Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler changes from Ingo Molnar:

 - Add the initial implementation of SCHED_DEADLINE support: a real-time
   scheduling policy where tasks that meet their deadlines and
   periodically execute their instances in less than their runtime quota
   see real-time scheduling and won't miss any of their deadlines.
   Tasks that go over their quota get delayed (Available to privileged
   users for now)

 - Clean up and fix preempt_enable_no_resched() abuse all around the
   tree

 - Do sched_clock() performance optimizations on x86 and elsewhere

 - Fix and improve auto-NUMA balancing

 - Fix and clean up the idle loop

 - Apply various cleanups and fixes

* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (60 commits)
  sched: Fix __sched_setscheduler() nice test
  sched: Move SCHED_RESET_ON_FORK into attr::sched_flags
  sched: Fix up attr::sched_priority warning
  sched: Fix up scheduler syscall LTP fails
  sched: Preserve the nice level over sched_setscheduler() and sched_setparam() calls
  sched/core: Fix htmldocs warnings
  sched/deadline: No need to check p if dl_se is valid
  sched/deadline: Remove unused variables
  sched/deadline: Fix sparse static warnings
  m68k: Fix build warning in mac_via.h
  sched, thermal: Clean up preempt_enable_no_resched() abuse
  sched, net: Fixup busy_loop_us_clock()
  sched, net: Clean up preempt_enable_no_resched() abuse
  sched/preempt: Fix up missed PREEMPT_NEED_RESCHED folding
  sched/preempt, locking: Rework local_bh_{dis,en}able()
  sched/clock, x86: Avoid a runtime condition in native_sched_clock()
  sched/clock: Fix up clear_sched_clock_stable()
  sched/clock, x86: Use a static_key for sched_clock_stable
  sched/clock: Remove local_irq_disable() from the clocks
  sched/clock, x86: Rewrite cyc2ns() to avoid the need to disable IRQs
  ...
This commit is contained in:
Linus Torvalds 2014-01-20 10:42:08 -08:00
commit a0fa1dd3cd
63 changed files with 3782 additions and 633 deletions

View File

@ -428,11 +428,6 @@ rate for each task.
numa_balancing_scan_size_mb is how many megabytes worth of pages are
scanned for a given scan.
numa_balancing_settle_count is how many scan periods must complete before
the schedule balancer stops pushing the task towards a preferred node. This
gives the scheduler a chance to place the task on an alternative node if the
preferred node is overloaded.
numa_balancing_migrate_deferred is how many page migrations get skipped
unconditionally, after a page migration is skipped because a page is shared
with other tasks. This reduces page migration overhead, and determines

View File

@ -15,7 +15,7 @@
#include <uapi/asm/unistd.h>
#define __NR_syscalls (380)
#define __NR_syscalls (384)
#define __ARM_NR_cmpxchg (__ARM_NR_BASE+0x00fff0)
#define __ARCH_WANT_STAT64

View File

@ -406,6 +406,8 @@
#define __NR_process_vm_writev (__NR_SYSCALL_BASE+377)
#define __NR_kcmp (__NR_SYSCALL_BASE+378)
#define __NR_finit_module (__NR_SYSCALL_BASE+379)
#define __NR_sched_setattr (__NR_SYSCALL_BASE+380)
#define __NR_sched_getattr (__NR_SYSCALL_BASE+381)
/*
* This may need to be greater than __NR_last_syscall+1 in order to

View File

@ -389,6 +389,8 @@
CALL(sys_process_vm_writev)
CALL(sys_kcmp)
CALL(sys_finit_module)
/* 380 */ CALL(sys_sched_setattr)
CALL(sys_sched_getattr)
#ifndef syscalls_counted
.equ syscalls_padding, ((NR_syscalls + 3) & ~3) - NR_syscalls
#define syscalls_counted

View File

@ -254,6 +254,8 @@
extern volatile __u8 *via1,*via2;
extern int rbv_present,via_alt_mapping;
struct irq_desc;
extern void via_register_interrupts(void);
extern void via_irq_enable(int);
extern void via_irq_disable(int);

View File

@ -1,6 +1,8 @@
#ifndef _ASM_X86_MWAIT_H
#define _ASM_X86_MWAIT_H
#include <linux/sched.h>
#define MWAIT_SUBSTATE_MASK 0xf
#define MWAIT_CSTATE_MASK 0xf
#define MWAIT_SUBSTATE_SIZE 4
@ -13,4 +15,45 @@
#define MWAIT_ECX_INTERRUPT_BREAK 0x1
static inline void __monitor(const void *eax, unsigned long ecx,
unsigned long edx)
{
/* "monitor %eax, %ecx, %edx;" */
asm volatile(".byte 0x0f, 0x01, 0xc8;"
:: "a" (eax), "c" (ecx), "d"(edx));
}
static inline void __mwait(unsigned long eax, unsigned long ecx)
{
/* "mwait %eax, %ecx;" */
asm volatile(".byte 0x0f, 0x01, 0xc9;"
:: "a" (eax), "c" (ecx));
}
/*
* This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
* which can obviate IPI to trigger checking of need_resched.
* We execute MONITOR against need_resched and enter optimized wait state
* through MWAIT. Whenever someone changes need_resched, we would be woken
* up from MWAIT (without an IPI).
*
* New with Core Duo processors, MWAIT can take some hints based on CPU
* capability.
*/
static inline void mwait_idle_with_hints(unsigned long eax, unsigned long ecx)
{
if (!current_set_polling_and_test()) {
if (static_cpu_has(X86_FEATURE_CLFLUSH_MONITOR)) {
mb();
clflush((void *)&current_thread_info()->flags);
mb();
}
__monitor((void *)&current_thread_info()->flags, 0, 0);
if (!need_resched())
__mwait(eax, ecx);
}
current_clr_polling();
}
#endif /* _ASM_X86_MWAIT_H */

View File

@ -700,29 +700,6 @@ static inline void sync_core(void)
#endif
}
static inline void __monitor(const void *eax, unsigned long ecx,
unsigned long edx)
{
/* "monitor %eax, %ecx, %edx;" */
asm volatile(".byte 0x0f, 0x01, 0xc8;"
:: "a" (eax), "c" (ecx), "d"(edx));
}
static inline void __mwait(unsigned long eax, unsigned long ecx)
{
/* "mwait %eax, %ecx;" */
asm volatile(".byte 0x0f, 0x01, 0xc9;"
:: "a" (eax), "c" (ecx));
}
static inline void __sti_mwait(unsigned long eax, unsigned long ecx)
{
trace_hardirqs_on();
/* "mwait %eax, %ecx;" */
asm volatile("sti; .byte 0x0f, 0x01, 0xc9;"
:: "a" (eax), "c" (ecx));
}
extern void select_idle_routine(const struct cpuinfo_x86 *c);
extern void init_amd_e400_c1e_mask(void);

View File

@ -4,6 +4,7 @@
#include <linux/pm.h>
#include <linux/percpu.h>
#include <linux/interrupt.h>
#include <linux/math64.h>
#define TICK_SIZE (tick_nsec / 1000)
@ -12,68 +13,26 @@ extern int recalibrate_cpu_khz(void);
extern int no_timer_check;
/* Accelerators for sched_clock()
* convert from cycles(64bits) => nanoseconds (64bits)
* basic equation:
* ns = cycles / (freq / ns_per_sec)
* ns = cycles * (ns_per_sec / freq)
* ns = cycles * (10^9 / (cpu_khz * 10^3))
* ns = cycles * (10^6 / cpu_khz)
/*
* We use the full linear equation: f(x) = a + b*x, in order to allow
* a continuous function in the face of dynamic freq changes.
*
* Then we use scaling math (suggested by george@mvista.com) to get:
* ns = cycles * (10^6 * SC / cpu_khz) / SC
* ns = cycles * cyc2ns_scale / SC
* Continuity means that when our frequency changes our slope (b); we want to
* ensure that: f(t) == f'(t), which gives: a + b*t == a' + b'*t.
*
* And since SC is a constant power of two, we can convert the div
* into a shift.
* Without an offset (a) the above would not be possible.
*
* We can use khz divisor instead of mhz to keep a better precision, since
* cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
* (mathieu.desnoyers@polymtl.ca)
*
* -johnstul@us.ibm.com "math is hard, lets go shopping!"
*
* In:
*
* ns = cycles * cyc2ns_scale / SC
*
* Although we may still have enough bits to store the value of ns,
* in some cases, we may not have enough bits to store cycles * cyc2ns_scale,
* leading to an incorrect result.
*
* To avoid this, we can decompose 'cycles' into quotient and remainder
* of division by SC. Then,
*
* ns = (quot * SC + rem) * cyc2ns_scale / SC
* = quot * cyc2ns_scale + (rem * cyc2ns_scale) / SC
*
* - sqazi@google.com
* See the comment near cycles_2_ns() for details on how we compute (b).
*/
struct cyc2ns_data {
u32 cyc2ns_mul;
u32 cyc2ns_shift;
u64 cyc2ns_offset;
u32 __count;
/* u32 hole */
}; /* 24 bytes -- do not grow */
DECLARE_PER_CPU(unsigned long, cyc2ns);
DECLARE_PER_CPU(unsigned long long, cyc2ns_offset);
#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
static inline unsigned long long __cycles_2_ns(unsigned long long cyc)
{
int cpu = smp_processor_id();
unsigned long long ns = per_cpu(cyc2ns_offset, cpu);
ns += mult_frac(cyc, per_cpu(cyc2ns, cpu),
(1UL << CYC2NS_SCALE_FACTOR));
return ns;
}
static inline unsigned long long cycles_2_ns(unsigned long long cyc)
{
unsigned long long ns;
unsigned long flags;
local_irq_save(flags);
ns = __cycles_2_ns(cyc);
local_irq_restore(flags);
return ns;
}
extern struct cyc2ns_data *cyc2ns_read_begin(void);
extern void cyc2ns_read_end(struct cyc2ns_data *);
#endif /* _ASM_X86_TIMER_H */

View File

@ -150,29 +150,6 @@ int acpi_processor_ffh_cstate_probe(unsigned int cpu,
}
EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_probe);
/*
* This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
* which can obviate IPI to trigger checking of need_resched.
* We execute MONITOR against need_resched and enter optimized wait state
* through MWAIT. Whenever someone changes need_resched, we would be woken
* up from MWAIT (without an IPI).
*
* New with Core Duo processors, MWAIT can take some hints based on CPU
* capability.
*/
void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
{
if (!need_resched()) {
if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR))
clflush((void *)&current_thread_info()->flags);
__monitor((void *)&current_thread_info()->flags, 0, 0);
smp_mb();
if (!need_resched())
__mwait(ax, cx);
}
}
void acpi_processor_ffh_cstate_enter(struct acpi_processor_cx *cx)
{
unsigned int cpu = smp_processor_id();

View File

@ -487,7 +487,7 @@ static void early_init_amd(struct cpuinfo_x86 *c)
set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
if (!check_tsc_unstable())
sched_clock_stable = 1;
set_sched_clock_stable();
}
#ifdef CONFIG_X86_64

View File

@ -93,7 +93,7 @@ static void early_init_intel(struct cpuinfo_x86 *c)
set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
if (!check_tsc_unstable())
sched_clock_stable = 1;
set_sched_clock_stable();
}
/* Penwell and Cloverview have the TSC which doesn't sleep on S3 */

View File

@ -1883,21 +1883,27 @@ static struct pmu pmu = {
void arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now)
{
struct cyc2ns_data *data;
userpg->cap_user_time = 0;
userpg->cap_user_time_zero = 0;
userpg->cap_user_rdpmc = x86_pmu.attr_rdpmc;
userpg->pmc_width = x86_pmu.cntval_bits;
if (!sched_clock_stable)
if (!sched_clock_stable())
return;
data = cyc2ns_read_begin();
userpg->cap_user_time = 1;
userpg->time_mult = this_cpu_read(cyc2ns);
userpg->time_shift = CYC2NS_SCALE_FACTOR;
userpg->time_offset = this_cpu_read(cyc2ns_offset) - now;
userpg->time_mult = data->cyc2ns_mul;
userpg->time_shift = data->cyc2ns_shift;
userpg->time_offset = data->cyc2ns_offset - now;
userpg->cap_user_time_zero = 1;
userpg->time_zero = this_cpu_read(cyc2ns_offset);
userpg->time_zero = data->cyc2ns_offset;
cyc2ns_read_end(data);
}
/*

View File

@ -1417,7 +1417,9 @@ static inline void mwait_play_dead(void)
* The WBINVD is insufficient due to the spurious-wakeup
* case where we return around the loop.
*/
mb();
clflush(mwait_ptr);
mb();
__monitor(mwait_ptr, 0, 0);
mb();
__mwait(eax, 0);

View File

@ -11,6 +11,7 @@
#include <linux/clocksource.h>
#include <linux/percpu.h>
#include <linux/timex.h>
#include <linux/static_key.h>
#include <asm/hpet.h>
#include <asm/timer.h>
@ -37,13 +38,244 @@ static int __read_mostly tsc_unstable;
erroneous rdtsc usage on !cpu_has_tsc processors */
static int __read_mostly tsc_disabled = -1;
static struct static_key __use_tsc = STATIC_KEY_INIT;
int tsc_clocksource_reliable;
/*
* Use a ring-buffer like data structure, where a writer advances the head by
* writing a new data entry and a reader advances the tail when it observes a
* new entry.
*
* Writers are made to wait on readers until there's space to write a new
* entry.
*
* This means that we can always use an {offset, mul} pair to compute a ns
* value that is 'roughly' in the right direction, even if we're writing a new
* {offset, mul} pair during the clock read.
*
* The down-side is that we can no longer guarantee strict monotonicity anymore
* (assuming the TSC was that to begin with), because while we compute the
* intersection point of the two clock slopes and make sure the time is
* continuous at the point of switching; we can no longer guarantee a reader is
* strictly before or after the switch point.
*
* It does mean a reader no longer needs to disable IRQs in order to avoid
* CPU-Freq updates messing with his times, and similarly an NMI reader will
* no longer run the risk of hitting half-written state.
*/
struct cyc2ns {
struct cyc2ns_data data[2]; /* 0 + 2*24 = 48 */
struct cyc2ns_data *head; /* 48 + 8 = 56 */
struct cyc2ns_data *tail; /* 56 + 8 = 64 */
}; /* exactly fits one cacheline */
static DEFINE_PER_CPU_ALIGNED(struct cyc2ns, cyc2ns);
struct cyc2ns_data *cyc2ns_read_begin(void)
{
struct cyc2ns_data *head;
preempt_disable();
head = this_cpu_read(cyc2ns.head);
/*
* Ensure we observe the entry when we observe the pointer to it.
* matches the wmb from cyc2ns_write_end().
*/
smp_read_barrier_depends();
head->__count++;
barrier();
return head;
}
void cyc2ns_read_end(struct cyc2ns_data *head)
{
barrier();
/*
* If we're the outer most nested read; update the tail pointer
* when we're done. This notifies possible pending writers
* that we've observed the head pointer and that the other
* entry is now free.
*/
if (!--head->__count) {
/*
* x86-TSO does not reorder writes with older reads;
* therefore once this write becomes visible to another
* cpu, we must be finished reading the cyc2ns_data.
*
* matches with cyc2ns_write_begin().
*/
this_cpu_write(cyc2ns.tail, head);
}
preempt_enable();
}
/*
* Begin writing a new @data entry for @cpu.
*
* Assumes some sort of write side lock; currently 'provided' by the assumption
* that cpufreq will call its notifiers sequentially.
*/
static struct cyc2ns_data *cyc2ns_write_begin(int cpu)
{
struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu);
struct cyc2ns_data *data = c2n->data;
if (data == c2n->head)
data++;
/* XXX send an IPI to @cpu in order to guarantee a read? */
/*
* When we observe the tail write from cyc2ns_read_end(),
* the cpu must be done with that entry and its safe
* to start writing to it.
*/
while (c2n->tail == data)
cpu_relax();
return data;
}
static void cyc2ns_write_end(int cpu, struct cyc2ns_data *data)
{
struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu);
/*
* Ensure the @data writes are visible before we publish the
* entry. Matches the data-depencency in cyc2ns_read_begin().
*/
smp_wmb();
ACCESS_ONCE(c2n->head) = data;
}
/*
* Accelerators for sched_clock()
* convert from cycles(64bits) => nanoseconds (64bits)
* basic equation:
* ns = cycles / (freq / ns_per_sec)
* ns = cycles * (ns_per_sec / freq)
* ns = cycles * (10^9 / (cpu_khz * 10^3))
* ns = cycles * (10^6 / cpu_khz)
*
* Then we use scaling math (suggested by george@mvista.com) to get:
* ns = cycles * (10^6 * SC / cpu_khz) / SC
* ns = cycles * cyc2ns_scale / SC
*
* And since SC is a constant power of two, we can convert the div
* into a shift.
*
* We can use khz divisor instead of mhz to keep a better precision, since
* cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
* (mathieu.desnoyers@polymtl.ca)
*
* -johnstul@us.ibm.com "math is hard, lets go shopping!"
*/
#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
static void cyc2ns_data_init(struct cyc2ns_data *data)
{
data->cyc2ns_mul = 1U << CYC2NS_SCALE_FACTOR;
data->cyc2ns_shift = CYC2NS_SCALE_FACTOR;
data->cyc2ns_offset = 0;
data->__count = 0;
}
static void cyc2ns_init(int cpu)
{
struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu);
cyc2ns_data_init(&c2n->data[0]);
cyc2ns_data_init(&c2n->data[1]);
c2n->head = c2n->data;
c2n->tail = c2n->data;
}
static inline unsigned long long cycles_2_ns(unsigned long long cyc)
{
struct cyc2ns_data *data, *tail;
unsigned long long ns;
/*
* See cyc2ns_read_*() for details; replicated in order to avoid
* an extra few instructions that came with the abstraction.
* Notable, it allows us to only do the __count and tail update
* dance when its actually needed.
*/
preempt_disable();
data = this_cpu_read(cyc2ns.head);
tail = this_cpu_read(cyc2ns.tail);
if (likely(data == tail)) {
ns = data->cyc2ns_offset;
ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR);
} else {
data->__count++;
barrier();
ns = data->cyc2ns_offset;
ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR);
barrier();
if (!--data->__count)
this_cpu_write(cyc2ns.tail, data);
}
preempt_enable();
return ns;
}
/* XXX surely we already have this someplace in the kernel?! */
#define DIV_ROUND(n, d) (((n) + ((d) / 2)) / (d))
static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
{
unsigned long long tsc_now, ns_now;
struct cyc2ns_data *data;
unsigned long flags;
local_irq_save(flags);
sched_clock_idle_sleep_event();
if (!cpu_khz)
goto done;
data = cyc2ns_write_begin(cpu);
rdtscll(tsc_now);
ns_now = cycles_2_ns(tsc_now);
/*
* Compute a new multiplier as per the above comment and ensure our
* time function is continuous; see the comment near struct
* cyc2ns_data.
*/
data->cyc2ns_mul = DIV_ROUND(NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR, cpu_khz);
data->cyc2ns_shift = CYC2NS_SCALE_FACTOR;
data->cyc2ns_offset = ns_now -
mul_u64_u32_shr(tsc_now, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR);
cyc2ns_write_end(cpu, data);
done:
sched_clock_idle_wakeup_event(0);
local_irq_restore(flags);
}
/*
* Scheduler clock - returns current time in nanosec units.
*/
u64 native_sched_clock(void)
{
u64 this_offset;
u64 tsc_now;
/*
* Fall back to jiffies if there's no TSC available:
@ -53,16 +285,16 @@ u64 native_sched_clock(void)
* very important for it to be as fast as the platform
* can achieve it. )
*/
if (unlikely(tsc_disabled)) {
if (!static_key_false(&__use_tsc)) {
/* No locking but a rare wrong value is not a big deal: */
return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ);
}
/* read the Time Stamp Counter: */
rdtscll(this_offset);
rdtscll(tsc_now);
/* return the value in ns */
return __cycles_2_ns(this_offset);
return cycles_2_ns(tsc_now);
}
/* We need to define a real function for sched_clock, to override the
@ -589,61 +821,11 @@ int recalibrate_cpu_khz(void)
EXPORT_SYMBOL(recalibrate_cpu_khz);
/* Accelerators for sched_clock()
* convert from cycles(64bits) => nanoseconds (64bits)
* basic equation:
* ns = cycles / (freq / ns_per_sec)
* ns = cycles * (ns_per_sec / freq)
* ns = cycles * (10^9 / (cpu_khz * 10^3))
* ns = cycles * (10^6 / cpu_khz)
*
* Then we use scaling math (suggested by george@mvista.com) to get:
* ns = cycles * (10^6 * SC / cpu_khz) / SC
* ns = cycles * cyc2ns_scale / SC
*
* And since SC is a constant power of two, we can convert the div
* into a shift.
*
* We can use khz divisor instead of mhz to keep a better precision, since
* cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
* (mathieu.desnoyers@polymtl.ca)
*
* -johnstul@us.ibm.com "math is hard, lets go shopping!"
*/
DEFINE_PER_CPU(unsigned long, cyc2ns);
DEFINE_PER_CPU(unsigned long long, cyc2ns_offset);
static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
{
unsigned long long tsc_now, ns_now, *offset;
unsigned long flags, *scale;
local_irq_save(flags);
sched_clock_idle_sleep_event();
scale = &per_cpu(cyc2ns, cpu);
offset = &per_cpu(cyc2ns_offset, cpu);
rdtscll(tsc_now);
ns_now = __cycles_2_ns(tsc_now);
if (cpu_khz) {
*scale = ((NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR) +
cpu_khz / 2) / cpu_khz;
*offset = ns_now - mult_frac(tsc_now, *scale,
(1UL << CYC2NS_SCALE_FACTOR));
}
sched_clock_idle_wakeup_event(0);
local_irq_restore(flags);
}
static unsigned long long cyc2ns_suspend;
void tsc_save_sched_clock_state(void)
{
if (!sched_clock_stable)
if (!sched_clock_stable())
return;
cyc2ns_suspend = sched_clock();
@ -663,16 +845,26 @@ void tsc_restore_sched_clock_state(void)
unsigned long flags;
int cpu;
if (!sched_clock_stable)
if (!sched_clock_stable())
return;
local_irq_save(flags);
__this_cpu_write(cyc2ns_offset, 0);
/*
* We're comming out of suspend, there's no concurrency yet; don't
* bother being nice about the RCU stuff, just write to both
* data fields.
*/
this_cpu_write(cyc2ns.data[0].cyc2ns_offset, 0);
this_cpu_write(cyc2ns.data[1].cyc2ns_offset, 0);
offset = cyc2ns_suspend - sched_clock();
for_each_possible_cpu(cpu)
per_cpu(cyc2ns_offset, cpu) = offset;
for_each_possible_cpu(cpu) {
per_cpu(cyc2ns.data[0].cyc2ns_offset, cpu) = offset;
per_cpu(cyc2ns.data[1].cyc2ns_offset, cpu) = offset;
}
local_irq_restore(flags);
}
@ -795,7 +987,7 @@ void mark_tsc_unstable(char *reason)
{
if (!tsc_unstable) {
tsc_unstable = 1;
sched_clock_stable = 0;
clear_sched_clock_stable();
disable_sched_clock_irqtime();
pr_info("Marking TSC unstable due to %s\n", reason);
/* Change only the rating, when not registered */
@ -995,14 +1187,18 @@ void __init tsc_init(void)
* speed as the bootup CPU. (cpufreq notifiers will fix this
* up if their speed diverges)
*/
for_each_possible_cpu(cpu)
for_each_possible_cpu(cpu) {
cyc2ns_init(cpu);
set_cyc2ns_scale(cpu_khz, cpu);
}
if (tsc_disabled > 0)
return;
/* now allow native_sched_clock() to use rdtsc */
tsc_disabled = 0;
static_key_slow_inc(&__use_tsc);
if (!no_sched_irq_time)
enable_sched_clock_irqtime();

View File

@ -433,15 +433,49 @@ static void reset_with_ipi(struct pnmask *distribution, struct bau_control *bcp)
return;
}
/*
* Not to be confused with cycles_2_ns() from tsc.c; this gives a relative
* number, not an absolute. It converts a duration in cycles to a duration in
* ns.
*/
static inline unsigned long long cycles_2_ns(unsigned long long cyc)
{
struct cyc2ns_data *data = cyc2ns_read_begin();
unsigned long long ns;
ns = mul_u64_u32_shr(cyc, data->cyc2ns_mul, data->cyc2ns_shift);
cyc2ns_read_end(data);
return ns;
}
/*
* The reverse of the above; converts a duration in ns to a duration in cycles.
*/
static inline unsigned long long ns_2_cycles(unsigned long long ns)
{
struct cyc2ns_data *data = cyc2ns_read_begin();
unsigned long long cyc;
cyc = (ns << data->cyc2ns_shift) / data->cyc2ns_mul;
cyc2ns_read_end(data);
return cyc;
}
static inline unsigned long cycles_2_us(unsigned long long cyc)
{
unsigned long long ns;
unsigned long us;
int cpu = smp_processor_id();
return cycles_2_ns(cyc) / NSEC_PER_USEC;
}
ns = (cyc * per_cpu(cyc2ns, cpu)) >> CYC2NS_SCALE_FACTOR;
us = ns / 1000;
return us;
static inline cycles_t sec_2_cycles(unsigned long sec)
{
return ns_2_cycles(sec * NSEC_PER_SEC);
}
static inline unsigned long long usec_2_cycles(unsigned long usec)
{
return ns_2_cycles(usec * NSEC_PER_USEC);
}
/*
@ -668,16 +702,6 @@ static int wait_completion(struct bau_desc *bau_desc,
bcp, try);
}
static inline cycles_t sec_2_cycles(unsigned long sec)
{
unsigned long ns;
cycles_t cyc;
ns = sec * 1000000000;
cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id()));
return cyc;
}
/*
* Our retries are blocked by all destination sw ack resources being
* in use, and a timeout is pending. In that case hardware immediately
@ -1327,16 +1351,6 @@ static void ptc_seq_stop(struct seq_file *file, void *data)
{
}
static inline unsigned long long usec_2_cycles(unsigned long microsec)
{
unsigned long ns;
unsigned long long cyc;
ns = microsec * 1000;
cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id()));
return cyc;
}
/*
* Display the statistics thru /proc/sgi_uv/ptc_statistics
* 'data' points to the cpu number

View File

@ -357,3 +357,5 @@
348 i386 process_vm_writev sys_process_vm_writev compat_sys_process_vm_writev
349 i386 kcmp sys_kcmp
350 i386 finit_module sys_finit_module
351 i386 sched_setattr sys_sched_setattr
352 i386 sched_getattr sys_sched_getattr

View File

@ -320,6 +320,8 @@
311 64 process_vm_writev sys_process_vm_writev
312 common kcmp sys_kcmp
313 common finit_module sys_finit_module
314 common sched_setattr sys_sched_setattr
315 common sched_getattr sys_sched_getattr
#
# x32-specific system call numbers start at 512 to avoid cache impact

View File

@ -193,10 +193,7 @@ static int power_saving_thread(void *data)
CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu);
stop_critical_timings();
__monitor((void *)&current_thread_info()->flags, 0, 0);
smp_mb();
if (!need_resched())
__mwait(power_saving_mwait_eax, 1);
mwait_idle_with_hints(power_saving_mwait_eax, 1);
start_critical_timings();
if (lapic_marked_unstable)

View File

@ -727,11 +727,6 @@ static int acpi_idle_enter_c1(struct cpuidle_device *dev,
if (unlikely(!pr))
return -EINVAL;
if (cx->entry_method == ACPI_CSTATE_FFH) {
if (current_set_polling_and_test())
return -EINVAL;
}
lapic_timer_state_broadcast(pr, cx, 1);
acpi_idle_do_entry(cx);
@ -785,11 +780,6 @@ static int acpi_idle_enter_simple(struct cpuidle_device *dev,
if (unlikely(!pr))
return -EINVAL;
if (cx->entry_method == ACPI_CSTATE_FFH) {
if (current_set_polling_and_test())
return -EINVAL;
}
/*
* Must be done before busmaster disable as we might need to
* access HPET !
@ -841,11 +831,6 @@ static int acpi_idle_enter_bm(struct cpuidle_device *dev,
}
}
if (cx->entry_method == ACPI_CSTATE_FFH) {
if (current_set_polling_and_test())
return -EINVAL;
}
acpi_unlazy_tlb(smp_processor_id());
/* Tell the scheduler that we are going deep-idle: */

View File

@ -377,16 +377,7 @@ static int intel_idle(struct cpuidle_device *dev,
if (!(lapic_timer_reliable_states & (1 << (cstate))))
clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu);
if (!current_set_polling_and_test()) {
if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR))
clflush((void *)&current_thread_info()->flags);
__monitor((void *)&current_thread_info()->flags, 0, 0);
smp_mb();
if (!need_resched())
__mwait(eax, ecx);
}
mwait_idle_with_hints(eax, ecx);
if (!(lapic_timer_reliable_states & (1 << (cstate))))
clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu);

View File

@ -438,14 +438,12 @@ static int clamp_thread(void *arg)
*/
local_touch_nmi();
stop_critical_timings();
__monitor((void *)&current_thread_info()->flags, 0, 0);
cpu_relax(); /* allow HT sibling to run */
__mwait(eax, ecx);
mwait_idle_with_hints(eax, ecx);
start_critical_timings();
atomic_inc(&idle_wakeup_counter);
}
tick_nohz_idle_exit();
preempt_enable_no_resched();
preempt_enable();
}
del_timer_sync(&wakeup_timer);
clear_bit(cpunr, cpu_clamping_mask);

View File

@ -1,9 +1,35 @@
#ifndef _LINUX_BH_H
#define _LINUX_BH_H
extern void local_bh_disable(void);
#include <linux/preempt.h>
#include <linux/preempt_mask.h>
#ifdef CONFIG_TRACE_IRQFLAGS
extern void __local_bh_disable_ip(unsigned long ip, unsigned int cnt);
#else
static __always_inline void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
{
preempt_count_add(cnt);
barrier();
}
#endif
static inline void local_bh_disable(void)
{
__local_bh_disable_ip(_THIS_IP_, SOFTIRQ_DISABLE_OFFSET);
}
extern void _local_bh_enable(void);
extern void local_bh_enable(void);
extern void local_bh_enable_ip(unsigned long ip);
extern void __local_bh_enable_ip(unsigned long ip, unsigned int cnt);
static inline void local_bh_enable_ip(unsigned long ip)
{
__local_bh_enable_ip(ip, SOFTIRQ_DISABLE_OFFSET);
}
static inline void local_bh_enable(void)
{
__local_bh_enable_ip(_THIS_IP_, SOFTIRQ_DISABLE_OFFSET);
}
#endif /* _LINUX_BH_H */

View File

@ -5,6 +5,7 @@
#include <linux/lockdep.h>
#include <linux/ftrace_irq.h>
#include <linux/vtime.h>
#include <asm/hardirq.h>
extern void synchronize_irq(unsigned int irq);

View File

@ -11,6 +11,7 @@
#include <linux/user_namespace.h>
#include <linux/securebits.h>
#include <linux/seqlock.h>
#include <linux/rbtree.h>
#include <net/net_namespace.h>
#include <linux/sched/rt.h>
@ -154,6 +155,14 @@ extern struct task_group root_task_group;
#define INIT_TASK_COMM "swapper"
#ifdef CONFIG_RT_MUTEXES
# define INIT_RT_MUTEXES(tsk) \
.pi_waiters = RB_ROOT, \
.pi_waiters_leftmost = NULL,
#else
# define INIT_RT_MUTEXES(tsk)
#endif
/*
* INIT_TASK is used to set up the first task table, touch at
* your own risk!. Base=0, limit=0x1fffff (=2MB)
@ -221,6 +230,7 @@ extern struct task_group root_task_group;
INIT_TRACE_RECURSION \
INIT_TASK_RCU_PREEMPT(tsk) \
INIT_CPUSET_SEQ(tsk) \
INIT_RT_MUTEXES(tsk) \
INIT_VTIME(tsk) \
}

View File

@ -64,7 +64,11 @@ do { \
} while (0)
#else
#define preempt_enable() preempt_enable_no_resched()
#define preempt_enable() \
do { \
barrier(); \
preempt_count_dec(); \
} while (0)
#define preempt_check_resched() do { } while (0)
#endif
@ -93,7 +97,11 @@ do { \
__preempt_schedule_context(); \
} while (0)
#else
#define preempt_enable_notrace() preempt_enable_no_resched_notrace()
#define preempt_enable_notrace() \
do { \
barrier(); \
__preempt_count_dec(); \
} while (0)
#endif
#else /* !CONFIG_PREEMPT_COUNT */
@ -116,6 +124,31 @@ do { \
#endif /* CONFIG_PREEMPT_COUNT */
#ifdef MODULE
/*
* Modules have no business playing preemption tricks.
*/
#undef sched_preempt_enable_no_resched
#undef preempt_enable_no_resched
#undef preempt_enable_no_resched_notrace
#undef preempt_check_resched
#endif
#ifdef CONFIG_PREEMPT
#define preempt_set_need_resched() \
do { \
set_preempt_need_resched(); \
} while (0)
#define preempt_fold_need_resched() \
do { \
if (tif_need_resched()) \
set_preempt_need_resched(); \
} while (0)
#else
#define preempt_set_need_resched() do { } while (0)
#define preempt_fold_need_resched() do { } while (0)
#endif
#ifdef CONFIG_PREEMPT_NOTIFIERS
struct preempt_notifier;

View File

@ -2,7 +2,6 @@
#define LINUX_PREEMPT_MASK_H
#include <linux/preempt.h>
#include <asm/hardirq.h>
/*
* We put the hardirq and softirq counter into the preemption
@ -78,6 +77,21 @@
# define PREEMPT_CHECK_OFFSET 0
#endif
/*
* The preempt_count offset needed for things like:
*
* spin_lock_bh()
*
* Which need to disable both preemption (CONFIG_PREEMPT_COUNT) and
* softirqs, such that unlock sequences of:
*
* spin_unlock();
* local_bh_enable();
*
* Work as expected.
*/
#define SOFTIRQ_LOCK_OFFSET (SOFTIRQ_DISABLE_OFFSET + PREEMPT_CHECK_OFFSET)
/*
* Are we running in atomic context? WARNING: this macro cannot
* always detect atomic context; in particular, it cannot know about

View File

@ -13,7 +13,7 @@
#define __LINUX_RT_MUTEX_H
#include <linux/linkage.h>
#include <linux/plist.h>
#include <linux/rbtree.h>
#include <linux/spinlock_types.h>
extern int max_lock_depth; /* for sysctl */
@ -22,12 +22,14 @@ extern int max_lock_depth; /* for sysctl */
* The rt_mutex structure
*
* @wait_lock: spinlock to protect the structure
* @wait_list: pilist head to enqueue waiters in priority order
* @waiters: rbtree root to enqueue waiters in priority order
* @waiters_leftmost: top waiter
* @owner: the mutex owner
*/
struct rt_mutex {
raw_spinlock_t wait_lock;
struct plist_head wait_list;
struct rb_root waiters;
struct rb_node *waiters_leftmost;
struct task_struct *owner;
#ifdef CONFIG_DEBUG_RT_MUTEXES
int save_state;
@ -66,7 +68,7 @@ struct hrtimer_sleeper;
#define __RT_MUTEX_INITIALIZER(mutexname) \
{ .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(mutexname.wait_lock) \
, .wait_list = PLIST_HEAD_INIT(mutexname.wait_list) \
, .waiters = RB_ROOT \
, .owner = NULL \
__DEBUG_RT_MUTEX_INITIALIZER(mutexname)}
@ -98,12 +100,4 @@ extern int rt_mutex_trylock(struct rt_mutex *lock);
extern void rt_mutex_unlock(struct rt_mutex *lock);
#ifdef CONFIG_RT_MUTEXES
# define INIT_RT_MUTEXES(tsk) \
.pi_waiters = PLIST_HEAD_INIT(tsk.pi_waiters), \
INIT_RT_MUTEX_DEBUG(tsk)
#else
# define INIT_RT_MUTEXES(tsk)
#endif
#endif

View File

@ -172,8 +172,7 @@ static inline void __raw_read_lock_irq(rwlock_t *lock)
static inline void __raw_read_lock_bh(rwlock_t *lock)
{
local_bh_disable();
preempt_disable();
__local_bh_disable_ip(_RET_IP_, SOFTIRQ_LOCK_OFFSET);
rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
LOCK_CONTENDED(lock, do_raw_read_trylock, do_raw_read_lock);
}
@ -200,8 +199,7 @@ static inline void __raw_write_lock_irq(rwlock_t *lock)
static inline void __raw_write_lock_bh(rwlock_t *lock)
{
local_bh_disable();
preempt_disable();
__local_bh_disable_ip(_RET_IP_, SOFTIRQ_LOCK_OFFSET);
rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
LOCK_CONTENDED(lock, do_raw_write_trylock, do_raw_write_lock);
}
@ -250,8 +248,7 @@ static inline void __raw_read_unlock_bh(rwlock_t *lock)
{
rwlock_release(&lock->dep_map, 1, _RET_IP_);
do_raw_read_unlock(lock);
preempt_enable_no_resched();
local_bh_enable_ip((unsigned long)__builtin_return_address(0));
__local_bh_enable_ip(_RET_IP_, SOFTIRQ_LOCK_OFFSET);
}
static inline void __raw_write_unlock_irqrestore(rwlock_t *lock,
@ -275,8 +272,7 @@ static inline void __raw_write_unlock_bh(rwlock_t *lock)
{
rwlock_release(&lock->dep_map, 1, _RET_IP_);
do_raw_write_unlock(lock);
preempt_enable_no_resched();
local_bh_enable_ip((unsigned long)__builtin_return_address(0));
__local_bh_enable_ip(_RET_IP_, SOFTIRQ_LOCK_OFFSET);
}
#endif /* __LINUX_RWLOCK_API_SMP_H */

View File

@ -16,6 +16,7 @@ struct sched_param {
#include <linux/types.h>
#include <linux/timex.h>
#include <linux/jiffies.h>
#include <linux/plist.h>
#include <linux/rbtree.h>
#include <linux/thread_info.h>
#include <linux/cpumask.h>
@ -56,6 +57,70 @@ struct sched_param {
#include <asm/processor.h>
#define SCHED_ATTR_SIZE_VER0 48 /* sizeof first published struct */
/*
* Extended scheduling parameters data structure.
*
* This is needed because the original struct sched_param can not be
* altered without introducing ABI issues with legacy applications
* (e.g., in sched_getparam()).
*
* However, the possibility of specifying more than just a priority for
* the tasks may be useful for a wide variety of application fields, e.g.,
* multimedia, streaming, automation and control, and many others.
*
* This variant (sched_attr) is meant at describing a so-called
* sporadic time-constrained task. In such model a task is specified by:
* - the activation period or minimum instance inter-arrival time;
* - the maximum (or average, depending on the actual scheduling
* discipline) computation time of all instances, a.k.a. runtime;
* - the deadline (relative to the actual activation time) of each
* instance.
* Very briefly, a periodic (sporadic) task asks for the execution of
* some specific computation --which is typically called an instance--
* (at most) every period. Moreover, each instance typically lasts no more
* than the runtime and must be completed by time instant t equal to
* the instance activation time + the deadline.
*
* This is reflected by the actual fields of the sched_attr structure:
*
* @size size of the structure, for fwd/bwd compat.
*
* @sched_policy task's scheduling policy
* @sched_flags for customizing the scheduler behaviour
* @sched_nice task's nice value (SCHED_NORMAL/BATCH)
* @sched_priority task's static priority (SCHED_FIFO/RR)
* @sched_deadline representative of the task's deadline
* @sched_runtime representative of the task's runtime
* @sched_period representative of the task's period
*
* Given this task model, there are a multiplicity of scheduling algorithms
* and policies, that can be used to ensure all the tasks will make their
* timing constraints.
*
* As of now, the SCHED_DEADLINE policy (sched_dl scheduling class) is the
* only user of this new interface. More information about the algorithm
* available in the scheduling class file or in Documentation/.
*/
struct sched_attr {
u32 size;
u32 sched_policy;
u64 sched_flags;
/* SCHED_NORMAL, SCHED_BATCH */
s32 sched_nice;
/* SCHED_FIFO, SCHED_RR */
u32 sched_priority;
/* SCHED_DEADLINE */
u64 sched_runtime;
u64 sched_deadline;
u64 sched_period;
};
struct exec_domain;
struct futex_pi_state;
struct robust_list_head;
@ -168,7 +233,6 @@ extern char ___assert_task_state[1 - 2*!!(
#define task_is_traced(task) ((task->state & __TASK_TRACED) != 0)
#define task_is_stopped(task) ((task->state & __TASK_STOPPED) != 0)
#define task_is_dead(task) ((task)->exit_state != 0)
#define task_is_stopped_or_traced(task) \
((task->state & (__TASK_STOPPED | __TASK_TRACED)) != 0)
#define task_contributes_to_load(task) \
@ -1029,6 +1093,51 @@ struct sched_rt_entity {
#endif
};
struct sched_dl_entity {
struct rb_node rb_node;
/*
* Original scheduling parameters. Copied here from sched_attr
* during sched_setscheduler2(), they will remain the same until
* the next sched_setscheduler2().
*/
u64 dl_runtime; /* maximum runtime for each instance */
u64 dl_deadline; /* relative deadline of each instance */
u64 dl_period; /* separation of two instances (period) */
u64 dl_bw; /* dl_runtime / dl_deadline */
/*
* Actual scheduling parameters. Initialized with the values above,
* they are continously updated during task execution. Note that
* the remaining runtime could be < 0 in case we are in overrun.
*/
s64 runtime; /* remaining runtime for this instance */
u64 deadline; /* absolute deadline for this instance */
unsigned int flags; /* specifying the scheduler behaviour */
/*
* Some bool flags:
*
* @dl_throttled tells if we exhausted the runtime. If so, the
* task has to wait for a replenishment to be performed at the
* next firing of dl_timer.
*
* @dl_new tells if a new instance arrived. If so we must
* start executing it with full runtime and reset its absolute
* deadline;
*
* @dl_boosted tells if we are boosted due to DI. If so we are
* outside bandwidth enforcement mechanism (but only until we
* exit the critical section).
*/
int dl_throttled, dl_new, dl_boosted;
/*
* Bandwidth enforcement timer. Each -deadline task has its
* own bandwidth to be enforced, thus we need one timer per task.
*/
struct hrtimer dl_timer;
};
struct rcu_node;
@ -1065,6 +1174,7 @@ struct task_struct {
#ifdef CONFIG_CGROUP_SCHED
struct task_group *sched_task_group;
#endif
struct sched_dl_entity dl;
#ifdef CONFIG_PREEMPT_NOTIFIERS
/* list of struct preempt_notifier: */
@ -1098,6 +1208,7 @@ struct task_struct {
struct list_head tasks;
#ifdef CONFIG_SMP
struct plist_node pushable_tasks;
struct rb_node pushable_dl_tasks;
#endif
struct mm_struct *mm, *active_mm;
@ -1249,9 +1360,12 @@ struct task_struct {
#ifdef CONFIG_RT_MUTEXES
/* PI waiters blocked on a rt_mutex held by this task */
struct plist_head pi_waiters;
struct rb_root pi_waiters;
struct rb_node *pi_waiters_leftmost;
/* Deadlock detection and priority inheritance handling */
struct rt_mutex_waiter *pi_blocked_on;
/* Top pi_waiters task */
struct task_struct *pi_top_task;
#endif
#ifdef CONFIG_DEBUG_MUTEXES
@ -1880,7 +1994,9 @@ static inline void sched_clock_idle_wakeup_event(u64 delta_ns)
* but then during bootup it turns out that sched_clock()
* is reliable after all:
*/
extern int sched_clock_stable;
extern int sched_clock_stable(void);
extern void set_sched_clock_stable(void);
extern void clear_sched_clock_stable(void);
extern void sched_clock_tick(void);
extern void sched_clock_idle_sleep_event(void);
@ -1959,6 +2075,8 @@ extern int sched_setscheduler(struct task_struct *, int,
const struct sched_param *);
extern int sched_setscheduler_nocheck(struct task_struct *, int,
const struct sched_param *);
extern int sched_setattr(struct task_struct *,
const struct sched_attr *);
extern struct task_struct *idle_task(int cpu);
/**
* is_idle_task - is the specified task an idle task?
@ -2038,7 +2156,7 @@ extern void wake_up_new_task(struct task_struct *tsk);
#else
static inline void kick_process(struct task_struct *tsk) { }
#endif
extern void sched_fork(unsigned long clone_flags, struct task_struct *p);
extern int sched_fork(unsigned long clone_flags, struct task_struct *p);
extern void sched_dead(struct task_struct *p);
extern void proc_caches_init(void);
@ -2627,6 +2745,21 @@ static inline bool __must_check current_clr_polling_and_test(void)
}
#endif
static inline void current_clr_polling(void)
{
__current_clr_polling();
/*
* Ensure we check TIF_NEED_RESCHED after we clear the polling bit.
* Once the bit is cleared, we'll get IPIs with every new
* TIF_NEED_RESCHED and the IPI handler, scheduler_ipi(), will also
* fold.
*/
smp_mb(); /* paired with resched_task() */
preempt_fold_need_resched();
}
static __always_inline bool need_resched(void)
{
return unlikely(tif_need_resched());

View File

@ -0,0 +1,24 @@
#ifndef _SCHED_DEADLINE_H
#define _SCHED_DEADLINE_H
/*
* SCHED_DEADLINE tasks has negative priorities, reflecting
* the fact that any of them has higher prio than RT and
* NORMAL/BATCH tasks.
*/
#define MAX_DL_PRIO 0
static inline int dl_prio(int prio)
{
if (unlikely(prio < MAX_DL_PRIO))
return 1;
return 0;
}
static inline int dl_task(struct task_struct *p)
{
return dl_prio(p->prio);
}
#endif /* _SCHED_DEADLINE_H */

View File

@ -35,6 +35,7 @@ static inline int rt_task(struct task_struct *p)
#ifdef CONFIG_RT_MUTEXES
extern int rt_mutex_getprio(struct task_struct *p);
extern void rt_mutex_setprio(struct task_struct *p, int prio);
extern struct task_struct *rt_mutex_get_top_task(struct task_struct *task);
extern void rt_mutex_adjust_pi(struct task_struct *p);
static inline bool tsk_is_pi_blocked(struct task_struct *tsk)
{
@ -45,6 +46,10 @@ static inline int rt_mutex_getprio(struct task_struct *p)
{
return p->normal_prio;
}
static inline struct task_struct *rt_mutex_get_top_task(struct task_struct *task)
{
return NULL;
}
# define rt_mutex_adjust_pi(p) do { } while (0)
static inline bool tsk_is_pi_blocked(struct task_struct *tsk)
{

View File

@ -48,7 +48,6 @@ extern unsigned int sysctl_numa_balancing_scan_delay;
extern unsigned int sysctl_numa_balancing_scan_period_min;
extern unsigned int sysctl_numa_balancing_scan_period_max;
extern unsigned int sysctl_numa_balancing_scan_size;
extern unsigned int sysctl_numa_balancing_settle_count;
#ifdef CONFIG_SCHED_DEBUG
extern unsigned int sysctl_sched_migration_cost;

View File

@ -131,8 +131,7 @@ static inline void __raw_spin_lock_irq(raw_spinlock_t *lock)
static inline void __raw_spin_lock_bh(raw_spinlock_t *lock)
{
local_bh_disable();
preempt_disable();
__local_bh_disable_ip(_RET_IP_, SOFTIRQ_LOCK_OFFSET);
spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock);
}
@ -174,20 +173,17 @@ static inline void __raw_spin_unlock_bh(raw_spinlock_t *lock)
{
spin_release(&lock->dep_map, 1, _RET_IP_);
do_raw_spin_unlock(lock);
preempt_enable_no_resched();
local_bh_enable_ip((unsigned long)__builtin_return_address(0));
__local_bh_enable_ip(_RET_IP_, SOFTIRQ_LOCK_OFFSET);
}
static inline int __raw_spin_trylock_bh(raw_spinlock_t *lock)
{
local_bh_disable();
preempt_disable();
__local_bh_disable_ip(_RET_IP_, SOFTIRQ_LOCK_OFFSET);
if (do_raw_spin_trylock(lock)) {
spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
return 1;
}
preempt_enable_no_resched();
local_bh_enable_ip((unsigned long)__builtin_return_address(0));
__local_bh_enable_ip(_RET_IP_, SOFTIRQ_LOCK_OFFSET);
return 0;
}

View File

@ -24,11 +24,14 @@
* flags straight, to suppress compiler warnings of unused lock
* variables, and to add the proper checker annotations:
*/
#define ___LOCK(lock) \
do { __acquire(lock); (void)(lock); } while (0)
#define __LOCK(lock) \
do { preempt_disable(); __acquire(lock); (void)(lock); } while (0)
do { preempt_disable(); ___LOCK(lock); } while (0)
#define __LOCK_BH(lock) \
do { local_bh_disable(); __LOCK(lock); } while (0)
do { __local_bh_disable_ip(_THIS_IP_, SOFTIRQ_LOCK_OFFSET); ___LOCK(lock); } while (0)
#define __LOCK_IRQ(lock) \
do { local_irq_disable(); __LOCK(lock); } while (0)
@ -36,12 +39,15 @@
#define __LOCK_IRQSAVE(lock, flags) \
do { local_irq_save(flags); __LOCK(lock); } while (0)
#define ___UNLOCK(lock) \
do { __release(lock); (void)(lock); } while (0)
#define __UNLOCK(lock) \
do { preempt_enable(); __release(lock); (void)(lock); } while (0)
do { preempt_enable(); ___UNLOCK(lock); } while (0)
#define __UNLOCK_BH(lock) \
do { preempt_enable_no_resched(); local_bh_enable(); \
__release(lock); (void)(lock); } while (0)
do { __local_bh_enable_ip(_THIS_IP_, SOFTIRQ_LOCK_OFFSET); \
___UNLOCK(lock); } while (0)
#define __UNLOCK_IRQ(lock) \
do { local_irq_enable(); __UNLOCK(lock); } while (0)

View File

@ -38,6 +38,7 @@ struct rlimit;
struct rlimit64;
struct rusage;
struct sched_param;
struct sched_attr;
struct sel_arg_struct;
struct semaphore;
struct sembuf;
@ -279,9 +280,14 @@ asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
struct sched_param __user *param);
asmlinkage long sys_sched_setparam(pid_t pid,
struct sched_param __user *param);
asmlinkage long sys_sched_setattr(pid_t pid,
struct sched_attr __user *attr);
asmlinkage long sys_sched_getscheduler(pid_t pid);
asmlinkage long sys_sched_getparam(pid_t pid,
struct sched_param __user *param);
asmlinkage long sys_sched_getattr(pid_t pid,
struct sched_attr __user *attr,
unsigned int size);
asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
unsigned long __user *user_mask_ptr);
asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,

View File

@ -25,13 +25,16 @@ static inline void pagefault_disable(void)
static inline void pagefault_enable(void)
{
#ifndef CONFIG_PREEMPT
/*
* make sure to issue those last loads/stores before enabling
* the pagefault handler again.
*/
barrier();
preempt_count_dec();
preempt_check_resched();
#else
preempt_enable();
#endif
}
#ifndef ARCH_HAS_NOCACHE_UACCESS

View File

@ -42,27 +42,10 @@ static inline bool net_busy_loop_on(void)
return sysctl_net_busy_poll;
}
/* a wrapper to make debug_smp_processor_id() happy
* we can use sched_clock() because we don't care much about precision
* we only care that the average is bounded
*/
#ifdef CONFIG_DEBUG_PREEMPT
static inline u64 busy_loop_us_clock(void)
{
u64 rc;
preempt_disable_notrace();
rc = sched_clock();
preempt_enable_no_resched_notrace();
return rc >> 10;
return local_clock() >> 10;
}
#else /* CONFIG_DEBUG_PREEMPT */
static inline u64 busy_loop_us_clock(void)
{
return sched_clock() >> 10;
}
#endif /* CONFIG_DEBUG_PREEMPT */
static inline unsigned long sk_busy_loop_end_time(struct sock *sk)
{

View File

@ -39,8 +39,14 @@
#define SCHED_BATCH 3
/* SCHED_ISO: reserved but not implemented yet */
#define SCHED_IDLE 5
#define SCHED_DEADLINE 6
/* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */
#define SCHED_RESET_ON_FORK 0x40000000
/*
* For the sched_{set,get}attr() calls
*/
#define SCHED_FLAG_RESET_ON_FORK 0x01
#endif /* _UAPI_LINUX_SCHED_H */

View File

@ -105,14 +105,17 @@ static void cpu_idle_loop(void)
__current_set_polling();
}
arch_cpu_idle_exit();
/*
* We need to test and propagate the TIF_NEED_RESCHED
* bit here because we might not have send the
* reschedule IPI to idle tasks.
*/
if (tif_need_resched())
set_preempt_need_resched();
}
/*
* Since we fell out of the loop above, we know
* TIF_NEED_RESCHED must be set, propagate it into
* PREEMPT_NEED_RESCHED.
*
* This is required because for polling idle loops we will
* not have had an IPI to fold the state for us.
*/
preempt_set_need_resched();
tick_nohz_idle_exit();
schedule_preempt_disabled();
}

View File

@ -1087,8 +1087,10 @@ static void rt_mutex_init_task(struct task_struct *p)
{
raw_spin_lock_init(&p->pi_lock);
#ifdef CONFIG_RT_MUTEXES
plist_head_init(&p->pi_waiters);
p->pi_waiters = RB_ROOT;
p->pi_waiters_leftmost = NULL;
p->pi_blocked_on = NULL;
p->pi_top_task = NULL;
#endif
}
@ -1311,7 +1313,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
#endif
/* Perform scheduler related setup. Assign this task to a CPU. */
sched_fork(clone_flags, p);
retval = sched_fork(clone_flags, p);
if (retval)
goto bad_fork_cleanup_policy;
retval = perf_event_init_task(p);
if (retval)
@ -1403,13 +1407,11 @@ static struct task_struct *copy_process(unsigned long clone_flags,
p->tgid = p->pid;
}
p->pdeath_signal = 0;
p->exit_state = 0;
p->nr_dirtied = 0;
p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10);
p->dirty_paused_when = 0;
p->pdeath_signal = 0;
INIT_LIST_HEAD(&p->thread_group);
p->task_works = NULL;

View File

@ -2426,6 +2426,8 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
* code while we sleep on uaddr.
*/
debug_rt_mutex_init_waiter(&rt_waiter);
RB_CLEAR_NODE(&rt_waiter.pi_tree_entry);
RB_CLEAR_NODE(&rt_waiter.tree_entry);
rt_waiter.task = NULL;
ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);

View File

@ -46,6 +46,7 @@
#include <linux/sched.h>
#include <linux/sched/sysctl.h>
#include <linux/sched/rt.h>
#include <linux/sched/deadline.h>
#include <linux/timer.h>
#include <linux/freezer.h>
@ -1610,7 +1611,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
unsigned long slack;
slack = current->timer_slack_ns;
if (rt_task(current))
if (dl_task(current) || rt_task(current))
slack = 0;
hrtimer_init_on_stack(&t.timer, clockid, mode);

View File

@ -24,7 +24,7 @@
#include <linux/kallsyms.h>
#include <linux/syscalls.h>
#include <linux/interrupt.h>
#include <linux/plist.h>
#include <linux/rbtree.h>
#include <linux/fs.h>
#include <linux/debug_locks.h>
@ -57,7 +57,7 @@ static void printk_lock(struct rt_mutex *lock, int print_owner)
void rt_mutex_debug_task_free(struct task_struct *task)
{
DEBUG_LOCKS_WARN_ON(!plist_head_empty(&task->pi_waiters));
DEBUG_LOCKS_WARN_ON(!RB_EMPTY_ROOT(&task->pi_waiters));
DEBUG_LOCKS_WARN_ON(task->pi_blocked_on);
}
@ -154,16 +154,12 @@ void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock)
void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
{
memset(waiter, 0x11, sizeof(*waiter));
plist_node_init(&waiter->list_entry, MAX_PRIO);
plist_node_init(&waiter->pi_list_entry, MAX_PRIO);
waiter->deadlock_task_pid = NULL;
}
void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter)
{
put_pid(waiter->deadlock_task_pid);
DEBUG_LOCKS_WARN_ON(!plist_node_empty(&waiter->list_entry));
DEBUG_LOCKS_WARN_ON(!plist_node_empty(&waiter->pi_list_entry));
memset(waiter, 0x22, sizeof(*waiter));
}

View File

@ -14,6 +14,7 @@
#include <linux/export.h>
#include <linux/sched.h>
#include <linux/sched/rt.h>
#include <linux/sched/deadline.h>
#include <linux/timer.h>
#include "rtmutex_common.h"
@ -91,10 +92,107 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
}
#endif
static inline int
rt_mutex_waiter_less(struct rt_mutex_waiter *left,
struct rt_mutex_waiter *right)
{
if (left->prio < right->prio)
return 1;
/*
* Calculate task priority from the waiter list priority
* If both waiters have dl_prio(), we check the deadlines of the
* associated tasks.
* If left waiter has a dl_prio(), and we didn't return 1 above,
* then right waiter has a dl_prio() too.
*/
if (dl_prio(left->prio))
return (left->task->dl.deadline < right->task->dl.deadline);
return 0;
}
static void
rt_mutex_enqueue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter)
{
struct rb_node **link = &lock->waiters.rb_node;
struct rb_node *parent = NULL;
struct rt_mutex_waiter *entry;
int leftmost = 1;
while (*link) {
parent = *link;
entry = rb_entry(parent, struct rt_mutex_waiter, tree_entry);
if (rt_mutex_waiter_less(waiter, entry)) {
link = &parent->rb_left;
} else {
link = &parent->rb_right;
leftmost = 0;
}
}
if (leftmost)
lock->waiters_leftmost = &waiter->tree_entry;
rb_link_node(&waiter->tree_entry, parent, link);
rb_insert_color(&waiter->tree_entry, &lock->waiters);
}
static void
rt_mutex_dequeue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter)
{
if (RB_EMPTY_NODE(&waiter->tree_entry))
return;
if (lock->waiters_leftmost == &waiter->tree_entry)
lock->waiters_leftmost = rb_next(&waiter->tree_entry);
rb_erase(&waiter->tree_entry, &lock->waiters);
RB_CLEAR_NODE(&waiter->tree_entry);
}
static void
rt_mutex_enqueue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter)
{
struct rb_node **link = &task->pi_waiters.rb_node;
struct rb_node *parent = NULL;
struct rt_mutex_waiter *entry;
int leftmost = 1;
while (*link) {
parent = *link;
entry = rb_entry(parent, struct rt_mutex_waiter, pi_tree_entry);
if (rt_mutex_waiter_less(waiter, entry)) {
link = &parent->rb_left;
} else {
link = &parent->rb_right;
leftmost = 0;
}
}
if (leftmost)
task->pi_waiters_leftmost = &waiter->pi_tree_entry;
rb_link_node(&waiter->pi_tree_entry, parent, link);
rb_insert_color(&waiter->pi_tree_entry, &task->pi_waiters);
}
static void
rt_mutex_dequeue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter)
{
if (RB_EMPTY_NODE(&waiter->pi_tree_entry))
return;
if (task->pi_waiters_leftmost == &waiter->pi_tree_entry)
task->pi_waiters_leftmost = rb_next(&waiter->pi_tree_entry);
rb_erase(&waiter->pi_tree_entry, &task->pi_waiters);
RB_CLEAR_NODE(&waiter->pi_tree_entry);
}
/*
* Calculate task priority from the waiter tree priority
*
* Return task->normal_prio when the waiter list is empty or when
* Return task->normal_prio when the waiter tree is empty or when
* the waiter is not allowed to do priority boosting
*/
int rt_mutex_getprio(struct task_struct *task)
@ -102,10 +200,18 @@ int rt_mutex_getprio(struct task_struct *task)
if (likely(!task_has_pi_waiters(task)))
return task->normal_prio;
return min(task_top_pi_waiter(task)->pi_list_entry.prio,
return min(task_top_pi_waiter(task)->prio,
task->normal_prio);
}
struct task_struct *rt_mutex_get_top_task(struct task_struct *task)
{
if (likely(!task_has_pi_waiters(task)))
return NULL;
return task_top_pi_waiter(task)->task;
}
/*
* Adjust the priority of a task, after its pi_waiters got modified.
*
@ -115,7 +221,7 @@ static void __rt_mutex_adjust_prio(struct task_struct *task)
{
int prio = rt_mutex_getprio(task);
if (task->prio != prio)
if (task->prio != prio || dl_prio(prio))
rt_mutex_setprio(task, prio);
}
@ -233,7 +339,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
* When deadlock detection is off then we check, if further
* priority adjustment is necessary.
*/
if (!detect_deadlock && waiter->list_entry.prio == task->prio)
if (!detect_deadlock && waiter->prio == task->prio)
goto out_unlock_pi;
lock = waiter->lock;
@ -254,9 +360,9 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
top_waiter = rt_mutex_top_waiter(lock);
/* Requeue the waiter */
plist_del(&waiter->list_entry, &lock->wait_list);
waiter->list_entry.prio = task->prio;
plist_add(&waiter->list_entry, &lock->wait_list);
rt_mutex_dequeue(lock, waiter);
waiter->prio = task->prio;
rt_mutex_enqueue(lock, waiter);
/* Release the task */
raw_spin_unlock_irqrestore(&task->pi_lock, flags);
@ -280,17 +386,15 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
if (waiter == rt_mutex_top_waiter(lock)) {
/* Boost the owner */
plist_del(&top_waiter->pi_list_entry, &task->pi_waiters);
waiter->pi_list_entry.prio = waiter->list_entry.prio;
plist_add(&waiter->pi_list_entry, &task->pi_waiters);
rt_mutex_dequeue_pi(task, top_waiter);
rt_mutex_enqueue_pi(task, waiter);
__rt_mutex_adjust_prio(task);
} else if (top_waiter == waiter) {
/* Deboost the owner */
plist_del(&waiter->pi_list_entry, &task->pi_waiters);
rt_mutex_dequeue_pi(task, waiter);
waiter = rt_mutex_top_waiter(lock);
waiter->pi_list_entry.prio = waiter->list_entry.prio;
plist_add(&waiter->pi_list_entry, &task->pi_waiters);
rt_mutex_enqueue_pi(task, waiter);
__rt_mutex_adjust_prio(task);
}
@ -355,7 +459,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
* 3) it is top waiter
*/
if (rt_mutex_has_waiters(lock)) {
if (task->prio >= rt_mutex_top_waiter(lock)->list_entry.prio) {
if (task->prio >= rt_mutex_top_waiter(lock)->prio) {
if (!waiter || waiter != rt_mutex_top_waiter(lock))
return 0;
}
@ -369,7 +473,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
/* remove the queued waiter. */
if (waiter) {
plist_del(&waiter->list_entry, &lock->wait_list);
rt_mutex_dequeue(lock, waiter);
task->pi_blocked_on = NULL;
}
@ -379,8 +483,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
*/
if (rt_mutex_has_waiters(lock)) {
top = rt_mutex_top_waiter(lock);
top->pi_list_entry.prio = top->list_entry.prio;
plist_add(&top->pi_list_entry, &task->pi_waiters);
rt_mutex_enqueue_pi(task, top);
}
raw_spin_unlock_irqrestore(&task->pi_lock, flags);
}
@ -416,13 +519,12 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
__rt_mutex_adjust_prio(task);
waiter->task = task;
waiter->lock = lock;
plist_node_init(&waiter->list_entry, task->prio);
plist_node_init(&waiter->pi_list_entry, task->prio);
waiter->prio = task->prio;
/* Get the top priority waiter on the lock */
if (rt_mutex_has_waiters(lock))
top_waiter = rt_mutex_top_waiter(lock);
plist_add(&waiter->list_entry, &lock->wait_list);
rt_mutex_enqueue(lock, waiter);
task->pi_blocked_on = waiter;
@ -433,8 +535,8 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
if (waiter == rt_mutex_top_waiter(lock)) {
raw_spin_lock_irqsave(&owner->pi_lock, flags);
plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters);
plist_add(&waiter->pi_list_entry, &owner->pi_waiters);
rt_mutex_dequeue_pi(owner, top_waiter);
rt_mutex_enqueue_pi(owner, waiter);
__rt_mutex_adjust_prio(owner);
if (owner->pi_blocked_on)
@ -486,7 +588,7 @@ static void wakeup_next_waiter(struct rt_mutex *lock)
* boosted mode and go back to normal after releasing
* lock->wait_lock.
*/
plist_del(&waiter->pi_list_entry, &current->pi_waiters);
rt_mutex_dequeue_pi(current, waiter);
rt_mutex_set_owner(lock, NULL);
@ -510,7 +612,7 @@ static void remove_waiter(struct rt_mutex *lock,
int chain_walk = 0;
raw_spin_lock_irqsave(&current->pi_lock, flags);
plist_del(&waiter->list_entry, &lock->wait_list);
rt_mutex_dequeue(lock, waiter);
current->pi_blocked_on = NULL;
raw_spin_unlock_irqrestore(&current->pi_lock, flags);
@ -521,13 +623,13 @@ static void remove_waiter(struct rt_mutex *lock,
raw_spin_lock_irqsave(&owner->pi_lock, flags);
plist_del(&waiter->pi_list_entry, &owner->pi_waiters);
rt_mutex_dequeue_pi(owner, waiter);
if (rt_mutex_has_waiters(lock)) {
struct rt_mutex_waiter *next;
next = rt_mutex_top_waiter(lock);
plist_add(&next->pi_list_entry, &owner->pi_waiters);
rt_mutex_enqueue_pi(owner, next);
}
__rt_mutex_adjust_prio(owner);
@ -537,8 +639,6 @@ static void remove_waiter(struct rt_mutex *lock,
raw_spin_unlock_irqrestore(&owner->pi_lock, flags);
}
WARN_ON(!plist_node_empty(&waiter->pi_list_entry));
if (!chain_walk)
return;
@ -565,7 +665,8 @@ void rt_mutex_adjust_pi(struct task_struct *task)
raw_spin_lock_irqsave(&task->pi_lock, flags);
waiter = task->pi_blocked_on;
if (!waiter || waiter->list_entry.prio == task->prio) {
if (!waiter || (waiter->prio == task->prio &&
!dl_prio(task->prio))) {
raw_spin_unlock_irqrestore(&task->pi_lock, flags);
return;
}
@ -638,6 +739,8 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
int ret = 0;
debug_rt_mutex_init_waiter(&waiter);
RB_CLEAR_NODE(&waiter.pi_tree_entry);
RB_CLEAR_NODE(&waiter.tree_entry);
raw_spin_lock(&lock->wait_lock);
@ -904,7 +1007,8 @@ void __rt_mutex_init(struct rt_mutex *lock, const char *name)
{
lock->owner = NULL;
raw_spin_lock_init(&lock->wait_lock);
plist_head_init(&lock->wait_list);
lock->waiters = RB_ROOT;
lock->waiters_leftmost = NULL;
debug_rt_mutex_init(lock, name);
}

View File

@ -40,13 +40,13 @@ extern void schedule_rt_mutex_test(struct rt_mutex *lock);
* This is the control structure for tasks blocked on a rt_mutex,
* which is allocated on the kernel stack on of the blocked task.
*
* @list_entry: pi node to enqueue into the mutex waiters list
* @pi_list_entry: pi node to enqueue into the mutex owner waiters list
* @tree_entry: pi node to enqueue into the mutex waiters tree
* @pi_tree_entry: pi node to enqueue into the mutex owner waiters tree
* @task: task reference to the blocked task
*/
struct rt_mutex_waiter {
struct plist_node list_entry;
struct plist_node pi_list_entry;
struct rb_node tree_entry;
struct rb_node pi_tree_entry;
struct task_struct *task;
struct rt_mutex *lock;
#ifdef CONFIG_DEBUG_RT_MUTEXES
@ -54,14 +54,15 @@ struct rt_mutex_waiter {
struct pid *deadlock_task_pid;
struct rt_mutex *deadlock_lock;
#endif
int prio;
};
/*
* Various helpers to access the waiters-plist:
* Various helpers to access the waiters-tree:
*/
static inline int rt_mutex_has_waiters(struct rt_mutex *lock)
{
return !plist_head_empty(&lock->wait_list);
return !RB_EMPTY_ROOT(&lock->waiters);
}
static inline struct rt_mutex_waiter *
@ -69,8 +70,8 @@ rt_mutex_top_waiter(struct rt_mutex *lock)
{
struct rt_mutex_waiter *w;
w = plist_first_entry(&lock->wait_list, struct rt_mutex_waiter,
list_entry);
w = rb_entry(lock->waiters_leftmost, struct rt_mutex_waiter,
tree_entry);
BUG_ON(w->lock != lock);
return w;
@ -78,14 +79,14 @@ rt_mutex_top_waiter(struct rt_mutex *lock)
static inline int task_has_pi_waiters(struct task_struct *p)
{
return !plist_head_empty(&p->pi_waiters);
return !RB_EMPTY_ROOT(&p->pi_waiters);
}
static inline struct rt_mutex_waiter *
task_top_pi_waiter(struct task_struct *p)
{
return plist_first_entry(&p->pi_waiters, struct rt_mutex_waiter,
pi_list_entry);
return rb_entry(p->pi_waiters_leftmost, struct rt_mutex_waiter,
pi_tree_entry);
}
/*

View File

@ -11,9 +11,10 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
endif
obj-y += core.o proc.o clock.o cputime.o idle_task.o fair.o rt.o stop_task.o
obj-y += core.o proc.o clock.o cputime.o
obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
obj-y += wait.o completion.o
obj-$(CONFIG_SMP) += cpupri.o
obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o
obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
obj-$(CONFIG_SCHEDSTATS) += stats.o
obj-$(CONFIG_SCHED_DEBUG) += debug.o

View File

@ -26,9 +26,10 @@
* at 0 on boot (but people really shouldn't rely on that).
*
* cpu_clock(i) -- can be used from any context, including NMI.
* sched_clock_cpu(i) -- must be used with local IRQs disabled (implied by NMI)
* local_clock() -- is cpu_clock() on the current cpu.
*
* sched_clock_cpu(i)
*
* How:
*
* The implementation either uses sched_clock() when
@ -50,15 +51,6 @@
* Furthermore, explicit sleep and wakeup hooks allow us to account for time
* that is otherwise invisible (TSC gets stopped).
*
*
* Notes:
*
* The !IRQ-safetly of sched_clock() and sched_clock_cpu() comes from things
* like cpufreq interrupts that can change the base clock (TSC) multiplier
* and cause funny jumps in time -- although the filtering provided by
* sched_clock_cpu() should mitigate serious artifacts we cannot rely on it
* in general since for !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK we fully rely on
* sched_clock().
*/
#include <linux/spinlock.h>
#include <linux/hardirq.h>
@ -66,6 +58,8 @@
#include <linux/percpu.h>
#include <linux/ktime.h>
#include <linux/sched.h>
#include <linux/static_key.h>
#include <linux/workqueue.h>
/*
* Scheduler clock - returns current time in nanosec units.
@ -82,7 +76,37 @@ EXPORT_SYMBOL_GPL(sched_clock);
__read_mostly int sched_clock_running;
#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
__read_mostly int sched_clock_stable;
static struct static_key __sched_clock_stable = STATIC_KEY_INIT;
int sched_clock_stable(void)
{
if (static_key_false(&__sched_clock_stable))
return false;
return true;
}
void set_sched_clock_stable(void)
{
if (!sched_clock_stable())
static_key_slow_dec(&__sched_clock_stable);
}
static void __clear_sched_clock_stable(struct work_struct *work)
{
/* XXX worry about clock continuity */
if (sched_clock_stable())
static_key_slow_inc(&__sched_clock_stable);
}
static DECLARE_WORK(sched_clock_work, __clear_sched_clock_stable);
void clear_sched_clock_stable(void)
{
if (keventd_up())
schedule_work(&sched_clock_work);
else
__clear_sched_clock_stable(&sched_clock_work);
}
struct sched_clock_data {
u64 tick_raw;
@ -242,20 +266,20 @@ u64 sched_clock_cpu(int cpu)
struct sched_clock_data *scd;
u64 clock;
WARN_ON_ONCE(!irqs_disabled());
if (sched_clock_stable)
if (sched_clock_stable())
return sched_clock();
if (unlikely(!sched_clock_running))
return 0ull;
preempt_disable();
scd = cpu_sdc(cpu);
if (cpu != smp_processor_id())
clock = sched_clock_remote(scd);
else
clock = sched_clock_local(scd);
preempt_enable();
return clock;
}
@ -265,7 +289,7 @@ void sched_clock_tick(void)
struct sched_clock_data *scd;
u64 now, now_gtod;
if (sched_clock_stable)
if (sched_clock_stable())
return;
if (unlikely(!sched_clock_running))
@ -316,14 +340,10 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
*/
u64 cpu_clock(int cpu)
{
u64 clock;
unsigned long flags;
if (static_key_false(&__sched_clock_stable))
return sched_clock_cpu(cpu);
local_irq_save(flags);
clock = sched_clock_cpu(cpu);
local_irq_restore(flags);
return clock;
return sched_clock();
}
/*
@ -335,14 +355,10 @@ u64 cpu_clock(int cpu)
*/
u64 local_clock(void)
{
u64 clock;
unsigned long flags;
if (static_key_false(&__sched_clock_stable))
return sched_clock_cpu(raw_smp_processor_id());
local_irq_save(flags);
clock = sched_clock_cpu(smp_processor_id());
local_irq_restore(flags);
return clock;
return sched_clock();
}
#else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
@ -362,12 +378,12 @@ u64 sched_clock_cpu(int cpu)
u64 cpu_clock(int cpu)
{
return sched_clock_cpu(cpu);
return sched_clock();
}
u64 local_clock(void)
{
return sched_clock_cpu(0);
return sched_clock();
}
#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */

File diff suppressed because it is too large Load Diff

216
kernel/sched/cpudeadline.c Normal file
View File

@ -0,0 +1,216 @@
/*
* kernel/sched/cpudl.c
*
* Global CPU deadline management
*
* Author: Juri Lelli <j.lelli@sssup.it>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; version 2
* of the License.
*/
#include <linux/gfp.h>
#include <linux/kernel.h>
#include "cpudeadline.h"
static inline int parent(int i)
{
return (i - 1) >> 1;
}
static inline int left_child(int i)
{
return (i << 1) + 1;
}
static inline int right_child(int i)
{
return (i << 1) + 2;
}
static inline int dl_time_before(u64 a, u64 b)
{
return (s64)(a - b) < 0;
}
static void cpudl_exchange(struct cpudl *cp, int a, int b)
{
int cpu_a = cp->elements[a].cpu, cpu_b = cp->elements[b].cpu;
swap(cp->elements[a], cp->elements[b]);
swap(cp->cpu_to_idx[cpu_a], cp->cpu_to_idx[cpu_b]);
}
static void cpudl_heapify(struct cpudl *cp, int idx)
{
int l, r, largest;
/* adapted from lib/prio_heap.c */
while(1) {
l = left_child(idx);
r = right_child(idx);
largest = idx;
if ((l < cp->size) && dl_time_before(cp->elements[idx].dl,
cp->elements[l].dl))
largest = l;
if ((r < cp->size) && dl_time_before(cp->elements[largest].dl,
cp->elements[r].dl))
largest = r;
if (largest == idx)
break;
/* Push idx down the heap one level and bump one up */
cpudl_exchange(cp, largest, idx);
idx = largest;
}
}
static void cpudl_change_key(struct cpudl *cp, int idx, u64 new_dl)
{
WARN_ON(idx > num_present_cpus() || idx == IDX_INVALID);
if (dl_time_before(new_dl, cp->elements[idx].dl)) {
cp->elements[idx].dl = new_dl;
cpudl_heapify(cp, idx);
} else {
cp->elements[idx].dl = new_dl;
while (idx > 0 && dl_time_before(cp->elements[parent(idx)].dl,
cp->elements[idx].dl)) {
cpudl_exchange(cp, idx, parent(idx));
idx = parent(idx);
}
}
}
static inline int cpudl_maximum(struct cpudl *cp)
{
return cp->elements[0].cpu;
}
/*
* cpudl_find - find the best (later-dl) CPU in the system
* @cp: the cpudl max-heap context
* @p: the task
* @later_mask: a mask to fill in with the selected CPUs (or NULL)
*
* Returns: int - best CPU (heap maximum if suitable)
*/
int cpudl_find(struct cpudl *cp, struct task_struct *p,
struct cpumask *later_mask)
{
int best_cpu = -1;
const struct sched_dl_entity *dl_se = &p->dl;
if (later_mask && cpumask_and(later_mask, cp->free_cpus,
&p->cpus_allowed) && cpumask_and(later_mask,
later_mask, cpu_active_mask)) {
best_cpu = cpumask_any(later_mask);
goto out;
} else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) &&
dl_time_before(dl_se->deadline, cp->elements[0].dl)) {
best_cpu = cpudl_maximum(cp);
if (later_mask)
cpumask_set_cpu(best_cpu, later_mask);
}
out:
WARN_ON(best_cpu > num_present_cpus() && best_cpu != -1);
return best_cpu;
}
/*
* cpudl_set - update the cpudl max-heap
* @cp: the cpudl max-heap context
* @cpu: the target cpu
* @dl: the new earliest deadline for this cpu
*
* Notes: assumes cpu_rq(cpu)->lock is locked
*
* Returns: (void)
*/
void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid)
{
int old_idx, new_cpu;
unsigned long flags;
WARN_ON(cpu > num_present_cpus());
raw_spin_lock_irqsave(&cp->lock, flags);
old_idx = cp->cpu_to_idx[cpu];
if (!is_valid) {
/* remove item */
if (old_idx == IDX_INVALID) {
/*
* Nothing to remove if old_idx was invalid.
* This could happen if a rq_offline_dl is
* called for a CPU without -dl tasks running.
*/
goto out;
}
new_cpu = cp->elements[cp->size - 1].cpu;
cp->elements[old_idx].dl = cp->elements[cp->size - 1].dl;
cp->elements[old_idx].cpu = new_cpu;
cp->size--;
cp->cpu_to_idx[new_cpu] = old_idx;
cp->cpu_to_idx[cpu] = IDX_INVALID;
while (old_idx > 0 && dl_time_before(
cp->elements[parent(old_idx)].dl,
cp->elements[old_idx].dl)) {
cpudl_exchange(cp, old_idx, parent(old_idx));
old_idx = parent(old_idx);
}
cpumask_set_cpu(cpu, cp->free_cpus);
cpudl_heapify(cp, old_idx);
goto out;
}
if (old_idx == IDX_INVALID) {
cp->size++;
cp->elements[cp->size - 1].dl = 0;
cp->elements[cp->size - 1].cpu = cpu;
cp->cpu_to_idx[cpu] = cp->size - 1;
cpudl_change_key(cp, cp->size - 1, dl);
cpumask_clear_cpu(cpu, cp->free_cpus);
} else {
cpudl_change_key(cp, old_idx, dl);
}
out:
raw_spin_unlock_irqrestore(&cp->lock, flags);
}
/*
* cpudl_init - initialize the cpudl structure
* @cp: the cpudl max-heap context
*/
int cpudl_init(struct cpudl *cp)
{
int i;
memset(cp, 0, sizeof(*cp));
raw_spin_lock_init(&cp->lock);
cp->size = 0;
for (i = 0; i < NR_CPUS; i++)
cp->cpu_to_idx[i] = IDX_INVALID;
if (!alloc_cpumask_var(&cp->free_cpus, GFP_KERNEL))
return -ENOMEM;
cpumask_setall(cp->free_cpus);
return 0;
}
/*
* cpudl_cleanup - clean up the cpudl structure
* @cp: the cpudl max-heap context
*/
void cpudl_cleanup(struct cpudl *cp)
{
/*
* nothing to do for the moment
*/
}

View File

@ -0,0 +1,33 @@
#ifndef _LINUX_CPUDL_H
#define _LINUX_CPUDL_H
#include <linux/sched.h>
#define IDX_INVALID -1
struct array_item {
u64 dl;
int cpu;
};
struct cpudl {
raw_spinlock_t lock;
int size;
int cpu_to_idx[NR_CPUS];
struct array_item elements[NR_CPUS];
cpumask_var_t free_cpus;
};
#ifdef CONFIG_SMP
int cpudl_find(struct cpudl *cp, struct task_struct *p,
struct cpumask *later_mask);
void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid);
int cpudl_init(struct cpudl *cp);
void cpudl_cleanup(struct cpudl *cp);
#else
#define cpudl_set(cp, cpu, dl) do { } while (0)
#define cpudl_init() do { } while (0)
#endif /* CONFIG_SMP */
#endif /* _LINUX_CPUDL_H */

1640
kernel/sched/deadline.c Normal file

File diff suppressed because it is too large Load Diff

View File

@ -139,7 +139,7 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
#endif
#ifdef CONFIG_NUMA_BALANCING
SEQ_printf(m, " %d", cpu_to_node(task_cpu(p)));
SEQ_printf(m, " %d", task_node(p));
#endif
#ifdef CONFIG_CGROUP_SCHED
SEQ_printf(m, " %s", task_group_path(task_group(p)));
@ -371,7 +371,7 @@ static void sched_debug_header(struct seq_file *m)
PN(cpu_clk);
P(jiffies);
#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
P(sched_clock_stable);
P(sched_clock_stable());
#endif
#undef PN
#undef P

View File

@ -872,15 +872,6 @@ static unsigned int task_scan_max(struct task_struct *p)
return max(smin, smax);
}
/*
* Once a preferred node is selected the scheduler balancer will prefer moving
* a task to that node for sysctl_numa_balancing_settle_count number of PTE
* scans. This will give the process the chance to accumulate more faults on
* the preferred node but still allow the scheduler to move the task again if
* the nodes CPUs are overloaded.
*/
unsigned int sysctl_numa_balancing_settle_count __read_mostly = 4;
static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
{
rq->nr_numa_running += (p->numa_preferred_nid != -1);
@ -930,7 +921,8 @@ static inline unsigned long group_faults(struct task_struct *p, int nid)
if (!p->numa_group)
return 0;
return p->numa_group->faults[2*nid] + p->numa_group->faults[2*nid+1];
return p->numa_group->faults[task_faults_idx(nid, 0)] +
p->numa_group->faults[task_faults_idx(nid, 1)];
}
/*
@ -1023,7 +1015,7 @@ struct task_numa_env {
struct numa_stats src_stats, dst_stats;
int imbalance_pct, idx;
int imbalance_pct;
struct task_struct *best_task;
long best_imp;
@ -1211,7 +1203,7 @@ static int task_numa_migrate(struct task_struct *p)
* elsewhere, so there is no point in (re)trying.
*/
if (unlikely(!sd)) {
p->numa_preferred_nid = cpu_to_node(task_cpu(p));
p->numa_preferred_nid = task_node(p);
return -EINVAL;
}
@ -1278,7 +1270,7 @@ static void numa_migrate_preferred(struct task_struct *p)
p->numa_migrate_retry = jiffies + HZ;
/* Success if task is already running on preferred CPU */
if (cpu_to_node(task_cpu(p)) == p->numa_preferred_nid)
if (task_node(p) == p->numa_preferred_nid)
return;
/* Otherwise, try migrate to a CPU on the preferred node */
@ -1350,7 +1342,6 @@ static void update_task_scan_period(struct task_struct *p,
* scanning faster if shared accesses dominate as it may
* simply bounce migrations uselessly
*/
period_slot = DIV_ROUND_UP(diff, NUMA_PERIOD_SLOTS);
ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared));
diff = (diff * ratio) / NUMA_PERIOD_SLOTS;
}
@ -4101,12 +4092,16 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
*/
static struct sched_group *
find_idlest_group(struct sched_domain *sd, struct task_struct *p,
int this_cpu, int load_idx)
int this_cpu, int sd_flag)
{
struct sched_group *idlest = NULL, *group = sd->groups;
unsigned long min_load = ULONG_MAX, this_load = 0;
int load_idx = sd->forkexec_idx;
int imbalance = 100 + (sd->imbalance_pct-100)/2;
if (sd_flag & SD_BALANCE_WAKE)
load_idx = sd->wake_idx;
do {
unsigned long load, avg_load;
int local_group;
@ -4274,7 +4269,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
}
while (sd) {
int load_idx = sd->forkexec_idx;
struct sched_group *group;
int weight;
@ -4283,10 +4277,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
continue;
}
if (sd_flag & SD_BALANCE_WAKE)
load_idx = sd->wake_idx;
group = find_idlest_group(sd, p, cpu, load_idx);
group = find_idlest_group(sd, p, cpu, sd_flag);
if (!group) {
sd = sd->child;
continue;
@ -5512,7 +5503,6 @@ static inline void update_sg_lb_stats(struct lb_env *env,
struct sched_group *group, int load_idx,
int local_group, struct sg_lb_stats *sgs)
{
unsigned long nr_running;
unsigned long load;
int i;
@ -5521,8 +5511,6 @@ static inline void update_sg_lb_stats(struct lb_env *env,
for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
struct rq *rq = cpu_rq(i);
nr_running = rq->nr_running;
/* Bias balancing toward cpus of our domain */
if (local_group)
load = target_load(i, load_idx);
@ -5530,7 +5518,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
load = source_load(i, load_idx);
sgs->group_load += load;
sgs->sum_nr_running += nr_running;
sgs->sum_nr_running += rq->nr_running;
#ifdef CONFIG_NUMA_BALANCING
sgs->nr_numa_running += rq->nr_numa_running;
sgs->nr_preferred_running += rq->nr_preferred_running;
@ -6521,7 +6509,7 @@ static struct {
unsigned long next_balance; /* in jiffy units */
} nohz ____cacheline_aligned;
static inline int find_new_ilb(int call_cpu)
static inline int find_new_ilb(void)
{
int ilb = cpumask_first(nohz.idle_cpus_mask);
@ -6536,13 +6524,13 @@ static inline int find_new_ilb(int call_cpu)
* nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
* CPU (if there is one).
*/
static void nohz_balancer_kick(int cpu)
static void nohz_balancer_kick(void)
{
int ilb_cpu;
nohz.next_balance++;
ilb_cpu = find_new_ilb(cpu);
ilb_cpu = find_new_ilb();
if (ilb_cpu >= nr_cpu_ids)
return;
@ -6652,10 +6640,10 @@ void update_max_interval(void)
*
* Balancing parameters are set up in init_sched_domains.
*/
static void rebalance_domains(int cpu, enum cpu_idle_type idle)
static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
{
int continue_balancing = 1;
struct rq *rq = cpu_rq(cpu);
int cpu = rq->cpu;
unsigned long interval;
struct sched_domain *sd;
/* Earliest time when we have to do rebalance again */
@ -6752,9 +6740,9 @@ out:
* In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
* rebalancing for all the cpus for whom scheduler ticks are stopped.
*/
static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
{
struct rq *this_rq = cpu_rq(this_cpu);
int this_cpu = this_rq->cpu;
struct rq *rq;
int balance_cpu;
@ -6781,7 +6769,7 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
update_idle_cpu_load(rq);
raw_spin_unlock_irq(&rq->lock);
rebalance_domains(balance_cpu, CPU_IDLE);
rebalance_domains(rq, CPU_IDLE);
if (time_after(this_rq->next_balance, rq->next_balance))
this_rq->next_balance = rq->next_balance;
@ -6800,14 +6788,14 @@ end:
* - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
* domain span are idle.
*/
static inline int nohz_kick_needed(struct rq *rq, int cpu)
static inline int nohz_kick_needed(struct rq *rq)
{
unsigned long now = jiffies;
struct sched_domain *sd;
struct sched_group_power *sgp;
int nr_busy;
int nr_busy, cpu = rq->cpu;
if (unlikely(idle_cpu(cpu)))
if (unlikely(rq->idle_balance))
return 0;
/*
@ -6856,7 +6844,7 @@ need_kick:
return 1;
}
#else
static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { }
#endif
/*
@ -6865,38 +6853,39 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
*/
static void run_rebalance_domains(struct softirq_action *h)
{
int this_cpu = smp_processor_id();
struct rq *this_rq = cpu_rq(this_cpu);
struct rq *this_rq = this_rq();
enum cpu_idle_type idle = this_rq->idle_balance ?
CPU_IDLE : CPU_NOT_IDLE;
rebalance_domains(this_cpu, idle);
rebalance_domains(this_rq, idle);
/*
* If this cpu has a pending nohz_balance_kick, then do the
* balancing on behalf of the other idle cpus whose ticks are
* stopped.
*/
nohz_idle_balance(this_cpu, idle);
nohz_idle_balance(this_rq, idle);
}
static inline int on_null_domain(int cpu)
static inline int on_null_domain(struct rq *rq)
{
return !rcu_dereference_sched(cpu_rq(cpu)->sd);
return !rcu_dereference_sched(rq->sd);
}
/*
* Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
*/
void trigger_load_balance(struct rq *rq, int cpu)
void trigger_load_balance(struct rq *rq)
{
/* Don't need to rebalance while attached to NULL domain */
if (time_after_eq(jiffies, rq->next_balance) &&
likely(!on_null_domain(cpu)))
if (unlikely(on_null_domain(rq)))
return;
if (time_after_eq(jiffies, rq->next_balance))
raise_softirq(SCHED_SOFTIRQ);
#ifdef CONFIG_NO_HZ_COMMON
if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu)))
nohz_balancer_kick(cpu);
if (nohz_kick_needed(rq))
nohz_balancer_kick();
#endif
}

View File

@ -1738,7 +1738,7 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
!test_tsk_need_resched(rq->curr) &&
has_pushable_tasks(rq) &&
p->nr_cpus_allowed > 1 &&
rt_task(rq->curr) &&
(dl_task(rq->curr) || rt_task(rq->curr)) &&
(rq->curr->nr_cpus_allowed < 2 ||
rq->curr->prio <= p->prio))
push_rt_tasks(rq);

View File

@ -2,6 +2,7 @@
#include <linux/sched.h>
#include <linux/sched/sysctl.h>
#include <linux/sched/rt.h>
#include <linux/sched/deadline.h>
#include <linux/mutex.h>
#include <linux/spinlock.h>
#include <linux/stop_machine.h>
@ -9,6 +10,7 @@
#include <linux/slab.h>
#include "cpupri.h"
#include "cpudeadline.h"
#include "cpuacct.h"
struct rq;
@ -72,6 +74,13 @@ extern void update_cpu_load_active(struct rq *this_rq);
#define NICE_0_LOAD SCHED_LOAD_SCALE
#define NICE_0_SHIFT SCHED_LOAD_SHIFT
/*
* Single value that decides SCHED_DEADLINE internal math precision.
* 10 -> just above 1us
* 9 -> just above 0.5us
*/
#define DL_SCALE (10)
/*
* These are the 'tuning knobs' of the scheduler:
*/
@ -81,11 +90,19 @@ extern void update_cpu_load_active(struct rq *this_rq);
*/
#define RUNTIME_INF ((u64)~0ULL)
static inline int fair_policy(int policy)
{
return policy == SCHED_NORMAL || policy == SCHED_BATCH;
}
static inline int rt_policy(int policy)
{
if (policy == SCHED_FIFO || policy == SCHED_RR)
return 1;
return 0;
return policy == SCHED_FIFO || policy == SCHED_RR;
}
static inline int dl_policy(int policy)
{
return policy == SCHED_DEADLINE;
}
static inline int task_has_rt_policy(struct task_struct *p)
@ -93,6 +110,25 @@ static inline int task_has_rt_policy(struct task_struct *p)
return rt_policy(p->policy);
}
static inline int task_has_dl_policy(struct task_struct *p)
{
return dl_policy(p->policy);
}
static inline bool dl_time_before(u64 a, u64 b)
{
return (s64)(a - b) < 0;
}
/*
* Tells if entity @a should preempt entity @b.
*/
static inline bool
dl_entity_preempt(struct sched_dl_entity *a, struct sched_dl_entity *b)
{
return dl_time_before(a->deadline, b->deadline);
}
/*
* This is the priority-queue data structure of the RT scheduling class:
*/
@ -108,6 +144,47 @@ struct rt_bandwidth {
u64 rt_runtime;
struct hrtimer rt_period_timer;
};
/*
* To keep the bandwidth of -deadline tasks and groups under control
* we need some place where:
* - store the maximum -deadline bandwidth of the system (the group);
* - cache the fraction of that bandwidth that is currently allocated.
*
* This is all done in the data structure below. It is similar to the
* one used for RT-throttling (rt_bandwidth), with the main difference
* that, since here we are only interested in admission control, we
* do not decrease any runtime while the group "executes", neither we
* need a timer to replenish it.
*
* With respect to SMP, the bandwidth is given on a per-CPU basis,
* meaning that:
* - dl_bw (< 100%) is the bandwidth of the system (group) on each CPU;
* - dl_total_bw array contains, in the i-eth element, the currently
* allocated bandwidth on the i-eth CPU.
* Moreover, groups consume bandwidth on each CPU, while tasks only
* consume bandwidth on the CPU they're running on.
* Finally, dl_total_bw_cpu is used to cache the index of dl_total_bw
* that will be shown the next time the proc or cgroup controls will
* be red. It on its turn can be changed by writing on its own
* control.
*/
struct dl_bandwidth {
raw_spinlock_t dl_runtime_lock;
u64 dl_runtime;
u64 dl_period;
};
static inline int dl_bandwidth_enabled(void)
{
return sysctl_sched_rt_runtime >= 0;
}
extern struct dl_bw *dl_bw_of(int i);
struct dl_bw {
raw_spinlock_t lock;
u64 bw, total_bw;
};
extern struct mutex sched_domains_mutex;
@ -364,6 +441,42 @@ struct rt_rq {
#endif
};
/* Deadline class' related fields in a runqueue */
struct dl_rq {
/* runqueue is an rbtree, ordered by deadline */
struct rb_root rb_root;
struct rb_node *rb_leftmost;
unsigned long dl_nr_running;
#ifdef CONFIG_SMP
/*
* Deadline values of the currently executing and the
* earliest ready task on this rq. Caching these facilitates
* the decision wether or not a ready but not running task
* should migrate somewhere else.
*/
struct {
u64 curr;
u64 next;
} earliest_dl;
unsigned long dl_nr_migratory;
unsigned long dl_nr_total;
int overloaded;
/*
* Tasks on this rq that can be pushed away. They are kept in
* an rb-tree, ordered by tasks' deadlines, with caching
* of the leftmost (earliest deadline) element.
*/
struct rb_root pushable_dl_tasks_root;
struct rb_node *pushable_dl_tasks_leftmost;
#else
struct dl_bw dl_bw;
#endif
};
#ifdef CONFIG_SMP
/*
@ -381,6 +494,15 @@ struct root_domain {
cpumask_var_t span;
cpumask_var_t online;
/*
* The bit corresponding to a CPU gets set here if such CPU has more
* than one runnable -deadline task (as it is below for RT tasks).
*/
cpumask_var_t dlo_mask;
atomic_t dlo_count;
struct dl_bw dl_bw;
struct cpudl cpudl;
/*
* The "RT overload" flag: it gets set if a CPU has more than
* one runnable RT task.
@ -432,6 +554,7 @@ struct rq {
struct cfs_rq cfs;
struct rt_rq rt;
struct dl_rq dl;
#ifdef CONFIG_FAIR_GROUP_SCHED
/* list of leaf cfs_rq on this cpu: */
@ -827,8 +950,6 @@ static inline u64 global_rt_runtime(void)
return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
}
static inline int task_current(struct rq *rq, struct task_struct *p)
{
return rq->curr == p;
@ -988,6 +1109,7 @@ static const u32 prio_to_wmult[40] = {
#else
#define ENQUEUE_WAKING 0
#endif
#define ENQUEUE_REPLENISH 8
#define DEQUEUE_SLEEP 1
@ -1023,6 +1145,7 @@ struct sched_class {
void (*set_curr_task) (struct rq *rq);
void (*task_tick) (struct rq *rq, struct task_struct *p, int queued);
void (*task_fork) (struct task_struct *p);
void (*task_dead) (struct task_struct *p);
void (*switched_from) (struct rq *this_rq, struct task_struct *task);
void (*switched_to) (struct rq *this_rq, struct task_struct *task);
@ -1042,6 +1165,7 @@ struct sched_class {
for (class = sched_class_highest; class; class = class->next)
extern const struct sched_class stop_sched_class;
extern const struct sched_class dl_sched_class;
extern const struct sched_class rt_sched_class;
extern const struct sched_class fair_sched_class;
extern const struct sched_class idle_sched_class;
@ -1051,7 +1175,7 @@ extern const struct sched_class idle_sched_class;
extern void update_group_power(struct sched_domain *sd, int cpu);
extern void trigger_load_balance(struct rq *rq, int cpu);
extern void trigger_load_balance(struct rq *rq);
extern void idle_balance(int this_cpu, struct rq *this_rq);
extern void idle_enter_fair(struct rq *this_rq);
@ -1068,8 +1192,11 @@ static inline void idle_balance(int cpu, struct rq *rq)
extern void sysrq_sched_debug_show(void);
extern void sched_init_granularity(void);
extern void update_max_interval(void);
extern void init_sched_dl_class(void);
extern void init_sched_rt_class(void);
extern void init_sched_fair_class(void);
extern void init_sched_dl_class(void);
extern void resched_task(struct task_struct *p);
extern void resched_cpu(int cpu);
@ -1077,6 +1204,12 @@ extern void resched_cpu(int cpu);
extern struct rt_bandwidth def_rt_bandwidth;
extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
extern struct dl_bandwidth def_dl_bandwidth;
extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime);
extern void init_dl_task_timer(struct sched_dl_entity *dl_se);
unsigned long to_ratio(u64 period, u64 runtime);
extern void update_idle_cpu_load(struct rq *this_rq);
extern void init_task_runnable_average(struct task_struct *p);
@ -1353,6 +1486,7 @@ extern void print_rt_stats(struct seq_file *m, int cpu);
extern void init_cfs_rq(struct cfs_rq *cfs_rq);
extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq);
extern void init_dl_rq(struct dl_rq *dl_rq, struct rq *rq);
extern void cfs_bandwidth_usage_inc(void);
extern void cfs_bandwidth_usage_dec(void);

View File

@ -103,7 +103,7 @@ get_rr_interval_stop(struct rq *rq, struct task_struct *task)
* Simple, special scheduling class for the per-CPU stop tasks:
*/
const struct sched_class stop_sched_class = {
.next = &rt_sched_class,
.next = &dl_sched_class,
.enqueue_task = enqueue_task_stop,
.dequeue_task = dequeue_task_stop,

View File

@ -89,7 +89,7 @@ static void wakeup_softirqd(void)
* where hardirqs are disabled legitimately:
*/
#ifdef CONFIG_TRACE_IRQFLAGS
static void __local_bh_disable(unsigned long ip, unsigned int cnt)
void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
{
unsigned long flags;
@ -107,33 +107,21 @@ static void __local_bh_disable(unsigned long ip, unsigned int cnt)
/*
* Were softirqs turned off above:
*/
if (softirq_count() == cnt)
if (softirq_count() == (cnt & SOFTIRQ_MASK))
trace_softirqs_off(ip);
raw_local_irq_restore(flags);
if (preempt_count() == cnt)
trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
}
#else /* !CONFIG_TRACE_IRQFLAGS */
static inline void __local_bh_disable(unsigned long ip, unsigned int cnt)
{
preempt_count_add(cnt);
barrier();
}
EXPORT_SYMBOL(__local_bh_disable_ip);
#endif /* CONFIG_TRACE_IRQFLAGS */
void local_bh_disable(void)
{
__local_bh_disable(_RET_IP_, SOFTIRQ_DISABLE_OFFSET);
}
EXPORT_SYMBOL(local_bh_disable);
static void __local_bh_enable(unsigned int cnt)
{
WARN_ON_ONCE(!irqs_disabled());
if (softirq_count() == cnt)
if (softirq_count() == (cnt & SOFTIRQ_MASK))
trace_softirqs_on(_RET_IP_);
preempt_count_sub(cnt);
}
@ -151,7 +139,7 @@ void _local_bh_enable(void)
EXPORT_SYMBOL(_local_bh_enable);
static inline void _local_bh_enable_ip(unsigned long ip)
void __local_bh_enable_ip(unsigned long ip, unsigned int cnt)
{
WARN_ON_ONCE(in_irq() || irqs_disabled());
#ifdef CONFIG_TRACE_IRQFLAGS
@ -166,7 +154,7 @@ static inline void _local_bh_enable_ip(unsigned long ip)
* Keep preemption disabled until we are done with
* softirq processing:
*/
preempt_count_sub(SOFTIRQ_DISABLE_OFFSET - 1);
preempt_count_sub(cnt - 1);
if (unlikely(!in_interrupt() && local_softirq_pending())) {
/*
@ -182,18 +170,7 @@ static inline void _local_bh_enable_ip(unsigned long ip)
#endif
preempt_check_resched();
}
void local_bh_enable(void)
{
_local_bh_enable_ip(_RET_IP_);
}
EXPORT_SYMBOL(local_bh_enable);
void local_bh_enable_ip(unsigned long ip)
{
_local_bh_enable_ip(ip);
}
EXPORT_SYMBOL(local_bh_enable_ip);
EXPORT_SYMBOL(__local_bh_enable_ip);
/*
* We restart softirq processing for at most MAX_SOFTIRQ_RESTART times,
@ -264,7 +241,7 @@ asmlinkage void __do_softirq(void)
pending = local_softirq_pending();
account_irq_enter_time(current);
__local_bh_disable(_RET_IP_, SOFTIRQ_OFFSET);
__local_bh_disable_ip(_RET_IP_, SOFTIRQ_OFFSET);
in_hardirq = lockdep_softirq_start();
cpu = smp_processor_id();

View File

@ -384,13 +384,6 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
{
.procname = "numa_balancing_settle_count",
.data = &sysctl_numa_balancing_settle_count,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
{
.procname = "numa_balancing_migrate_deferred",
.data = &sysctl_numa_balancing_migrate_deferred,

View File

@ -177,7 +177,7 @@ static bool can_stop_full_tick(void)
* TODO: kick full dynticks CPUs when
* sched_clock_stable is set.
*/
if (!sched_clock_stable) {
if (!sched_clock_stable()) {
trace_tick_stop(0, "unstable sched clock\n");
/*
* Don't allow the user to think they can get

View File

@ -2558,7 +2558,7 @@ rb_reserve_next_event(struct ring_buffer *buffer,
if (unlikely(test_time_stamp(delta))) {
int local_clock_stable = 1;
#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
local_clock_stable = sched_clock_stable;
local_clock_stable = sched_clock_stable();
#endif
WARN_ONCE(delta > (1ULL << 59),
KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s",

View File

@ -16,6 +16,7 @@
#include <linux/uaccess.h>
#include <linux/ftrace.h>
#include <linux/sched/rt.h>
#include <linux/sched/deadline.h>
#include <trace/events/sched.h>
#include "trace.h"
@ -27,6 +28,8 @@ static int wakeup_cpu;
static int wakeup_current_cpu;
static unsigned wakeup_prio = -1;
static int wakeup_rt;
static int wakeup_dl;
static int tracing_dl = 0;
static arch_spinlock_t wakeup_lock =
(arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
@ -437,6 +440,7 @@ static void __wakeup_reset(struct trace_array *tr)
{
wakeup_cpu = -1;
wakeup_prio = -1;
tracing_dl = 0;
if (wakeup_task)
put_task_struct(wakeup_task);
@ -472,9 +476,17 @@ probe_wakeup(void *ignore, struct task_struct *p, int success)
tracing_record_cmdline(p);
tracing_record_cmdline(current);
if ((wakeup_rt && !rt_task(p)) ||
p->prio >= wakeup_prio ||
p->prio >= current->prio)
/*
* Semantic is like this:
* - wakeup tracer handles all tasks in the system, independently
* from their scheduling class;
* - wakeup_rt tracer handles tasks belonging to sched_dl and
* sched_rt class;
* - wakeup_dl handles tasks belonging to sched_dl class only.
*/
if (tracing_dl || (wakeup_dl && !dl_task(p)) ||
(wakeup_rt && !dl_task(p) && !rt_task(p)) ||
(!dl_task(p) && (p->prio >= wakeup_prio || p->prio >= current->prio)))
return;
pc = preempt_count();
@ -486,7 +498,8 @@ probe_wakeup(void *ignore, struct task_struct *p, int success)
arch_spin_lock(&wakeup_lock);
/* check for races. */
if (!tracer_enabled || p->prio >= wakeup_prio)
if (!tracer_enabled || tracing_dl ||
(!dl_task(p) && p->prio >= wakeup_prio))
goto out_locked;
/* reset the trace */
@ -496,6 +509,15 @@ probe_wakeup(void *ignore, struct task_struct *p, int success)
wakeup_current_cpu = wakeup_cpu;
wakeup_prio = p->prio;
/*
* Once you start tracing a -deadline task, don't bother tracing
* another task until the first one wakes up.
*/
if (dl_task(p))
tracing_dl = 1;
else
tracing_dl = 0;
wakeup_task = p;
get_task_struct(wakeup_task);
@ -597,16 +619,25 @@ static int __wakeup_tracer_init(struct trace_array *tr)
static int wakeup_tracer_init(struct trace_array *tr)
{
wakeup_dl = 0;
wakeup_rt = 0;
return __wakeup_tracer_init(tr);
}
static int wakeup_rt_tracer_init(struct trace_array *tr)
{
wakeup_dl = 0;
wakeup_rt = 1;
return __wakeup_tracer_init(tr);
}
static int wakeup_dl_tracer_init(struct trace_array *tr)
{
wakeup_dl = 1;
wakeup_rt = 0;
return __wakeup_tracer_init(tr);
}
static void wakeup_tracer_reset(struct trace_array *tr)
{
int lat_flag = save_flags & TRACE_ITER_LATENCY_FMT;
@ -674,6 +705,28 @@ static struct tracer wakeup_rt_tracer __read_mostly =
.use_max_tr = true,
};
static struct tracer wakeup_dl_tracer __read_mostly =
{
.name = "wakeup_dl",
.init = wakeup_dl_tracer_init,
.reset = wakeup_tracer_reset,
.start = wakeup_tracer_start,
.stop = wakeup_tracer_stop,
.wait_pipe = poll_wait_pipe,
.print_max = true,
.print_header = wakeup_print_header,
.print_line = wakeup_print_line,
.flags = &tracer_flags,
.set_flag = wakeup_set_flag,
.flag_changed = wakeup_flag_changed,
#ifdef CONFIG_FTRACE_SELFTEST
.selftest = trace_selftest_startup_wakeup,
#endif
.open = wakeup_trace_open,
.close = wakeup_trace_close,
.use_max_tr = true,
};
__init static int init_wakeup_tracer(void)
{
int ret;
@ -686,6 +739,10 @@ __init static int init_wakeup_tracer(void)
if (ret)
return ret;
ret = register_tracer(&wakeup_dl_tracer);
if (ret)
return ret;
return 0;
}
core_initcall(init_wakeup_tracer);

View File

@ -1022,11 +1022,16 @@ trace_selftest_startup_nop(struct tracer *trace, struct trace_array *tr)
#ifdef CONFIG_SCHED_TRACER
static int trace_wakeup_test_thread(void *data)
{
/* Make this a RT thread, doesn't need to be too high */
static const struct sched_param param = { .sched_priority = 5 };
/* Make this a -deadline thread */
static const struct sched_attr attr = {
.sched_policy = SCHED_DEADLINE,
.sched_runtime = 100000ULL,
.sched_deadline = 10000000ULL,
.sched_period = 10000000ULL
};
struct completion *x = data;
sched_setscheduler(current, SCHED_FIFO, &param);
sched_setattr(current, &attr);
/* Make it know we have a new prio */
complete(x);
@ -1040,8 +1045,8 @@ static int trace_wakeup_test_thread(void *data)
/* we are awake, now wait to disappear */
while (!kthread_should_stop()) {
/*
* This is an RT task, do short sleeps to let
* others run.
* This will likely be the system top priority
* task, do short sleeps to let others run.
*/
msleep(100);
}
@ -1054,21 +1059,21 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
{
unsigned long save_max = tracing_max_latency;
struct task_struct *p;
struct completion isrt;
struct completion is_ready;
unsigned long count;
int ret;
init_completion(&isrt);
init_completion(&is_ready);
/* create a high prio thread */
p = kthread_run(trace_wakeup_test_thread, &isrt, "ftrace-test");
/* create a -deadline thread */
p = kthread_run(trace_wakeup_test_thread, &is_ready, "ftrace-test");
if (IS_ERR(p)) {
printk(KERN_CONT "Failed to create ftrace wakeup test thread ");
return -1;
}
/* make sure the thread is running at an RT prio */
wait_for_completion(&isrt);
/* make sure the thread is running at -deadline policy */
wait_for_completion(&is_ready);
/* start the tracing */
ret = tracer_init(trace, tr);
@ -1082,19 +1087,19 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
while (p->on_rq) {
/*
* Sleep to make sure the RT thread is asleep too.
* Sleep to make sure the -deadline thread is asleep too.
* On virtual machines we can't rely on timings,
* but we want to make sure this test still works.
*/
msleep(100);
}
init_completion(&isrt);
init_completion(&is_ready);
wake_up_process(p);
/* Wait for the task to wake up */
wait_for_completion(&isrt);
wait_for_completion(&is_ready);
/* stop the tracing. */
tracing_stop();

View File

@ -1623,11 +1623,11 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
(len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) &&
!sysctl_tcp_low_latency &&
net_dma_find_channel()) {
preempt_enable_no_resched();
preempt_enable();
tp->ucopy.pinned_list =
dma_pin_iovec_pages(msg->msg_iov, len);
} else {
preempt_enable_no_resched();
preempt_enable();
}
}
#endif