Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler changes from Ingo Molnar: - Add the initial implementation of SCHED_DEADLINE support: a real-time scheduling policy where tasks that meet their deadlines and periodically execute their instances in less than their runtime quota see real-time scheduling and won't miss any of their deadlines. Tasks that go over their quota get delayed (Available to privileged users for now) - Clean up and fix preempt_enable_no_resched() abuse all around the tree - Do sched_clock() performance optimizations on x86 and elsewhere - Fix and improve auto-NUMA balancing - Fix and clean up the idle loop - Apply various cleanups and fixes * 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (60 commits) sched: Fix __sched_setscheduler() nice test sched: Move SCHED_RESET_ON_FORK into attr::sched_flags sched: Fix up attr::sched_priority warning sched: Fix up scheduler syscall LTP fails sched: Preserve the nice level over sched_setscheduler() and sched_setparam() calls sched/core: Fix htmldocs warnings sched/deadline: No need to check p if dl_se is valid sched/deadline: Remove unused variables sched/deadline: Fix sparse static warnings m68k: Fix build warning in mac_via.h sched, thermal: Clean up preempt_enable_no_resched() abuse sched, net: Fixup busy_loop_us_clock() sched, net: Clean up preempt_enable_no_resched() abuse sched/preempt: Fix up missed PREEMPT_NEED_RESCHED folding sched/preempt, locking: Rework local_bh_{dis,en}able() sched/clock, x86: Avoid a runtime condition in native_sched_clock() sched/clock: Fix up clear_sched_clock_stable() sched/clock, x86: Use a static_key for sched_clock_stable sched/clock: Remove local_irq_disable() from the clocks sched/clock, x86: Rewrite cyc2ns() to avoid the need to disable IRQs ...
This commit is contained in:
commit
a0fa1dd3cd
@ -428,11 +428,6 @@ rate for each task.
|
||||
numa_balancing_scan_size_mb is how many megabytes worth of pages are
|
||||
scanned for a given scan.
|
||||
|
||||
numa_balancing_settle_count is how many scan periods must complete before
|
||||
the schedule balancer stops pushing the task towards a preferred node. This
|
||||
gives the scheduler a chance to place the task on an alternative node if the
|
||||
preferred node is overloaded.
|
||||
|
||||
numa_balancing_migrate_deferred is how many page migrations get skipped
|
||||
unconditionally, after a page migration is skipped because a page is shared
|
||||
with other tasks. This reduces page migration overhead, and determines
|
||||
|
@ -15,7 +15,7 @@
|
||||
|
||||
#include <uapi/asm/unistd.h>
|
||||
|
||||
#define __NR_syscalls (380)
|
||||
#define __NR_syscalls (384)
|
||||
#define __ARM_NR_cmpxchg (__ARM_NR_BASE+0x00fff0)
|
||||
|
||||
#define __ARCH_WANT_STAT64
|
||||
|
@ -406,6 +406,8 @@
|
||||
#define __NR_process_vm_writev (__NR_SYSCALL_BASE+377)
|
||||
#define __NR_kcmp (__NR_SYSCALL_BASE+378)
|
||||
#define __NR_finit_module (__NR_SYSCALL_BASE+379)
|
||||
#define __NR_sched_setattr (__NR_SYSCALL_BASE+380)
|
||||
#define __NR_sched_getattr (__NR_SYSCALL_BASE+381)
|
||||
|
||||
/*
|
||||
* This may need to be greater than __NR_last_syscall+1 in order to
|
||||
|
@ -389,6 +389,8 @@
|
||||
CALL(sys_process_vm_writev)
|
||||
CALL(sys_kcmp)
|
||||
CALL(sys_finit_module)
|
||||
/* 380 */ CALL(sys_sched_setattr)
|
||||
CALL(sys_sched_getattr)
|
||||
#ifndef syscalls_counted
|
||||
.equ syscalls_padding, ((NR_syscalls + 3) & ~3) - NR_syscalls
|
||||
#define syscalls_counted
|
||||
|
@ -254,6 +254,8 @@
|
||||
extern volatile __u8 *via1,*via2;
|
||||
extern int rbv_present,via_alt_mapping;
|
||||
|
||||
struct irq_desc;
|
||||
|
||||
extern void via_register_interrupts(void);
|
||||
extern void via_irq_enable(int);
|
||||
extern void via_irq_disable(int);
|
||||
|
@ -1,6 +1,8 @@
|
||||
#ifndef _ASM_X86_MWAIT_H
|
||||
#define _ASM_X86_MWAIT_H
|
||||
|
||||
#include <linux/sched.h>
|
||||
|
||||
#define MWAIT_SUBSTATE_MASK 0xf
|
||||
#define MWAIT_CSTATE_MASK 0xf
|
||||
#define MWAIT_SUBSTATE_SIZE 4
|
||||
@ -13,4 +15,45 @@
|
||||
|
||||
#define MWAIT_ECX_INTERRUPT_BREAK 0x1
|
||||
|
||||
static inline void __monitor(const void *eax, unsigned long ecx,
|
||||
unsigned long edx)
|
||||
{
|
||||
/* "monitor %eax, %ecx, %edx;" */
|
||||
asm volatile(".byte 0x0f, 0x01, 0xc8;"
|
||||
:: "a" (eax), "c" (ecx), "d"(edx));
|
||||
}
|
||||
|
||||
static inline void __mwait(unsigned long eax, unsigned long ecx)
|
||||
{
|
||||
/* "mwait %eax, %ecx;" */
|
||||
asm volatile(".byte 0x0f, 0x01, 0xc9;"
|
||||
:: "a" (eax), "c" (ecx));
|
||||
}
|
||||
|
||||
/*
|
||||
* This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
|
||||
* which can obviate IPI to trigger checking of need_resched.
|
||||
* We execute MONITOR against need_resched and enter optimized wait state
|
||||
* through MWAIT. Whenever someone changes need_resched, we would be woken
|
||||
* up from MWAIT (without an IPI).
|
||||
*
|
||||
* New with Core Duo processors, MWAIT can take some hints based on CPU
|
||||
* capability.
|
||||
*/
|
||||
static inline void mwait_idle_with_hints(unsigned long eax, unsigned long ecx)
|
||||
{
|
||||
if (!current_set_polling_and_test()) {
|
||||
if (static_cpu_has(X86_FEATURE_CLFLUSH_MONITOR)) {
|
||||
mb();
|
||||
clflush((void *)¤t_thread_info()->flags);
|
||||
mb();
|
||||
}
|
||||
|
||||
__monitor((void *)¤t_thread_info()->flags, 0, 0);
|
||||
if (!need_resched())
|
||||
__mwait(eax, ecx);
|
||||
}
|
||||
current_clr_polling();
|
||||
}
|
||||
|
||||
#endif /* _ASM_X86_MWAIT_H */
|
||||
|
@ -700,29 +700,6 @@ static inline void sync_core(void)
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline void __monitor(const void *eax, unsigned long ecx,
|
||||
unsigned long edx)
|
||||
{
|
||||
/* "monitor %eax, %ecx, %edx;" */
|
||||
asm volatile(".byte 0x0f, 0x01, 0xc8;"
|
||||
:: "a" (eax), "c" (ecx), "d"(edx));
|
||||
}
|
||||
|
||||
static inline void __mwait(unsigned long eax, unsigned long ecx)
|
||||
{
|
||||
/* "mwait %eax, %ecx;" */
|
||||
asm volatile(".byte 0x0f, 0x01, 0xc9;"
|
||||
:: "a" (eax), "c" (ecx));
|
||||
}
|
||||
|
||||
static inline void __sti_mwait(unsigned long eax, unsigned long ecx)
|
||||
{
|
||||
trace_hardirqs_on();
|
||||
/* "mwait %eax, %ecx;" */
|
||||
asm volatile("sti; .byte 0x0f, 0x01, 0xc9;"
|
||||
:: "a" (eax), "c" (ecx));
|
||||
}
|
||||
|
||||
extern void select_idle_routine(const struct cpuinfo_x86 *c);
|
||||
extern void init_amd_e400_c1e_mask(void);
|
||||
|
||||
|
@ -4,6 +4,7 @@
|
||||
#include <linux/pm.h>
|
||||
#include <linux/percpu.h>
|
||||
#include <linux/interrupt.h>
|
||||
#include <linux/math64.h>
|
||||
|
||||
#define TICK_SIZE (tick_nsec / 1000)
|
||||
|
||||
@ -12,68 +13,26 @@ extern int recalibrate_cpu_khz(void);
|
||||
|
||||
extern int no_timer_check;
|
||||
|
||||
/* Accelerators for sched_clock()
|
||||
* convert from cycles(64bits) => nanoseconds (64bits)
|
||||
* basic equation:
|
||||
* ns = cycles / (freq / ns_per_sec)
|
||||
* ns = cycles * (ns_per_sec / freq)
|
||||
* ns = cycles * (10^9 / (cpu_khz * 10^3))
|
||||
* ns = cycles * (10^6 / cpu_khz)
|
||||
/*
|
||||
* We use the full linear equation: f(x) = a + b*x, in order to allow
|
||||
* a continuous function in the face of dynamic freq changes.
|
||||
*
|
||||
* Then we use scaling math (suggested by george@mvista.com) to get:
|
||||
* ns = cycles * (10^6 * SC / cpu_khz) / SC
|
||||
* ns = cycles * cyc2ns_scale / SC
|
||||
* Continuity means that when our frequency changes our slope (b); we want to
|
||||
* ensure that: f(t) == f'(t), which gives: a + b*t == a' + b'*t.
|
||||
*
|
||||
* And since SC is a constant power of two, we can convert the div
|
||||
* into a shift.
|
||||
* Without an offset (a) the above would not be possible.
|
||||
*
|
||||
* We can use khz divisor instead of mhz to keep a better precision, since
|
||||
* cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
|
||||
* (mathieu.desnoyers@polymtl.ca)
|
||||
*
|
||||
* -johnstul@us.ibm.com "math is hard, lets go shopping!"
|
||||
*
|
||||
* In:
|
||||
*
|
||||
* ns = cycles * cyc2ns_scale / SC
|
||||
*
|
||||
* Although we may still have enough bits to store the value of ns,
|
||||
* in some cases, we may not have enough bits to store cycles * cyc2ns_scale,
|
||||
* leading to an incorrect result.
|
||||
*
|
||||
* To avoid this, we can decompose 'cycles' into quotient and remainder
|
||||
* of division by SC. Then,
|
||||
*
|
||||
* ns = (quot * SC + rem) * cyc2ns_scale / SC
|
||||
* = quot * cyc2ns_scale + (rem * cyc2ns_scale) / SC
|
||||
*
|
||||
* - sqazi@google.com
|
||||
* See the comment near cycles_2_ns() for details on how we compute (b).
|
||||
*/
|
||||
struct cyc2ns_data {
|
||||
u32 cyc2ns_mul;
|
||||
u32 cyc2ns_shift;
|
||||
u64 cyc2ns_offset;
|
||||
u32 __count;
|
||||
/* u32 hole */
|
||||
}; /* 24 bytes -- do not grow */
|
||||
|
||||
DECLARE_PER_CPU(unsigned long, cyc2ns);
|
||||
DECLARE_PER_CPU(unsigned long long, cyc2ns_offset);
|
||||
|
||||
#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
|
||||
|
||||
static inline unsigned long long __cycles_2_ns(unsigned long long cyc)
|
||||
{
|
||||
int cpu = smp_processor_id();
|
||||
unsigned long long ns = per_cpu(cyc2ns_offset, cpu);
|
||||
ns += mult_frac(cyc, per_cpu(cyc2ns, cpu),
|
||||
(1UL << CYC2NS_SCALE_FACTOR));
|
||||
return ns;
|
||||
}
|
||||
|
||||
static inline unsigned long long cycles_2_ns(unsigned long long cyc)
|
||||
{
|
||||
unsigned long long ns;
|
||||
unsigned long flags;
|
||||
|
||||
local_irq_save(flags);
|
||||
ns = __cycles_2_ns(cyc);
|
||||
local_irq_restore(flags);
|
||||
|
||||
return ns;
|
||||
}
|
||||
extern struct cyc2ns_data *cyc2ns_read_begin(void);
|
||||
extern void cyc2ns_read_end(struct cyc2ns_data *);
|
||||
|
||||
#endif /* _ASM_X86_TIMER_H */
|
||||
|
@ -150,29 +150,6 @@ int acpi_processor_ffh_cstate_probe(unsigned int cpu,
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_probe);
|
||||
|
||||
/*
|
||||
* This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
|
||||
* which can obviate IPI to trigger checking of need_resched.
|
||||
* We execute MONITOR against need_resched and enter optimized wait state
|
||||
* through MWAIT. Whenever someone changes need_resched, we would be woken
|
||||
* up from MWAIT (without an IPI).
|
||||
*
|
||||
* New with Core Duo processors, MWAIT can take some hints based on CPU
|
||||
* capability.
|
||||
*/
|
||||
void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
|
||||
{
|
||||
if (!need_resched()) {
|
||||
if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR))
|
||||
clflush((void *)¤t_thread_info()->flags);
|
||||
|
||||
__monitor((void *)¤t_thread_info()->flags, 0, 0);
|
||||
smp_mb();
|
||||
if (!need_resched())
|
||||
__mwait(ax, cx);
|
||||
}
|
||||
}
|
||||
|
||||
void acpi_processor_ffh_cstate_enter(struct acpi_processor_cx *cx)
|
||||
{
|
||||
unsigned int cpu = smp_processor_id();
|
||||
|
@ -487,7 +487,7 @@ static void early_init_amd(struct cpuinfo_x86 *c)
|
||||
set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
|
||||
set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
|
||||
if (!check_tsc_unstable())
|
||||
sched_clock_stable = 1;
|
||||
set_sched_clock_stable();
|
||||
}
|
||||
|
||||
#ifdef CONFIG_X86_64
|
||||
|
@ -93,7 +93,7 @@ static void early_init_intel(struct cpuinfo_x86 *c)
|
||||
set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
|
||||
set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
|
||||
if (!check_tsc_unstable())
|
||||
sched_clock_stable = 1;
|
||||
set_sched_clock_stable();
|
||||
}
|
||||
|
||||
/* Penwell and Cloverview have the TSC which doesn't sleep on S3 */
|
||||
|
@ -1883,21 +1883,27 @@ static struct pmu pmu = {
|
||||
|
||||
void arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now)
|
||||
{
|
||||
struct cyc2ns_data *data;
|
||||
|
||||
userpg->cap_user_time = 0;
|
||||
userpg->cap_user_time_zero = 0;
|
||||
userpg->cap_user_rdpmc = x86_pmu.attr_rdpmc;
|
||||
userpg->pmc_width = x86_pmu.cntval_bits;
|
||||
|
||||
if (!sched_clock_stable)
|
||||
if (!sched_clock_stable())
|
||||
return;
|
||||
|
||||
data = cyc2ns_read_begin();
|
||||
|
||||
userpg->cap_user_time = 1;
|
||||
userpg->time_mult = this_cpu_read(cyc2ns);
|
||||
userpg->time_shift = CYC2NS_SCALE_FACTOR;
|
||||
userpg->time_offset = this_cpu_read(cyc2ns_offset) - now;
|
||||
userpg->time_mult = data->cyc2ns_mul;
|
||||
userpg->time_shift = data->cyc2ns_shift;
|
||||
userpg->time_offset = data->cyc2ns_offset - now;
|
||||
|
||||
userpg->cap_user_time_zero = 1;
|
||||
userpg->time_zero = this_cpu_read(cyc2ns_offset);
|
||||
userpg->time_zero = data->cyc2ns_offset;
|
||||
|
||||
cyc2ns_read_end(data);
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -1417,7 +1417,9 @@ static inline void mwait_play_dead(void)
|
||||
* The WBINVD is insufficient due to the spurious-wakeup
|
||||
* case where we return around the loop.
|
||||
*/
|
||||
mb();
|
||||
clflush(mwait_ptr);
|
||||
mb();
|
||||
__monitor(mwait_ptr, 0, 0);
|
||||
mb();
|
||||
__mwait(eax, 0);
|
||||
|
@ -11,6 +11,7 @@
|
||||
#include <linux/clocksource.h>
|
||||
#include <linux/percpu.h>
|
||||
#include <linux/timex.h>
|
||||
#include <linux/static_key.h>
|
||||
|
||||
#include <asm/hpet.h>
|
||||
#include <asm/timer.h>
|
||||
@ -37,13 +38,244 @@ static int __read_mostly tsc_unstable;
|
||||
erroneous rdtsc usage on !cpu_has_tsc processors */
|
||||
static int __read_mostly tsc_disabled = -1;
|
||||
|
||||
static struct static_key __use_tsc = STATIC_KEY_INIT;
|
||||
|
||||
int tsc_clocksource_reliable;
|
||||
|
||||
/*
|
||||
* Use a ring-buffer like data structure, where a writer advances the head by
|
||||
* writing a new data entry and a reader advances the tail when it observes a
|
||||
* new entry.
|
||||
*
|
||||
* Writers are made to wait on readers until there's space to write a new
|
||||
* entry.
|
||||
*
|
||||
* This means that we can always use an {offset, mul} pair to compute a ns
|
||||
* value that is 'roughly' in the right direction, even if we're writing a new
|
||||
* {offset, mul} pair during the clock read.
|
||||
*
|
||||
* The down-side is that we can no longer guarantee strict monotonicity anymore
|
||||
* (assuming the TSC was that to begin with), because while we compute the
|
||||
* intersection point of the two clock slopes and make sure the time is
|
||||
* continuous at the point of switching; we can no longer guarantee a reader is
|
||||
* strictly before or after the switch point.
|
||||
*
|
||||
* It does mean a reader no longer needs to disable IRQs in order to avoid
|
||||
* CPU-Freq updates messing with his times, and similarly an NMI reader will
|
||||
* no longer run the risk of hitting half-written state.
|
||||
*/
|
||||
|
||||
struct cyc2ns {
|
||||
struct cyc2ns_data data[2]; /* 0 + 2*24 = 48 */
|
||||
struct cyc2ns_data *head; /* 48 + 8 = 56 */
|
||||
struct cyc2ns_data *tail; /* 56 + 8 = 64 */
|
||||
}; /* exactly fits one cacheline */
|
||||
|
||||
static DEFINE_PER_CPU_ALIGNED(struct cyc2ns, cyc2ns);
|
||||
|
||||
struct cyc2ns_data *cyc2ns_read_begin(void)
|
||||
{
|
||||
struct cyc2ns_data *head;
|
||||
|
||||
preempt_disable();
|
||||
|
||||
head = this_cpu_read(cyc2ns.head);
|
||||
/*
|
||||
* Ensure we observe the entry when we observe the pointer to it.
|
||||
* matches the wmb from cyc2ns_write_end().
|
||||
*/
|
||||
smp_read_barrier_depends();
|
||||
head->__count++;
|
||||
barrier();
|
||||
|
||||
return head;
|
||||
}
|
||||
|
||||
void cyc2ns_read_end(struct cyc2ns_data *head)
|
||||
{
|
||||
barrier();
|
||||
/*
|
||||
* If we're the outer most nested read; update the tail pointer
|
||||
* when we're done. This notifies possible pending writers
|
||||
* that we've observed the head pointer and that the other
|
||||
* entry is now free.
|
||||
*/
|
||||
if (!--head->__count) {
|
||||
/*
|
||||
* x86-TSO does not reorder writes with older reads;
|
||||
* therefore once this write becomes visible to another
|
||||
* cpu, we must be finished reading the cyc2ns_data.
|
||||
*
|
||||
* matches with cyc2ns_write_begin().
|
||||
*/
|
||||
this_cpu_write(cyc2ns.tail, head);
|
||||
}
|
||||
preempt_enable();
|
||||
}
|
||||
|
||||
/*
|
||||
* Begin writing a new @data entry for @cpu.
|
||||
*
|
||||
* Assumes some sort of write side lock; currently 'provided' by the assumption
|
||||
* that cpufreq will call its notifiers sequentially.
|
||||
*/
|
||||
static struct cyc2ns_data *cyc2ns_write_begin(int cpu)
|
||||
{
|
||||
struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu);
|
||||
struct cyc2ns_data *data = c2n->data;
|
||||
|
||||
if (data == c2n->head)
|
||||
data++;
|
||||
|
||||
/* XXX send an IPI to @cpu in order to guarantee a read? */
|
||||
|
||||
/*
|
||||
* When we observe the tail write from cyc2ns_read_end(),
|
||||
* the cpu must be done with that entry and its safe
|
||||
* to start writing to it.
|
||||
*/
|
||||
while (c2n->tail == data)
|
||||
cpu_relax();
|
||||
|
||||
return data;
|
||||
}
|
||||
|
||||
static void cyc2ns_write_end(int cpu, struct cyc2ns_data *data)
|
||||
{
|
||||
struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu);
|
||||
|
||||
/*
|
||||
* Ensure the @data writes are visible before we publish the
|
||||
* entry. Matches the data-depencency in cyc2ns_read_begin().
|
||||
*/
|
||||
smp_wmb();
|
||||
|
||||
ACCESS_ONCE(c2n->head) = data;
|
||||
}
|
||||
|
||||
/*
|
||||
* Accelerators for sched_clock()
|
||||
* convert from cycles(64bits) => nanoseconds (64bits)
|
||||
* basic equation:
|
||||
* ns = cycles / (freq / ns_per_sec)
|
||||
* ns = cycles * (ns_per_sec / freq)
|
||||
* ns = cycles * (10^9 / (cpu_khz * 10^3))
|
||||
* ns = cycles * (10^6 / cpu_khz)
|
||||
*
|
||||
* Then we use scaling math (suggested by george@mvista.com) to get:
|
||||
* ns = cycles * (10^6 * SC / cpu_khz) / SC
|
||||
* ns = cycles * cyc2ns_scale / SC
|
||||
*
|
||||
* And since SC is a constant power of two, we can convert the div
|
||||
* into a shift.
|
||||
*
|
||||
* We can use khz divisor instead of mhz to keep a better precision, since
|
||||
* cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
|
||||
* (mathieu.desnoyers@polymtl.ca)
|
||||
*
|
||||
* -johnstul@us.ibm.com "math is hard, lets go shopping!"
|
||||
*/
|
||||
|
||||
#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
|
||||
|
||||
static void cyc2ns_data_init(struct cyc2ns_data *data)
|
||||
{
|
||||
data->cyc2ns_mul = 1U << CYC2NS_SCALE_FACTOR;
|
||||
data->cyc2ns_shift = CYC2NS_SCALE_FACTOR;
|
||||
data->cyc2ns_offset = 0;
|
||||
data->__count = 0;
|
||||
}
|
||||
|
||||
static void cyc2ns_init(int cpu)
|
||||
{
|
||||
struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu);
|
||||
|
||||
cyc2ns_data_init(&c2n->data[0]);
|
||||
cyc2ns_data_init(&c2n->data[1]);
|
||||
|
||||
c2n->head = c2n->data;
|
||||
c2n->tail = c2n->data;
|
||||
}
|
||||
|
||||
static inline unsigned long long cycles_2_ns(unsigned long long cyc)
|
||||
{
|
||||
struct cyc2ns_data *data, *tail;
|
||||
unsigned long long ns;
|
||||
|
||||
/*
|
||||
* See cyc2ns_read_*() for details; replicated in order to avoid
|
||||
* an extra few instructions that came with the abstraction.
|
||||
* Notable, it allows us to only do the __count and tail update
|
||||
* dance when its actually needed.
|
||||
*/
|
||||
|
||||
preempt_disable();
|
||||
data = this_cpu_read(cyc2ns.head);
|
||||
tail = this_cpu_read(cyc2ns.tail);
|
||||
|
||||
if (likely(data == tail)) {
|
||||
ns = data->cyc2ns_offset;
|
||||
ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR);
|
||||
} else {
|
||||
data->__count++;
|
||||
|
||||
barrier();
|
||||
|
||||
ns = data->cyc2ns_offset;
|
||||
ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR);
|
||||
|
||||
barrier();
|
||||
|
||||
if (!--data->__count)
|
||||
this_cpu_write(cyc2ns.tail, data);
|
||||
}
|
||||
preempt_enable();
|
||||
|
||||
return ns;
|
||||
}
|
||||
|
||||
/* XXX surely we already have this someplace in the kernel?! */
|
||||
#define DIV_ROUND(n, d) (((n) + ((d) / 2)) / (d))
|
||||
|
||||
static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
|
||||
{
|
||||
unsigned long long tsc_now, ns_now;
|
||||
struct cyc2ns_data *data;
|
||||
unsigned long flags;
|
||||
|
||||
local_irq_save(flags);
|
||||
sched_clock_idle_sleep_event();
|
||||
|
||||
if (!cpu_khz)
|
||||
goto done;
|
||||
|
||||
data = cyc2ns_write_begin(cpu);
|
||||
|
||||
rdtscll(tsc_now);
|
||||
ns_now = cycles_2_ns(tsc_now);
|
||||
|
||||
/*
|
||||
* Compute a new multiplier as per the above comment and ensure our
|
||||
* time function is continuous; see the comment near struct
|
||||
* cyc2ns_data.
|
||||
*/
|
||||
data->cyc2ns_mul = DIV_ROUND(NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR, cpu_khz);
|
||||
data->cyc2ns_shift = CYC2NS_SCALE_FACTOR;
|
||||
data->cyc2ns_offset = ns_now -
|
||||
mul_u64_u32_shr(tsc_now, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR);
|
||||
|
||||
cyc2ns_write_end(cpu, data);
|
||||
|
||||
done:
|
||||
sched_clock_idle_wakeup_event(0);
|
||||
local_irq_restore(flags);
|
||||
}
|
||||
/*
|
||||
* Scheduler clock - returns current time in nanosec units.
|
||||
*/
|
||||
u64 native_sched_clock(void)
|
||||
{
|
||||
u64 this_offset;
|
||||
u64 tsc_now;
|
||||
|
||||
/*
|
||||
* Fall back to jiffies if there's no TSC available:
|
||||
@ -53,16 +285,16 @@ u64 native_sched_clock(void)
|
||||
* very important for it to be as fast as the platform
|
||||
* can achieve it. )
|
||||
*/
|
||||
if (unlikely(tsc_disabled)) {
|
||||
if (!static_key_false(&__use_tsc)) {
|
||||
/* No locking but a rare wrong value is not a big deal: */
|
||||
return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ);
|
||||
}
|
||||
|
||||
/* read the Time Stamp Counter: */
|
||||
rdtscll(this_offset);
|
||||
rdtscll(tsc_now);
|
||||
|
||||
/* return the value in ns */
|
||||
return __cycles_2_ns(this_offset);
|
||||
return cycles_2_ns(tsc_now);
|
||||
}
|
||||
|
||||
/* We need to define a real function for sched_clock, to override the
|
||||
@ -589,61 +821,11 @@ int recalibrate_cpu_khz(void)
|
||||
EXPORT_SYMBOL(recalibrate_cpu_khz);
|
||||
|
||||
|
||||
/* Accelerators for sched_clock()
|
||||
* convert from cycles(64bits) => nanoseconds (64bits)
|
||||
* basic equation:
|
||||
* ns = cycles / (freq / ns_per_sec)
|
||||
* ns = cycles * (ns_per_sec / freq)
|
||||
* ns = cycles * (10^9 / (cpu_khz * 10^3))
|
||||
* ns = cycles * (10^6 / cpu_khz)
|
||||
*
|
||||
* Then we use scaling math (suggested by george@mvista.com) to get:
|
||||
* ns = cycles * (10^6 * SC / cpu_khz) / SC
|
||||
* ns = cycles * cyc2ns_scale / SC
|
||||
*
|
||||
* And since SC is a constant power of two, we can convert the div
|
||||
* into a shift.
|
||||
*
|
||||
* We can use khz divisor instead of mhz to keep a better precision, since
|
||||
* cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
|
||||
* (mathieu.desnoyers@polymtl.ca)
|
||||
*
|
||||
* -johnstul@us.ibm.com "math is hard, lets go shopping!"
|
||||
*/
|
||||
|
||||
DEFINE_PER_CPU(unsigned long, cyc2ns);
|
||||
DEFINE_PER_CPU(unsigned long long, cyc2ns_offset);
|
||||
|
||||
static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
|
||||
{
|
||||
unsigned long long tsc_now, ns_now, *offset;
|
||||
unsigned long flags, *scale;
|
||||
|
||||
local_irq_save(flags);
|
||||
sched_clock_idle_sleep_event();
|
||||
|
||||
scale = &per_cpu(cyc2ns, cpu);
|
||||
offset = &per_cpu(cyc2ns_offset, cpu);
|
||||
|
||||
rdtscll(tsc_now);
|
||||
ns_now = __cycles_2_ns(tsc_now);
|
||||
|
||||
if (cpu_khz) {
|
||||
*scale = ((NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR) +
|
||||
cpu_khz / 2) / cpu_khz;
|
||||
*offset = ns_now - mult_frac(tsc_now, *scale,
|
||||
(1UL << CYC2NS_SCALE_FACTOR));
|
||||
}
|
||||
|
||||
sched_clock_idle_wakeup_event(0);
|
||||
local_irq_restore(flags);
|
||||
}
|
||||
|
||||
static unsigned long long cyc2ns_suspend;
|
||||
|
||||
void tsc_save_sched_clock_state(void)
|
||||
{
|
||||
if (!sched_clock_stable)
|
||||
if (!sched_clock_stable())
|
||||
return;
|
||||
|
||||
cyc2ns_suspend = sched_clock();
|
||||
@ -663,16 +845,26 @@ void tsc_restore_sched_clock_state(void)
|
||||
unsigned long flags;
|
||||
int cpu;
|
||||
|
||||
if (!sched_clock_stable)
|
||||
if (!sched_clock_stable())
|
||||
return;
|
||||
|
||||
local_irq_save(flags);
|
||||
|
||||
__this_cpu_write(cyc2ns_offset, 0);
|
||||
/*
|
||||
* We're comming out of suspend, there's no concurrency yet; don't
|
||||
* bother being nice about the RCU stuff, just write to both
|
||||
* data fields.
|
||||
*/
|
||||
|
||||
this_cpu_write(cyc2ns.data[0].cyc2ns_offset, 0);
|
||||
this_cpu_write(cyc2ns.data[1].cyc2ns_offset, 0);
|
||||
|
||||
offset = cyc2ns_suspend - sched_clock();
|
||||
|
||||
for_each_possible_cpu(cpu)
|
||||
per_cpu(cyc2ns_offset, cpu) = offset;
|
||||
for_each_possible_cpu(cpu) {
|
||||
per_cpu(cyc2ns.data[0].cyc2ns_offset, cpu) = offset;
|
||||
per_cpu(cyc2ns.data[1].cyc2ns_offset, cpu) = offset;
|
||||
}
|
||||
|
||||
local_irq_restore(flags);
|
||||
}
|
||||
@ -795,7 +987,7 @@ void mark_tsc_unstable(char *reason)
|
||||
{
|
||||
if (!tsc_unstable) {
|
||||
tsc_unstable = 1;
|
||||
sched_clock_stable = 0;
|
||||
clear_sched_clock_stable();
|
||||
disable_sched_clock_irqtime();
|
||||
pr_info("Marking TSC unstable due to %s\n", reason);
|
||||
/* Change only the rating, when not registered */
|
||||
@ -995,14 +1187,18 @@ void __init tsc_init(void)
|
||||
* speed as the bootup CPU. (cpufreq notifiers will fix this
|
||||
* up if their speed diverges)
|
||||
*/
|
||||
for_each_possible_cpu(cpu)
|
||||
for_each_possible_cpu(cpu) {
|
||||
cyc2ns_init(cpu);
|
||||
set_cyc2ns_scale(cpu_khz, cpu);
|
||||
}
|
||||
|
||||
if (tsc_disabled > 0)
|
||||
return;
|
||||
|
||||
/* now allow native_sched_clock() to use rdtsc */
|
||||
|
||||
tsc_disabled = 0;
|
||||
static_key_slow_inc(&__use_tsc);
|
||||
|
||||
if (!no_sched_irq_time)
|
||||
enable_sched_clock_irqtime();
|
||||
|
@ -433,15 +433,49 @@ static void reset_with_ipi(struct pnmask *distribution, struct bau_control *bcp)
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* Not to be confused with cycles_2_ns() from tsc.c; this gives a relative
|
||||
* number, not an absolute. It converts a duration in cycles to a duration in
|
||||
* ns.
|
||||
*/
|
||||
static inline unsigned long long cycles_2_ns(unsigned long long cyc)
|
||||
{
|
||||
struct cyc2ns_data *data = cyc2ns_read_begin();
|
||||
unsigned long long ns;
|
||||
|
||||
ns = mul_u64_u32_shr(cyc, data->cyc2ns_mul, data->cyc2ns_shift);
|
||||
|
||||
cyc2ns_read_end(data);
|
||||
return ns;
|
||||
}
|
||||
|
||||
/*
|
||||
* The reverse of the above; converts a duration in ns to a duration in cycles.
|
||||
*/
|
||||
static inline unsigned long long ns_2_cycles(unsigned long long ns)
|
||||
{
|
||||
struct cyc2ns_data *data = cyc2ns_read_begin();
|
||||
unsigned long long cyc;
|
||||
|
||||
cyc = (ns << data->cyc2ns_shift) / data->cyc2ns_mul;
|
||||
|
||||
cyc2ns_read_end(data);
|
||||
return cyc;
|
||||
}
|
||||
|
||||
static inline unsigned long cycles_2_us(unsigned long long cyc)
|
||||
{
|
||||
unsigned long long ns;
|
||||
unsigned long us;
|
||||
int cpu = smp_processor_id();
|
||||
return cycles_2_ns(cyc) / NSEC_PER_USEC;
|
||||
}
|
||||
|
||||
ns = (cyc * per_cpu(cyc2ns, cpu)) >> CYC2NS_SCALE_FACTOR;
|
||||
us = ns / 1000;
|
||||
return us;
|
||||
static inline cycles_t sec_2_cycles(unsigned long sec)
|
||||
{
|
||||
return ns_2_cycles(sec * NSEC_PER_SEC);
|
||||
}
|
||||
|
||||
static inline unsigned long long usec_2_cycles(unsigned long usec)
|
||||
{
|
||||
return ns_2_cycles(usec * NSEC_PER_USEC);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -668,16 +702,6 @@ static int wait_completion(struct bau_desc *bau_desc,
|
||||
bcp, try);
|
||||
}
|
||||
|
||||
static inline cycles_t sec_2_cycles(unsigned long sec)
|
||||
{
|
||||
unsigned long ns;
|
||||
cycles_t cyc;
|
||||
|
||||
ns = sec * 1000000000;
|
||||
cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id()));
|
||||
return cyc;
|
||||
}
|
||||
|
||||
/*
|
||||
* Our retries are blocked by all destination sw ack resources being
|
||||
* in use, and a timeout is pending. In that case hardware immediately
|
||||
@ -1327,16 +1351,6 @@ static void ptc_seq_stop(struct seq_file *file, void *data)
|
||||
{
|
||||
}
|
||||
|
||||
static inline unsigned long long usec_2_cycles(unsigned long microsec)
|
||||
{
|
||||
unsigned long ns;
|
||||
unsigned long long cyc;
|
||||
|
||||
ns = microsec * 1000;
|
||||
cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id()));
|
||||
return cyc;
|
||||
}
|
||||
|
||||
/*
|
||||
* Display the statistics thru /proc/sgi_uv/ptc_statistics
|
||||
* 'data' points to the cpu number
|
||||
|
@ -357,3 +357,5 @@
|
||||
348 i386 process_vm_writev sys_process_vm_writev compat_sys_process_vm_writev
|
||||
349 i386 kcmp sys_kcmp
|
||||
350 i386 finit_module sys_finit_module
|
||||
351 i386 sched_setattr sys_sched_setattr
|
||||
352 i386 sched_getattr sys_sched_getattr
|
||||
|
@ -320,6 +320,8 @@
|
||||
311 64 process_vm_writev sys_process_vm_writev
|
||||
312 common kcmp sys_kcmp
|
||||
313 common finit_module sys_finit_module
|
||||
314 common sched_setattr sys_sched_setattr
|
||||
315 common sched_getattr sys_sched_getattr
|
||||
|
||||
#
|
||||
# x32-specific system call numbers start at 512 to avoid cache impact
|
||||
|
@ -193,10 +193,7 @@ static int power_saving_thread(void *data)
|
||||
CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu);
|
||||
stop_critical_timings();
|
||||
|
||||
__monitor((void *)¤t_thread_info()->flags, 0, 0);
|
||||
smp_mb();
|
||||
if (!need_resched())
|
||||
__mwait(power_saving_mwait_eax, 1);
|
||||
mwait_idle_with_hints(power_saving_mwait_eax, 1);
|
||||
|
||||
start_critical_timings();
|
||||
if (lapic_marked_unstable)
|
||||
|
@ -727,11 +727,6 @@ static int acpi_idle_enter_c1(struct cpuidle_device *dev,
|
||||
if (unlikely(!pr))
|
||||
return -EINVAL;
|
||||
|
||||
if (cx->entry_method == ACPI_CSTATE_FFH) {
|
||||
if (current_set_polling_and_test())
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
lapic_timer_state_broadcast(pr, cx, 1);
|
||||
acpi_idle_do_entry(cx);
|
||||
|
||||
@ -785,11 +780,6 @@ static int acpi_idle_enter_simple(struct cpuidle_device *dev,
|
||||
if (unlikely(!pr))
|
||||
return -EINVAL;
|
||||
|
||||
if (cx->entry_method == ACPI_CSTATE_FFH) {
|
||||
if (current_set_polling_and_test())
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
/*
|
||||
* Must be done before busmaster disable as we might need to
|
||||
* access HPET !
|
||||
@ -841,11 +831,6 @@ static int acpi_idle_enter_bm(struct cpuidle_device *dev,
|
||||
}
|
||||
}
|
||||
|
||||
if (cx->entry_method == ACPI_CSTATE_FFH) {
|
||||
if (current_set_polling_and_test())
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
acpi_unlazy_tlb(smp_processor_id());
|
||||
|
||||
/* Tell the scheduler that we are going deep-idle: */
|
||||
|
@ -377,16 +377,7 @@ static int intel_idle(struct cpuidle_device *dev,
|
||||
if (!(lapic_timer_reliable_states & (1 << (cstate))))
|
||||
clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu);
|
||||
|
||||
if (!current_set_polling_and_test()) {
|
||||
|
||||
if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR))
|
||||
clflush((void *)¤t_thread_info()->flags);
|
||||
|
||||
__monitor((void *)¤t_thread_info()->flags, 0, 0);
|
||||
smp_mb();
|
||||
if (!need_resched())
|
||||
__mwait(eax, ecx);
|
||||
}
|
||||
mwait_idle_with_hints(eax, ecx);
|
||||
|
||||
if (!(lapic_timer_reliable_states & (1 << (cstate))))
|
||||
clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu);
|
||||
|
@ -438,14 +438,12 @@ static int clamp_thread(void *arg)
|
||||
*/
|
||||
local_touch_nmi();
|
||||
stop_critical_timings();
|
||||
__monitor((void *)¤t_thread_info()->flags, 0, 0);
|
||||
cpu_relax(); /* allow HT sibling to run */
|
||||
__mwait(eax, ecx);
|
||||
mwait_idle_with_hints(eax, ecx);
|
||||
start_critical_timings();
|
||||
atomic_inc(&idle_wakeup_counter);
|
||||
}
|
||||
tick_nohz_idle_exit();
|
||||
preempt_enable_no_resched();
|
||||
preempt_enable();
|
||||
}
|
||||
del_timer_sync(&wakeup_timer);
|
||||
clear_bit(cpunr, cpu_clamping_mask);
|
||||
|
@ -1,9 +1,35 @@
|
||||
#ifndef _LINUX_BH_H
|
||||
#define _LINUX_BH_H
|
||||
|
||||
extern void local_bh_disable(void);
|
||||
#include <linux/preempt.h>
|
||||
#include <linux/preempt_mask.h>
|
||||
|
||||
#ifdef CONFIG_TRACE_IRQFLAGS
|
||||
extern void __local_bh_disable_ip(unsigned long ip, unsigned int cnt);
|
||||
#else
|
||||
static __always_inline void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
|
||||
{
|
||||
preempt_count_add(cnt);
|
||||
barrier();
|
||||
}
|
||||
#endif
|
||||
|
||||
static inline void local_bh_disable(void)
|
||||
{
|
||||
__local_bh_disable_ip(_THIS_IP_, SOFTIRQ_DISABLE_OFFSET);
|
||||
}
|
||||
|
||||
extern void _local_bh_enable(void);
|
||||
extern void local_bh_enable(void);
|
||||
extern void local_bh_enable_ip(unsigned long ip);
|
||||
extern void __local_bh_enable_ip(unsigned long ip, unsigned int cnt);
|
||||
|
||||
static inline void local_bh_enable_ip(unsigned long ip)
|
||||
{
|
||||
__local_bh_enable_ip(ip, SOFTIRQ_DISABLE_OFFSET);
|
||||
}
|
||||
|
||||
static inline void local_bh_enable(void)
|
||||
{
|
||||
__local_bh_enable_ip(_THIS_IP_, SOFTIRQ_DISABLE_OFFSET);
|
||||
}
|
||||
|
||||
#endif /* _LINUX_BH_H */
|
||||
|
@ -5,6 +5,7 @@
|
||||
#include <linux/lockdep.h>
|
||||
#include <linux/ftrace_irq.h>
|
||||
#include <linux/vtime.h>
|
||||
#include <asm/hardirq.h>
|
||||
|
||||
|
||||
extern void synchronize_irq(unsigned int irq);
|
||||
|
@ -11,6 +11,7 @@
|
||||
#include <linux/user_namespace.h>
|
||||
#include <linux/securebits.h>
|
||||
#include <linux/seqlock.h>
|
||||
#include <linux/rbtree.h>
|
||||
#include <net/net_namespace.h>
|
||||
#include <linux/sched/rt.h>
|
||||
|
||||
@ -154,6 +155,14 @@ extern struct task_group root_task_group;
|
||||
|
||||
#define INIT_TASK_COMM "swapper"
|
||||
|
||||
#ifdef CONFIG_RT_MUTEXES
|
||||
# define INIT_RT_MUTEXES(tsk) \
|
||||
.pi_waiters = RB_ROOT, \
|
||||
.pi_waiters_leftmost = NULL,
|
||||
#else
|
||||
# define INIT_RT_MUTEXES(tsk)
|
||||
#endif
|
||||
|
||||
/*
|
||||
* INIT_TASK is used to set up the first task table, touch at
|
||||
* your own risk!. Base=0, limit=0x1fffff (=2MB)
|
||||
@ -221,6 +230,7 @@ extern struct task_group root_task_group;
|
||||
INIT_TRACE_RECURSION \
|
||||
INIT_TASK_RCU_PREEMPT(tsk) \
|
||||
INIT_CPUSET_SEQ(tsk) \
|
||||
INIT_RT_MUTEXES(tsk) \
|
||||
INIT_VTIME(tsk) \
|
||||
}
|
||||
|
||||
|
@ -64,7 +64,11 @@ do { \
|
||||
} while (0)
|
||||
|
||||
#else
|
||||
#define preempt_enable() preempt_enable_no_resched()
|
||||
#define preempt_enable() \
|
||||
do { \
|
||||
barrier(); \
|
||||
preempt_count_dec(); \
|
||||
} while (0)
|
||||
#define preempt_check_resched() do { } while (0)
|
||||
#endif
|
||||
|
||||
@ -93,7 +97,11 @@ do { \
|
||||
__preempt_schedule_context(); \
|
||||
} while (0)
|
||||
#else
|
||||
#define preempt_enable_notrace() preempt_enable_no_resched_notrace()
|
||||
#define preempt_enable_notrace() \
|
||||
do { \
|
||||
barrier(); \
|
||||
__preempt_count_dec(); \
|
||||
} while (0)
|
||||
#endif
|
||||
|
||||
#else /* !CONFIG_PREEMPT_COUNT */
|
||||
@ -116,6 +124,31 @@ do { \
|
||||
|
||||
#endif /* CONFIG_PREEMPT_COUNT */
|
||||
|
||||
#ifdef MODULE
|
||||
/*
|
||||
* Modules have no business playing preemption tricks.
|
||||
*/
|
||||
#undef sched_preempt_enable_no_resched
|
||||
#undef preempt_enable_no_resched
|
||||
#undef preempt_enable_no_resched_notrace
|
||||
#undef preempt_check_resched
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_PREEMPT
|
||||
#define preempt_set_need_resched() \
|
||||
do { \
|
||||
set_preempt_need_resched(); \
|
||||
} while (0)
|
||||
#define preempt_fold_need_resched() \
|
||||
do { \
|
||||
if (tif_need_resched()) \
|
||||
set_preempt_need_resched(); \
|
||||
} while (0)
|
||||
#else
|
||||
#define preempt_set_need_resched() do { } while (0)
|
||||
#define preempt_fold_need_resched() do { } while (0)
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_PREEMPT_NOTIFIERS
|
||||
|
||||
struct preempt_notifier;
|
||||
|
@ -2,7 +2,6 @@
|
||||
#define LINUX_PREEMPT_MASK_H
|
||||
|
||||
#include <linux/preempt.h>
|
||||
#include <asm/hardirq.h>
|
||||
|
||||
/*
|
||||
* We put the hardirq and softirq counter into the preemption
|
||||
@ -78,6 +77,21 @@
|
||||
# define PREEMPT_CHECK_OFFSET 0
|
||||
#endif
|
||||
|
||||
/*
|
||||
* The preempt_count offset needed for things like:
|
||||
*
|
||||
* spin_lock_bh()
|
||||
*
|
||||
* Which need to disable both preemption (CONFIG_PREEMPT_COUNT) and
|
||||
* softirqs, such that unlock sequences of:
|
||||
*
|
||||
* spin_unlock();
|
||||
* local_bh_enable();
|
||||
*
|
||||
* Work as expected.
|
||||
*/
|
||||
#define SOFTIRQ_LOCK_OFFSET (SOFTIRQ_DISABLE_OFFSET + PREEMPT_CHECK_OFFSET)
|
||||
|
||||
/*
|
||||
* Are we running in atomic context? WARNING: this macro cannot
|
||||
* always detect atomic context; in particular, it cannot know about
|
||||
|
@ -13,7 +13,7 @@
|
||||
#define __LINUX_RT_MUTEX_H
|
||||
|
||||
#include <linux/linkage.h>
|
||||
#include <linux/plist.h>
|
||||
#include <linux/rbtree.h>
|
||||
#include <linux/spinlock_types.h>
|
||||
|
||||
extern int max_lock_depth; /* for sysctl */
|
||||
@ -22,12 +22,14 @@ extern int max_lock_depth; /* for sysctl */
|
||||
* The rt_mutex structure
|
||||
*
|
||||
* @wait_lock: spinlock to protect the structure
|
||||
* @wait_list: pilist head to enqueue waiters in priority order
|
||||
* @waiters: rbtree root to enqueue waiters in priority order
|
||||
* @waiters_leftmost: top waiter
|
||||
* @owner: the mutex owner
|
||||
*/
|
||||
struct rt_mutex {
|
||||
raw_spinlock_t wait_lock;
|
||||
struct plist_head wait_list;
|
||||
struct rb_root waiters;
|
||||
struct rb_node *waiters_leftmost;
|
||||
struct task_struct *owner;
|
||||
#ifdef CONFIG_DEBUG_RT_MUTEXES
|
||||
int save_state;
|
||||
@ -66,7 +68,7 @@ struct hrtimer_sleeper;
|
||||
|
||||
#define __RT_MUTEX_INITIALIZER(mutexname) \
|
||||
{ .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(mutexname.wait_lock) \
|
||||
, .wait_list = PLIST_HEAD_INIT(mutexname.wait_list) \
|
||||
, .waiters = RB_ROOT \
|
||||
, .owner = NULL \
|
||||
__DEBUG_RT_MUTEX_INITIALIZER(mutexname)}
|
||||
|
||||
@ -98,12 +100,4 @@ extern int rt_mutex_trylock(struct rt_mutex *lock);
|
||||
|
||||
extern void rt_mutex_unlock(struct rt_mutex *lock);
|
||||
|
||||
#ifdef CONFIG_RT_MUTEXES
|
||||
# define INIT_RT_MUTEXES(tsk) \
|
||||
.pi_waiters = PLIST_HEAD_INIT(tsk.pi_waiters), \
|
||||
INIT_RT_MUTEX_DEBUG(tsk)
|
||||
#else
|
||||
# define INIT_RT_MUTEXES(tsk)
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
@ -172,8 +172,7 @@ static inline void __raw_read_lock_irq(rwlock_t *lock)
|
||||
|
||||
static inline void __raw_read_lock_bh(rwlock_t *lock)
|
||||
{
|
||||
local_bh_disable();
|
||||
preempt_disable();
|
||||
__local_bh_disable_ip(_RET_IP_, SOFTIRQ_LOCK_OFFSET);
|
||||
rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
|
||||
LOCK_CONTENDED(lock, do_raw_read_trylock, do_raw_read_lock);
|
||||
}
|
||||
@ -200,8 +199,7 @@ static inline void __raw_write_lock_irq(rwlock_t *lock)
|
||||
|
||||
static inline void __raw_write_lock_bh(rwlock_t *lock)
|
||||
{
|
||||
local_bh_disable();
|
||||
preempt_disable();
|
||||
__local_bh_disable_ip(_RET_IP_, SOFTIRQ_LOCK_OFFSET);
|
||||
rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
|
||||
LOCK_CONTENDED(lock, do_raw_write_trylock, do_raw_write_lock);
|
||||
}
|
||||
@ -250,8 +248,7 @@ static inline void __raw_read_unlock_bh(rwlock_t *lock)
|
||||
{
|
||||
rwlock_release(&lock->dep_map, 1, _RET_IP_);
|
||||
do_raw_read_unlock(lock);
|
||||
preempt_enable_no_resched();
|
||||
local_bh_enable_ip((unsigned long)__builtin_return_address(0));
|
||||
__local_bh_enable_ip(_RET_IP_, SOFTIRQ_LOCK_OFFSET);
|
||||
}
|
||||
|
||||
static inline void __raw_write_unlock_irqrestore(rwlock_t *lock,
|
||||
@ -275,8 +272,7 @@ static inline void __raw_write_unlock_bh(rwlock_t *lock)
|
||||
{
|
||||
rwlock_release(&lock->dep_map, 1, _RET_IP_);
|
||||
do_raw_write_unlock(lock);
|
||||
preempt_enable_no_resched();
|
||||
local_bh_enable_ip((unsigned long)__builtin_return_address(0));
|
||||
__local_bh_enable_ip(_RET_IP_, SOFTIRQ_LOCK_OFFSET);
|
||||
}
|
||||
|
||||
#endif /* __LINUX_RWLOCK_API_SMP_H */
|
||||
|
@ -16,6 +16,7 @@ struct sched_param {
|
||||
#include <linux/types.h>
|
||||
#include <linux/timex.h>
|
||||
#include <linux/jiffies.h>
|
||||
#include <linux/plist.h>
|
||||
#include <linux/rbtree.h>
|
||||
#include <linux/thread_info.h>
|
||||
#include <linux/cpumask.h>
|
||||
@ -56,6 +57,70 @@ struct sched_param {
|
||||
|
||||
#include <asm/processor.h>
|
||||
|
||||
#define SCHED_ATTR_SIZE_VER0 48 /* sizeof first published struct */
|
||||
|
||||
/*
|
||||
* Extended scheduling parameters data structure.
|
||||
*
|
||||
* This is needed because the original struct sched_param can not be
|
||||
* altered without introducing ABI issues with legacy applications
|
||||
* (e.g., in sched_getparam()).
|
||||
*
|
||||
* However, the possibility of specifying more than just a priority for
|
||||
* the tasks may be useful for a wide variety of application fields, e.g.,
|
||||
* multimedia, streaming, automation and control, and many others.
|
||||
*
|
||||
* This variant (sched_attr) is meant at describing a so-called
|
||||
* sporadic time-constrained task. In such model a task is specified by:
|
||||
* - the activation period or minimum instance inter-arrival time;
|
||||
* - the maximum (or average, depending on the actual scheduling
|
||||
* discipline) computation time of all instances, a.k.a. runtime;
|
||||
* - the deadline (relative to the actual activation time) of each
|
||||
* instance.
|
||||
* Very briefly, a periodic (sporadic) task asks for the execution of
|
||||
* some specific computation --which is typically called an instance--
|
||||
* (at most) every period. Moreover, each instance typically lasts no more
|
||||
* than the runtime and must be completed by time instant t equal to
|
||||
* the instance activation time + the deadline.
|
||||
*
|
||||
* This is reflected by the actual fields of the sched_attr structure:
|
||||
*
|
||||
* @size size of the structure, for fwd/bwd compat.
|
||||
*
|
||||
* @sched_policy task's scheduling policy
|
||||
* @sched_flags for customizing the scheduler behaviour
|
||||
* @sched_nice task's nice value (SCHED_NORMAL/BATCH)
|
||||
* @sched_priority task's static priority (SCHED_FIFO/RR)
|
||||
* @sched_deadline representative of the task's deadline
|
||||
* @sched_runtime representative of the task's runtime
|
||||
* @sched_period representative of the task's period
|
||||
*
|
||||
* Given this task model, there are a multiplicity of scheduling algorithms
|
||||
* and policies, that can be used to ensure all the tasks will make their
|
||||
* timing constraints.
|
||||
*
|
||||
* As of now, the SCHED_DEADLINE policy (sched_dl scheduling class) is the
|
||||
* only user of this new interface. More information about the algorithm
|
||||
* available in the scheduling class file or in Documentation/.
|
||||
*/
|
||||
struct sched_attr {
|
||||
u32 size;
|
||||
|
||||
u32 sched_policy;
|
||||
u64 sched_flags;
|
||||
|
||||
/* SCHED_NORMAL, SCHED_BATCH */
|
||||
s32 sched_nice;
|
||||
|
||||
/* SCHED_FIFO, SCHED_RR */
|
||||
u32 sched_priority;
|
||||
|
||||
/* SCHED_DEADLINE */
|
||||
u64 sched_runtime;
|
||||
u64 sched_deadline;
|
||||
u64 sched_period;
|
||||
};
|
||||
|
||||
struct exec_domain;
|
||||
struct futex_pi_state;
|
||||
struct robust_list_head;
|
||||
@ -168,7 +233,6 @@ extern char ___assert_task_state[1 - 2*!!(
|
||||
|
||||
#define task_is_traced(task) ((task->state & __TASK_TRACED) != 0)
|
||||
#define task_is_stopped(task) ((task->state & __TASK_STOPPED) != 0)
|
||||
#define task_is_dead(task) ((task)->exit_state != 0)
|
||||
#define task_is_stopped_or_traced(task) \
|
||||
((task->state & (__TASK_STOPPED | __TASK_TRACED)) != 0)
|
||||
#define task_contributes_to_load(task) \
|
||||
@ -1029,6 +1093,51 @@ struct sched_rt_entity {
|
||||
#endif
|
||||
};
|
||||
|
||||
struct sched_dl_entity {
|
||||
struct rb_node rb_node;
|
||||
|
||||
/*
|
||||
* Original scheduling parameters. Copied here from sched_attr
|
||||
* during sched_setscheduler2(), they will remain the same until
|
||||
* the next sched_setscheduler2().
|
||||
*/
|
||||
u64 dl_runtime; /* maximum runtime for each instance */
|
||||
u64 dl_deadline; /* relative deadline of each instance */
|
||||
u64 dl_period; /* separation of two instances (period) */
|
||||
u64 dl_bw; /* dl_runtime / dl_deadline */
|
||||
|
||||
/*
|
||||
* Actual scheduling parameters. Initialized with the values above,
|
||||
* they are continously updated during task execution. Note that
|
||||
* the remaining runtime could be < 0 in case we are in overrun.
|
||||
*/
|
||||
s64 runtime; /* remaining runtime for this instance */
|
||||
u64 deadline; /* absolute deadline for this instance */
|
||||
unsigned int flags; /* specifying the scheduler behaviour */
|
||||
|
||||
/*
|
||||
* Some bool flags:
|
||||
*
|
||||
* @dl_throttled tells if we exhausted the runtime. If so, the
|
||||
* task has to wait for a replenishment to be performed at the
|
||||
* next firing of dl_timer.
|
||||
*
|
||||
* @dl_new tells if a new instance arrived. If so we must
|
||||
* start executing it with full runtime and reset its absolute
|
||||
* deadline;
|
||||
*
|
||||
* @dl_boosted tells if we are boosted due to DI. If so we are
|
||||
* outside bandwidth enforcement mechanism (but only until we
|
||||
* exit the critical section).
|
||||
*/
|
||||
int dl_throttled, dl_new, dl_boosted;
|
||||
|
||||
/*
|
||||
* Bandwidth enforcement timer. Each -deadline task has its
|
||||
* own bandwidth to be enforced, thus we need one timer per task.
|
||||
*/
|
||||
struct hrtimer dl_timer;
|
||||
};
|
||||
|
||||
struct rcu_node;
|
||||
|
||||
@ -1065,6 +1174,7 @@ struct task_struct {
|
||||
#ifdef CONFIG_CGROUP_SCHED
|
||||
struct task_group *sched_task_group;
|
||||
#endif
|
||||
struct sched_dl_entity dl;
|
||||
|
||||
#ifdef CONFIG_PREEMPT_NOTIFIERS
|
||||
/* list of struct preempt_notifier: */
|
||||
@ -1098,6 +1208,7 @@ struct task_struct {
|
||||
struct list_head tasks;
|
||||
#ifdef CONFIG_SMP
|
||||
struct plist_node pushable_tasks;
|
||||
struct rb_node pushable_dl_tasks;
|
||||
#endif
|
||||
|
||||
struct mm_struct *mm, *active_mm;
|
||||
@ -1249,9 +1360,12 @@ struct task_struct {
|
||||
|
||||
#ifdef CONFIG_RT_MUTEXES
|
||||
/* PI waiters blocked on a rt_mutex held by this task */
|
||||
struct plist_head pi_waiters;
|
||||
struct rb_root pi_waiters;
|
||||
struct rb_node *pi_waiters_leftmost;
|
||||
/* Deadlock detection and priority inheritance handling */
|
||||
struct rt_mutex_waiter *pi_blocked_on;
|
||||
/* Top pi_waiters task */
|
||||
struct task_struct *pi_top_task;
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_DEBUG_MUTEXES
|
||||
@ -1880,7 +1994,9 @@ static inline void sched_clock_idle_wakeup_event(u64 delta_ns)
|
||||
* but then during bootup it turns out that sched_clock()
|
||||
* is reliable after all:
|
||||
*/
|
||||
extern int sched_clock_stable;
|
||||
extern int sched_clock_stable(void);
|
||||
extern void set_sched_clock_stable(void);
|
||||
extern void clear_sched_clock_stable(void);
|
||||
|
||||
extern void sched_clock_tick(void);
|
||||
extern void sched_clock_idle_sleep_event(void);
|
||||
@ -1959,6 +2075,8 @@ extern int sched_setscheduler(struct task_struct *, int,
|
||||
const struct sched_param *);
|
||||
extern int sched_setscheduler_nocheck(struct task_struct *, int,
|
||||
const struct sched_param *);
|
||||
extern int sched_setattr(struct task_struct *,
|
||||
const struct sched_attr *);
|
||||
extern struct task_struct *idle_task(int cpu);
|
||||
/**
|
||||
* is_idle_task - is the specified task an idle task?
|
||||
@ -2038,7 +2156,7 @@ extern void wake_up_new_task(struct task_struct *tsk);
|
||||
#else
|
||||
static inline void kick_process(struct task_struct *tsk) { }
|
||||
#endif
|
||||
extern void sched_fork(unsigned long clone_flags, struct task_struct *p);
|
||||
extern int sched_fork(unsigned long clone_flags, struct task_struct *p);
|
||||
extern void sched_dead(struct task_struct *p);
|
||||
|
||||
extern void proc_caches_init(void);
|
||||
@ -2627,6 +2745,21 @@ static inline bool __must_check current_clr_polling_and_test(void)
|
||||
}
|
||||
#endif
|
||||
|
||||
static inline void current_clr_polling(void)
|
||||
{
|
||||
__current_clr_polling();
|
||||
|
||||
/*
|
||||
* Ensure we check TIF_NEED_RESCHED after we clear the polling bit.
|
||||
* Once the bit is cleared, we'll get IPIs with every new
|
||||
* TIF_NEED_RESCHED and the IPI handler, scheduler_ipi(), will also
|
||||
* fold.
|
||||
*/
|
||||
smp_mb(); /* paired with resched_task() */
|
||||
|
||||
preempt_fold_need_resched();
|
||||
}
|
||||
|
||||
static __always_inline bool need_resched(void)
|
||||
{
|
||||
return unlikely(tif_need_resched());
|
||||
|
24
include/linux/sched/deadline.h
Normal file
24
include/linux/sched/deadline.h
Normal file
@ -0,0 +1,24 @@
|
||||
#ifndef _SCHED_DEADLINE_H
|
||||
#define _SCHED_DEADLINE_H
|
||||
|
||||
/*
|
||||
* SCHED_DEADLINE tasks has negative priorities, reflecting
|
||||
* the fact that any of them has higher prio than RT and
|
||||
* NORMAL/BATCH tasks.
|
||||
*/
|
||||
|
||||
#define MAX_DL_PRIO 0
|
||||
|
||||
static inline int dl_prio(int prio)
|
||||
{
|
||||
if (unlikely(prio < MAX_DL_PRIO))
|
||||
return 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline int dl_task(struct task_struct *p)
|
||||
{
|
||||
return dl_prio(p->prio);
|
||||
}
|
||||
|
||||
#endif /* _SCHED_DEADLINE_H */
|
@ -35,6 +35,7 @@ static inline int rt_task(struct task_struct *p)
|
||||
#ifdef CONFIG_RT_MUTEXES
|
||||
extern int rt_mutex_getprio(struct task_struct *p);
|
||||
extern void rt_mutex_setprio(struct task_struct *p, int prio);
|
||||
extern struct task_struct *rt_mutex_get_top_task(struct task_struct *task);
|
||||
extern void rt_mutex_adjust_pi(struct task_struct *p);
|
||||
static inline bool tsk_is_pi_blocked(struct task_struct *tsk)
|
||||
{
|
||||
@ -45,6 +46,10 @@ static inline int rt_mutex_getprio(struct task_struct *p)
|
||||
{
|
||||
return p->normal_prio;
|
||||
}
|
||||
static inline struct task_struct *rt_mutex_get_top_task(struct task_struct *task)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
# define rt_mutex_adjust_pi(p) do { } while (0)
|
||||
static inline bool tsk_is_pi_blocked(struct task_struct *tsk)
|
||||
{
|
||||
|
@ -48,7 +48,6 @@ extern unsigned int sysctl_numa_balancing_scan_delay;
|
||||
extern unsigned int sysctl_numa_balancing_scan_period_min;
|
||||
extern unsigned int sysctl_numa_balancing_scan_period_max;
|
||||
extern unsigned int sysctl_numa_balancing_scan_size;
|
||||
extern unsigned int sysctl_numa_balancing_settle_count;
|
||||
|
||||
#ifdef CONFIG_SCHED_DEBUG
|
||||
extern unsigned int sysctl_sched_migration_cost;
|
||||
|
@ -131,8 +131,7 @@ static inline void __raw_spin_lock_irq(raw_spinlock_t *lock)
|
||||
|
||||
static inline void __raw_spin_lock_bh(raw_spinlock_t *lock)
|
||||
{
|
||||
local_bh_disable();
|
||||
preempt_disable();
|
||||
__local_bh_disable_ip(_RET_IP_, SOFTIRQ_LOCK_OFFSET);
|
||||
spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
|
||||
LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock);
|
||||
}
|
||||
@ -174,20 +173,17 @@ static inline void __raw_spin_unlock_bh(raw_spinlock_t *lock)
|
||||
{
|
||||
spin_release(&lock->dep_map, 1, _RET_IP_);
|
||||
do_raw_spin_unlock(lock);
|
||||
preempt_enable_no_resched();
|
||||
local_bh_enable_ip((unsigned long)__builtin_return_address(0));
|
||||
__local_bh_enable_ip(_RET_IP_, SOFTIRQ_LOCK_OFFSET);
|
||||
}
|
||||
|
||||
static inline int __raw_spin_trylock_bh(raw_spinlock_t *lock)
|
||||
{
|
||||
local_bh_disable();
|
||||
preempt_disable();
|
||||
__local_bh_disable_ip(_RET_IP_, SOFTIRQ_LOCK_OFFSET);
|
||||
if (do_raw_spin_trylock(lock)) {
|
||||
spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
|
||||
return 1;
|
||||
}
|
||||
preempt_enable_no_resched();
|
||||
local_bh_enable_ip((unsigned long)__builtin_return_address(0));
|
||||
__local_bh_enable_ip(_RET_IP_, SOFTIRQ_LOCK_OFFSET);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -24,11 +24,14 @@
|
||||
* flags straight, to suppress compiler warnings of unused lock
|
||||
* variables, and to add the proper checker annotations:
|
||||
*/
|
||||
#define ___LOCK(lock) \
|
||||
do { __acquire(lock); (void)(lock); } while (0)
|
||||
|
||||
#define __LOCK(lock) \
|
||||
do { preempt_disable(); __acquire(lock); (void)(lock); } while (0)
|
||||
do { preempt_disable(); ___LOCK(lock); } while (0)
|
||||
|
||||
#define __LOCK_BH(lock) \
|
||||
do { local_bh_disable(); __LOCK(lock); } while (0)
|
||||
do { __local_bh_disable_ip(_THIS_IP_, SOFTIRQ_LOCK_OFFSET); ___LOCK(lock); } while (0)
|
||||
|
||||
#define __LOCK_IRQ(lock) \
|
||||
do { local_irq_disable(); __LOCK(lock); } while (0)
|
||||
@ -36,12 +39,15 @@
|
||||
#define __LOCK_IRQSAVE(lock, flags) \
|
||||
do { local_irq_save(flags); __LOCK(lock); } while (0)
|
||||
|
||||
#define ___UNLOCK(lock) \
|
||||
do { __release(lock); (void)(lock); } while (0)
|
||||
|
||||
#define __UNLOCK(lock) \
|
||||
do { preempt_enable(); __release(lock); (void)(lock); } while (0)
|
||||
do { preempt_enable(); ___UNLOCK(lock); } while (0)
|
||||
|
||||
#define __UNLOCK_BH(lock) \
|
||||
do { preempt_enable_no_resched(); local_bh_enable(); \
|
||||
__release(lock); (void)(lock); } while (0)
|
||||
do { __local_bh_enable_ip(_THIS_IP_, SOFTIRQ_LOCK_OFFSET); \
|
||||
___UNLOCK(lock); } while (0)
|
||||
|
||||
#define __UNLOCK_IRQ(lock) \
|
||||
do { local_irq_enable(); __UNLOCK(lock); } while (0)
|
||||
|
@ -38,6 +38,7 @@ struct rlimit;
|
||||
struct rlimit64;
|
||||
struct rusage;
|
||||
struct sched_param;
|
||||
struct sched_attr;
|
||||
struct sel_arg_struct;
|
||||
struct semaphore;
|
||||
struct sembuf;
|
||||
@ -279,9 +280,14 @@ asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
|
||||
struct sched_param __user *param);
|
||||
asmlinkage long sys_sched_setparam(pid_t pid,
|
||||
struct sched_param __user *param);
|
||||
asmlinkage long sys_sched_setattr(pid_t pid,
|
||||
struct sched_attr __user *attr);
|
||||
asmlinkage long sys_sched_getscheduler(pid_t pid);
|
||||
asmlinkage long sys_sched_getparam(pid_t pid,
|
||||
struct sched_param __user *param);
|
||||
asmlinkage long sys_sched_getattr(pid_t pid,
|
||||
struct sched_attr __user *attr,
|
||||
unsigned int size);
|
||||
asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
|
||||
unsigned long __user *user_mask_ptr);
|
||||
asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
|
||||
|
@ -25,13 +25,16 @@ static inline void pagefault_disable(void)
|
||||
|
||||
static inline void pagefault_enable(void)
|
||||
{
|
||||
#ifndef CONFIG_PREEMPT
|
||||
/*
|
||||
* make sure to issue those last loads/stores before enabling
|
||||
* the pagefault handler again.
|
||||
*/
|
||||
barrier();
|
||||
preempt_count_dec();
|
||||
preempt_check_resched();
|
||||
#else
|
||||
preempt_enable();
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifndef ARCH_HAS_NOCACHE_UACCESS
|
||||
|
@ -42,27 +42,10 @@ static inline bool net_busy_loop_on(void)
|
||||
return sysctl_net_busy_poll;
|
||||
}
|
||||
|
||||
/* a wrapper to make debug_smp_processor_id() happy
|
||||
* we can use sched_clock() because we don't care much about precision
|
||||
* we only care that the average is bounded
|
||||
*/
|
||||
#ifdef CONFIG_DEBUG_PREEMPT
|
||||
static inline u64 busy_loop_us_clock(void)
|
||||
{
|
||||
u64 rc;
|
||||
|
||||
preempt_disable_notrace();
|
||||
rc = sched_clock();
|
||||
preempt_enable_no_resched_notrace();
|
||||
|
||||
return rc >> 10;
|
||||
return local_clock() >> 10;
|
||||
}
|
||||
#else /* CONFIG_DEBUG_PREEMPT */
|
||||
static inline u64 busy_loop_us_clock(void)
|
||||
{
|
||||
return sched_clock() >> 10;
|
||||
}
|
||||
#endif /* CONFIG_DEBUG_PREEMPT */
|
||||
|
||||
static inline unsigned long sk_busy_loop_end_time(struct sock *sk)
|
||||
{
|
||||
|
@ -39,8 +39,14 @@
|
||||
#define SCHED_BATCH 3
|
||||
/* SCHED_ISO: reserved but not implemented yet */
|
||||
#define SCHED_IDLE 5
|
||||
#define SCHED_DEADLINE 6
|
||||
|
||||
/* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */
|
||||
#define SCHED_RESET_ON_FORK 0x40000000
|
||||
|
||||
/*
|
||||
* For the sched_{set,get}attr() calls
|
||||
*/
|
||||
#define SCHED_FLAG_RESET_ON_FORK 0x01
|
||||
|
||||
#endif /* _UAPI_LINUX_SCHED_H */
|
||||
|
@ -105,14 +105,17 @@ static void cpu_idle_loop(void)
|
||||
__current_set_polling();
|
||||
}
|
||||
arch_cpu_idle_exit();
|
||||
/*
|
||||
* We need to test and propagate the TIF_NEED_RESCHED
|
||||
* bit here because we might not have send the
|
||||
* reschedule IPI to idle tasks.
|
||||
*/
|
||||
if (tif_need_resched())
|
||||
set_preempt_need_resched();
|
||||
}
|
||||
|
||||
/*
|
||||
* Since we fell out of the loop above, we know
|
||||
* TIF_NEED_RESCHED must be set, propagate it into
|
||||
* PREEMPT_NEED_RESCHED.
|
||||
*
|
||||
* This is required because for polling idle loops we will
|
||||
* not have had an IPI to fold the state for us.
|
||||
*/
|
||||
preempt_set_need_resched();
|
||||
tick_nohz_idle_exit();
|
||||
schedule_preempt_disabled();
|
||||
}
|
||||
|
@ -1087,8 +1087,10 @@ static void rt_mutex_init_task(struct task_struct *p)
|
||||
{
|
||||
raw_spin_lock_init(&p->pi_lock);
|
||||
#ifdef CONFIG_RT_MUTEXES
|
||||
plist_head_init(&p->pi_waiters);
|
||||
p->pi_waiters = RB_ROOT;
|
||||
p->pi_waiters_leftmost = NULL;
|
||||
p->pi_blocked_on = NULL;
|
||||
p->pi_top_task = NULL;
|
||||
#endif
|
||||
}
|
||||
|
||||
@ -1311,7 +1313,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
|
||||
#endif
|
||||
|
||||
/* Perform scheduler related setup. Assign this task to a CPU. */
|
||||
sched_fork(clone_flags, p);
|
||||
retval = sched_fork(clone_flags, p);
|
||||
if (retval)
|
||||
goto bad_fork_cleanup_policy;
|
||||
|
||||
retval = perf_event_init_task(p);
|
||||
if (retval)
|
||||
@ -1403,13 +1407,11 @@ static struct task_struct *copy_process(unsigned long clone_flags,
|
||||
p->tgid = p->pid;
|
||||
}
|
||||
|
||||
p->pdeath_signal = 0;
|
||||
p->exit_state = 0;
|
||||
|
||||
p->nr_dirtied = 0;
|
||||
p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10);
|
||||
p->dirty_paused_when = 0;
|
||||
|
||||
p->pdeath_signal = 0;
|
||||
INIT_LIST_HEAD(&p->thread_group);
|
||||
p->task_works = NULL;
|
||||
|
||||
|
@ -2426,6 +2426,8 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
|
||||
* code while we sleep on uaddr.
|
||||
*/
|
||||
debug_rt_mutex_init_waiter(&rt_waiter);
|
||||
RB_CLEAR_NODE(&rt_waiter.pi_tree_entry);
|
||||
RB_CLEAR_NODE(&rt_waiter.tree_entry);
|
||||
rt_waiter.task = NULL;
|
||||
|
||||
ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
|
||||
|
@ -46,6 +46,7 @@
|
||||
#include <linux/sched.h>
|
||||
#include <linux/sched/sysctl.h>
|
||||
#include <linux/sched/rt.h>
|
||||
#include <linux/sched/deadline.h>
|
||||
#include <linux/timer.h>
|
||||
#include <linux/freezer.h>
|
||||
|
||||
@ -1610,7 +1611,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
|
||||
unsigned long slack;
|
||||
|
||||
slack = current->timer_slack_ns;
|
||||
if (rt_task(current))
|
||||
if (dl_task(current) || rt_task(current))
|
||||
slack = 0;
|
||||
|
||||
hrtimer_init_on_stack(&t.timer, clockid, mode);
|
||||
|
@ -24,7 +24,7 @@
|
||||
#include <linux/kallsyms.h>
|
||||
#include <linux/syscalls.h>
|
||||
#include <linux/interrupt.h>
|
||||
#include <linux/plist.h>
|
||||
#include <linux/rbtree.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/debug_locks.h>
|
||||
|
||||
@ -57,7 +57,7 @@ static void printk_lock(struct rt_mutex *lock, int print_owner)
|
||||
|
||||
void rt_mutex_debug_task_free(struct task_struct *task)
|
||||
{
|
||||
DEBUG_LOCKS_WARN_ON(!plist_head_empty(&task->pi_waiters));
|
||||
DEBUG_LOCKS_WARN_ON(!RB_EMPTY_ROOT(&task->pi_waiters));
|
||||
DEBUG_LOCKS_WARN_ON(task->pi_blocked_on);
|
||||
}
|
||||
|
||||
@ -154,16 +154,12 @@ void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock)
|
||||
void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
|
||||
{
|
||||
memset(waiter, 0x11, sizeof(*waiter));
|
||||
plist_node_init(&waiter->list_entry, MAX_PRIO);
|
||||
plist_node_init(&waiter->pi_list_entry, MAX_PRIO);
|
||||
waiter->deadlock_task_pid = NULL;
|
||||
}
|
||||
|
||||
void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter)
|
||||
{
|
||||
put_pid(waiter->deadlock_task_pid);
|
||||
DEBUG_LOCKS_WARN_ON(!plist_node_empty(&waiter->list_entry));
|
||||
DEBUG_LOCKS_WARN_ON(!plist_node_empty(&waiter->pi_list_entry));
|
||||
memset(waiter, 0x22, sizeof(*waiter));
|
||||
}
|
||||
|
||||
|
@ -14,6 +14,7 @@
|
||||
#include <linux/export.h>
|
||||
#include <linux/sched.h>
|
||||
#include <linux/sched/rt.h>
|
||||
#include <linux/sched/deadline.h>
|
||||
#include <linux/timer.h>
|
||||
|
||||
#include "rtmutex_common.h"
|
||||
@ -91,10 +92,107 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
|
||||
}
|
||||
#endif
|
||||
|
||||
static inline int
|
||||
rt_mutex_waiter_less(struct rt_mutex_waiter *left,
|
||||
struct rt_mutex_waiter *right)
|
||||
{
|
||||
if (left->prio < right->prio)
|
||||
return 1;
|
||||
|
||||
/*
|
||||
* Calculate task priority from the waiter list priority
|
||||
* If both waiters have dl_prio(), we check the deadlines of the
|
||||
* associated tasks.
|
||||
* If left waiter has a dl_prio(), and we didn't return 1 above,
|
||||
* then right waiter has a dl_prio() too.
|
||||
*/
|
||||
if (dl_prio(left->prio))
|
||||
return (left->task->dl.deadline < right->task->dl.deadline);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void
|
||||
rt_mutex_enqueue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter)
|
||||
{
|
||||
struct rb_node **link = &lock->waiters.rb_node;
|
||||
struct rb_node *parent = NULL;
|
||||
struct rt_mutex_waiter *entry;
|
||||
int leftmost = 1;
|
||||
|
||||
while (*link) {
|
||||
parent = *link;
|
||||
entry = rb_entry(parent, struct rt_mutex_waiter, tree_entry);
|
||||
if (rt_mutex_waiter_less(waiter, entry)) {
|
||||
link = &parent->rb_left;
|
||||
} else {
|
||||
link = &parent->rb_right;
|
||||
leftmost = 0;
|
||||
}
|
||||
}
|
||||
|
||||
if (leftmost)
|
||||
lock->waiters_leftmost = &waiter->tree_entry;
|
||||
|
||||
rb_link_node(&waiter->tree_entry, parent, link);
|
||||
rb_insert_color(&waiter->tree_entry, &lock->waiters);
|
||||
}
|
||||
|
||||
static void
|
||||
rt_mutex_dequeue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter)
|
||||
{
|
||||
if (RB_EMPTY_NODE(&waiter->tree_entry))
|
||||
return;
|
||||
|
||||
if (lock->waiters_leftmost == &waiter->tree_entry)
|
||||
lock->waiters_leftmost = rb_next(&waiter->tree_entry);
|
||||
|
||||
rb_erase(&waiter->tree_entry, &lock->waiters);
|
||||
RB_CLEAR_NODE(&waiter->tree_entry);
|
||||
}
|
||||
|
||||
static void
|
||||
rt_mutex_enqueue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter)
|
||||
{
|
||||
struct rb_node **link = &task->pi_waiters.rb_node;
|
||||
struct rb_node *parent = NULL;
|
||||
struct rt_mutex_waiter *entry;
|
||||
int leftmost = 1;
|
||||
|
||||
while (*link) {
|
||||
parent = *link;
|
||||
entry = rb_entry(parent, struct rt_mutex_waiter, pi_tree_entry);
|
||||
if (rt_mutex_waiter_less(waiter, entry)) {
|
||||
link = &parent->rb_left;
|
||||
} else {
|
||||
link = &parent->rb_right;
|
||||
leftmost = 0;
|
||||
}
|
||||
}
|
||||
|
||||
if (leftmost)
|
||||
task->pi_waiters_leftmost = &waiter->pi_tree_entry;
|
||||
|
||||
rb_link_node(&waiter->pi_tree_entry, parent, link);
|
||||
rb_insert_color(&waiter->pi_tree_entry, &task->pi_waiters);
|
||||
}
|
||||
|
||||
static void
|
||||
rt_mutex_dequeue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter)
|
||||
{
|
||||
if (RB_EMPTY_NODE(&waiter->pi_tree_entry))
|
||||
return;
|
||||
|
||||
if (task->pi_waiters_leftmost == &waiter->pi_tree_entry)
|
||||
task->pi_waiters_leftmost = rb_next(&waiter->pi_tree_entry);
|
||||
|
||||
rb_erase(&waiter->pi_tree_entry, &task->pi_waiters);
|
||||
RB_CLEAR_NODE(&waiter->pi_tree_entry);
|
||||
}
|
||||
|
||||
/*
|
||||
* Calculate task priority from the waiter tree priority
|
||||
*
|
||||
* Return task->normal_prio when the waiter list is empty or when
|
||||
* Return task->normal_prio when the waiter tree is empty or when
|
||||
* the waiter is not allowed to do priority boosting
|
||||
*/
|
||||
int rt_mutex_getprio(struct task_struct *task)
|
||||
@ -102,10 +200,18 @@ int rt_mutex_getprio(struct task_struct *task)
|
||||
if (likely(!task_has_pi_waiters(task)))
|
||||
return task->normal_prio;
|
||||
|
||||
return min(task_top_pi_waiter(task)->pi_list_entry.prio,
|
||||
return min(task_top_pi_waiter(task)->prio,
|
||||
task->normal_prio);
|
||||
}
|
||||
|
||||
struct task_struct *rt_mutex_get_top_task(struct task_struct *task)
|
||||
{
|
||||
if (likely(!task_has_pi_waiters(task)))
|
||||
return NULL;
|
||||
|
||||
return task_top_pi_waiter(task)->task;
|
||||
}
|
||||
|
||||
/*
|
||||
* Adjust the priority of a task, after its pi_waiters got modified.
|
||||
*
|
||||
@ -115,7 +221,7 @@ static void __rt_mutex_adjust_prio(struct task_struct *task)
|
||||
{
|
||||
int prio = rt_mutex_getprio(task);
|
||||
|
||||
if (task->prio != prio)
|
||||
if (task->prio != prio || dl_prio(prio))
|
||||
rt_mutex_setprio(task, prio);
|
||||
}
|
||||
|
||||
@ -233,7 +339,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
|
||||
* When deadlock detection is off then we check, if further
|
||||
* priority adjustment is necessary.
|
||||
*/
|
||||
if (!detect_deadlock && waiter->list_entry.prio == task->prio)
|
||||
if (!detect_deadlock && waiter->prio == task->prio)
|
||||
goto out_unlock_pi;
|
||||
|
||||
lock = waiter->lock;
|
||||
@ -254,9 +360,9 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
|
||||
top_waiter = rt_mutex_top_waiter(lock);
|
||||
|
||||
/* Requeue the waiter */
|
||||
plist_del(&waiter->list_entry, &lock->wait_list);
|
||||
waiter->list_entry.prio = task->prio;
|
||||
plist_add(&waiter->list_entry, &lock->wait_list);
|
||||
rt_mutex_dequeue(lock, waiter);
|
||||
waiter->prio = task->prio;
|
||||
rt_mutex_enqueue(lock, waiter);
|
||||
|
||||
/* Release the task */
|
||||
raw_spin_unlock_irqrestore(&task->pi_lock, flags);
|
||||
@ -280,17 +386,15 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
|
||||
|
||||
if (waiter == rt_mutex_top_waiter(lock)) {
|
||||
/* Boost the owner */
|
||||
plist_del(&top_waiter->pi_list_entry, &task->pi_waiters);
|
||||
waiter->pi_list_entry.prio = waiter->list_entry.prio;
|
||||
plist_add(&waiter->pi_list_entry, &task->pi_waiters);
|
||||
rt_mutex_dequeue_pi(task, top_waiter);
|
||||
rt_mutex_enqueue_pi(task, waiter);
|
||||
__rt_mutex_adjust_prio(task);
|
||||
|
||||
} else if (top_waiter == waiter) {
|
||||
/* Deboost the owner */
|
||||
plist_del(&waiter->pi_list_entry, &task->pi_waiters);
|
||||
rt_mutex_dequeue_pi(task, waiter);
|
||||
waiter = rt_mutex_top_waiter(lock);
|
||||
waiter->pi_list_entry.prio = waiter->list_entry.prio;
|
||||
plist_add(&waiter->pi_list_entry, &task->pi_waiters);
|
||||
rt_mutex_enqueue_pi(task, waiter);
|
||||
__rt_mutex_adjust_prio(task);
|
||||
}
|
||||
|
||||
@ -355,7 +459,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
|
||||
* 3) it is top waiter
|
||||
*/
|
||||
if (rt_mutex_has_waiters(lock)) {
|
||||
if (task->prio >= rt_mutex_top_waiter(lock)->list_entry.prio) {
|
||||
if (task->prio >= rt_mutex_top_waiter(lock)->prio) {
|
||||
if (!waiter || waiter != rt_mutex_top_waiter(lock))
|
||||
return 0;
|
||||
}
|
||||
@ -369,7 +473,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
|
||||
|
||||
/* remove the queued waiter. */
|
||||
if (waiter) {
|
||||
plist_del(&waiter->list_entry, &lock->wait_list);
|
||||
rt_mutex_dequeue(lock, waiter);
|
||||
task->pi_blocked_on = NULL;
|
||||
}
|
||||
|
||||
@ -379,8 +483,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
|
||||
*/
|
||||
if (rt_mutex_has_waiters(lock)) {
|
||||
top = rt_mutex_top_waiter(lock);
|
||||
top->pi_list_entry.prio = top->list_entry.prio;
|
||||
plist_add(&top->pi_list_entry, &task->pi_waiters);
|
||||
rt_mutex_enqueue_pi(task, top);
|
||||
}
|
||||
raw_spin_unlock_irqrestore(&task->pi_lock, flags);
|
||||
}
|
||||
@ -416,13 +519,12 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
|
||||
__rt_mutex_adjust_prio(task);
|
||||
waiter->task = task;
|
||||
waiter->lock = lock;
|
||||
plist_node_init(&waiter->list_entry, task->prio);
|
||||
plist_node_init(&waiter->pi_list_entry, task->prio);
|
||||
waiter->prio = task->prio;
|
||||
|
||||
/* Get the top priority waiter on the lock */
|
||||
if (rt_mutex_has_waiters(lock))
|
||||
top_waiter = rt_mutex_top_waiter(lock);
|
||||
plist_add(&waiter->list_entry, &lock->wait_list);
|
||||
rt_mutex_enqueue(lock, waiter);
|
||||
|
||||
task->pi_blocked_on = waiter;
|
||||
|
||||
@ -433,8 +535,8 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
|
||||
|
||||
if (waiter == rt_mutex_top_waiter(lock)) {
|
||||
raw_spin_lock_irqsave(&owner->pi_lock, flags);
|
||||
plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters);
|
||||
plist_add(&waiter->pi_list_entry, &owner->pi_waiters);
|
||||
rt_mutex_dequeue_pi(owner, top_waiter);
|
||||
rt_mutex_enqueue_pi(owner, waiter);
|
||||
|
||||
__rt_mutex_adjust_prio(owner);
|
||||
if (owner->pi_blocked_on)
|
||||
@ -486,7 +588,7 @@ static void wakeup_next_waiter(struct rt_mutex *lock)
|
||||
* boosted mode and go back to normal after releasing
|
||||
* lock->wait_lock.
|
||||
*/
|
||||
plist_del(&waiter->pi_list_entry, ¤t->pi_waiters);
|
||||
rt_mutex_dequeue_pi(current, waiter);
|
||||
|
||||
rt_mutex_set_owner(lock, NULL);
|
||||
|
||||
@ -510,7 +612,7 @@ static void remove_waiter(struct rt_mutex *lock,
|
||||
int chain_walk = 0;
|
||||
|
||||
raw_spin_lock_irqsave(¤t->pi_lock, flags);
|
||||
plist_del(&waiter->list_entry, &lock->wait_list);
|
||||
rt_mutex_dequeue(lock, waiter);
|
||||
current->pi_blocked_on = NULL;
|
||||
raw_spin_unlock_irqrestore(¤t->pi_lock, flags);
|
||||
|
||||
@ -521,13 +623,13 @@ static void remove_waiter(struct rt_mutex *lock,
|
||||
|
||||
raw_spin_lock_irqsave(&owner->pi_lock, flags);
|
||||
|
||||
plist_del(&waiter->pi_list_entry, &owner->pi_waiters);
|
||||
rt_mutex_dequeue_pi(owner, waiter);
|
||||
|
||||
if (rt_mutex_has_waiters(lock)) {
|
||||
struct rt_mutex_waiter *next;
|
||||
|
||||
next = rt_mutex_top_waiter(lock);
|
||||
plist_add(&next->pi_list_entry, &owner->pi_waiters);
|
||||
rt_mutex_enqueue_pi(owner, next);
|
||||
}
|
||||
__rt_mutex_adjust_prio(owner);
|
||||
|
||||
@ -537,8 +639,6 @@ static void remove_waiter(struct rt_mutex *lock,
|
||||
raw_spin_unlock_irqrestore(&owner->pi_lock, flags);
|
||||
}
|
||||
|
||||
WARN_ON(!plist_node_empty(&waiter->pi_list_entry));
|
||||
|
||||
if (!chain_walk)
|
||||
return;
|
||||
|
||||
@ -565,7 +665,8 @@ void rt_mutex_adjust_pi(struct task_struct *task)
|
||||
raw_spin_lock_irqsave(&task->pi_lock, flags);
|
||||
|
||||
waiter = task->pi_blocked_on;
|
||||
if (!waiter || waiter->list_entry.prio == task->prio) {
|
||||
if (!waiter || (waiter->prio == task->prio &&
|
||||
!dl_prio(task->prio))) {
|
||||
raw_spin_unlock_irqrestore(&task->pi_lock, flags);
|
||||
return;
|
||||
}
|
||||
@ -638,6 +739,8 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
|
||||
int ret = 0;
|
||||
|
||||
debug_rt_mutex_init_waiter(&waiter);
|
||||
RB_CLEAR_NODE(&waiter.pi_tree_entry);
|
||||
RB_CLEAR_NODE(&waiter.tree_entry);
|
||||
|
||||
raw_spin_lock(&lock->wait_lock);
|
||||
|
||||
@ -904,7 +1007,8 @@ void __rt_mutex_init(struct rt_mutex *lock, const char *name)
|
||||
{
|
||||
lock->owner = NULL;
|
||||
raw_spin_lock_init(&lock->wait_lock);
|
||||
plist_head_init(&lock->wait_list);
|
||||
lock->waiters = RB_ROOT;
|
||||
lock->waiters_leftmost = NULL;
|
||||
|
||||
debug_rt_mutex_init(lock, name);
|
||||
}
|
||||
|
@ -40,13 +40,13 @@ extern void schedule_rt_mutex_test(struct rt_mutex *lock);
|
||||
* This is the control structure for tasks blocked on a rt_mutex,
|
||||
* which is allocated on the kernel stack on of the blocked task.
|
||||
*
|
||||
* @list_entry: pi node to enqueue into the mutex waiters list
|
||||
* @pi_list_entry: pi node to enqueue into the mutex owner waiters list
|
||||
* @tree_entry: pi node to enqueue into the mutex waiters tree
|
||||
* @pi_tree_entry: pi node to enqueue into the mutex owner waiters tree
|
||||
* @task: task reference to the blocked task
|
||||
*/
|
||||
struct rt_mutex_waiter {
|
||||
struct plist_node list_entry;
|
||||
struct plist_node pi_list_entry;
|
||||
struct rb_node tree_entry;
|
||||
struct rb_node pi_tree_entry;
|
||||
struct task_struct *task;
|
||||
struct rt_mutex *lock;
|
||||
#ifdef CONFIG_DEBUG_RT_MUTEXES
|
||||
@ -54,14 +54,15 @@ struct rt_mutex_waiter {
|
||||
struct pid *deadlock_task_pid;
|
||||
struct rt_mutex *deadlock_lock;
|
||||
#endif
|
||||
int prio;
|
||||
};
|
||||
|
||||
/*
|
||||
* Various helpers to access the waiters-plist:
|
||||
* Various helpers to access the waiters-tree:
|
||||
*/
|
||||
static inline int rt_mutex_has_waiters(struct rt_mutex *lock)
|
||||
{
|
||||
return !plist_head_empty(&lock->wait_list);
|
||||
return !RB_EMPTY_ROOT(&lock->waiters);
|
||||
}
|
||||
|
||||
static inline struct rt_mutex_waiter *
|
||||
@ -69,8 +70,8 @@ rt_mutex_top_waiter(struct rt_mutex *lock)
|
||||
{
|
||||
struct rt_mutex_waiter *w;
|
||||
|
||||
w = plist_first_entry(&lock->wait_list, struct rt_mutex_waiter,
|
||||
list_entry);
|
||||
w = rb_entry(lock->waiters_leftmost, struct rt_mutex_waiter,
|
||||
tree_entry);
|
||||
BUG_ON(w->lock != lock);
|
||||
|
||||
return w;
|
||||
@ -78,14 +79,14 @@ rt_mutex_top_waiter(struct rt_mutex *lock)
|
||||
|
||||
static inline int task_has_pi_waiters(struct task_struct *p)
|
||||
{
|
||||
return !plist_head_empty(&p->pi_waiters);
|
||||
return !RB_EMPTY_ROOT(&p->pi_waiters);
|
||||
}
|
||||
|
||||
static inline struct rt_mutex_waiter *
|
||||
task_top_pi_waiter(struct task_struct *p)
|
||||
{
|
||||
return plist_first_entry(&p->pi_waiters, struct rt_mutex_waiter,
|
||||
pi_list_entry);
|
||||
return rb_entry(p->pi_waiters_leftmost, struct rt_mutex_waiter,
|
||||
pi_tree_entry);
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -11,9 +11,10 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
|
||||
CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
|
||||
endif
|
||||
|
||||
obj-y += core.o proc.o clock.o cputime.o idle_task.o fair.o rt.o stop_task.o
|
||||
obj-y += core.o proc.o clock.o cputime.o
|
||||
obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
|
||||
obj-y += wait.o completion.o
|
||||
obj-$(CONFIG_SMP) += cpupri.o
|
||||
obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o
|
||||
obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
|
||||
obj-$(CONFIG_SCHEDSTATS) += stats.o
|
||||
obj-$(CONFIG_SCHED_DEBUG) += debug.o
|
||||
|
@ -26,9 +26,10 @@
|
||||
* at 0 on boot (but people really shouldn't rely on that).
|
||||
*
|
||||
* cpu_clock(i) -- can be used from any context, including NMI.
|
||||
* sched_clock_cpu(i) -- must be used with local IRQs disabled (implied by NMI)
|
||||
* local_clock() -- is cpu_clock() on the current cpu.
|
||||
*
|
||||
* sched_clock_cpu(i)
|
||||
*
|
||||
* How:
|
||||
*
|
||||
* The implementation either uses sched_clock() when
|
||||
@ -50,15 +51,6 @@
|
||||
* Furthermore, explicit sleep and wakeup hooks allow us to account for time
|
||||
* that is otherwise invisible (TSC gets stopped).
|
||||
*
|
||||
*
|
||||
* Notes:
|
||||
*
|
||||
* The !IRQ-safetly of sched_clock() and sched_clock_cpu() comes from things
|
||||
* like cpufreq interrupts that can change the base clock (TSC) multiplier
|
||||
* and cause funny jumps in time -- although the filtering provided by
|
||||
* sched_clock_cpu() should mitigate serious artifacts we cannot rely on it
|
||||
* in general since for !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK we fully rely on
|
||||
* sched_clock().
|
||||
*/
|
||||
#include <linux/spinlock.h>
|
||||
#include <linux/hardirq.h>
|
||||
@ -66,6 +58,8 @@
|
||||
#include <linux/percpu.h>
|
||||
#include <linux/ktime.h>
|
||||
#include <linux/sched.h>
|
||||
#include <linux/static_key.h>
|
||||
#include <linux/workqueue.h>
|
||||
|
||||
/*
|
||||
* Scheduler clock - returns current time in nanosec units.
|
||||
@ -82,7 +76,37 @@ EXPORT_SYMBOL_GPL(sched_clock);
|
||||
__read_mostly int sched_clock_running;
|
||||
|
||||
#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
|
||||
__read_mostly int sched_clock_stable;
|
||||
static struct static_key __sched_clock_stable = STATIC_KEY_INIT;
|
||||
|
||||
int sched_clock_stable(void)
|
||||
{
|
||||
if (static_key_false(&__sched_clock_stable))
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
void set_sched_clock_stable(void)
|
||||
{
|
||||
if (!sched_clock_stable())
|
||||
static_key_slow_dec(&__sched_clock_stable);
|
||||
}
|
||||
|
||||
static void __clear_sched_clock_stable(struct work_struct *work)
|
||||
{
|
||||
/* XXX worry about clock continuity */
|
||||
if (sched_clock_stable())
|
||||
static_key_slow_inc(&__sched_clock_stable);
|
||||
}
|
||||
|
||||
static DECLARE_WORK(sched_clock_work, __clear_sched_clock_stable);
|
||||
|
||||
void clear_sched_clock_stable(void)
|
||||
{
|
||||
if (keventd_up())
|
||||
schedule_work(&sched_clock_work);
|
||||
else
|
||||
__clear_sched_clock_stable(&sched_clock_work);
|
||||
}
|
||||
|
||||
struct sched_clock_data {
|
||||
u64 tick_raw;
|
||||
@ -242,20 +266,20 @@ u64 sched_clock_cpu(int cpu)
|
||||
struct sched_clock_data *scd;
|
||||
u64 clock;
|
||||
|
||||
WARN_ON_ONCE(!irqs_disabled());
|
||||
|
||||
if (sched_clock_stable)
|
||||
if (sched_clock_stable())
|
||||
return sched_clock();
|
||||
|
||||
if (unlikely(!sched_clock_running))
|
||||
return 0ull;
|
||||
|
||||
preempt_disable();
|
||||
scd = cpu_sdc(cpu);
|
||||
|
||||
if (cpu != smp_processor_id())
|
||||
clock = sched_clock_remote(scd);
|
||||
else
|
||||
clock = sched_clock_local(scd);
|
||||
preempt_enable();
|
||||
|
||||
return clock;
|
||||
}
|
||||
@ -265,7 +289,7 @@ void sched_clock_tick(void)
|
||||
struct sched_clock_data *scd;
|
||||
u64 now, now_gtod;
|
||||
|
||||
if (sched_clock_stable)
|
||||
if (sched_clock_stable())
|
||||
return;
|
||||
|
||||
if (unlikely(!sched_clock_running))
|
||||
@ -316,14 +340,10 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
|
||||
*/
|
||||
u64 cpu_clock(int cpu)
|
||||
{
|
||||
u64 clock;
|
||||
unsigned long flags;
|
||||
if (static_key_false(&__sched_clock_stable))
|
||||
return sched_clock_cpu(cpu);
|
||||
|
||||
local_irq_save(flags);
|
||||
clock = sched_clock_cpu(cpu);
|
||||
local_irq_restore(flags);
|
||||
|
||||
return clock;
|
||||
return sched_clock();
|
||||
}
|
||||
|
||||
/*
|
||||
@ -335,14 +355,10 @@ u64 cpu_clock(int cpu)
|
||||
*/
|
||||
u64 local_clock(void)
|
||||
{
|
||||
u64 clock;
|
||||
unsigned long flags;
|
||||
if (static_key_false(&__sched_clock_stable))
|
||||
return sched_clock_cpu(raw_smp_processor_id());
|
||||
|
||||
local_irq_save(flags);
|
||||
clock = sched_clock_cpu(smp_processor_id());
|
||||
local_irq_restore(flags);
|
||||
|
||||
return clock;
|
||||
return sched_clock();
|
||||
}
|
||||
|
||||
#else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
|
||||
@ -362,12 +378,12 @@ u64 sched_clock_cpu(int cpu)
|
||||
|
||||
u64 cpu_clock(int cpu)
|
||||
{
|
||||
return sched_clock_cpu(cpu);
|
||||
return sched_clock();
|
||||
}
|
||||
|
||||
u64 local_clock(void)
|
||||
{
|
||||
return sched_clock_cpu(0);
|
||||
return sched_clock();
|
||||
}
|
||||
|
||||
#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
|
||||
|
File diff suppressed because it is too large
Load Diff
216
kernel/sched/cpudeadline.c
Normal file
216
kernel/sched/cpudeadline.c
Normal file
@ -0,0 +1,216 @@
|
||||
/*
|
||||
* kernel/sched/cpudl.c
|
||||
*
|
||||
* Global CPU deadline management
|
||||
*
|
||||
* Author: Juri Lelli <j.lelli@sssup.it>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation; version 2
|
||||
* of the License.
|
||||
*/
|
||||
|
||||
#include <linux/gfp.h>
|
||||
#include <linux/kernel.h>
|
||||
#include "cpudeadline.h"
|
||||
|
||||
static inline int parent(int i)
|
||||
{
|
||||
return (i - 1) >> 1;
|
||||
}
|
||||
|
||||
static inline int left_child(int i)
|
||||
{
|
||||
return (i << 1) + 1;
|
||||
}
|
||||
|
||||
static inline int right_child(int i)
|
||||
{
|
||||
return (i << 1) + 2;
|
||||
}
|
||||
|
||||
static inline int dl_time_before(u64 a, u64 b)
|
||||
{
|
||||
return (s64)(a - b) < 0;
|
||||
}
|
||||
|
||||
static void cpudl_exchange(struct cpudl *cp, int a, int b)
|
||||
{
|
||||
int cpu_a = cp->elements[a].cpu, cpu_b = cp->elements[b].cpu;
|
||||
|
||||
swap(cp->elements[a], cp->elements[b]);
|
||||
swap(cp->cpu_to_idx[cpu_a], cp->cpu_to_idx[cpu_b]);
|
||||
}
|
||||
|
||||
static void cpudl_heapify(struct cpudl *cp, int idx)
|
||||
{
|
||||
int l, r, largest;
|
||||
|
||||
/* adapted from lib/prio_heap.c */
|
||||
while(1) {
|
||||
l = left_child(idx);
|
||||
r = right_child(idx);
|
||||
largest = idx;
|
||||
|
||||
if ((l < cp->size) && dl_time_before(cp->elements[idx].dl,
|
||||
cp->elements[l].dl))
|
||||
largest = l;
|
||||
if ((r < cp->size) && dl_time_before(cp->elements[largest].dl,
|
||||
cp->elements[r].dl))
|
||||
largest = r;
|
||||
if (largest == idx)
|
||||
break;
|
||||
|
||||
/* Push idx down the heap one level and bump one up */
|
||||
cpudl_exchange(cp, largest, idx);
|
||||
idx = largest;
|
||||
}
|
||||
}
|
||||
|
||||
static void cpudl_change_key(struct cpudl *cp, int idx, u64 new_dl)
|
||||
{
|
||||
WARN_ON(idx > num_present_cpus() || idx == IDX_INVALID);
|
||||
|
||||
if (dl_time_before(new_dl, cp->elements[idx].dl)) {
|
||||
cp->elements[idx].dl = new_dl;
|
||||
cpudl_heapify(cp, idx);
|
||||
} else {
|
||||
cp->elements[idx].dl = new_dl;
|
||||
while (idx > 0 && dl_time_before(cp->elements[parent(idx)].dl,
|
||||
cp->elements[idx].dl)) {
|
||||
cpudl_exchange(cp, idx, parent(idx));
|
||||
idx = parent(idx);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static inline int cpudl_maximum(struct cpudl *cp)
|
||||
{
|
||||
return cp->elements[0].cpu;
|
||||
}
|
||||
|
||||
/*
|
||||
* cpudl_find - find the best (later-dl) CPU in the system
|
||||
* @cp: the cpudl max-heap context
|
||||
* @p: the task
|
||||
* @later_mask: a mask to fill in with the selected CPUs (or NULL)
|
||||
*
|
||||
* Returns: int - best CPU (heap maximum if suitable)
|
||||
*/
|
||||
int cpudl_find(struct cpudl *cp, struct task_struct *p,
|
||||
struct cpumask *later_mask)
|
||||
{
|
||||
int best_cpu = -1;
|
||||
const struct sched_dl_entity *dl_se = &p->dl;
|
||||
|
||||
if (later_mask && cpumask_and(later_mask, cp->free_cpus,
|
||||
&p->cpus_allowed) && cpumask_and(later_mask,
|
||||
later_mask, cpu_active_mask)) {
|
||||
best_cpu = cpumask_any(later_mask);
|
||||
goto out;
|
||||
} else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) &&
|
||||
dl_time_before(dl_se->deadline, cp->elements[0].dl)) {
|
||||
best_cpu = cpudl_maximum(cp);
|
||||
if (later_mask)
|
||||
cpumask_set_cpu(best_cpu, later_mask);
|
||||
}
|
||||
|
||||
out:
|
||||
WARN_ON(best_cpu > num_present_cpus() && best_cpu != -1);
|
||||
|
||||
return best_cpu;
|
||||
}
|
||||
|
||||
/*
|
||||
* cpudl_set - update the cpudl max-heap
|
||||
* @cp: the cpudl max-heap context
|
||||
* @cpu: the target cpu
|
||||
* @dl: the new earliest deadline for this cpu
|
||||
*
|
||||
* Notes: assumes cpu_rq(cpu)->lock is locked
|
||||
*
|
||||
* Returns: (void)
|
||||
*/
|
||||
void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid)
|
||||
{
|
||||
int old_idx, new_cpu;
|
||||
unsigned long flags;
|
||||
|
||||
WARN_ON(cpu > num_present_cpus());
|
||||
|
||||
raw_spin_lock_irqsave(&cp->lock, flags);
|
||||
old_idx = cp->cpu_to_idx[cpu];
|
||||
if (!is_valid) {
|
||||
/* remove item */
|
||||
if (old_idx == IDX_INVALID) {
|
||||
/*
|
||||
* Nothing to remove if old_idx was invalid.
|
||||
* This could happen if a rq_offline_dl is
|
||||
* called for a CPU without -dl tasks running.
|
||||
*/
|
||||
goto out;
|
||||
}
|
||||
new_cpu = cp->elements[cp->size - 1].cpu;
|
||||
cp->elements[old_idx].dl = cp->elements[cp->size - 1].dl;
|
||||
cp->elements[old_idx].cpu = new_cpu;
|
||||
cp->size--;
|
||||
cp->cpu_to_idx[new_cpu] = old_idx;
|
||||
cp->cpu_to_idx[cpu] = IDX_INVALID;
|
||||
while (old_idx > 0 && dl_time_before(
|
||||
cp->elements[parent(old_idx)].dl,
|
||||
cp->elements[old_idx].dl)) {
|
||||
cpudl_exchange(cp, old_idx, parent(old_idx));
|
||||
old_idx = parent(old_idx);
|
||||
}
|
||||
cpumask_set_cpu(cpu, cp->free_cpus);
|
||||
cpudl_heapify(cp, old_idx);
|
||||
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (old_idx == IDX_INVALID) {
|
||||
cp->size++;
|
||||
cp->elements[cp->size - 1].dl = 0;
|
||||
cp->elements[cp->size - 1].cpu = cpu;
|
||||
cp->cpu_to_idx[cpu] = cp->size - 1;
|
||||
cpudl_change_key(cp, cp->size - 1, dl);
|
||||
cpumask_clear_cpu(cpu, cp->free_cpus);
|
||||
} else {
|
||||
cpudl_change_key(cp, old_idx, dl);
|
||||
}
|
||||
|
||||
out:
|
||||
raw_spin_unlock_irqrestore(&cp->lock, flags);
|
||||
}
|
||||
|
||||
/*
|
||||
* cpudl_init - initialize the cpudl structure
|
||||
* @cp: the cpudl max-heap context
|
||||
*/
|
||||
int cpudl_init(struct cpudl *cp)
|
||||
{
|
||||
int i;
|
||||
|
||||
memset(cp, 0, sizeof(*cp));
|
||||
raw_spin_lock_init(&cp->lock);
|
||||
cp->size = 0;
|
||||
for (i = 0; i < NR_CPUS; i++)
|
||||
cp->cpu_to_idx[i] = IDX_INVALID;
|
||||
if (!alloc_cpumask_var(&cp->free_cpus, GFP_KERNEL))
|
||||
return -ENOMEM;
|
||||
cpumask_setall(cp->free_cpus);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* cpudl_cleanup - clean up the cpudl structure
|
||||
* @cp: the cpudl max-heap context
|
||||
*/
|
||||
void cpudl_cleanup(struct cpudl *cp)
|
||||
{
|
||||
/*
|
||||
* nothing to do for the moment
|
||||
*/
|
||||
}
|
33
kernel/sched/cpudeadline.h
Normal file
33
kernel/sched/cpudeadline.h
Normal file
@ -0,0 +1,33 @@
|
||||
#ifndef _LINUX_CPUDL_H
|
||||
#define _LINUX_CPUDL_H
|
||||
|
||||
#include <linux/sched.h>
|
||||
|
||||
#define IDX_INVALID -1
|
||||
|
||||
struct array_item {
|
||||
u64 dl;
|
||||
int cpu;
|
||||
};
|
||||
|
||||
struct cpudl {
|
||||
raw_spinlock_t lock;
|
||||
int size;
|
||||
int cpu_to_idx[NR_CPUS];
|
||||
struct array_item elements[NR_CPUS];
|
||||
cpumask_var_t free_cpus;
|
||||
};
|
||||
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
int cpudl_find(struct cpudl *cp, struct task_struct *p,
|
||||
struct cpumask *later_mask);
|
||||
void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid);
|
||||
int cpudl_init(struct cpudl *cp);
|
||||
void cpudl_cleanup(struct cpudl *cp);
|
||||
#else
|
||||
#define cpudl_set(cp, cpu, dl) do { } while (0)
|
||||
#define cpudl_init() do { } while (0)
|
||||
#endif /* CONFIG_SMP */
|
||||
|
||||
#endif /* _LINUX_CPUDL_H */
|
1640
kernel/sched/deadline.c
Normal file
1640
kernel/sched/deadline.c
Normal file
File diff suppressed because it is too large
Load Diff
@ -139,7 +139,7 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
|
||||
0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
|
||||
#endif
|
||||
#ifdef CONFIG_NUMA_BALANCING
|
||||
SEQ_printf(m, " %d", cpu_to_node(task_cpu(p)));
|
||||
SEQ_printf(m, " %d", task_node(p));
|
||||
#endif
|
||||
#ifdef CONFIG_CGROUP_SCHED
|
||||
SEQ_printf(m, " %s", task_group_path(task_group(p)));
|
||||
@ -371,7 +371,7 @@ static void sched_debug_header(struct seq_file *m)
|
||||
PN(cpu_clk);
|
||||
P(jiffies);
|
||||
#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
|
||||
P(sched_clock_stable);
|
||||
P(sched_clock_stable());
|
||||
#endif
|
||||
#undef PN
|
||||
#undef P
|
||||
|
@ -872,15 +872,6 @@ static unsigned int task_scan_max(struct task_struct *p)
|
||||
return max(smin, smax);
|
||||
}
|
||||
|
||||
/*
|
||||
* Once a preferred node is selected the scheduler balancer will prefer moving
|
||||
* a task to that node for sysctl_numa_balancing_settle_count number of PTE
|
||||
* scans. This will give the process the chance to accumulate more faults on
|
||||
* the preferred node but still allow the scheduler to move the task again if
|
||||
* the nodes CPUs are overloaded.
|
||||
*/
|
||||
unsigned int sysctl_numa_balancing_settle_count __read_mostly = 4;
|
||||
|
||||
static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
|
||||
{
|
||||
rq->nr_numa_running += (p->numa_preferred_nid != -1);
|
||||
@ -930,7 +921,8 @@ static inline unsigned long group_faults(struct task_struct *p, int nid)
|
||||
if (!p->numa_group)
|
||||
return 0;
|
||||
|
||||
return p->numa_group->faults[2*nid] + p->numa_group->faults[2*nid+1];
|
||||
return p->numa_group->faults[task_faults_idx(nid, 0)] +
|
||||
p->numa_group->faults[task_faults_idx(nid, 1)];
|
||||
}
|
||||
|
||||
/*
|
||||
@ -1023,7 +1015,7 @@ struct task_numa_env {
|
||||
|
||||
struct numa_stats src_stats, dst_stats;
|
||||
|
||||
int imbalance_pct, idx;
|
||||
int imbalance_pct;
|
||||
|
||||
struct task_struct *best_task;
|
||||
long best_imp;
|
||||
@ -1211,7 +1203,7 @@ static int task_numa_migrate(struct task_struct *p)
|
||||
* elsewhere, so there is no point in (re)trying.
|
||||
*/
|
||||
if (unlikely(!sd)) {
|
||||
p->numa_preferred_nid = cpu_to_node(task_cpu(p));
|
||||
p->numa_preferred_nid = task_node(p);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
@ -1278,7 +1270,7 @@ static void numa_migrate_preferred(struct task_struct *p)
|
||||
p->numa_migrate_retry = jiffies + HZ;
|
||||
|
||||
/* Success if task is already running on preferred CPU */
|
||||
if (cpu_to_node(task_cpu(p)) == p->numa_preferred_nid)
|
||||
if (task_node(p) == p->numa_preferred_nid)
|
||||
return;
|
||||
|
||||
/* Otherwise, try migrate to a CPU on the preferred node */
|
||||
@ -1350,7 +1342,6 @@ static void update_task_scan_period(struct task_struct *p,
|
||||
* scanning faster if shared accesses dominate as it may
|
||||
* simply bounce migrations uselessly
|
||||
*/
|
||||
period_slot = DIV_ROUND_UP(diff, NUMA_PERIOD_SLOTS);
|
||||
ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared));
|
||||
diff = (diff * ratio) / NUMA_PERIOD_SLOTS;
|
||||
}
|
||||
@ -4101,12 +4092,16 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
|
||||
*/
|
||||
static struct sched_group *
|
||||
find_idlest_group(struct sched_domain *sd, struct task_struct *p,
|
||||
int this_cpu, int load_idx)
|
||||
int this_cpu, int sd_flag)
|
||||
{
|
||||
struct sched_group *idlest = NULL, *group = sd->groups;
|
||||
unsigned long min_load = ULONG_MAX, this_load = 0;
|
||||
int load_idx = sd->forkexec_idx;
|
||||
int imbalance = 100 + (sd->imbalance_pct-100)/2;
|
||||
|
||||
if (sd_flag & SD_BALANCE_WAKE)
|
||||
load_idx = sd->wake_idx;
|
||||
|
||||
do {
|
||||
unsigned long load, avg_load;
|
||||
int local_group;
|
||||
@ -4274,7 +4269,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
|
||||
}
|
||||
|
||||
while (sd) {
|
||||
int load_idx = sd->forkexec_idx;
|
||||
struct sched_group *group;
|
||||
int weight;
|
||||
|
||||
@ -4283,10 +4277,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
|
||||
continue;
|
||||
}
|
||||
|
||||
if (sd_flag & SD_BALANCE_WAKE)
|
||||
load_idx = sd->wake_idx;
|
||||
|
||||
group = find_idlest_group(sd, p, cpu, load_idx);
|
||||
group = find_idlest_group(sd, p, cpu, sd_flag);
|
||||
if (!group) {
|
||||
sd = sd->child;
|
||||
continue;
|
||||
@ -5512,7 +5503,6 @@ static inline void update_sg_lb_stats(struct lb_env *env,
|
||||
struct sched_group *group, int load_idx,
|
||||
int local_group, struct sg_lb_stats *sgs)
|
||||
{
|
||||
unsigned long nr_running;
|
||||
unsigned long load;
|
||||
int i;
|
||||
|
||||
@ -5521,8 +5511,6 @@ static inline void update_sg_lb_stats(struct lb_env *env,
|
||||
for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
|
||||
struct rq *rq = cpu_rq(i);
|
||||
|
||||
nr_running = rq->nr_running;
|
||||
|
||||
/* Bias balancing toward cpus of our domain */
|
||||
if (local_group)
|
||||
load = target_load(i, load_idx);
|
||||
@ -5530,7 +5518,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
|
||||
load = source_load(i, load_idx);
|
||||
|
||||
sgs->group_load += load;
|
||||
sgs->sum_nr_running += nr_running;
|
||||
sgs->sum_nr_running += rq->nr_running;
|
||||
#ifdef CONFIG_NUMA_BALANCING
|
||||
sgs->nr_numa_running += rq->nr_numa_running;
|
||||
sgs->nr_preferred_running += rq->nr_preferred_running;
|
||||
@ -6521,7 +6509,7 @@ static struct {
|
||||
unsigned long next_balance; /* in jiffy units */
|
||||
} nohz ____cacheline_aligned;
|
||||
|
||||
static inline int find_new_ilb(int call_cpu)
|
||||
static inline int find_new_ilb(void)
|
||||
{
|
||||
int ilb = cpumask_first(nohz.idle_cpus_mask);
|
||||
|
||||
@ -6536,13 +6524,13 @@ static inline int find_new_ilb(int call_cpu)
|
||||
* nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
|
||||
* CPU (if there is one).
|
||||
*/
|
||||
static void nohz_balancer_kick(int cpu)
|
||||
static void nohz_balancer_kick(void)
|
||||
{
|
||||
int ilb_cpu;
|
||||
|
||||
nohz.next_balance++;
|
||||
|
||||
ilb_cpu = find_new_ilb(cpu);
|
||||
ilb_cpu = find_new_ilb();
|
||||
|
||||
if (ilb_cpu >= nr_cpu_ids)
|
||||
return;
|
||||
@ -6652,10 +6640,10 @@ void update_max_interval(void)
|
||||
*
|
||||
* Balancing parameters are set up in init_sched_domains.
|
||||
*/
|
||||
static void rebalance_domains(int cpu, enum cpu_idle_type idle)
|
||||
static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
|
||||
{
|
||||
int continue_balancing = 1;
|
||||
struct rq *rq = cpu_rq(cpu);
|
||||
int cpu = rq->cpu;
|
||||
unsigned long interval;
|
||||
struct sched_domain *sd;
|
||||
/* Earliest time when we have to do rebalance again */
|
||||
@ -6752,9 +6740,9 @@ out:
|
||||
* In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
|
||||
* rebalancing for all the cpus for whom scheduler ticks are stopped.
|
||||
*/
|
||||
static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
|
||||
static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
|
||||
{
|
||||
struct rq *this_rq = cpu_rq(this_cpu);
|
||||
int this_cpu = this_rq->cpu;
|
||||
struct rq *rq;
|
||||
int balance_cpu;
|
||||
|
||||
@ -6781,7 +6769,7 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
|
||||
update_idle_cpu_load(rq);
|
||||
raw_spin_unlock_irq(&rq->lock);
|
||||
|
||||
rebalance_domains(balance_cpu, CPU_IDLE);
|
||||
rebalance_domains(rq, CPU_IDLE);
|
||||
|
||||
if (time_after(this_rq->next_balance, rq->next_balance))
|
||||
this_rq->next_balance = rq->next_balance;
|
||||
@ -6800,14 +6788,14 @@ end:
|
||||
* - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
|
||||
* domain span are idle.
|
||||
*/
|
||||
static inline int nohz_kick_needed(struct rq *rq, int cpu)
|
||||
static inline int nohz_kick_needed(struct rq *rq)
|
||||
{
|
||||
unsigned long now = jiffies;
|
||||
struct sched_domain *sd;
|
||||
struct sched_group_power *sgp;
|
||||
int nr_busy;
|
||||
int nr_busy, cpu = rq->cpu;
|
||||
|
||||
if (unlikely(idle_cpu(cpu)))
|
||||
if (unlikely(rq->idle_balance))
|
||||
return 0;
|
||||
|
||||
/*
|
||||
@ -6856,7 +6844,7 @@ need_kick:
|
||||
return 1;
|
||||
}
|
||||
#else
|
||||
static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
|
||||
static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { }
|
||||
#endif
|
||||
|
||||
/*
|
||||
@ -6865,38 +6853,39 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
|
||||
*/
|
||||
static void run_rebalance_domains(struct softirq_action *h)
|
||||
{
|
||||
int this_cpu = smp_processor_id();
|
||||
struct rq *this_rq = cpu_rq(this_cpu);
|
||||
struct rq *this_rq = this_rq();
|
||||
enum cpu_idle_type idle = this_rq->idle_balance ?
|
||||
CPU_IDLE : CPU_NOT_IDLE;
|
||||
|
||||
rebalance_domains(this_cpu, idle);
|
||||
rebalance_domains(this_rq, idle);
|
||||
|
||||
/*
|
||||
* If this cpu has a pending nohz_balance_kick, then do the
|
||||
* balancing on behalf of the other idle cpus whose ticks are
|
||||
* stopped.
|
||||
*/
|
||||
nohz_idle_balance(this_cpu, idle);
|
||||
nohz_idle_balance(this_rq, idle);
|
||||
}
|
||||
|
||||
static inline int on_null_domain(int cpu)
|
||||
static inline int on_null_domain(struct rq *rq)
|
||||
{
|
||||
return !rcu_dereference_sched(cpu_rq(cpu)->sd);
|
||||
return !rcu_dereference_sched(rq->sd);
|
||||
}
|
||||
|
||||
/*
|
||||
* Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
|
||||
*/
|
||||
void trigger_load_balance(struct rq *rq, int cpu)
|
||||
void trigger_load_balance(struct rq *rq)
|
||||
{
|
||||
/* Don't need to rebalance while attached to NULL domain */
|
||||
if (time_after_eq(jiffies, rq->next_balance) &&
|
||||
likely(!on_null_domain(cpu)))
|
||||
if (unlikely(on_null_domain(rq)))
|
||||
return;
|
||||
|
||||
if (time_after_eq(jiffies, rq->next_balance))
|
||||
raise_softirq(SCHED_SOFTIRQ);
|
||||
#ifdef CONFIG_NO_HZ_COMMON
|
||||
if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu)))
|
||||
nohz_balancer_kick(cpu);
|
||||
if (nohz_kick_needed(rq))
|
||||
nohz_balancer_kick();
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -1738,7 +1738,7 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
|
||||
!test_tsk_need_resched(rq->curr) &&
|
||||
has_pushable_tasks(rq) &&
|
||||
p->nr_cpus_allowed > 1 &&
|
||||
rt_task(rq->curr) &&
|
||||
(dl_task(rq->curr) || rt_task(rq->curr)) &&
|
||||
(rq->curr->nr_cpus_allowed < 2 ||
|
||||
rq->curr->prio <= p->prio))
|
||||
push_rt_tasks(rq);
|
||||
|
@ -2,6 +2,7 @@
|
||||
#include <linux/sched.h>
|
||||
#include <linux/sched/sysctl.h>
|
||||
#include <linux/sched/rt.h>
|
||||
#include <linux/sched/deadline.h>
|
||||
#include <linux/mutex.h>
|
||||
#include <linux/spinlock.h>
|
||||
#include <linux/stop_machine.h>
|
||||
@ -9,6 +10,7 @@
|
||||
#include <linux/slab.h>
|
||||
|
||||
#include "cpupri.h"
|
||||
#include "cpudeadline.h"
|
||||
#include "cpuacct.h"
|
||||
|
||||
struct rq;
|
||||
@ -72,6 +74,13 @@ extern void update_cpu_load_active(struct rq *this_rq);
|
||||
#define NICE_0_LOAD SCHED_LOAD_SCALE
|
||||
#define NICE_0_SHIFT SCHED_LOAD_SHIFT
|
||||
|
||||
/*
|
||||
* Single value that decides SCHED_DEADLINE internal math precision.
|
||||
* 10 -> just above 1us
|
||||
* 9 -> just above 0.5us
|
||||
*/
|
||||
#define DL_SCALE (10)
|
||||
|
||||
/*
|
||||
* These are the 'tuning knobs' of the scheduler:
|
||||
*/
|
||||
@ -81,11 +90,19 @@ extern void update_cpu_load_active(struct rq *this_rq);
|
||||
*/
|
||||
#define RUNTIME_INF ((u64)~0ULL)
|
||||
|
||||
static inline int fair_policy(int policy)
|
||||
{
|
||||
return policy == SCHED_NORMAL || policy == SCHED_BATCH;
|
||||
}
|
||||
|
||||
static inline int rt_policy(int policy)
|
||||
{
|
||||
if (policy == SCHED_FIFO || policy == SCHED_RR)
|
||||
return 1;
|
||||
return 0;
|
||||
return policy == SCHED_FIFO || policy == SCHED_RR;
|
||||
}
|
||||
|
||||
static inline int dl_policy(int policy)
|
||||
{
|
||||
return policy == SCHED_DEADLINE;
|
||||
}
|
||||
|
||||
static inline int task_has_rt_policy(struct task_struct *p)
|
||||
@ -93,6 +110,25 @@ static inline int task_has_rt_policy(struct task_struct *p)
|
||||
return rt_policy(p->policy);
|
||||
}
|
||||
|
||||
static inline int task_has_dl_policy(struct task_struct *p)
|
||||
{
|
||||
return dl_policy(p->policy);
|
||||
}
|
||||
|
||||
static inline bool dl_time_before(u64 a, u64 b)
|
||||
{
|
||||
return (s64)(a - b) < 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Tells if entity @a should preempt entity @b.
|
||||
*/
|
||||
static inline bool
|
||||
dl_entity_preempt(struct sched_dl_entity *a, struct sched_dl_entity *b)
|
||||
{
|
||||
return dl_time_before(a->deadline, b->deadline);
|
||||
}
|
||||
|
||||
/*
|
||||
* This is the priority-queue data structure of the RT scheduling class:
|
||||
*/
|
||||
@ -108,6 +144,47 @@ struct rt_bandwidth {
|
||||
u64 rt_runtime;
|
||||
struct hrtimer rt_period_timer;
|
||||
};
|
||||
/*
|
||||
* To keep the bandwidth of -deadline tasks and groups under control
|
||||
* we need some place where:
|
||||
* - store the maximum -deadline bandwidth of the system (the group);
|
||||
* - cache the fraction of that bandwidth that is currently allocated.
|
||||
*
|
||||
* This is all done in the data structure below. It is similar to the
|
||||
* one used for RT-throttling (rt_bandwidth), with the main difference
|
||||
* that, since here we are only interested in admission control, we
|
||||
* do not decrease any runtime while the group "executes", neither we
|
||||
* need a timer to replenish it.
|
||||
*
|
||||
* With respect to SMP, the bandwidth is given on a per-CPU basis,
|
||||
* meaning that:
|
||||
* - dl_bw (< 100%) is the bandwidth of the system (group) on each CPU;
|
||||
* - dl_total_bw array contains, in the i-eth element, the currently
|
||||
* allocated bandwidth on the i-eth CPU.
|
||||
* Moreover, groups consume bandwidth on each CPU, while tasks only
|
||||
* consume bandwidth on the CPU they're running on.
|
||||
* Finally, dl_total_bw_cpu is used to cache the index of dl_total_bw
|
||||
* that will be shown the next time the proc or cgroup controls will
|
||||
* be red. It on its turn can be changed by writing on its own
|
||||
* control.
|
||||
*/
|
||||
struct dl_bandwidth {
|
||||
raw_spinlock_t dl_runtime_lock;
|
||||
u64 dl_runtime;
|
||||
u64 dl_period;
|
||||
};
|
||||
|
||||
static inline int dl_bandwidth_enabled(void)
|
||||
{
|
||||
return sysctl_sched_rt_runtime >= 0;
|
||||
}
|
||||
|
||||
extern struct dl_bw *dl_bw_of(int i);
|
||||
|
||||
struct dl_bw {
|
||||
raw_spinlock_t lock;
|
||||
u64 bw, total_bw;
|
||||
};
|
||||
|
||||
extern struct mutex sched_domains_mutex;
|
||||
|
||||
@ -364,6 +441,42 @@ struct rt_rq {
|
||||
#endif
|
||||
};
|
||||
|
||||
/* Deadline class' related fields in a runqueue */
|
||||
struct dl_rq {
|
||||
/* runqueue is an rbtree, ordered by deadline */
|
||||
struct rb_root rb_root;
|
||||
struct rb_node *rb_leftmost;
|
||||
|
||||
unsigned long dl_nr_running;
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
/*
|
||||
* Deadline values of the currently executing and the
|
||||
* earliest ready task on this rq. Caching these facilitates
|
||||
* the decision wether or not a ready but not running task
|
||||
* should migrate somewhere else.
|
||||
*/
|
||||
struct {
|
||||
u64 curr;
|
||||
u64 next;
|
||||
} earliest_dl;
|
||||
|
||||
unsigned long dl_nr_migratory;
|
||||
unsigned long dl_nr_total;
|
||||
int overloaded;
|
||||
|
||||
/*
|
||||
* Tasks on this rq that can be pushed away. They are kept in
|
||||
* an rb-tree, ordered by tasks' deadlines, with caching
|
||||
* of the leftmost (earliest deadline) element.
|
||||
*/
|
||||
struct rb_root pushable_dl_tasks_root;
|
||||
struct rb_node *pushable_dl_tasks_leftmost;
|
||||
#else
|
||||
struct dl_bw dl_bw;
|
||||
#endif
|
||||
};
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
|
||||
/*
|
||||
@ -381,6 +494,15 @@ struct root_domain {
|
||||
cpumask_var_t span;
|
||||
cpumask_var_t online;
|
||||
|
||||
/*
|
||||
* The bit corresponding to a CPU gets set here if such CPU has more
|
||||
* than one runnable -deadline task (as it is below for RT tasks).
|
||||
*/
|
||||
cpumask_var_t dlo_mask;
|
||||
atomic_t dlo_count;
|
||||
struct dl_bw dl_bw;
|
||||
struct cpudl cpudl;
|
||||
|
||||
/*
|
||||
* The "RT overload" flag: it gets set if a CPU has more than
|
||||
* one runnable RT task.
|
||||
@ -432,6 +554,7 @@ struct rq {
|
||||
|
||||
struct cfs_rq cfs;
|
||||
struct rt_rq rt;
|
||||
struct dl_rq dl;
|
||||
|
||||
#ifdef CONFIG_FAIR_GROUP_SCHED
|
||||
/* list of leaf cfs_rq on this cpu: */
|
||||
@ -827,8 +950,6 @@ static inline u64 global_rt_runtime(void)
|
||||
return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
|
||||
}
|
||||
|
||||
|
||||
|
||||
static inline int task_current(struct rq *rq, struct task_struct *p)
|
||||
{
|
||||
return rq->curr == p;
|
||||
@ -988,6 +1109,7 @@ static const u32 prio_to_wmult[40] = {
|
||||
#else
|
||||
#define ENQUEUE_WAKING 0
|
||||
#endif
|
||||
#define ENQUEUE_REPLENISH 8
|
||||
|
||||
#define DEQUEUE_SLEEP 1
|
||||
|
||||
@ -1023,6 +1145,7 @@ struct sched_class {
|
||||
void (*set_curr_task) (struct rq *rq);
|
||||
void (*task_tick) (struct rq *rq, struct task_struct *p, int queued);
|
||||
void (*task_fork) (struct task_struct *p);
|
||||
void (*task_dead) (struct task_struct *p);
|
||||
|
||||
void (*switched_from) (struct rq *this_rq, struct task_struct *task);
|
||||
void (*switched_to) (struct rq *this_rq, struct task_struct *task);
|
||||
@ -1042,6 +1165,7 @@ struct sched_class {
|
||||
for (class = sched_class_highest; class; class = class->next)
|
||||
|
||||
extern const struct sched_class stop_sched_class;
|
||||
extern const struct sched_class dl_sched_class;
|
||||
extern const struct sched_class rt_sched_class;
|
||||
extern const struct sched_class fair_sched_class;
|
||||
extern const struct sched_class idle_sched_class;
|
||||
@ -1051,7 +1175,7 @@ extern const struct sched_class idle_sched_class;
|
||||
|
||||
extern void update_group_power(struct sched_domain *sd, int cpu);
|
||||
|
||||
extern void trigger_load_balance(struct rq *rq, int cpu);
|
||||
extern void trigger_load_balance(struct rq *rq);
|
||||
extern void idle_balance(int this_cpu, struct rq *this_rq);
|
||||
|
||||
extern void idle_enter_fair(struct rq *this_rq);
|
||||
@ -1068,8 +1192,11 @@ static inline void idle_balance(int cpu, struct rq *rq)
|
||||
extern void sysrq_sched_debug_show(void);
|
||||
extern void sched_init_granularity(void);
|
||||
extern void update_max_interval(void);
|
||||
|
||||
extern void init_sched_dl_class(void);
|
||||
extern void init_sched_rt_class(void);
|
||||
extern void init_sched_fair_class(void);
|
||||
extern void init_sched_dl_class(void);
|
||||
|
||||
extern void resched_task(struct task_struct *p);
|
||||
extern void resched_cpu(int cpu);
|
||||
@ -1077,6 +1204,12 @@ extern void resched_cpu(int cpu);
|
||||
extern struct rt_bandwidth def_rt_bandwidth;
|
||||
extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
|
||||
|
||||
extern struct dl_bandwidth def_dl_bandwidth;
|
||||
extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime);
|
||||
extern void init_dl_task_timer(struct sched_dl_entity *dl_se);
|
||||
|
||||
unsigned long to_ratio(u64 period, u64 runtime);
|
||||
|
||||
extern void update_idle_cpu_load(struct rq *this_rq);
|
||||
|
||||
extern void init_task_runnable_average(struct task_struct *p);
|
||||
@ -1353,6 +1486,7 @@ extern void print_rt_stats(struct seq_file *m, int cpu);
|
||||
|
||||
extern void init_cfs_rq(struct cfs_rq *cfs_rq);
|
||||
extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq);
|
||||
extern void init_dl_rq(struct dl_rq *dl_rq, struct rq *rq);
|
||||
|
||||
extern void cfs_bandwidth_usage_inc(void);
|
||||
extern void cfs_bandwidth_usage_dec(void);
|
||||
|
@ -103,7 +103,7 @@ get_rr_interval_stop(struct rq *rq, struct task_struct *task)
|
||||
* Simple, special scheduling class for the per-CPU stop tasks:
|
||||
*/
|
||||
const struct sched_class stop_sched_class = {
|
||||
.next = &rt_sched_class,
|
||||
.next = &dl_sched_class,
|
||||
|
||||
.enqueue_task = enqueue_task_stop,
|
||||
.dequeue_task = dequeue_task_stop,
|
||||
|
@ -89,7 +89,7 @@ static void wakeup_softirqd(void)
|
||||
* where hardirqs are disabled legitimately:
|
||||
*/
|
||||
#ifdef CONFIG_TRACE_IRQFLAGS
|
||||
static void __local_bh_disable(unsigned long ip, unsigned int cnt)
|
||||
void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
|
||||
{
|
||||
unsigned long flags;
|
||||
|
||||
@ -107,33 +107,21 @@ static void __local_bh_disable(unsigned long ip, unsigned int cnt)
|
||||
/*
|
||||
* Were softirqs turned off above:
|
||||
*/
|
||||
if (softirq_count() == cnt)
|
||||
if (softirq_count() == (cnt & SOFTIRQ_MASK))
|
||||
trace_softirqs_off(ip);
|
||||
raw_local_irq_restore(flags);
|
||||
|
||||
if (preempt_count() == cnt)
|
||||
trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
|
||||
}
|
||||
#else /* !CONFIG_TRACE_IRQFLAGS */
|
||||
static inline void __local_bh_disable(unsigned long ip, unsigned int cnt)
|
||||
{
|
||||
preempt_count_add(cnt);
|
||||
barrier();
|
||||
}
|
||||
EXPORT_SYMBOL(__local_bh_disable_ip);
|
||||
#endif /* CONFIG_TRACE_IRQFLAGS */
|
||||
|
||||
void local_bh_disable(void)
|
||||
{
|
||||
__local_bh_disable(_RET_IP_, SOFTIRQ_DISABLE_OFFSET);
|
||||
}
|
||||
|
||||
EXPORT_SYMBOL(local_bh_disable);
|
||||
|
||||
static void __local_bh_enable(unsigned int cnt)
|
||||
{
|
||||
WARN_ON_ONCE(!irqs_disabled());
|
||||
|
||||
if (softirq_count() == cnt)
|
||||
if (softirq_count() == (cnt & SOFTIRQ_MASK))
|
||||
trace_softirqs_on(_RET_IP_);
|
||||
preempt_count_sub(cnt);
|
||||
}
|
||||
@ -151,7 +139,7 @@ void _local_bh_enable(void)
|
||||
|
||||
EXPORT_SYMBOL(_local_bh_enable);
|
||||
|
||||
static inline void _local_bh_enable_ip(unsigned long ip)
|
||||
void __local_bh_enable_ip(unsigned long ip, unsigned int cnt)
|
||||
{
|
||||
WARN_ON_ONCE(in_irq() || irqs_disabled());
|
||||
#ifdef CONFIG_TRACE_IRQFLAGS
|
||||
@ -166,7 +154,7 @@ static inline void _local_bh_enable_ip(unsigned long ip)
|
||||
* Keep preemption disabled until we are done with
|
||||
* softirq processing:
|
||||
*/
|
||||
preempt_count_sub(SOFTIRQ_DISABLE_OFFSET - 1);
|
||||
preempt_count_sub(cnt - 1);
|
||||
|
||||
if (unlikely(!in_interrupt() && local_softirq_pending())) {
|
||||
/*
|
||||
@ -182,18 +170,7 @@ static inline void _local_bh_enable_ip(unsigned long ip)
|
||||
#endif
|
||||
preempt_check_resched();
|
||||
}
|
||||
|
||||
void local_bh_enable(void)
|
||||
{
|
||||
_local_bh_enable_ip(_RET_IP_);
|
||||
}
|
||||
EXPORT_SYMBOL(local_bh_enable);
|
||||
|
||||
void local_bh_enable_ip(unsigned long ip)
|
||||
{
|
||||
_local_bh_enable_ip(ip);
|
||||
}
|
||||
EXPORT_SYMBOL(local_bh_enable_ip);
|
||||
EXPORT_SYMBOL(__local_bh_enable_ip);
|
||||
|
||||
/*
|
||||
* We restart softirq processing for at most MAX_SOFTIRQ_RESTART times,
|
||||
@ -264,7 +241,7 @@ asmlinkage void __do_softirq(void)
|
||||
pending = local_softirq_pending();
|
||||
account_irq_enter_time(current);
|
||||
|
||||
__local_bh_disable(_RET_IP_, SOFTIRQ_OFFSET);
|
||||
__local_bh_disable_ip(_RET_IP_, SOFTIRQ_OFFSET);
|
||||
in_hardirq = lockdep_softirq_start();
|
||||
|
||||
cpu = smp_processor_id();
|
||||
|
@ -384,13 +384,6 @@ static struct ctl_table kern_table[] = {
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec,
|
||||
},
|
||||
{
|
||||
.procname = "numa_balancing_settle_count",
|
||||
.data = &sysctl_numa_balancing_settle_count,
|
||||
.maxlen = sizeof(unsigned int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec,
|
||||
},
|
||||
{
|
||||
.procname = "numa_balancing_migrate_deferred",
|
||||
.data = &sysctl_numa_balancing_migrate_deferred,
|
||||
|
@ -177,7 +177,7 @@ static bool can_stop_full_tick(void)
|
||||
* TODO: kick full dynticks CPUs when
|
||||
* sched_clock_stable is set.
|
||||
*/
|
||||
if (!sched_clock_stable) {
|
||||
if (!sched_clock_stable()) {
|
||||
trace_tick_stop(0, "unstable sched clock\n");
|
||||
/*
|
||||
* Don't allow the user to think they can get
|
||||
|
@ -2558,7 +2558,7 @@ rb_reserve_next_event(struct ring_buffer *buffer,
|
||||
if (unlikely(test_time_stamp(delta))) {
|
||||
int local_clock_stable = 1;
|
||||
#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
|
||||
local_clock_stable = sched_clock_stable;
|
||||
local_clock_stable = sched_clock_stable();
|
||||
#endif
|
||||
WARN_ONCE(delta > (1ULL << 59),
|
||||
KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s",
|
||||
|
@ -16,6 +16,7 @@
|
||||
#include <linux/uaccess.h>
|
||||
#include <linux/ftrace.h>
|
||||
#include <linux/sched/rt.h>
|
||||
#include <linux/sched/deadline.h>
|
||||
#include <trace/events/sched.h>
|
||||
#include "trace.h"
|
||||
|
||||
@ -27,6 +28,8 @@ static int wakeup_cpu;
|
||||
static int wakeup_current_cpu;
|
||||
static unsigned wakeup_prio = -1;
|
||||
static int wakeup_rt;
|
||||
static int wakeup_dl;
|
||||
static int tracing_dl = 0;
|
||||
|
||||
static arch_spinlock_t wakeup_lock =
|
||||
(arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
|
||||
@ -437,6 +440,7 @@ static void __wakeup_reset(struct trace_array *tr)
|
||||
{
|
||||
wakeup_cpu = -1;
|
||||
wakeup_prio = -1;
|
||||
tracing_dl = 0;
|
||||
|
||||
if (wakeup_task)
|
||||
put_task_struct(wakeup_task);
|
||||
@ -472,9 +476,17 @@ probe_wakeup(void *ignore, struct task_struct *p, int success)
|
||||
tracing_record_cmdline(p);
|
||||
tracing_record_cmdline(current);
|
||||
|
||||
if ((wakeup_rt && !rt_task(p)) ||
|
||||
p->prio >= wakeup_prio ||
|
||||
p->prio >= current->prio)
|
||||
/*
|
||||
* Semantic is like this:
|
||||
* - wakeup tracer handles all tasks in the system, independently
|
||||
* from their scheduling class;
|
||||
* - wakeup_rt tracer handles tasks belonging to sched_dl and
|
||||
* sched_rt class;
|
||||
* - wakeup_dl handles tasks belonging to sched_dl class only.
|
||||
*/
|
||||
if (tracing_dl || (wakeup_dl && !dl_task(p)) ||
|
||||
(wakeup_rt && !dl_task(p) && !rt_task(p)) ||
|
||||
(!dl_task(p) && (p->prio >= wakeup_prio || p->prio >= current->prio)))
|
||||
return;
|
||||
|
||||
pc = preempt_count();
|
||||
@ -486,7 +498,8 @@ probe_wakeup(void *ignore, struct task_struct *p, int success)
|
||||
arch_spin_lock(&wakeup_lock);
|
||||
|
||||
/* check for races. */
|
||||
if (!tracer_enabled || p->prio >= wakeup_prio)
|
||||
if (!tracer_enabled || tracing_dl ||
|
||||
(!dl_task(p) && p->prio >= wakeup_prio))
|
||||
goto out_locked;
|
||||
|
||||
/* reset the trace */
|
||||
@ -496,6 +509,15 @@ probe_wakeup(void *ignore, struct task_struct *p, int success)
|
||||
wakeup_current_cpu = wakeup_cpu;
|
||||
wakeup_prio = p->prio;
|
||||
|
||||
/*
|
||||
* Once you start tracing a -deadline task, don't bother tracing
|
||||
* another task until the first one wakes up.
|
||||
*/
|
||||
if (dl_task(p))
|
||||
tracing_dl = 1;
|
||||
else
|
||||
tracing_dl = 0;
|
||||
|
||||
wakeup_task = p;
|
||||
get_task_struct(wakeup_task);
|
||||
|
||||
@ -597,16 +619,25 @@ static int __wakeup_tracer_init(struct trace_array *tr)
|
||||
|
||||
static int wakeup_tracer_init(struct trace_array *tr)
|
||||
{
|
||||
wakeup_dl = 0;
|
||||
wakeup_rt = 0;
|
||||
return __wakeup_tracer_init(tr);
|
||||
}
|
||||
|
||||
static int wakeup_rt_tracer_init(struct trace_array *tr)
|
||||
{
|
||||
wakeup_dl = 0;
|
||||
wakeup_rt = 1;
|
||||
return __wakeup_tracer_init(tr);
|
||||
}
|
||||
|
||||
static int wakeup_dl_tracer_init(struct trace_array *tr)
|
||||
{
|
||||
wakeup_dl = 1;
|
||||
wakeup_rt = 0;
|
||||
return __wakeup_tracer_init(tr);
|
||||
}
|
||||
|
||||
static void wakeup_tracer_reset(struct trace_array *tr)
|
||||
{
|
||||
int lat_flag = save_flags & TRACE_ITER_LATENCY_FMT;
|
||||
@ -674,6 +705,28 @@ static struct tracer wakeup_rt_tracer __read_mostly =
|
||||
.use_max_tr = true,
|
||||
};
|
||||
|
||||
static struct tracer wakeup_dl_tracer __read_mostly =
|
||||
{
|
||||
.name = "wakeup_dl",
|
||||
.init = wakeup_dl_tracer_init,
|
||||
.reset = wakeup_tracer_reset,
|
||||
.start = wakeup_tracer_start,
|
||||
.stop = wakeup_tracer_stop,
|
||||
.wait_pipe = poll_wait_pipe,
|
||||
.print_max = true,
|
||||
.print_header = wakeup_print_header,
|
||||
.print_line = wakeup_print_line,
|
||||
.flags = &tracer_flags,
|
||||
.set_flag = wakeup_set_flag,
|
||||
.flag_changed = wakeup_flag_changed,
|
||||
#ifdef CONFIG_FTRACE_SELFTEST
|
||||
.selftest = trace_selftest_startup_wakeup,
|
||||
#endif
|
||||
.open = wakeup_trace_open,
|
||||
.close = wakeup_trace_close,
|
||||
.use_max_tr = true,
|
||||
};
|
||||
|
||||
__init static int init_wakeup_tracer(void)
|
||||
{
|
||||
int ret;
|
||||
@ -686,6 +739,10 @@ __init static int init_wakeup_tracer(void)
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
ret = register_tracer(&wakeup_dl_tracer);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
return 0;
|
||||
}
|
||||
core_initcall(init_wakeup_tracer);
|
||||
|
@ -1022,11 +1022,16 @@ trace_selftest_startup_nop(struct tracer *trace, struct trace_array *tr)
|
||||
#ifdef CONFIG_SCHED_TRACER
|
||||
static int trace_wakeup_test_thread(void *data)
|
||||
{
|
||||
/* Make this a RT thread, doesn't need to be too high */
|
||||
static const struct sched_param param = { .sched_priority = 5 };
|
||||
/* Make this a -deadline thread */
|
||||
static const struct sched_attr attr = {
|
||||
.sched_policy = SCHED_DEADLINE,
|
||||
.sched_runtime = 100000ULL,
|
||||
.sched_deadline = 10000000ULL,
|
||||
.sched_period = 10000000ULL
|
||||
};
|
||||
struct completion *x = data;
|
||||
|
||||
sched_setscheduler(current, SCHED_FIFO, ¶m);
|
||||
sched_setattr(current, &attr);
|
||||
|
||||
/* Make it know we have a new prio */
|
||||
complete(x);
|
||||
@ -1040,8 +1045,8 @@ static int trace_wakeup_test_thread(void *data)
|
||||
/* we are awake, now wait to disappear */
|
||||
while (!kthread_should_stop()) {
|
||||
/*
|
||||
* This is an RT task, do short sleeps to let
|
||||
* others run.
|
||||
* This will likely be the system top priority
|
||||
* task, do short sleeps to let others run.
|
||||
*/
|
||||
msleep(100);
|
||||
}
|
||||
@ -1054,21 +1059,21 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
|
||||
{
|
||||
unsigned long save_max = tracing_max_latency;
|
||||
struct task_struct *p;
|
||||
struct completion isrt;
|
||||
struct completion is_ready;
|
||||
unsigned long count;
|
||||
int ret;
|
||||
|
||||
init_completion(&isrt);
|
||||
init_completion(&is_ready);
|
||||
|
||||
/* create a high prio thread */
|
||||
p = kthread_run(trace_wakeup_test_thread, &isrt, "ftrace-test");
|
||||
/* create a -deadline thread */
|
||||
p = kthread_run(trace_wakeup_test_thread, &is_ready, "ftrace-test");
|
||||
if (IS_ERR(p)) {
|
||||
printk(KERN_CONT "Failed to create ftrace wakeup test thread ");
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* make sure the thread is running at an RT prio */
|
||||
wait_for_completion(&isrt);
|
||||
/* make sure the thread is running at -deadline policy */
|
||||
wait_for_completion(&is_ready);
|
||||
|
||||
/* start the tracing */
|
||||
ret = tracer_init(trace, tr);
|
||||
@ -1082,19 +1087,19 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
|
||||
|
||||
while (p->on_rq) {
|
||||
/*
|
||||
* Sleep to make sure the RT thread is asleep too.
|
||||
* Sleep to make sure the -deadline thread is asleep too.
|
||||
* On virtual machines we can't rely on timings,
|
||||
* but we want to make sure this test still works.
|
||||
*/
|
||||
msleep(100);
|
||||
}
|
||||
|
||||
init_completion(&isrt);
|
||||
init_completion(&is_ready);
|
||||
|
||||
wake_up_process(p);
|
||||
|
||||
/* Wait for the task to wake up */
|
||||
wait_for_completion(&isrt);
|
||||
wait_for_completion(&is_ready);
|
||||
|
||||
/* stop the tracing. */
|
||||
tracing_stop();
|
||||
|
@ -1623,11 +1623,11 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
|
||||
(len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) &&
|
||||
!sysctl_tcp_low_latency &&
|
||||
net_dma_find_channel()) {
|
||||
preempt_enable_no_resched();
|
||||
preempt_enable();
|
||||
tp->ucopy.pinned_list =
|
||||
dma_pin_iovec_pages(msg->msg_iov, len);
|
||||
} else {
|
||||
preempt_enable_no_resched();
|
||||
preempt_enable();
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
Loading…
Reference in New Issue
Block a user