- Avoid touching ~100 config files in order to be able to select

the preemption model
 
 - clear cluster CPU masks too, on the CPU unplug path
 
 - prevent use-after-free in cfs
 
 - Prevent a race condition when updating CPU cache domains
 
 - Factor out common shared part of smp_prepare_cpus() into a common
 helper which can be called by both baremetal and Xen, in order to fix a
 booting of Xen PV guests
 -----BEGIN PGP SIGNATURE-----
 
 iQIzBAABCgAdFiEEzv7L6UO9uDPlPSfHEsHwGGHeVUoFAmGQ8HcACgkQEsHwGGHe
 VUouoA//WAZ/dZu7IiM06JhZWswa2yNsdU8qQHys81lEqstaBqiWuZdg1qJTVIir
 2d0aN0keiPcsLyAsp1UJ2g/K/7D5vSJWDzsHKfEAToiAm8Tntai2LlSocWWfeSQm
 10grDHWpEHbj0hTHTA6HYOr2WbY4/LnR4cdL0WobIzivIrRTx49d0XUOUfWLP5KX
 60uM6dSjwpJrQUnvzk+bhGiHVmutFrEJy+UU/0o+nxkdhwraNiSbLi0007BGRCof
 6dokRRvLLR09dl1LMG51gVjQch4j/lCx6EWWUhYOFeV3I3gibSCNkmu7dpmMCBTR
 QWO01cR9gyFN4xQ2is4I36M5L0/8T+sbGvvXIXNDT/XWr0/p+g6p2mx0cd2XiYIr
 ZthGRcxxV/KGmxfPaygKS9tpQseMEIrdd6VjAnGfZ3OS6CtUvYt8d0B2Soj8FALQ
 N9fMXDIEP3uUZim8UvCT6HBKlj9LR5uI5n+dAQ6uzsenO9WqeGeldc/N26/+osdN
 vo4lNYTqiXJPhJvunYW5t4j5JnUa3grDHioAPWaQRJlWtEZBGKs9SXTcweg/KURb
 mNfe1RfSlGJt28RD3E18gXeSS7xWdKgpcVX1rmW/9tUjX04NNDWjq4sAzOj7c+Ir
 4sr78XgCY0pUxFaFYxvQWFUy7wcm0zAczo1RGUhcDTf1edDEvjo=
 =s2MX
 -----END PGP SIGNATURE-----

Merge tag 'sched_urgent_for_v5.16_rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler fixes from Borislav Petkov:

 - Avoid touching ~100 config files in order to be able to select the
   preemption model

 - clear cluster CPU masks too, on the CPU unplug path

 - prevent use-after-free in cfs

 - Prevent a race condition when updating CPU cache domains

 - Factor out common shared part of smp_prepare_cpus() into a common
   helper which can be called by both baremetal and Xen, in order to fix
   a booting of Xen PV guests

* tag 'sched_urgent_for_v5.16_rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  preempt: Restore preemption model selection configs
  arch_topology: Fix missing clear cluster_cpumask in remove_cpu_topology()
  sched/fair: Prevent dead task groups from regaining cfs_rq's
  sched/core: Mitigate race cpus_share_cache()/update_top_cache_domain()
  x86/smp: Factor out parts of native_smp_prepare_cpus()
This commit is contained in:
Linus Torvalds 2021-11-14 09:39:03 -08:00
commit fc661f2dcb
13 changed files with 96 additions and 59 deletions

View File

@ -126,6 +126,7 @@ static inline void arch_send_call_function_ipi_mask(const struct cpumask *mask)
void cpu_disable_common(void); void cpu_disable_common(void);
void native_smp_prepare_boot_cpu(void); void native_smp_prepare_boot_cpu(void);
void smp_prepare_cpus_common(void);
void native_smp_prepare_cpus(unsigned int max_cpus); void native_smp_prepare_cpus(unsigned int max_cpus);
void calculate_max_logical_packages(void); void calculate_max_logical_packages(void);
void native_smp_cpus_done(unsigned int max_cpus); void native_smp_cpus_done(unsigned int max_cpus);

View File

@ -1350,12 +1350,7 @@ static void __init smp_get_logical_apicid(void)
cpu0_logical_apicid = GET_APIC_LOGICAL_ID(apic_read(APIC_LDR)); cpu0_logical_apicid = GET_APIC_LOGICAL_ID(apic_read(APIC_LDR));
} }
/* void __init smp_prepare_cpus_common(void)
* Prepare for SMP bootup.
* @max_cpus: configured maximum number of CPUs, It is a legacy parameter
* for common interface support.
*/
void __init native_smp_prepare_cpus(unsigned int max_cpus)
{ {
unsigned int i; unsigned int i;
@ -1386,6 +1381,17 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
set_sched_topology(x86_topology); set_sched_topology(x86_topology);
set_cpu_sibling_map(0); set_cpu_sibling_map(0);
}
/*
* Prepare for SMP bootup.
* @max_cpus: configured maximum number of CPUs, It is a legacy parameter
* for common interface support.
*/
void __init native_smp_prepare_cpus(unsigned int max_cpus)
{
smp_prepare_cpus_common();
init_freq_invariance(false, false); init_freq_invariance(false, false);
smp_sanity_check(); smp_sanity_check();

View File

@ -225,7 +225,6 @@ static void __init xen_pv_smp_prepare_boot_cpu(void)
static void __init xen_pv_smp_prepare_cpus(unsigned int max_cpus) static void __init xen_pv_smp_prepare_cpus(unsigned int max_cpus)
{ {
unsigned cpu; unsigned cpu;
unsigned int i;
if (skip_ioapic_setup) { if (skip_ioapic_setup) {
char *m = (max_cpus == 0) ? char *m = (max_cpus == 0) ?
@ -238,16 +237,9 @@ static void __init xen_pv_smp_prepare_cpus(unsigned int max_cpus)
} }
xen_init_lock_cpu(0); xen_init_lock_cpu(0);
smp_store_boot_cpu_info(); smp_prepare_cpus_common();
cpu_data(0).x86_max_cores = 1;
for_each_possible_cpu(i) { cpu_data(0).x86_max_cores = 1;
zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL);
zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL);
zalloc_cpumask_var(&per_cpu(cpu_die_map, i), GFP_KERNEL);
zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL);
}
set_cpu_sibling_map(0);
speculative_store_bypass_ht_init(); speculative_store_bypass_ht_init();

View File

@ -677,6 +677,8 @@ void remove_cpu_topology(unsigned int cpu)
cpumask_clear_cpu(cpu, topology_core_cpumask(sibling)); cpumask_clear_cpu(cpu, topology_core_cpumask(sibling));
for_each_cpu(sibling, topology_sibling_cpumask(cpu)) for_each_cpu(sibling, topology_sibling_cpumask(cpu))
cpumask_clear_cpu(cpu, topology_sibling_cpumask(sibling)); cpumask_clear_cpu(cpu, topology_sibling_cpumask(sibling));
for_each_cpu(sibling, topology_cluster_cpumask(cpu))
cpumask_clear_cpu(cpu, topology_cluster_cpumask(sibling));
for_each_cpu(sibling, topology_llc_cpumask(cpu)) for_each_cpu(sibling, topology_llc_cpumask(cpu))
cpumask_clear_cpu(cpu, topology_llc_cpumask(sibling)); cpumask_clear_cpu(cpu, topology_llc_cpumask(sibling));

View File

@ -85,7 +85,7 @@
struct completion; struct completion;
struct user; struct user;
#ifdef CONFIG_PREEMPT_VOLUNTARY #ifdef CONFIG_PREEMPT_VOLUNTARY_BUILD
extern int __cond_resched(void); extern int __cond_resched(void);
# define might_resched() __cond_resched() # define might_resched() __cond_resched()

View File

@ -15,7 +15,7 @@
#else #else
#define MODULE_VERMAGIC_SMP "" #define MODULE_VERMAGIC_SMP ""
#endif #endif
#ifdef CONFIG_PREEMPT #ifdef CONFIG_PREEMPT_BUILD
#define MODULE_VERMAGIC_PREEMPT "preempt " #define MODULE_VERMAGIC_PREEMPT "preempt "
#elif defined(CONFIG_PREEMPT_RT) #elif defined(CONFIG_PREEMPT_RT)
#define MODULE_VERMAGIC_PREEMPT "preempt_rt " #define MODULE_VERMAGIC_PREEMPT "preempt_rt "

View File

@ -30,7 +30,7 @@ $(obj)/version.o: include/generated/compile.h
quiet_cmd_compile.h = CHK $@ quiet_cmd_compile.h = CHK $@
cmd_compile.h = \ cmd_compile.h = \
$(CONFIG_SHELL) $(srctree)/scripts/mkcompile_h $@ \ $(CONFIG_SHELL) $(srctree)/scripts/mkcompile_h $@ \
"$(UTS_MACHINE)" "$(CONFIG_SMP)" "$(CONFIG_PREEMPT)" \ "$(UTS_MACHINE)" "$(CONFIG_SMP)" "$(CONFIG_PREEMPT_BUILD)" \
"$(CONFIG_PREEMPT_RT)" $(CONFIG_CC_VERSION_TEXT) "$(LD)" "$(CONFIG_PREEMPT_RT)" $(CONFIG_CC_VERSION_TEXT) "$(LD)"
include/generated/compile.h: FORCE include/generated/compile.h: FORCE

View File

@ -1,12 +1,23 @@
# SPDX-License-Identifier: GPL-2.0-only # SPDX-License-Identifier: GPL-2.0-only
config PREEMPT_NONE_BUILD
bool
config PREEMPT_VOLUNTARY_BUILD
bool
config PREEMPT_BUILD
bool
select PREEMPTION
select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK
choice choice
prompt "Preemption Model" prompt "Preemption Model"
default PREEMPT_NONE_BEHAVIOUR default PREEMPT_NONE
config PREEMPT_NONE_BEHAVIOUR config PREEMPT_NONE
bool "No Forced Preemption (Server)" bool "No Forced Preemption (Server)"
select PREEMPT_NONE if !PREEMPT_DYNAMIC select PREEMPT_NONE_BUILD if !PREEMPT_DYNAMIC
help help
This is the traditional Linux preemption model, geared towards This is the traditional Linux preemption model, geared towards
throughput. It will still provide good latencies most of the throughput. It will still provide good latencies most of the
@ -18,10 +29,10 @@ config PREEMPT_NONE_BEHAVIOUR
raw processing power of the kernel, irrespective of scheduling raw processing power of the kernel, irrespective of scheduling
latencies. latencies.
config PREEMPT_VOLUNTARY_BEHAVIOUR config PREEMPT_VOLUNTARY
bool "Voluntary Kernel Preemption (Desktop)" bool "Voluntary Kernel Preemption (Desktop)"
depends on !ARCH_NO_PREEMPT depends on !ARCH_NO_PREEMPT
select PREEMPT_VOLUNTARY if !PREEMPT_DYNAMIC select PREEMPT_VOLUNTARY_BUILD if !PREEMPT_DYNAMIC
help help
This option reduces the latency of the kernel by adding more This option reduces the latency of the kernel by adding more
"explicit preemption points" to the kernel code. These new "explicit preemption points" to the kernel code. These new
@ -37,10 +48,10 @@ config PREEMPT_VOLUNTARY_BEHAVIOUR
Select this if you are building a kernel for a desktop system. Select this if you are building a kernel for a desktop system.
config PREEMPT_BEHAVIOUR config PREEMPT
bool "Preemptible Kernel (Low-Latency Desktop)" bool "Preemptible Kernel (Low-Latency Desktop)"
depends on !ARCH_NO_PREEMPT depends on !ARCH_NO_PREEMPT
select PREEMPT select PREEMPT_BUILD
help help
This option reduces the latency of the kernel by making This option reduces the latency of the kernel by making
all kernel code (that is not executing in a critical section) all kernel code (that is not executing in a critical section)
@ -58,7 +69,7 @@ config PREEMPT_BEHAVIOUR
config PREEMPT_RT config PREEMPT_RT
bool "Fully Preemptible Kernel (Real-Time)" bool "Fully Preemptible Kernel (Real-Time)"
depends on EXPERT && ARCH_SUPPORTS_RT && !PREEMPT_DYNAMIC depends on EXPERT && ARCH_SUPPORTS_RT
select PREEMPTION select PREEMPTION
help help
This option turns the kernel into a real-time kernel by replacing This option turns the kernel into a real-time kernel by replacing
@ -75,17 +86,6 @@ config PREEMPT_RT
endchoice endchoice
config PREEMPT_NONE
bool
config PREEMPT_VOLUNTARY
bool
config PREEMPT
bool
select PREEMPTION
select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK
config PREEMPT_COUNT config PREEMPT_COUNT
bool bool
@ -95,8 +95,8 @@ config PREEMPTION
config PREEMPT_DYNAMIC config PREEMPT_DYNAMIC
bool "Preemption behaviour defined on boot" bool "Preemption behaviour defined on boot"
depends on HAVE_PREEMPT_DYNAMIC depends on HAVE_PREEMPT_DYNAMIC && !PREEMPT_RT
select PREEMPT select PREEMPT_BUILD
default y default y
help help
This option allows to define the preemption model on the kernel This option allows to define the preemption model on the kernel

View File

@ -31,7 +31,7 @@ static inline void autogroup_destroy(struct kref *kref)
ag->tg->rt_se = NULL; ag->tg->rt_se = NULL;
ag->tg->rt_rq = NULL; ag->tg->rt_rq = NULL;
#endif #endif
sched_offline_group(ag->tg); sched_release_group(ag->tg);
sched_destroy_group(ag->tg); sched_destroy_group(ag->tg);
} }

View File

@ -3726,6 +3726,9 @@ out:
bool cpus_share_cache(int this_cpu, int that_cpu) bool cpus_share_cache(int this_cpu, int that_cpu)
{ {
if (this_cpu == that_cpu)
return true;
return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
} }
@ -6625,13 +6628,13 @@ __setup("preempt=", setup_preempt_mode);
static void __init preempt_dynamic_init(void) static void __init preempt_dynamic_init(void)
{ {
if (preempt_dynamic_mode == preempt_dynamic_undefined) { if (preempt_dynamic_mode == preempt_dynamic_undefined) {
if (IS_ENABLED(CONFIG_PREEMPT_NONE_BEHAVIOUR)) { if (IS_ENABLED(CONFIG_PREEMPT_NONE)) {
sched_dynamic_update(preempt_dynamic_none); sched_dynamic_update(preempt_dynamic_none);
} else if (IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY_BEHAVIOUR)) { } else if (IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY)) {
sched_dynamic_update(preempt_dynamic_voluntary); sched_dynamic_update(preempt_dynamic_voluntary);
} else { } else {
/* Default static call setting, nothing to do */ /* Default static call setting, nothing to do */
WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT_BEHAVIOUR)); WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT));
preempt_dynamic_mode = preempt_dynamic_full; preempt_dynamic_mode = preempt_dynamic_full;
pr_info("Dynamic Preempt: full\n"); pr_info("Dynamic Preempt: full\n");
} }
@ -9716,6 +9719,22 @@ static void sched_free_group(struct task_group *tg)
kmem_cache_free(task_group_cache, tg); kmem_cache_free(task_group_cache, tg);
} }
static void sched_free_group_rcu(struct rcu_head *rcu)
{
sched_free_group(container_of(rcu, struct task_group, rcu));
}
static void sched_unregister_group(struct task_group *tg)
{
unregister_fair_sched_group(tg);
unregister_rt_sched_group(tg);
/*
* We have to wait for yet another RCU grace period to expire, as
* print_cfs_stats() might run concurrently.
*/
call_rcu(&tg->rcu, sched_free_group_rcu);
}
/* allocate runqueue etc for a new task group */ /* allocate runqueue etc for a new task group */
struct task_group *sched_create_group(struct task_group *parent) struct task_group *sched_create_group(struct task_group *parent)
{ {
@ -9759,25 +9778,35 @@ void sched_online_group(struct task_group *tg, struct task_group *parent)
} }
/* rcu callback to free various structures associated with a task group */ /* rcu callback to free various structures associated with a task group */
static void sched_free_group_rcu(struct rcu_head *rhp) static void sched_unregister_group_rcu(struct rcu_head *rhp)
{ {
/* Now it should be safe to free those cfs_rqs: */ /* Now it should be safe to free those cfs_rqs: */
sched_free_group(container_of(rhp, struct task_group, rcu)); sched_unregister_group(container_of(rhp, struct task_group, rcu));
} }
void sched_destroy_group(struct task_group *tg) void sched_destroy_group(struct task_group *tg)
{ {
/* Wait for possible concurrent references to cfs_rqs complete: */ /* Wait for possible concurrent references to cfs_rqs complete: */
call_rcu(&tg->rcu, sched_free_group_rcu); call_rcu(&tg->rcu, sched_unregister_group_rcu);
} }
void sched_offline_group(struct task_group *tg) void sched_release_group(struct task_group *tg)
{ {
unsigned long flags; unsigned long flags;
/* End participation in shares distribution: */ /*
unregister_fair_sched_group(tg); * Unlink first, to avoid walk_tg_tree_from() from finding us (via
* sched_cfs_period_timer()).
*
* For this to be effective, we have to wait for all pending users of
* this task group to leave their RCU critical section to ensure no new
* user will see our dying task group any more. Specifically ensure
* that tg_unthrottle_up() won't add decayed cfs_rq's to it.
*
* We therefore defer calling unregister_fair_sched_group() to
* sched_unregister_group() which is guarantied to get called only after the
* current RCU grace period has expired.
*/
spin_lock_irqsave(&task_group_lock, flags); spin_lock_irqsave(&task_group_lock, flags);
list_del_rcu(&tg->list); list_del_rcu(&tg->list);
list_del_rcu(&tg->siblings); list_del_rcu(&tg->siblings);
@ -9896,7 +9925,7 @@ static void cpu_cgroup_css_released(struct cgroup_subsys_state *css)
{ {
struct task_group *tg = css_tg(css); struct task_group *tg = css_tg(css);
sched_offline_group(tg); sched_release_group(tg);
} }
static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
@ -9906,7 +9935,7 @@ static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
/* /*
* Relies on the RCU grace period between css_released() and this. * Relies on the RCU grace period between css_released() and this.
*/ */
sched_free_group(tg); sched_unregister_group(tg);
} }
/* /*

View File

@ -11456,8 +11456,6 @@ void free_fair_sched_group(struct task_group *tg)
{ {
int i; int i;
destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
for_each_possible_cpu(i) { for_each_possible_cpu(i) {
if (tg->cfs_rq) if (tg->cfs_rq)
kfree(tg->cfs_rq[i]); kfree(tg->cfs_rq[i]);
@ -11534,6 +11532,8 @@ void unregister_fair_sched_group(struct task_group *tg)
struct rq *rq; struct rq *rq;
int cpu; int cpu;
destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
for_each_possible_cpu(cpu) { for_each_possible_cpu(cpu) {
if (tg->se[cpu]) if (tg->se[cpu])
remove_entity_load_avg(tg->se[cpu]); remove_entity_load_avg(tg->se[cpu]);

View File

@ -137,13 +137,17 @@ static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
return rt_rq->rq; return rt_rq->rq;
} }
void unregister_rt_sched_group(struct task_group *tg)
{
if (tg->rt_se)
destroy_rt_bandwidth(&tg->rt_bandwidth);
}
void free_rt_sched_group(struct task_group *tg) void free_rt_sched_group(struct task_group *tg)
{ {
int i; int i;
if (tg->rt_se)
destroy_rt_bandwidth(&tg->rt_bandwidth);
for_each_possible_cpu(i) { for_each_possible_cpu(i) {
if (tg->rt_rq) if (tg->rt_rq)
kfree(tg->rt_rq[i]); kfree(tg->rt_rq[i]);
@ -250,6 +254,8 @@ static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
return &rq->rt; return &rq->rt;
} }
void unregister_rt_sched_group(struct task_group *tg) { }
void free_rt_sched_group(struct task_group *tg) { } void free_rt_sched_group(struct task_group *tg) { }
int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)

View File

@ -488,6 +488,7 @@ extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b);
extern void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b); extern void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq); extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq);
extern void unregister_rt_sched_group(struct task_group *tg);
extern void free_rt_sched_group(struct task_group *tg); extern void free_rt_sched_group(struct task_group *tg);
extern int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent); extern int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent);
extern void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, extern void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
@ -503,7 +504,7 @@ extern struct task_group *sched_create_group(struct task_group *parent);
extern void sched_online_group(struct task_group *tg, extern void sched_online_group(struct task_group *tg,
struct task_group *parent); struct task_group *parent);
extern void sched_destroy_group(struct task_group *tg); extern void sched_destroy_group(struct task_group *tg);
extern void sched_offline_group(struct task_group *tg); extern void sched_release_group(struct task_group *tg);
extern void sched_move_task(struct task_struct *tsk); extern void sched_move_task(struct task_struct *tsk);