cgroup/cpuset: Take isolated CPUs out of workqueue unbound cpumask

To make CPUs in isolated cpuset partition closer in isolation to
the boot time isolated CPUs specified in the "isolcpus" boot command
line option, we need to take those CPUs out of the workqueue unbound
cpumask so that work functions from the unbound workqueues won't run
on those CPUs.  Otherwise, they will interfere the user tasks running
on those isolated CPUs.

With the introduction of the workqueue_unbound_exclude_cpumask() helper
function in an earlier commit, those isolated CPUs can now be taken
out from the workqueue unbound cpumask.

This patch also updates cgroup-v2.rst to mention that isolated
CPUs will be excluded from unbound workqueue cpumask as well as
updating test_cpuset_prs.sh to verify the correctness of the new
*cpuset.cpus.isolated file, if available via cgroup_debug option.

Signed-off-by: Waiman Long <longman@redhat.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
This commit is contained in:
Waiman Long 2023-10-25 14:25:55 -04:00 committed by Tejun Heo
parent 11e5f407b6
commit 72c6303acf
3 changed files with 166 additions and 34 deletions

View File

@ -2358,11 +2358,11 @@ Cpuset Interface Files
partition or scheduling domain. The set of exclusive CPUs is
determined by the value of its "cpuset.cpus.exclusive.effective".
When set to "isolated", the CPUs in that partition will
be in an isolated state without any load balancing from the
scheduler. Tasks placed in such a partition with multiple
CPUs should be carefully distributed and bound to each of the
individual CPUs for optimal performance.
When set to "isolated", the CPUs in that partition will be in
an isolated state without any load balancing from the scheduler
and excluded from the unbound workqueues. Tasks placed in such
a partition with multiple CPUs should be carefully distributed
and bound to each of the individual CPUs for optimal performance.
A partition root ("root" or "isolated") can be in one of the
two possible states - valid or invalid. An invalid partition

View File

@ -25,6 +25,7 @@
#include <linux/cpu.h>
#include <linux/cpumask.h>
#include <linux/cpuset.h>
#include <linux/delay.h>
#include <linux/init.h>
#include <linux/interrupt.h>
#include <linux/kernel.h>
@ -43,6 +44,7 @@
#include <linux/sched/isolation.h>
#include <linux/cgroup.h>
#include <linux/wait.h>
#include <linux/workqueue.h>
DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key);
DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key);
@ -1444,25 +1446,31 @@ static void partition_xcpus_newstate(int old_prs, int new_prs, struct cpumask *x
* @new_prs: new partition_root_state
* @parent: parent cpuset
* @xcpus: exclusive CPUs to be added
* Return: true if isolated_cpus modified, false otherwise
*
* Remote partition if parent == NULL
*/
static void partition_xcpus_add(int new_prs, struct cpuset *parent,
static bool partition_xcpus_add(int new_prs, struct cpuset *parent,
struct cpumask *xcpus)
{
bool isolcpus_updated;
WARN_ON_ONCE(new_prs < 0);
lockdep_assert_held(&callback_lock);
if (!parent)
parent = &top_cpuset;
if (parent == &top_cpuset)
cpumask_or(subpartitions_cpus, subpartitions_cpus, xcpus);
if (new_prs != parent->partition_root_state)
isolcpus_updated = (new_prs != parent->partition_root_state);
if (isolcpus_updated)
partition_xcpus_newstate(parent->partition_root_state, new_prs,
xcpus);
cpumask_andnot(parent->effective_cpus, parent->effective_cpus, xcpus);
return isolcpus_updated;
}
/*
@ -1470,12 +1478,15 @@ static void partition_xcpus_add(int new_prs, struct cpuset *parent,
* @old_prs: old partition_root_state
* @parent: parent cpuset
* @xcpus: exclusive CPUs to be removed
* Return: true if isolated_cpus modified, false otherwise
*
* Remote partition if parent == NULL
*/
static void partition_xcpus_del(int old_prs, struct cpuset *parent,
static bool partition_xcpus_del(int old_prs, struct cpuset *parent,
struct cpumask *xcpus)
{
bool isolcpus_updated;
WARN_ON_ONCE(old_prs < 0);
lockdep_assert_held(&callback_lock);
if (!parent)
@ -1484,12 +1495,27 @@ static void partition_xcpus_del(int old_prs, struct cpuset *parent,
if (parent == &top_cpuset)
cpumask_andnot(subpartitions_cpus, subpartitions_cpus, xcpus);
if (old_prs != parent->partition_root_state)
isolcpus_updated = (old_prs != parent->partition_root_state);
if (isolcpus_updated)
partition_xcpus_newstate(old_prs, parent->partition_root_state,
xcpus);
cpumask_and(xcpus, xcpus, cpu_active_mask);
cpumask_or(parent->effective_cpus, parent->effective_cpus, xcpus);
return isolcpus_updated;
}
static void update_unbound_workqueue_cpumask(bool isolcpus_updated)
{
int ret;
lockdep_assert_cpus_held();
if (!isolcpus_updated)
return;
ret = workqueue_unbound_exclude_cpumask(isolated_cpus);
WARN_ON_ONCE(ret < 0);
}
/*
@ -1540,6 +1566,8 @@ static inline bool is_local_partition(struct cpuset *cs)
static int remote_partition_enable(struct cpuset *cs, int new_prs,
struct tmpmasks *tmp)
{
bool isolcpus_updated;
/*
* The user must have sysadmin privilege.
*/
@ -1561,7 +1589,7 @@ static int remote_partition_enable(struct cpuset *cs, int new_prs,
return 0;
spin_lock_irq(&callback_lock);
partition_xcpus_add(new_prs, NULL, tmp->new_cpus);
isolcpus_updated = partition_xcpus_add(new_prs, NULL, tmp->new_cpus);
list_add(&cs->remote_sibling, &remote_children);
if (cs->use_parent_ecpus) {
struct cpuset *parent = parent_cs(cs);
@ -1570,13 +1598,13 @@ static int remote_partition_enable(struct cpuset *cs, int new_prs,
parent->child_ecpus_count--;
}
spin_unlock_irq(&callback_lock);
update_unbound_workqueue_cpumask(isolcpus_updated);
/*
* Proprogate changes in top_cpuset's effective_cpus down the hierarchy.
*/
update_tasks_cpumask(&top_cpuset, tmp->new_cpus);
update_sibling_cpumasks(&top_cpuset, NULL, tmp);
return 1;
}
@ -1591,18 +1619,22 @@ static int remote_partition_enable(struct cpuset *cs, int new_prs,
*/
static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp)
{
bool isolcpus_updated;
compute_effective_exclusive_cpumask(cs, tmp->new_cpus);
WARN_ON_ONCE(!is_remote_partition(cs));
WARN_ON_ONCE(!cpumask_subset(tmp->new_cpus, subpartitions_cpus));
spin_lock_irq(&callback_lock);
list_del_init(&cs->remote_sibling);
partition_xcpus_del(cs->partition_root_state, NULL, tmp->new_cpus);
isolcpus_updated = partition_xcpus_del(cs->partition_root_state,
NULL, tmp->new_cpus);
cs->partition_root_state = -cs->partition_root_state;
if (!cs->prs_err)
cs->prs_err = PERR_INVCPUS;
reset_partition_data(cs);
spin_unlock_irq(&callback_lock);
update_unbound_workqueue_cpumask(isolcpus_updated);
/*
* Proprogate changes in top_cpuset's effective_cpus down the hierarchy.
@ -1625,6 +1657,7 @@ static void remote_cpus_update(struct cpuset *cs, struct cpumask *newmask,
{
bool adding, deleting;
int prs = cs->partition_root_state;
int isolcpus_updated = 0;
if (WARN_ON_ONCE(!is_remote_partition(cs)))
return;
@ -1649,10 +1682,11 @@ static void remote_cpus_update(struct cpuset *cs, struct cpumask *newmask,
spin_lock_irq(&callback_lock);
if (adding)
partition_xcpus_add(prs, NULL, tmp->addmask);
isolcpus_updated += partition_xcpus_add(prs, NULL, tmp->addmask);
if (deleting)
partition_xcpus_del(prs, NULL, tmp->delmask);
isolcpus_updated += partition_xcpus_del(prs, NULL, tmp->delmask);
spin_unlock_irq(&callback_lock);
update_unbound_workqueue_cpumask(isolcpus_updated);
/*
* Proprogate changes in top_cpuset's effective_cpus down the hierarchy.
@ -1774,6 +1808,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
int part_error = PERR_NONE; /* Partition error? */
int subparts_delta = 0;
struct cpumask *xcpus; /* cs effective_xcpus */
int isolcpus_updated = 0;
bool nocpu;
lockdep_assert_held(&cpuset_mutex);
@ -2010,15 +2045,18 @@ write_error:
* and vice versa.
*/
if (adding)
partition_xcpus_del(old_prs, parent, tmp->addmask);
isolcpus_updated += partition_xcpus_del(old_prs, parent,
tmp->addmask);
if (deleting)
partition_xcpus_add(new_prs, parent, tmp->delmask);
isolcpus_updated += partition_xcpus_add(new_prs, parent,
tmp->delmask);
if (is_partition_valid(parent)) {
parent->nr_subparts += subparts_delta;
WARN_ON_ONCE(parent->nr_subparts < 0);
}
spin_unlock_irq(&callback_lock);
update_unbound_workqueue_cpumask(isolcpus_updated);
if ((old_prs != new_prs) && (cmd == partcmd_update))
update_partition_exclusive(cs, new_prs);
@ -3082,6 +3120,7 @@ out:
else if (new_xcpus_state)
partition_xcpus_newstate(old_prs, new_prs, cs->effective_xcpus);
spin_unlock_irq(&callback_lock);
update_unbound_workqueue_cpumask(new_xcpus_state);
/* Force update if switching back to member */
update_cpumasks_hier(cs, &tmpmask, !new_prs ? HIER_CHECKALL : 0);
@ -4370,6 +4409,30 @@ void cpuset_force_rebuild(void)
force_rebuild = true;
}
/*
* Attempt to acquire a cpus_read_lock while a hotplug operation may be in
* progress.
* Return: true if successful, false otherwise
*
* To avoid circular lock dependency between cpuset_mutex and cpus_read_lock,
* cpus_read_trylock() is used here to acquire the lock.
*/
static bool cpuset_hotplug_cpus_read_trylock(void)
{
int retries = 0;
while (!cpus_read_trylock()) {
/*
* CPU hotplug still in progress. Retry 5 times
* with a 10ms wait before bailing out.
*/
if (++retries > 5)
return false;
msleep(10);
}
return true;
}
/**
* cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug
* @cs: cpuset in interest
@ -4386,6 +4449,7 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp)
bool cpus_updated;
bool mems_updated;
bool remote;
int partcmd = -1;
struct cpuset *parent;
retry:
wait_event(cpuset_attach_wq, cs->attach_in_progress == 0);
@ -4417,11 +4481,13 @@ retry:
compute_partition_effective_cpumask(cs, &new_cpus);
if (remote && cpumask_empty(&new_cpus) &&
partition_is_populated(cs, NULL)) {
partition_is_populated(cs, NULL) &&
cpuset_hotplug_cpus_read_trylock()) {
remote_partition_disable(cs, tmp);
compute_effective_cpumask(&new_cpus, cs, parent);
remote = false;
cpuset_force_rebuild();
cpus_read_unlock();
}
/*
@ -4432,18 +4498,28 @@ retry:
* partitions.
*/
if (is_local_partition(cs) && (!is_partition_valid(parent) ||
tasks_nocpu_error(parent, cs, &new_cpus))) {
update_parent_effective_cpumask(cs, partcmd_invalidate, NULL, tmp);
compute_effective_cpumask(&new_cpus, cs, parent);
cpuset_force_rebuild();
}
tasks_nocpu_error(parent, cs, &new_cpus)))
partcmd = partcmd_invalidate;
/*
* On the other hand, an invalid partition root may be transitioned
* back to a regular one.
*/
else if (is_partition_valid(parent) && is_partition_invalid(cs)) {
update_parent_effective_cpumask(cs, partcmd_update, NULL, tmp);
if (is_partition_valid(cs)) {
else if (is_partition_valid(parent) && is_partition_invalid(cs))
partcmd = partcmd_update;
/*
* cpus_read_lock needs to be held before calling
* update_parent_effective_cpumask(). To avoid circular lock
* dependency between cpuset_mutex and cpus_read_lock,
* cpus_read_trylock() is used here to acquire the lock.
*/
if (partcmd >= 0) {
if (!cpuset_hotplug_cpus_read_trylock())
goto update_tasks;
update_parent_effective_cpumask(cs, partcmd, NULL, tmp);
cpus_read_unlock();
if ((partcmd == partcmd_invalidate) || is_partition_valid(cs)) {
compute_partition_effective_cpumask(cs, &new_cpus);
cpuset_force_rebuild();
}

View File

@ -232,11 +232,11 @@ TEST_MATRIX=(
" C0-3:S+ C1-3:S+ C2-3 C4-5 X2-3 X2-3:P1 P2 P1 0 A1:0-1,A2:,A3:2-3,B1:4-5 \
A1:P0,A2:P1,A3:P2,B1:P1 2-3"
" C0-3:S+ C1-3:S+ C2-3 C4 X2-3 X2-3:P1 P2 P1 0 A1:0-1,A2:,A3:2-3,B1:4 \
A1:P0,A2:P1,A3:P2,B1:P1 2-4"
A1:P0,A2:P1,A3:P2,B1:P1 2-4,2-3"
" C0-3:S+ C1-3:S+ C3 C4 X2-3 X2-3:P1 P2 P1 0 A1:0-1,A2:2,A3:3,B1:4 \
A1:P0,A2:P1,A3:P2,B1:P1 2-4"
A1:P0,A2:P1,A3:P2,B1:P1 2-4,3"
" C0-4:S+ C1-4:S+ C2-4 . X2-4 X2-4:P2 X4:P1 . 0 A1:0-1,A2:2-3,A3:4 \
A1:P0,A2:P2,A3:P1 2-4"
A1:P0,A2:P2,A3:P1 2-4,2-3"
" C0-4:X2-4:S+ C1-4:X2-4:S+:P2 C2-4:X4:P1 \
. . X5 . . 0 A1:0-4,A2:1-4,A3:2-4 \
A1:P0,A2:P-2,A3:P-1"
@ -248,7 +248,7 @@ TEST_MATRIX=(
" C0-3:S+ C1-3:S+ C2-3 . X2-3 X2-3 X2-3:P2:O2=0 . 0 A1:0-1,A2:1,A3:3 A1:P0,A3:P2 2-3"
" C0-3:S+ C1-3:S+ C2-3 . X2-3 X2-3 X2-3:P2:O2=0 O2=1 0 A1:0-1,A2:1,A3:2-3 A1:P0,A3:P2 2-3"
" C0-3:S+ C1-3:S+ C3 . X2-3 X2-3 P2:O3=0 . 0 A1:0-2,A2:1-2,A3: A1:P0,A3:P2 3"
" C0-3:S+ C1-3:S+ C3 . X2-3 X2-3 T:P2:O3=0 . 0 A1:0-2,A2:1-2,A3:1-2 A1:P0,A3:P-2 3"
" C0-3:S+ C1-3:S+ C3 . X2-3 X2-3 T:P2:O3=0 . 0 A1:0-2,A2:1-2,A3:1-2 A1:P0,A3:P-2 3,"
# An invalidated remote partition cannot self-recover from hotplug
" C0-3:S+ C1-3:S+ C2 . X2-3 X2-3 T:P2:O2=0 O2=1 0 A1:0-3,A2:1-3,A3:2 A1:P0,A3:P-2"
@ -376,7 +376,7 @@ write_cpu_online()
}
fi
echo $VAL > $CPUFILE
pause 0.01
pause 0.05
}
#
@ -508,12 +508,14 @@ dump_states()
XECPUS=$DIR/cpuset.cpus.exclusive.effective
PRS=$DIR/cpuset.cpus.partition
PCPUS=$DIR/.__DEBUG__.cpuset.cpus.subpartitions
ISCPUS=$DIR/.__DEBUG__.cpuset.cpus.isolated
[[ -e $CPUS ]] && echo "$CPUS: $(cat $CPUS)"
[[ -e $XCPUS ]] && echo "$XCPUS: $(cat $XCPUS)"
[[ -e $ECPUS ]] && echo "$ECPUS: $(cat $ECPUS)"
[[ -e $XECPUS ]] && echo "$XECPUS: $(cat $XECPUS)"
[[ -e $PRS ]] && echo "$PRS: $(cat $PRS)"
[[ -e $PCPUS ]] && echo "$PCPUS: $(cat $PCPUS)"
[[ -e $ISCPUS ]] && echo "$ISCPUS: $(cat $ISCPUS)"
done
}
@ -591,11 +593,17 @@ check_cgroup_states()
#
# Get isolated (including offline) CPUs by looking at
# /sys/kernel/debug/sched/domains and compare that with the expected value.
# /sys/kernel/debug/sched/domains and *cpuset.cpus.isolated control file,
# if available, and compare that with the expected value.
#
# Note that a sched domain of just 1 CPU will be considered isolated.
# Note that isolated CPUs from the sched/domains context include offline
# CPUs as well as CPUs in non-isolated 1-CPU partition. Those CPUs may
# not be included in the *cpuset.cpus.isolated control file which contains
# only CPUs in isolated partitions.
#
# $1 - expected isolated cpu list
# $1 - expected isolated cpu list(s) <isolcpus1>{,<isolcpus2>}
# <isolcpus1> - expected sched/domains value
# <isolcpus2> - *cpuset.cpus.isolated value = <isolcpus1> if not defined
#
check_isolcpus()
{
@ -603,8 +611,38 @@ check_isolcpus()
ISOLCPUS=
LASTISOLCPU=
SCHED_DOMAINS=/sys/kernel/debug/sched/domains
ISCPUS=${CGROUP2}/.__DEBUG__.cpuset.cpus.isolated
if [[ $EXPECT_VAL = . ]]
then
EXPECT_VAL=
EXPECT_VAL2=
elif [[ $(expr $EXPECT_VAL : ".*,.*") > 0 ]]
then
set -- $(echo $EXPECT_VAL | sed -e "s/,/ /g")
EXPECT_VAL=$1
EXPECT_VAL2=$2
else
EXPECT_VAL2=$EXPECT_VAL
fi
#
# Check the debug isolated cpumask, if present
#
[[ -f $ISCPUS ]] && {
ISOLCPUS=$(cat $ISCPUS)
[[ "$EXPECT_VAL2" != "$ISOLCPUS" ]] && {
# Take a 50ms pause and try again
pause 0.05
ISOLCPUS=$(cat $ISCPUS)
}
[[ "$EXPECT_VAL2" != "$ISOLCPUS" ]] && return 1
ISOLCPUS=
}
#
# Use the sched domain in debugfs to check isolated CPUs, if available
#
[[ -d $SCHED_DOMAINS ]] || return 0
[[ $EXPECT_VAL = . ]] && EXPECT_VAL=
for ((CPU=0; CPU < $NR_CPUS; CPU++))
do
@ -648,6 +686,22 @@ test_fail()
exit 1
}
#
# Check to see if there are unexpected isolated CPUs left
#
null_isolcpus_check()
{
[[ $VERBOSE -gt 0 ]] || return 0
pause 0.02
check_isolcpus "."
if [[ $? -ne 0 ]]
then
echo "Unexpected isolated CPUs: $ISOLCPUS"
dump_states
exit 1
fi
}
#
# Run cpuset state transition test
# $1 - test matrix name
@ -733,6 +787,7 @@ run_state_test()
echo "Effective cpus changed to $NEWLIST after test $I!"
exit 1
}
null_isolcpus_check
[[ $VERBOSE -gt 0 ]] && echo "Test $I done."
((I++))
done
@ -802,6 +857,7 @@ test_isolated()
console_msg "Cleaning up"
echo $$ > $CGROUP2/cgroup.procs
[[ -d A1 ]] && rmdir A1
null_isolcpus_check
}
#