cgroup/cpuset: Keep track of CPUs in isolated partitions

Add a new internal isolated_cpus mask to keep track of the CPUs that are in
isolated partitions. Expose that new cpumask as a new root-only control file
".cpuset.cpus.isolated".

tj: Updated patch description to reflect dropping __DEBUG__ prefix.

Signed-off-by: Waiman Long <longman@redhat.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
This commit is contained in:
Waiman Long 2023-10-25 14:25:54 -04:00 committed by Tejun Heo
parent 14060dfc48
commit 11e5f407b6

View File

@ -204,6 +204,11 @@ struct cpuset {
*/
static cpumask_var_t subpartitions_cpus;
/*
* Exclusive CPUs in isolated partitions
*/
static cpumask_var_t isolated_cpus;
/* List of remote partition root children */
static struct list_head remote_children;
@ -1317,6 +1322,7 @@ static void compute_effective_cpumask(struct cpumask *new_cpus,
*/
enum partition_cmd {
partcmd_enable, /* Enable partition root */
partcmd_enablei, /* Enable isolated partition root */
partcmd_disable, /* Disable partition root */
partcmd_update, /* Update parent's effective_cpus */
partcmd_invalidate, /* Make partition invalid */
@ -1418,6 +1424,74 @@ static void reset_partition_data(struct cpuset *cs)
}
}
/*
* partition_xcpus_newstate - Exclusive CPUs state change
* @old_prs: old partition_root_state
* @new_prs: new partition_root_state
* @xcpus: exclusive CPUs with state change
*/
static void partition_xcpus_newstate(int old_prs, int new_prs, struct cpumask *xcpus)
{
WARN_ON_ONCE(old_prs == new_prs);
if (new_prs == PRS_ISOLATED)
cpumask_or(isolated_cpus, isolated_cpus, xcpus);
else
cpumask_andnot(isolated_cpus, isolated_cpus, xcpus);
}
/*
* partition_xcpus_add - Add new exclusive CPUs to partition
* @new_prs: new partition_root_state
* @parent: parent cpuset
* @xcpus: exclusive CPUs to be added
*
* Remote partition if parent == NULL
*/
static void partition_xcpus_add(int new_prs, struct cpuset *parent,
struct cpumask *xcpus)
{
WARN_ON_ONCE(new_prs < 0);
lockdep_assert_held(&callback_lock);
if (!parent)
parent = &top_cpuset;
if (parent == &top_cpuset)
cpumask_or(subpartitions_cpus, subpartitions_cpus, xcpus);
if (new_prs != parent->partition_root_state)
partition_xcpus_newstate(parent->partition_root_state, new_prs,
xcpus);
cpumask_andnot(parent->effective_cpus, parent->effective_cpus, xcpus);
}
/*
* partition_xcpus_del - Remove exclusive CPUs from partition
* @old_prs: old partition_root_state
* @parent: parent cpuset
* @xcpus: exclusive CPUs to be removed
*
* Remote partition if parent == NULL
*/
static void partition_xcpus_del(int old_prs, struct cpuset *parent,
struct cpumask *xcpus)
{
WARN_ON_ONCE(old_prs < 0);
lockdep_assert_held(&callback_lock);
if (!parent)
parent = &top_cpuset;
if (parent == &top_cpuset)
cpumask_andnot(subpartitions_cpus, subpartitions_cpus, xcpus);
if (old_prs != parent->partition_root_state)
partition_xcpus_newstate(old_prs, parent->partition_root_state,
xcpus);
cpumask_and(xcpus, xcpus, cpu_active_mask);
cpumask_or(parent->effective_cpus, parent->effective_cpus, xcpus);
}
/*
* compute_effective_exclusive_cpumask - compute effective exclusive CPUs
* @cs: cpuset
@ -1456,13 +1530,15 @@ static inline bool is_local_partition(struct cpuset *cs)
/*
* remote_partition_enable - Enable current cpuset as a remote partition root
* @cs: the cpuset to update
* @new_prs: new partition_root_state
* @tmp: temparary masks
* Return: 1 if successful, 0 if error
*
* Enable the current cpuset to become a remote partition root taking CPUs
* directly from the top cpuset. cpuset_mutex must be held by the caller.
*/
static int remote_partition_enable(struct cpuset *cs, struct tmpmasks *tmp)
static int remote_partition_enable(struct cpuset *cs, int new_prs,
struct tmpmasks *tmp)
{
/*
* The user must have sysadmin privilege.
@ -1485,18 +1561,14 @@ static int remote_partition_enable(struct cpuset *cs, struct tmpmasks *tmp)
return 0;
spin_lock_irq(&callback_lock);
cpumask_andnot(top_cpuset.effective_cpus,
top_cpuset.effective_cpus, tmp->new_cpus);
cpumask_or(subpartitions_cpus,
subpartitions_cpus, tmp->new_cpus);
partition_xcpus_add(new_prs, NULL, tmp->new_cpus);
list_add(&cs->remote_sibling, &remote_children);
if (cs->use_parent_ecpus) {
struct cpuset *parent = parent_cs(cs);
cs->use_parent_ecpus = false;
parent->child_ecpus_count--;
}
list_add(&cs->remote_sibling, &remote_children);
spin_unlock_irq(&callback_lock);
/*
@ -1524,13 +1596,8 @@ static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp)
WARN_ON_ONCE(!cpumask_subset(tmp->new_cpus, subpartitions_cpus));
spin_lock_irq(&callback_lock);
cpumask_andnot(subpartitions_cpus,
subpartitions_cpus, tmp->new_cpus);
cpumask_and(tmp->new_cpus,
tmp->new_cpus, cpu_active_mask);
cpumask_or(top_cpuset.effective_cpus,
top_cpuset.effective_cpus, tmp->new_cpus);
list_del_init(&cs->remote_sibling);
partition_xcpus_del(cs->partition_root_state, NULL, tmp->new_cpus);
cs->partition_root_state = -cs->partition_root_state;
if (!cs->prs_err)
cs->prs_err = PERR_INVCPUS;
@ -1557,6 +1624,7 @@ static void remote_cpus_update(struct cpuset *cs, struct cpumask *newmask,
struct tmpmasks *tmp)
{
bool adding, deleting;
int prs = cs->partition_root_state;
if (WARN_ON_ONCE(!is_remote_partition(cs)))
return;
@ -1580,20 +1648,10 @@ static void remote_cpus_update(struct cpuset *cs, struct cpumask *newmask,
goto invalidate;
spin_lock_irq(&callback_lock);
if (adding) {
cpumask_or(subpartitions_cpus,
subpartitions_cpus, tmp->addmask);
cpumask_andnot(top_cpuset.effective_cpus,
top_cpuset.effective_cpus, tmp->addmask);
}
if (deleting) {
cpumask_andnot(subpartitions_cpus,
subpartitions_cpus, tmp->delmask);
cpumask_and(tmp->delmask,
tmp->delmask, cpu_active_mask);
cpumask_or(top_cpuset.effective_cpus,
top_cpuset.effective_cpus, tmp->delmask);
}
if (adding)
partition_xcpus_add(prs, NULL, tmp->addmask);
if (deleting)
partition_xcpus_del(prs, NULL, tmp->delmask);
spin_unlock_irq(&callback_lock);
/*
@ -1676,11 +1734,11 @@ static bool prstate_housekeeping_conflict(int prstate, struct cpumask *new_cpus)
* @tmp: Temporary addmask and delmask
* Return: 0 or a partition root state error code
*
* For partcmd_enable, the cpuset is being transformed from a non-partition
* root to a partition root. The effective_xcpus (cpus_allowed if effective_xcpus
* not set) mask of the given cpuset will be taken away from parent's
* effective_cpus. The function will return 0 if all the CPUs listed in
* effective_xcpus can be granted or an error code will be returned.
* For partcmd_enable*, the cpuset is being transformed from a non-partition
* root to a partition root. The effective_xcpus (cpus_allowed if
* effective_xcpus not set) mask of the given cpuset will be taken away from
* parent's effective_cpus. The function will return 0 if all the CPUs listed
* in effective_xcpus can be granted or an error code will be returned.
*
* For partcmd_disable, the cpuset is being transformed from a partition
* root back to a non-partition root. Any CPUs in effective_xcpus will be
@ -1695,7 +1753,7 @@ static bool prstate_housekeeping_conflict(int prstate, struct cpumask *new_cpus)
*
* For partcmd_invalidate, the current partition will be made invalid.
*
* The partcmd_enable and partcmd_disable commands are used by
* The partcmd_enable* and partcmd_disable commands are used by
* update_prstate(). An error code may be returned and the caller will check
* for error.
*
@ -1760,7 +1818,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
nocpu = tasks_nocpu_error(parent, cs, xcpus);
if (cmd == partcmd_enable) {
if ((cmd == partcmd_enable) || (cmd == partcmd_enablei)) {
/*
* Enabling partition root is not allowed if its
* effective_xcpus is empty or doesn't overlap with
@ -1783,6 +1841,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
cpumask_copy(tmp->delmask, xcpus);
deleting = true;
subparts_delta++;
new_prs = (cmd == partcmd_enable) ? PRS_ROOT : PRS_ISOLATED;
} else if (cmd == partcmd_disable) {
/*
* May need to add cpus to parent's effective_cpus for
@ -1792,6 +1851,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
cpumask_and(tmp->addmask, xcpus, parent->effective_xcpus);
if (adding)
subparts_delta--;
new_prs = PRS_MEMBER;
} else if (newmask) {
/*
* Empty cpumask is not allowed
@ -1940,37 +2000,24 @@ write_error:
* newly deleted ones will be added back to effective_cpus.
*/
spin_lock_irq(&callback_lock);
if (adding) {
if (parent == &top_cpuset)
cpumask_andnot(subpartitions_cpus,
subpartitions_cpus, tmp->addmask);
/*
* Some of the CPUs in effective_xcpus might have been offlined.
*/
cpumask_or(parent->effective_cpus,
parent->effective_cpus, tmp->addmask);
cpumask_and(parent->effective_cpus,
parent->effective_cpus, cpu_active_mask);
}
if (deleting) {
if (parent == &top_cpuset)
cpumask_or(subpartitions_cpus,
subpartitions_cpus, tmp->delmask);
cpumask_andnot(parent->effective_cpus,
parent->effective_cpus, tmp->delmask);
}
if (is_partition_valid(parent)) {
parent->nr_subparts += subparts_delta;
WARN_ON_ONCE(parent->nr_subparts < 0);
}
if (old_prs != new_prs) {
cs->partition_root_state = new_prs;
if (new_prs <= 0)
cs->nr_subparts = 0;
}
/*
* Adding to parent's effective_cpus means deletion CPUs from cs
* and vice versa.
*/
if (adding)
partition_xcpus_del(old_prs, parent, tmp->addmask);
if (deleting)
partition_xcpus_add(new_prs, parent, tmp->delmask);
if (is_partition_valid(parent)) {
parent->nr_subparts += subparts_delta;
WARN_ON_ONCE(parent->nr_subparts < 0);
}
spin_unlock_irq(&callback_lock);
if ((old_prs != new_prs) && (cmd == partcmd_update))
@ -2948,6 +2995,7 @@ static int update_prstate(struct cpuset *cs, int new_prs)
int err = PERR_NONE, old_prs = cs->partition_root_state;
struct cpuset *parent = parent_cs(cs);
struct tmpmasks tmpmask;
bool new_xcpus_state = false;
if (old_prs == new_prs)
return 0;
@ -2977,6 +3025,9 @@ static int update_prstate(struct cpuset *cs, int new_prs)
goto out;
if (!old_prs) {
enum partition_cmd cmd = (new_prs == PRS_ROOT)
? partcmd_enable : partcmd_enablei;
/*
* cpus_allowed cannot be empty.
*/
@ -2985,19 +3036,18 @@ static int update_prstate(struct cpuset *cs, int new_prs)
goto out;
}
err = update_parent_effective_cpumask(cs, partcmd_enable,
NULL, &tmpmask);
err = update_parent_effective_cpumask(cs, cmd, NULL, &tmpmask);
/*
* If an attempt to become local partition root fails,
* try to become a remote partition root instead.
*/
if (err && remote_partition_enable(cs, &tmpmask))
if (err && remote_partition_enable(cs, new_prs, &tmpmask))
err = 0;
} else if (old_prs && new_prs) {
/*
* A change in load balance state only, no change in cpumasks.
*/
;
new_xcpus_state = true;
} else {
/*
* Switching back to member is always allowed even if it
@ -3029,6 +3079,8 @@ out:
WRITE_ONCE(cs->prs_err, err);
if (!is_partition_valid(cs))
reset_partition_data(cs);
else if (new_xcpus_state)
partition_xcpus_newstate(old_prs, new_prs, cs->effective_xcpus);
spin_unlock_irq(&callback_lock);
/* Force update if switching back to member */
@ -3386,6 +3438,7 @@ typedef enum {
FILE_SUBPARTS_CPULIST,
FILE_EXCLUSIVE_CPULIST,
FILE_EFFECTIVE_XCPULIST,
FILE_ISOLATED_CPULIST,
FILE_CPU_EXCLUSIVE,
FILE_MEM_EXCLUSIVE,
FILE_MEM_HARDWALL,
@ -3582,6 +3635,9 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v)
case FILE_SUBPARTS_CPULIST:
seq_printf(sf, "%*pbl\n", cpumask_pr_args(subpartitions_cpus));
break;
case FILE_ISOLATED_CPULIST:
seq_printf(sf, "%*pbl\n", cpumask_pr_args(isolated_cpus));
break;
default:
ret = -EINVAL;
}
@ -3875,6 +3931,13 @@ static struct cftype dfl_files[] = {
.flags = CFTYPE_ONLY_ON_ROOT | CFTYPE_DEBUG,
},
{
.name = "cpus.isolated",
.seq_show = cpuset_common_seq_show,
.private = FILE_ISOLATED_CPULIST,
.flags = CFTYPE_ONLY_ON_ROOT | CFTYPE_DEBUG,
},
{ } /* terminate */
};
@ -4194,6 +4257,7 @@ int __init cpuset_init(void)
BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_xcpus, GFP_KERNEL));
BUG_ON(!alloc_cpumask_var(&top_cpuset.exclusive_cpus, GFP_KERNEL));
BUG_ON(!zalloc_cpumask_var(&subpartitions_cpus, GFP_KERNEL));
BUG_ON(!zalloc_cpumask_var(&isolated_cpus, GFP_KERNEL));
cpumask_setall(top_cpuset.cpus_allowed);
nodes_setall(top_cpuset.mems_allowed);