7fef099702
The implementation of 'current' on x86 is very intentionally special: it is a very common thing to look up, and it uses 'this_cpu_read_stable()' to get the current thread pointer efficiently from per-cpu storage. And the keyword in there is 'stable': the current thread pointer never changes as far as a single thread is concerned. Even if when a thread is preempted, or moved to another CPU, or even across an explicit call 'schedule()' that thread will still have the same value for 'current'. It is, after all, the kernel base pointer to thread-local storage. That's why it's stable to begin with, but it's also why it's important enough that we have that special 'this_cpu_read_stable()' access for it. So this is all done very intentionally to allow the compiler to treat 'current' as a value that never visibly changes, so that the compiler can do CSE and combine multiple different 'current' accesses into one. However, there is obviously one very special situation when the currently running thread does actually change: inside the scheduler itself. So the scheduler code paths are special, and do not have a 'current' thread at all. Instead there are _two_ threads: the previous and the next thread - typically called 'prev' and 'next' (or prev_p/next_p) internally. So this is all actually quite straightforward and simple, and not all that complicated. Except for when you then have special code that is run in scheduler context, that code then has to be aware that 'current' isn't really a valid thing. Did you mean 'prev'? Did you mean 'next'? In fact, even if then look at the code, and you use 'current' after the new value has been assigned to the percpu variable, we have explicitly told the compiler that 'current' is magical and always stable. So the compiler is quite free to use an older (or newer) value of 'current', and the actual assignment to the percpu storage is not relevant even if it might look that way. Which is exactly what happened in the resctl code, that blithely used 'current' in '__resctrl_sched_in()' when it really wanted the new process state (as implied by the name: we're scheduling 'into' that new resctl state). And clang would end up just using the old thread pointer value at least in some configurations. This could have happened with gcc too, and purely depends on random compiler details. Clang just seems to have been more aggressive about moving the read of the per-cpu current_task pointer around. The fix is trivial: just make the resctl code adhere to the scheduler rules of using the prev/next thread pointer explicitly, instead of using 'current' in a situation where it just wasn't valid. That same code is then also used outside of the scheduler context (when a thread resctl state is explicitly changed), and then we will just pass in 'current' as that pointer, of course. There is no ambiguity in that case. The fix may be trivial, but noticing and figuring out what went wrong was not. The credit for that goes to Stephane Eranian. Reported-by: Stephane Eranian <eranian@google.com> Link: https://lore.kernel.org/lkml/20230303231133.1486085-1-eranian@google.com/ Link: https://lore.kernel.org/lkml/alpine.LFD.2.01.0908011214330.3304@localhost.localdomain/ Reviewed-by: Nick Desaulniers <ndesaulniers@google.com> Tested-by: Tony Luck <tony.luck@intel.com> Tested-by: Stephane Eranian <eranian@google.com> Tested-by: Babu Moger <babu.moger@amd.com> Cc: stable@kernel.org Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
107 lines
3.0 KiB
C
107 lines
3.0 KiB
C
/* SPDX-License-Identifier: GPL-2.0 */
|
|
#ifndef _ASM_X86_RESCTRL_H
|
|
#define _ASM_X86_RESCTRL_H
|
|
|
|
#ifdef CONFIG_X86_CPU_RESCTRL
|
|
|
|
#include <linux/sched.h>
|
|
#include <linux/jump_label.h>
|
|
|
|
/**
|
|
* struct resctrl_pqr_state - State cache for the PQR MSR
|
|
* @cur_rmid: The cached Resource Monitoring ID
|
|
* @cur_closid: The cached Class Of Service ID
|
|
* @default_rmid: The user assigned Resource Monitoring ID
|
|
* @default_closid: The user assigned cached Class Of Service ID
|
|
*
|
|
* The upper 32 bits of MSR_IA32_PQR_ASSOC contain closid and the
|
|
* lower 10 bits rmid. The update to MSR_IA32_PQR_ASSOC always
|
|
* contains both parts, so we need to cache them. This also
|
|
* stores the user configured per cpu CLOSID and RMID.
|
|
*
|
|
* The cache also helps to avoid pointless updates if the value does
|
|
* not change.
|
|
*/
|
|
struct resctrl_pqr_state {
|
|
u32 cur_rmid;
|
|
u32 cur_closid;
|
|
u32 default_rmid;
|
|
u32 default_closid;
|
|
};
|
|
|
|
DECLARE_PER_CPU(struct resctrl_pqr_state, pqr_state);
|
|
|
|
DECLARE_STATIC_KEY_FALSE(rdt_enable_key);
|
|
DECLARE_STATIC_KEY_FALSE(rdt_alloc_enable_key);
|
|
DECLARE_STATIC_KEY_FALSE(rdt_mon_enable_key);
|
|
|
|
/*
|
|
* __resctrl_sched_in() - Writes the task's CLOSid/RMID to IA32_PQR_MSR
|
|
*
|
|
* Following considerations are made so that this has minimal impact
|
|
* on scheduler hot path:
|
|
* - This will stay as no-op unless we are running on an Intel SKU
|
|
* which supports resource control or monitoring and we enable by
|
|
* mounting the resctrl file system.
|
|
* - Caches the per cpu CLOSid/RMID values and does the MSR write only
|
|
* when a task with a different CLOSid/RMID is scheduled in.
|
|
* - We allocate RMIDs/CLOSids globally in order to keep this as
|
|
* simple as possible.
|
|
* Must be called with preemption disabled.
|
|
*/
|
|
static inline void __resctrl_sched_in(struct task_struct *tsk)
|
|
{
|
|
struct resctrl_pqr_state *state = this_cpu_ptr(&pqr_state);
|
|
u32 closid = state->default_closid;
|
|
u32 rmid = state->default_rmid;
|
|
u32 tmp;
|
|
|
|
/*
|
|
* If this task has a closid/rmid assigned, use it.
|
|
* Else use the closid/rmid assigned to this cpu.
|
|
*/
|
|
if (static_branch_likely(&rdt_alloc_enable_key)) {
|
|
tmp = READ_ONCE(tsk->closid);
|
|
if (tmp)
|
|
closid = tmp;
|
|
}
|
|
|
|
if (static_branch_likely(&rdt_mon_enable_key)) {
|
|
tmp = READ_ONCE(tsk->rmid);
|
|
if (tmp)
|
|
rmid = tmp;
|
|
}
|
|
|
|
if (closid != state->cur_closid || rmid != state->cur_rmid) {
|
|
state->cur_closid = closid;
|
|
state->cur_rmid = rmid;
|
|
wrmsr(MSR_IA32_PQR_ASSOC, rmid, closid);
|
|
}
|
|
}
|
|
|
|
static inline unsigned int resctrl_arch_round_mon_val(unsigned int val)
|
|
{
|
|
unsigned int scale = boot_cpu_data.x86_cache_occ_scale;
|
|
|
|
/* h/w works in units of "boot_cpu_data.x86_cache_occ_scale" */
|
|
val /= scale;
|
|
return val * scale;
|
|
}
|
|
|
|
static inline void resctrl_sched_in(struct task_struct *tsk)
|
|
{
|
|
if (static_branch_likely(&rdt_enable_key))
|
|
__resctrl_sched_in(tsk);
|
|
}
|
|
|
|
void resctrl_cpu_detect(struct cpuinfo_x86 *c);
|
|
|
|
#else
|
|
|
|
static inline void resctrl_sched_in(struct task_struct *tsk) {}
|
|
static inline void resctrl_cpu_detect(struct cpuinfo_x86 *c) {}
|
|
|
|
#endif /* CONFIG_X86_CPU_RESCTRL */
|
|
|
|
#endif /* _ASM_X86_RESCTRL_H */
|