From 4622ba923e55e12cb76081c8865b01fcb383a9d8 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Mon, 5 Jun 2023 15:47:13 +0000 Subject: [PATCH 01/12] intel_idle: refactor state->enter manipulation into its own function Since the 6.4 kernel, the logic for updating a state's enter method based on "environmental conditions" (command line options, cpu sidechannel workarounds etc etc) has gotten pretty complex. This patch refactors this into a seperate small, self contained function (no behavior changes) for improved readability and to make future changes to this logic easier to do and understand. Signed-off-by: Arjan van de Ven Signed-off-by: Rafael J. Wysocki --- drivers/idle/intel_idle.c | 50 ++++++++++++++++++++++----------------- 1 file changed, 28 insertions(+), 22 deletions(-) diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index aa2d19db2b1d..c351b21c0875 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -1839,6 +1839,32 @@ static bool __init intel_idle_verify_cstate(unsigned int mwait_hint) return true; } +static void state_update_enter_method(struct cpuidle_state *state, int cstate) +{ + if (state->flags & CPUIDLE_FLAG_INIT_XSTATE) { + /* + * Combining with XSTATE with IBRS or IRQ_ENABLE flags + * is not currently supported but this driver. + */ + WARN_ON_ONCE(state->flags & CPUIDLE_FLAG_IBRS); + WARN_ON_ONCE(state->flags & CPUIDLE_FLAG_IRQ_ENABLE); + state->enter = intel_idle_xstate; + } else if (cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS) && + state->flags & CPUIDLE_FLAG_IBRS) { + /* + * IBRS mitigation requires that C-states are entered + * with interrupts disabled. + */ + WARN_ON_ONCE(state->flags & CPUIDLE_FLAG_IRQ_ENABLE); + state->enter = intel_idle_ibrs; + } else if (state->flags & CPUIDLE_FLAG_IRQ_ENABLE) { + state->enter = intel_idle_irq; + } else if (force_irq_on) { + pr_info("forced intel_idle_irq for state %d\n", cstate); + state->enter = intel_idle_irq; + } +} + static void __init intel_idle_init_cstates_icpu(struct cpuidle_driver *drv) { int cstate; @@ -1894,28 +1920,8 @@ static void __init intel_idle_init_cstates_icpu(struct cpuidle_driver *drv) drv->states[drv->state_count] = cpuidle_state_table[cstate]; state = &drv->states[drv->state_count]; - if (state->flags & CPUIDLE_FLAG_INIT_XSTATE) { - /* - * Combining with XSTATE with IBRS or IRQ_ENABLE flags - * is not currently supported but this driver. - */ - WARN_ON_ONCE(state->flags & CPUIDLE_FLAG_IBRS); - WARN_ON_ONCE(state->flags & CPUIDLE_FLAG_IRQ_ENABLE); - state->enter = intel_idle_xstate; - } else if (cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS) && - state->flags & CPUIDLE_FLAG_IBRS) { - /* - * IBRS mitigation requires that C-states are entered - * with interrupts disabled. - */ - WARN_ON_ONCE(state->flags & CPUIDLE_FLAG_IRQ_ENABLE); - state->enter = intel_idle_ibrs; - } else if (state->flags & CPUIDLE_FLAG_IRQ_ENABLE) { - state->enter = intel_idle_irq; - } else if (force_irq_on) { - pr_info("forced intel_idle_irq for state %d\n", cstate); - state->enter = intel_idle_irq; - } + state_update_enter_method(state, cstate); + if ((disabled_states_mask & BIT(drv->state_count)) || ((icpu->use_acpi || force_use_acpi) && From 7826c069c8765969cfb7a364f8e0e663fc132b10 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Mon, 5 Jun 2023 15:47:14 +0000 Subject: [PATCH 02/12] intel_idle: clean up the (new) state_update_enter_method function Now that the logic for state_update_enter_method() is in its own function, the long if .. else if .. else if .. else if chain can be simplified by just returning from the function at the various places. This does not change functionality, but it makes the logic much simpler to read or modify later. Signed-off-by: Arjan van de Ven Signed-off-by: Rafael J. Wysocki --- drivers/idle/intel_idle.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index c351b21c0875..256c2d42e350 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -1849,7 +1849,10 @@ static void state_update_enter_method(struct cpuidle_state *state, int cstate) WARN_ON_ONCE(state->flags & CPUIDLE_FLAG_IBRS); WARN_ON_ONCE(state->flags & CPUIDLE_FLAG_IRQ_ENABLE); state->enter = intel_idle_xstate; - } else if (cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS) && + return; + } + + if (cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS) && state->flags & CPUIDLE_FLAG_IBRS) { /* * IBRS mitigation requires that C-states are entered @@ -1857,9 +1860,15 @@ static void state_update_enter_method(struct cpuidle_state *state, int cstate) */ WARN_ON_ONCE(state->flags & CPUIDLE_FLAG_IRQ_ENABLE); state->enter = intel_idle_ibrs; - } else if (state->flags & CPUIDLE_FLAG_IRQ_ENABLE) { + return; + } + + if (state->flags & CPUIDLE_FLAG_IRQ_ENABLE) { state->enter = intel_idle_irq; - } else if (force_irq_on) { + return; + } + + if (force_irq_on) { pr_info("forced intel_idle_irq for state %d\n", cstate); state->enter = intel_idle_irq; } From b4a11fa3331e163e177e76098fe1d8b12b87cf6b Mon Sep 17 00:00:00 2001 From: Wyes Karny Date: Mon, 29 May 2023 14:25:51 +0000 Subject: [PATCH 03/12] cpufreq: Fail driver register if it has adjust_perf without fast_switch If fast_switch_possible flag is set by the scaling driver, the governor is free to select fast_switch function even if adjust_perf is set. Some scaling drivers which use adjust_perf don't set fast_switch thinking that the governor would never fall back to fast_switch. But the governor can fall back to fast_switch even in runtime if frequency invariance is disabled due to some reason. This could crash the kernel if the driver didn't set the fast_switch function pointer. Therefore, fail driver registration if it has adjust_perf without fast_switch. Suggested-by: Rafael J. Wysocki Suggested-by: Viresh Kumar Signed-off-by: Wyes Karny Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq.c | 3 ++- include/linux/cpufreq.h | 5 ++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 6b52ebe5a890..50bbc969ffe5 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -2828,7 +2828,8 @@ int cpufreq_register_driver(struct cpufreq_driver *driver_data) (driver_data->setpolicy && (driver_data->target_index || driver_data->target)) || (!driver_data->get_intermediate != !driver_data->target_intermediate) || - (!driver_data->online != !driver_data->offline)) + (!driver_data->online != !driver_data->offline) || + (driver_data->adjust_perf && !driver_data->fast_switch)) return -EINVAL; pr_debug("trying to register driver %s\n", driver_data->name); diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 26e2eb399484..172ff51c1b2a 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -340,7 +340,10 @@ struct cpufreq_driver { /* * ->fast_switch() replacement for drivers that use an internal * representation of performance levels and can pass hints other than - * the target performance level to the hardware. + * the target performance level to the hardware. This can only be set + * if ->fast_switch is set too, because in those cases (under specific + * conditions) scale invariance can be disabled, which causes the + * schedutil governor to fall back to the latter. */ void (*adjust_perf)(unsigned int cpu, unsigned long min_perf, From 2f3d08f074b02aa449de27238fda72496c789034 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Mon, 5 Jun 2023 15:47:15 +0000 Subject: [PATCH 04/12] intel_idle: Add support for using intel_idle in a VM guest using just hlt In a typical VM guest, the mwait instruction is not available, leaving only the 'hlt' instruction (which causes a VMEXIT to the host). So for this common case, intel_idle will detect the lack of mwait, and fail to initialize (after which another idle method would step in which will just use hlt always). Other (non-common) cases exist; the table below shows the before/after for these: +------------+--------------------------+-------------------------+ | Hypervisor | Idle method before patch | Idle method after patch | | exposes | | | +============+==========================+=========================+ | nothing | default_idle fallback | intel_idle VM table | | (common) | (straight "hlt") | | +------------+--------------------------+-------------------------+ | mwait | intel_idle mwait table | intel_idle mwait table | +------------+--------------------------+-------------------------+ | ACPI | ACPI C1 state ("hlt") | intel_idle VM table | +------------+--------------------------+-------------------------+ This is only applicable to CPUs known by intel_idle. For the bare metal case, unknown CPU models will use the ACPI tables (when available) to get estimates for latency and break even point for longer idle states. In guests, the common case is that ACPI tables are not available, but even when they are available, they can't and don't provide the latency information for the longer (mwait based) states. For this scenario (unknown CPU model), the default_idle mode (no ACPI) or ACPI C1 (ACPI avaible) will be used. By providing capability to do this with the intel_idle driver, we can do better than the fallback or ACPI table methods. While this current change only gets us to the existing behavior, later patches in this series will add new capabilities such as optimized TLB flushing. In order to do this, a simplified version of the initialization function for VM guests is created, and this will be called if the CPU is recognized, but mwait is not supported, and we're in a VM guest. One thing to note is that the max latency (and break even) of this C1 state is higher than the typical bare metal C1 state. Because hlt causes a vmexit, and the cost of vmexit + hypervisor overhead + vmenter is typically in the order of upto 5 microseconds... even if the hypervisor does not actually goes into a hardware power saving state. Signed-off-by: Arjan van de Ven [ rjw: Dropped redundant checks from should_verify_mwait() ] Signed-off-by: Rafael J. Wysocki --- drivers/idle/intel_idle.c | 117 +++++++++++++++++++++++++++++++++++++- 1 file changed, 116 insertions(+), 1 deletion(-) diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index 256c2d42e350..a80e1f520293 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -199,6 +199,43 @@ static __cpuidle int intel_idle_xstate(struct cpuidle_device *dev, return __intel_idle(dev, drv, index); } +static __always_inline int __intel_idle_hlt(struct cpuidle_device *dev, + struct cpuidle_driver *drv, int index) +{ + raw_safe_halt(); + raw_local_irq_disable(); + return index; +} + +/** + * intel_idle_hlt - Ask the processor to enter the given idle state using hlt. + * @dev: cpuidle device of the target CPU. + * @drv: cpuidle driver (assumed to point to intel_idle_driver). + * @index: Target idle state index. + * + * Use the HLT instruction to notify the processor that the CPU represented by + * @dev is idle and it can try to enter the idle state corresponding to @index. + * + * Must be called under local_irq_disable(). + */ +static __cpuidle int intel_idle_hlt(struct cpuidle_device *dev, + struct cpuidle_driver *drv, int index) +{ + return __intel_idle_hlt(dev, drv, index); +} + +static __cpuidle int intel_idle_hlt_irq_on(struct cpuidle_device *dev, + struct cpuidle_driver *drv, int index) +{ + int ret; + + raw_local_irq_enable(); + ret = __intel_idle_hlt(dev, drv, index); + raw_local_irq_disable(); + + return ret; +} + /** * intel_idle_s2idle - Ask the processor to enter the given idle state. * @dev: cpuidle device of the target CPU. @@ -1242,6 +1279,18 @@ static struct cpuidle_state snr_cstates[] __initdata = { .enter = NULL } }; +static struct cpuidle_state vmguest_cstates[] __initdata = { + { + .name = "C1", + .desc = "HLT", + .flags = MWAIT2flg(0x00) | CPUIDLE_FLAG_IRQ_ENABLE, + .exit_latency = 5, + .target_residency = 10, + .enter = &intel_idle_hlt, }, + { + .enter = NULL } +}; + static const struct idle_cpu idle_cpu_nehalem __initconst = { .state_table = nehalem_cstates, .auto_demotion_disable_flags = NHM_C1_AUTO_DEMOTE | NHM_C3_AUTO_DEMOTE, @@ -1841,6 +1890,16 @@ static bool __init intel_idle_verify_cstate(unsigned int mwait_hint) static void state_update_enter_method(struct cpuidle_state *state, int cstate) { + if (state->enter == intel_idle_hlt) { + if (force_irq_on) { + pr_info("forced intel_idle_irq for state %d\n", cstate); + state->enter = intel_idle_hlt_irq_on; + } + return; + } + if (state->enter == intel_idle_hlt_irq_on) + return; /* no update scenarios */ + if (state->flags & CPUIDLE_FLAG_INIT_XSTATE) { /* * Combining with XSTATE with IBRS or IRQ_ENABLE flags @@ -1874,6 +1933,21 @@ static void state_update_enter_method(struct cpuidle_state *state, int cstate) } } +/* + * For mwait based states, we want to verify the cpuid data to see if the state + * is actually supported by this specific CPU. + * For non-mwait based states, this check should be skipped. + */ +static bool should_verify_mwait(struct cpuidle_state *state) +{ + if (state->enter == intel_idle_hlt) + return false; + if (state->enter == intel_idle_hlt_irq_on) + return false; + + return true; +} + static void __init intel_idle_init_cstates_icpu(struct cpuidle_driver *drv) { int cstate; @@ -1922,7 +1996,7 @@ static void __init intel_idle_init_cstates_icpu(struct cpuidle_driver *drv) } mwait_hint = flg2MWAIT(cpuidle_state_table[cstate].flags); - if (!intel_idle_verify_cstate(mwait_hint)) + if (should_verify_mwait(&cpuidle_state_table[cstate]) && !intel_idle_verify_cstate(mwait_hint)) continue; /* Structure copy. */ @@ -2056,6 +2130,45 @@ static void __init intel_idle_cpuidle_devices_uninit(void) cpuidle_unregister_device(per_cpu_ptr(intel_idle_cpuidle_devices, i)); } +static int __init intel_idle_vminit(const struct x86_cpu_id *id) +{ + int retval; + + cpuidle_state_table = vmguest_cstates; + + icpu = (const struct idle_cpu *)id->driver_data; + + pr_debug("v" INTEL_IDLE_VERSION " model 0x%X\n", + boot_cpu_data.x86_model); + + intel_idle_cpuidle_devices = alloc_percpu(struct cpuidle_device); + if (!intel_idle_cpuidle_devices) + return -ENOMEM; + + intel_idle_cpuidle_driver_init(&intel_idle_driver); + + retval = cpuidle_register_driver(&intel_idle_driver); + if (retval) { + struct cpuidle_driver *drv = cpuidle_get_driver(); + printk(KERN_DEBUG pr_fmt("intel_idle yielding to %s\n"), + drv ? drv->name : "none"); + goto init_driver_fail; + } + + retval = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "idle/intel:online", + intel_idle_cpu_online, NULL); + if (retval < 0) + goto hp_setup_fail; + + return 0; +hp_setup_fail: + intel_idle_cpuidle_devices_uninit(); + cpuidle_unregister_driver(&intel_idle_driver); +init_driver_fail: + free_percpu(intel_idle_cpuidle_devices); + return retval; +} + static int __init intel_idle_init(void) { const struct x86_cpu_id *id; @@ -2074,6 +2187,8 @@ static int __init intel_idle_init(void) id = x86_match_cpu(intel_idle_ids); if (id) { if (!boot_cpu_has(X86_FEATURE_MWAIT)) { + if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) + return intel_idle_vminit(id); pr_debug("Please enable MWAIT in BIOS SETUP\n"); return -ENODEV; } From 217e67784eab30cd0704fab4109647ea68a4d850 Mon Sep 17 00:00:00 2001 From: Wyes Karny Date: Tue, 30 May 2023 13:13:48 +0000 Subject: [PATCH 05/12] cpufreq: amd-pstate: Write CPPC enable bit per-socket Currently amd_pstate sets CPPC enable bit in MSR_AMD_CPPC_ENABLE only for the CPU where the module_init happened. But MSR_AMD_CPPC_ENABLE is per-socket. This causes CPPC enable bit to set for only one socket for servers with more than one physical packages. To fix this write MSR_AMD_CPPC_ENABLE per-socket. Also, handle duplicate calls for cppc_enable, because it's called from per-policy/per-core callbacks and can result in duplicate MSR writes. Before the fix: amd@amd:~$ sudo rdmsr -a 0xc00102b1 | uniq --count 192 0 192 1 After the fix: amd@amd:~$ sudo rdmsr -a 0xc00102b1 | uniq --count 384 1 Suggested-by: Gautham R. Shenoy Signed-off-by: Wyes Karny Acked-by: Huang Rui Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/amd-pstate.c | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index ddd346a239e0..50722bfbb34a 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -63,6 +63,7 @@ static struct cpufreq_driver *current_pstate_driver; static struct cpufreq_driver amd_pstate_driver; static struct cpufreq_driver amd_pstate_epp_driver; static int cppc_state = AMD_PSTATE_DISABLE; +static bool cppc_enabled; /* * AMD Energy Preference Performance (EPP) @@ -228,7 +229,28 @@ static int amd_pstate_set_energy_pref_index(struct amd_cpudata *cpudata, static inline int pstate_enable(bool enable) { - return wrmsrl_safe(MSR_AMD_CPPC_ENABLE, enable); + int ret, cpu; + unsigned long logical_proc_id_mask = 0; + + if (enable == cppc_enabled) + return 0; + + for_each_present_cpu(cpu) { + unsigned long logical_id = topology_logical_die_id(cpu); + + if (test_bit(logical_id, &logical_proc_id_mask)) + continue; + + set_bit(logical_id, &logical_proc_id_mask); + + ret = wrmsrl_safe_on_cpu(cpu, MSR_AMD_CPPC_ENABLE, + enable); + if (ret) + return ret; + } + + cppc_enabled = enable; + return 0; } static int cppc_enable(bool enable) @@ -236,6 +258,9 @@ static int cppc_enable(bool enable) int cpu, ret = 0; struct cppc_perf_ctrls perf_ctrls; + if (enable == cppc_enabled) + return 0; + for_each_present_cpu(cpu) { ret = cppc_set_enable(cpu, enable); if (ret) @@ -251,6 +276,7 @@ static int cppc_enable(bool enable) } } + cppc_enabled = enable; return ret; } From f4aad639302a07454dcb23b408dcadf8a9efb031 Mon Sep 17 00:00:00 2001 From: Wyes Karny Date: Mon, 12 Jun 2023 11:36:10 +0000 Subject: [PATCH 06/12] cpufreq: amd-pstate: Make amd-pstate EPP driver name hyphenated amd-pstate passive mode driver is hyphenated. So make amd-pstate active mode driver consistent with that rename "amd_pstate_epp" to "amd-pstate-epp". Fixes: ffa5096a7c33 ("cpufreq: amd-pstate: implement Pstate EPP support for the AMD processors") Cc: All applicable Reviewed-by: Gautham R. Shenoy Signed-off-by: Wyes Karny Acked-by: Huang Rui Reviewed-by: Perry Yuan Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/amd-pstate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index 50722bfbb34a..d8269994322e 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -1382,7 +1382,7 @@ static struct cpufreq_driver amd_pstate_epp_driver = { .online = amd_pstate_epp_cpu_online, .suspend = amd_pstate_epp_suspend, .resume = amd_pstate_epp_resume, - .name = "amd_pstate_epp", + .name = "amd-pstate-epp", .attr = amd_pstate_epp_attr, }; From a4ba10bf6855bf9381fe2365ec9c3af84c1fa7db Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Mon, 12 Jun 2023 09:26:48 -0400 Subject: [PATCH 07/12] cpufreq: amd-pstate: Set default governor to schedutil The Kconfig currently defaults the governor to schedutil on x86_64 only when intel-pstate and SMP have been selected. If the kernel is built only with amd-pstate, the default governor should also be schedutil. Signed-off-by: Mario Limonciello Reviewed-by: Leo Li Acked-by: Huang Rui Tested-by: Perry Yuan Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig index 2c839bd2b051..a1c51abddbc5 100644 --- a/drivers/cpufreq/Kconfig +++ b/drivers/cpufreq/Kconfig @@ -38,7 +38,7 @@ choice prompt "Default CPUFreq governor" default CPU_FREQ_DEFAULT_GOV_USERSPACE if ARM_SA1110_CPUFREQ default CPU_FREQ_DEFAULT_GOV_SCHEDUTIL if ARM64 || ARM - default CPU_FREQ_DEFAULT_GOV_SCHEDUTIL if X86_INTEL_PSTATE && SMP + default CPU_FREQ_DEFAULT_GOV_SCHEDUTIL if (X86_INTEL_PSTATE || X86_AMD_PSTATE) && SMP default CPU_FREQ_DEFAULT_GOV_PERFORMANCE help This option sets which CPUFreq governor shall be loaded at From 965262ef71c475857d0984a5eee57694b207a113 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Tue, 20 Jun 2023 12:24:31 -0500 Subject: [PATCH 08/12] ACPI: CPPC: Add definition for undefined FADT preferred PM profile value In the event a new preferred PM profile value is introduced it's best for code to be able to defensively guard against it so that the wrong settings don't get applied on a new system that uses this profile but ancient kernels. Acked-by: Huang Rui Suggested-by: Gautham Ranjal Shenoy Link: https://uefi.org/htmlspecs/ACPI_Spec_6_4_html/05_ACPI_Software_Programming_Model/ACPI_Software_Programming_Model.html#fixed-acpi-description-table-fadt Signed-off-by: Mario Limonciello Reviewed-by: Perry Yuan Signed-off-by: Rafael J. Wysocki --- include/acpi/actbl.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/acpi/actbl.h b/include/acpi/actbl.h index e5dfb6f4de52..451f6276da49 100644 --- a/include/acpi/actbl.h +++ b/include/acpi/actbl.h @@ -307,7 +307,8 @@ enum acpi_preferred_pm_profiles { PM_SOHO_SERVER = 5, PM_APPLIANCE_PC = 6, PM_PERFORMANCE_SERVER = 7, - PM_TABLET = 8 + PM_TABLET = 8, + NR_PM_PROFILES = 9 }; /* Values for sleep_status and sleep_control registers (V5+ FADT) */ From 32f80b9adfdb43f8af248596724f59dde938a190 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Tue, 20 Jun 2023 12:24:32 -0500 Subject: [PATCH 09/12] cpufreq: amd-pstate: Set a fallback policy based on preferred_profile If a user's configuration doesn't explicitly specify the cpufreq scaling governor then the code currently explicitly falls back to 'powersave'. This default is fine for notebooks and desktops, but servers and undefined machines should default to 'performance'. Look at the 'preferred_profile' field from the FADT to set this policy accordingly. Link: https://uefi.org/htmlspecs/ACPI_Spec_6_4_html/05_ACPI_Software_Programming_Model/ACPI_Software_Programming_Model.html#fixed-acpi-description-table-fadt Acked-by: Huang Rui Suggested-by: Wyes Karny Reviewed-by: Gautham R. Shenoy Signed-off-by: Mario Limonciello Reviewed-by: Perry Yuan Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/amd-pstate.c | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index d8269994322e..3546d7db614d 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -1071,6 +1071,26 @@ static const struct attribute_group amd_pstate_global_attr_group = { .attrs = pstate_global_attributes, }; +static bool amd_pstate_acpi_pm_profile_server(void) +{ + switch (acpi_gbl_FADT.preferred_profile) { + case PM_ENTERPRISE_SERVER: + case PM_SOHO_SERVER: + case PM_PERFORMANCE_SERVER: + return true; + } + return false; +} + +static bool amd_pstate_acpi_pm_profile_undefined(void) +{ + if (acpi_gbl_FADT.preferred_profile == PM_UNSPECIFIED) + return true; + if (acpi_gbl_FADT.preferred_profile >= NR_PM_PROFILES) + return true; + return false; +} + static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy) { int min_freq, max_freq, nominal_freq, lowest_nonlinear_freq, ret; @@ -1128,10 +1148,14 @@ static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy) policy->max = policy->cpuinfo.max_freq; /* - * Set the policy to powersave to provide a valid fallback value in case + * Set the policy to provide a valid fallback value in case * the default cpufreq governor is neither powersave nor performance. */ - policy->policy = CPUFREQ_POLICY_POWERSAVE; + if (amd_pstate_acpi_pm_profile_server() || + amd_pstate_acpi_pm_profile_undefined()) + policy->policy = CPUFREQ_POLICY_PERFORMANCE; + else + policy->policy = CPUFREQ_POLICY_POWERSAVE; if (boot_cpu_has(X86_FEATURE_CPPC)) { ret = rdmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, &value); From c88ad30e3f861c7be4e3b4995554e2b0754059b7 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Tue, 20 Jun 2023 12:24:33 -0500 Subject: [PATCH 10/12] cpufreq: amd-pstate: Add a kernel config option to set default mode Users are having more success with amd-pstate since the introduction of EPP and Guided modes. To expose the driver to more users by default introduce a kernel configuration option for setting the default mode. Users can use an integer to map out which default mode they want to use in lieu of a kernel command line option. This will default to EPP, but only if: 1) The CPU supports an MSR. 2) The system profile is identified 3) The system profile is identified as a non-server by the FADT. Link: https://gitlab.freedesktop.org/hadess/power-profiles-daemon/-/merge_requests/121 Acked-by: Huang Rui Reviewed-by: Gautham R. Shenoy Co-developed-by: Perry Yuan Signed-off-by: Perry Yuan Signed-off-by: Mario Limonciello Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/Kconfig.x86 | 17 +++++++++ drivers/cpufreq/amd-pstate.c | 73 ++++++++++++++++++++++++------------ include/linux/amd-pstate.h | 4 +- 3 files changed, 68 insertions(+), 26 deletions(-) diff --git a/drivers/cpufreq/Kconfig.x86 b/drivers/cpufreq/Kconfig.x86 index 00476e94db90..438c9e75a04d 100644 --- a/drivers/cpufreq/Kconfig.x86 +++ b/drivers/cpufreq/Kconfig.x86 @@ -51,6 +51,23 @@ config X86_AMD_PSTATE If in doubt, say N. +config X86_AMD_PSTATE_DEFAULT_MODE + int "AMD Processor P-State default mode" + depends on X86_AMD_PSTATE + default 3 if X86_AMD_PSTATE + range 1 4 + help + Select the default mode the amd-pstate driver will use on + supported hardware. + The value set has the following meanings: + 1 -> Disabled + 2 -> Passive + 3 -> Active (EPP) + 4 -> Guided + + For details, take a look at: + . + config X86_AMD_PSTATE_UT tristate "selftest for AMD Processor P-State driver" depends on X86 && ACPI_PROCESSOR diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index 3546d7db614d..81fba0dcbee9 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -62,7 +62,7 @@ static struct cpufreq_driver *current_pstate_driver; static struct cpufreq_driver amd_pstate_driver; static struct cpufreq_driver amd_pstate_epp_driver; -static int cppc_state = AMD_PSTATE_DISABLE; +static int cppc_state = AMD_PSTATE_UNDEFINED; static bool cppc_enabled; /* @@ -1410,6 +1410,25 @@ static struct cpufreq_driver amd_pstate_epp_driver = { .attr = amd_pstate_epp_attr, }; +static int __init amd_pstate_set_driver(int mode_idx) +{ + if (mode_idx >= AMD_PSTATE_DISABLE && mode_idx < AMD_PSTATE_MAX) { + cppc_state = mode_idx; + if (cppc_state == AMD_PSTATE_DISABLE) + pr_info("driver is explicitly disabled\n"); + + if (cppc_state == AMD_PSTATE_ACTIVE) + current_pstate_driver = &amd_pstate_epp_driver; + + if (cppc_state == AMD_PSTATE_PASSIVE || cppc_state == AMD_PSTATE_GUIDED) + current_pstate_driver = &amd_pstate_driver; + + return 0; + } + + return -EINVAL; +} + static int __init amd_pstate_init(void) { struct device *dev_root; @@ -1417,15 +1436,6 @@ static int __init amd_pstate_init(void) if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) return -ENODEV; - /* - * by default the pstate driver is disabled to load - * enable the amd_pstate passive mode driver explicitly - * with amd_pstate=passive or other modes in kernel command line - */ - if (cppc_state == AMD_PSTATE_DISABLE) { - pr_info("driver load is disabled, boot with specific mode to enable this\n"); - return -ENODEV; - } if (!acpi_cpc_valid()) { pr_warn_once("the _CPC object is not present in SBIOS or ACPI disabled\n"); @@ -1436,6 +1446,33 @@ static int __init amd_pstate_init(void) if (cpufreq_get_current_driver()) return -EEXIST; + switch (cppc_state) { + case AMD_PSTATE_UNDEFINED: + /* Disable on the following configs by default: + * 1. Undefined platforms + * 2. Server platforms + * 3. Shared memory designs + */ + if (amd_pstate_acpi_pm_profile_undefined() || + amd_pstate_acpi_pm_profile_server() || + !boot_cpu_has(X86_FEATURE_CPPC)) { + pr_info("driver load is disabled, boot with specific mode to enable this\n"); + return -ENODEV; + } + ret = amd_pstate_set_driver(CONFIG_X86_AMD_PSTATE_DEFAULT_MODE); + if (ret) + return ret; + break; + case AMD_PSTATE_DISABLE: + return -ENODEV; + case AMD_PSTATE_PASSIVE: + case AMD_PSTATE_ACTIVE: + case AMD_PSTATE_GUIDED: + break; + default: + return -EINVAL; + } + /* capability check */ if (boot_cpu_has(X86_FEATURE_CPPC)) { pr_debug("AMD CPPC MSR based functionality is supported\n"); @@ -1488,21 +1525,7 @@ static int __init amd_pstate_param(char *str) size = strlen(str); mode_idx = get_mode_idx_from_str(str, size); - if (mode_idx >= AMD_PSTATE_DISABLE && mode_idx < AMD_PSTATE_MAX) { - cppc_state = mode_idx; - if (cppc_state == AMD_PSTATE_DISABLE) - pr_info("driver is explicitly disabled\n"); - - if (cppc_state == AMD_PSTATE_ACTIVE) - current_pstate_driver = &amd_pstate_epp_driver; - - if (cppc_state == AMD_PSTATE_PASSIVE || cppc_state == AMD_PSTATE_GUIDED) - current_pstate_driver = &amd_pstate_driver; - - return 0; - } - - return -EINVAL; + return amd_pstate_set_driver(mode_idx); } early_param("amd_pstate", amd_pstate_param); diff --git a/include/linux/amd-pstate.h b/include/linux/amd-pstate.h index c10ebf8c42e6..446394f84606 100644 --- a/include/linux/amd-pstate.h +++ b/include/linux/amd-pstate.h @@ -94,7 +94,8 @@ struct amd_cpudata { * enum amd_pstate_mode - driver working mode of amd pstate */ enum amd_pstate_mode { - AMD_PSTATE_DISABLE = 0, + AMD_PSTATE_UNDEFINED = 0, + AMD_PSTATE_DISABLE, AMD_PSTATE_PASSIVE, AMD_PSTATE_ACTIVE, AMD_PSTATE_GUIDED, @@ -102,6 +103,7 @@ enum amd_pstate_mode { }; static const char * const amd_pstate_mode_string[] = { + [AMD_PSTATE_UNDEFINED] = "undefined", [AMD_PSTATE_DISABLE] = "disable", [AMD_PSTATE_PASSIVE] = "passive", [AMD_PSTATE_ACTIVE] = "active", From 03f44ffb3d5be2fceda375d92c70ab6de4df7081 Mon Sep 17 00:00:00 2001 From: Tero Kristo Date: Wed, 21 Jun 2023 09:58:39 +0300 Subject: [PATCH 11/12] cpufreq: intel_pstate: Fix energy_performance_preference for passive If the intel_pstate driver is set to passive mode, then writing the same value to the energy_performance_preference sysfs twice will fail. This is caused by the wrong return value used (index of the matched energy_perf_string), instead of the length of the passed in parameter. Fix by forcing the internal return value to zero when the same preference is passed in by user. This same issue is not present when active mode is used for the driver. Fixes: f6ebbcf08f37 ("cpufreq: intel_pstate: Implement passive mode with HWP enabled") Reported-by: Niklas Neronin Signed-off-by: Tero Kristo Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/intel_pstate.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 2548ec92faa2..f29182512b98 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -824,6 +824,8 @@ static ssize_t store_energy_performance_preference( err = cpufreq_start_governor(policy); if (!ret) ret = err; + } else { + ret = 0; } } From 0fac214bb75ee64d7b9e6521d53f8251a4d324ea Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Tue, 20 Jun 2023 18:03:54 +0000 Subject: [PATCH 12/12] intel_idle: Add a "Long HLT" C1 state for the VM guest mode intel_idle will, for the bare metal case, usually have one or more deep power states that have the CPUIDLE_FLAG_TLB_FLUSHED flag set. When a state with this flag is selected by the cpuidle framework, it will also flush the TLBs as part of entering this state. The benefit of doing this is that the kernel does not need to wake the cpu out of this deep power state just to flush the TLBs... for which the latency can be very high due to the exit latency of deep power states. In a VM guest currently, this benefit of avoiding the wakeup does not exist, while the problem (long exit latency) is even more severe. Linux will need to wake up a vCPU (causing the host to either come out of a deep C state, or the VMM to have to deschedule something else to schedule the vCPU) which can take a very long time.. adding a lot of latency to tlb flush operations (including munmap and others). To solve this, add a "Long HLT" C state to the state table for the VM guest case that has the CPUIDLE_FLAG_TLB_FLUSHED flag set. The result of that is that for long idle periods (where the VMM is likely to do things that cause large latency) the cpuidle framework will flush the TLBs (and avoid the wakeups), while for short/quick idle durations, the existing behavior is retained. Now, there is still only "hlt" available in the guest, but for long idle, the host can go to a deeper state (say C6). There is a reasonable debate one can have to what to set for the exit_latency and break even point for this "Long HLT" state. The good news is that intel_idle has these values available for the underlying CPU (even when mwait is not exposed). The solution thus is to just use the latency and break even of the deepest state from the bare metal CPU. This is under the assumption that this is a pretty reasonable estimate of what the VMM would do to cause latency. Signed-off-by: Arjan van de Ven Signed-off-by: Rafael J. Wysocki --- drivers/idle/intel_idle.c | 55 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index a80e1f520293..34201d7ef33e 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -1287,6 +1287,13 @@ static struct cpuidle_state vmguest_cstates[] __initdata = { .exit_latency = 5, .target_residency = 10, .enter = &intel_idle_hlt, }, + { + .name = "C1L", + .desc = "Long HLT", + .flags = MWAIT2flg(0x00) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 5, + .target_residency = 200, + .enter = &intel_idle_hlt, }, { .enter = NULL } }; @@ -2130,6 +2137,45 @@ static void __init intel_idle_cpuidle_devices_uninit(void) cpuidle_unregister_device(per_cpu_ptr(intel_idle_cpuidle_devices, i)); } +/* + * Match up the latency and break even point of the bare metal (cpu based) + * states with the deepest VM available state. + * + * We only want to do this for the deepest state, the ones that has + * the TLB_FLUSHED flag set on the . + * + * All our short idle states are dominated by vmexit/vmenter latencies, + * not the underlying hardware latencies so we keep our values for these. + */ +static void matchup_vm_state_with_baremetal(void) +{ + int cstate; + + for (cstate = 0; cstate < CPUIDLE_STATE_MAX; ++cstate) { + int matching_cstate; + + if (intel_idle_max_cstate_reached(cstate)) + break; + + if (!cpuidle_state_table[cstate].enter) + break; + + if (!(cpuidle_state_table[cstate].flags & CPUIDLE_FLAG_TLB_FLUSHED)) + continue; + + for (matching_cstate = 0; matching_cstate < CPUIDLE_STATE_MAX; ++matching_cstate) { + if (!icpu->state_table[matching_cstate].enter) + break; + if (icpu->state_table[matching_cstate].exit_latency > cpuidle_state_table[cstate].exit_latency) { + cpuidle_state_table[cstate].exit_latency = icpu->state_table[matching_cstate].exit_latency; + cpuidle_state_table[cstate].target_residency = icpu->state_table[matching_cstate].target_residency; + } + } + + } +} + + static int __init intel_idle_vminit(const struct x86_cpu_id *id) { int retval; @@ -2145,6 +2191,15 @@ static int __init intel_idle_vminit(const struct x86_cpu_id *id) if (!intel_idle_cpuidle_devices) return -ENOMEM; + /* + * We don't know exactly what the host will do when we go idle, but as a worst estimate + * we can assume that the exit latency of the deepest host state will be hit for our + * deep (long duration) guest idle state. + * The same logic applies to the break even point for the long duration guest idle state. + * So lets copy these two properties from the table we found for the host CPU type. + */ + matchup_vm_state_with_baremetal(); + intel_idle_cpuidle_driver_init(&intel_idle_driver); retval = cpuidle_register_driver(&intel_idle_driver);