diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig index 2c839bd2b051..a1c51abddbc5 100644 --- a/drivers/cpufreq/Kconfig +++ b/drivers/cpufreq/Kconfig @@ -38,7 +38,7 @@ choice prompt "Default CPUFreq governor" default CPU_FREQ_DEFAULT_GOV_USERSPACE if ARM_SA1110_CPUFREQ default CPU_FREQ_DEFAULT_GOV_SCHEDUTIL if ARM64 || ARM - default CPU_FREQ_DEFAULT_GOV_SCHEDUTIL if X86_INTEL_PSTATE && SMP + default CPU_FREQ_DEFAULT_GOV_SCHEDUTIL if (X86_INTEL_PSTATE || X86_AMD_PSTATE) && SMP default CPU_FREQ_DEFAULT_GOV_PERFORMANCE help This option sets which CPUFreq governor shall be loaded at diff --git a/drivers/cpufreq/Kconfig.x86 b/drivers/cpufreq/Kconfig.x86 index 00476e94db90..438c9e75a04d 100644 --- a/drivers/cpufreq/Kconfig.x86 +++ b/drivers/cpufreq/Kconfig.x86 @@ -51,6 +51,23 @@ config X86_AMD_PSTATE If in doubt, say N. +config X86_AMD_PSTATE_DEFAULT_MODE + int "AMD Processor P-State default mode" + depends on X86_AMD_PSTATE + default 3 if X86_AMD_PSTATE + range 1 4 + help + Select the default mode the amd-pstate driver will use on + supported hardware. + The value set has the following meanings: + 1 -> Disabled + 2 -> Passive + 3 -> Active (EPP) + 4 -> Guided + + For details, take a look at: + . + config X86_AMD_PSTATE_UT tristate "selftest for AMD Processor P-State driver" depends on X86 && ACPI_PROCESSOR diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index ddd346a239e0..81fba0dcbee9 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -62,7 +62,8 @@ static struct cpufreq_driver *current_pstate_driver; static struct cpufreq_driver amd_pstate_driver; static struct cpufreq_driver amd_pstate_epp_driver; -static int cppc_state = AMD_PSTATE_DISABLE; +static int cppc_state = AMD_PSTATE_UNDEFINED; +static bool cppc_enabled; /* * AMD Energy Preference Performance (EPP) @@ -228,7 +229,28 @@ static int amd_pstate_set_energy_pref_index(struct amd_cpudata *cpudata, static inline int pstate_enable(bool enable) { - return wrmsrl_safe(MSR_AMD_CPPC_ENABLE, enable); + int ret, cpu; + unsigned long logical_proc_id_mask = 0; + + if (enable == cppc_enabled) + return 0; + + for_each_present_cpu(cpu) { + unsigned long logical_id = topology_logical_die_id(cpu); + + if (test_bit(logical_id, &logical_proc_id_mask)) + continue; + + set_bit(logical_id, &logical_proc_id_mask); + + ret = wrmsrl_safe_on_cpu(cpu, MSR_AMD_CPPC_ENABLE, + enable); + if (ret) + return ret; + } + + cppc_enabled = enable; + return 0; } static int cppc_enable(bool enable) @@ -236,6 +258,9 @@ static int cppc_enable(bool enable) int cpu, ret = 0; struct cppc_perf_ctrls perf_ctrls; + if (enable == cppc_enabled) + return 0; + for_each_present_cpu(cpu) { ret = cppc_set_enable(cpu, enable); if (ret) @@ -251,6 +276,7 @@ static int cppc_enable(bool enable) } } + cppc_enabled = enable; return ret; } @@ -1045,6 +1071,26 @@ static const struct attribute_group amd_pstate_global_attr_group = { .attrs = pstate_global_attributes, }; +static bool amd_pstate_acpi_pm_profile_server(void) +{ + switch (acpi_gbl_FADT.preferred_profile) { + case PM_ENTERPRISE_SERVER: + case PM_SOHO_SERVER: + case PM_PERFORMANCE_SERVER: + return true; + } + return false; +} + +static bool amd_pstate_acpi_pm_profile_undefined(void) +{ + if (acpi_gbl_FADT.preferred_profile == PM_UNSPECIFIED) + return true; + if (acpi_gbl_FADT.preferred_profile >= NR_PM_PROFILES) + return true; + return false; +} + static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy) { int min_freq, max_freq, nominal_freq, lowest_nonlinear_freq, ret; @@ -1102,10 +1148,14 @@ static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy) policy->max = policy->cpuinfo.max_freq; /* - * Set the policy to powersave to provide a valid fallback value in case + * Set the policy to provide a valid fallback value in case * the default cpufreq governor is neither powersave nor performance. */ - policy->policy = CPUFREQ_POLICY_POWERSAVE; + if (amd_pstate_acpi_pm_profile_server() || + amd_pstate_acpi_pm_profile_undefined()) + policy->policy = CPUFREQ_POLICY_PERFORMANCE; + else + policy->policy = CPUFREQ_POLICY_POWERSAVE; if (boot_cpu_has(X86_FEATURE_CPPC)) { ret = rdmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, &value); @@ -1356,10 +1406,29 @@ static struct cpufreq_driver amd_pstate_epp_driver = { .online = amd_pstate_epp_cpu_online, .suspend = amd_pstate_epp_suspend, .resume = amd_pstate_epp_resume, - .name = "amd_pstate_epp", + .name = "amd-pstate-epp", .attr = amd_pstate_epp_attr, }; +static int __init amd_pstate_set_driver(int mode_idx) +{ + if (mode_idx >= AMD_PSTATE_DISABLE && mode_idx < AMD_PSTATE_MAX) { + cppc_state = mode_idx; + if (cppc_state == AMD_PSTATE_DISABLE) + pr_info("driver is explicitly disabled\n"); + + if (cppc_state == AMD_PSTATE_ACTIVE) + current_pstate_driver = &amd_pstate_epp_driver; + + if (cppc_state == AMD_PSTATE_PASSIVE || cppc_state == AMD_PSTATE_GUIDED) + current_pstate_driver = &amd_pstate_driver; + + return 0; + } + + return -EINVAL; +} + static int __init amd_pstate_init(void) { struct device *dev_root; @@ -1367,15 +1436,6 @@ static int __init amd_pstate_init(void) if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) return -ENODEV; - /* - * by default the pstate driver is disabled to load - * enable the amd_pstate passive mode driver explicitly - * with amd_pstate=passive or other modes in kernel command line - */ - if (cppc_state == AMD_PSTATE_DISABLE) { - pr_info("driver load is disabled, boot with specific mode to enable this\n"); - return -ENODEV; - } if (!acpi_cpc_valid()) { pr_warn_once("the _CPC object is not present in SBIOS or ACPI disabled\n"); @@ -1386,6 +1446,33 @@ static int __init amd_pstate_init(void) if (cpufreq_get_current_driver()) return -EEXIST; + switch (cppc_state) { + case AMD_PSTATE_UNDEFINED: + /* Disable on the following configs by default: + * 1. Undefined platforms + * 2. Server platforms + * 3. Shared memory designs + */ + if (amd_pstate_acpi_pm_profile_undefined() || + amd_pstate_acpi_pm_profile_server() || + !boot_cpu_has(X86_FEATURE_CPPC)) { + pr_info("driver load is disabled, boot with specific mode to enable this\n"); + return -ENODEV; + } + ret = amd_pstate_set_driver(CONFIG_X86_AMD_PSTATE_DEFAULT_MODE); + if (ret) + return ret; + break; + case AMD_PSTATE_DISABLE: + return -ENODEV; + case AMD_PSTATE_PASSIVE: + case AMD_PSTATE_ACTIVE: + case AMD_PSTATE_GUIDED: + break; + default: + return -EINVAL; + } + /* capability check */ if (boot_cpu_has(X86_FEATURE_CPPC)) { pr_debug("AMD CPPC MSR based functionality is supported\n"); @@ -1438,21 +1525,7 @@ static int __init amd_pstate_param(char *str) size = strlen(str); mode_idx = get_mode_idx_from_str(str, size); - if (mode_idx >= AMD_PSTATE_DISABLE && mode_idx < AMD_PSTATE_MAX) { - cppc_state = mode_idx; - if (cppc_state == AMD_PSTATE_DISABLE) - pr_info("driver is explicitly disabled\n"); - - if (cppc_state == AMD_PSTATE_ACTIVE) - current_pstate_driver = &amd_pstate_epp_driver; - - if (cppc_state == AMD_PSTATE_PASSIVE || cppc_state == AMD_PSTATE_GUIDED) - current_pstate_driver = &amd_pstate_driver; - - return 0; - } - - return -EINVAL; + return amd_pstate_set_driver(mode_idx); } early_param("amd_pstate", amd_pstate_param); diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 6b52ebe5a890..50bbc969ffe5 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -2828,7 +2828,8 @@ int cpufreq_register_driver(struct cpufreq_driver *driver_data) (driver_data->setpolicy && (driver_data->target_index || driver_data->target)) || (!driver_data->get_intermediate != !driver_data->target_intermediate) || - (!driver_data->online != !driver_data->offline)) + (!driver_data->online != !driver_data->offline) || + (driver_data->adjust_perf && !driver_data->fast_switch)) return -EINVAL; pr_debug("trying to register driver %s\n", driver_data->name); diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 2548ec92faa2..f29182512b98 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -824,6 +824,8 @@ static ssize_t store_energy_performance_preference( err = cpufreq_start_governor(policy); if (!ret) ret = err; + } else { + ret = 0; } } diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index aa2d19db2b1d..34201d7ef33e 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -199,6 +199,43 @@ static __cpuidle int intel_idle_xstate(struct cpuidle_device *dev, return __intel_idle(dev, drv, index); } +static __always_inline int __intel_idle_hlt(struct cpuidle_device *dev, + struct cpuidle_driver *drv, int index) +{ + raw_safe_halt(); + raw_local_irq_disable(); + return index; +} + +/** + * intel_idle_hlt - Ask the processor to enter the given idle state using hlt. + * @dev: cpuidle device of the target CPU. + * @drv: cpuidle driver (assumed to point to intel_idle_driver). + * @index: Target idle state index. + * + * Use the HLT instruction to notify the processor that the CPU represented by + * @dev is idle and it can try to enter the idle state corresponding to @index. + * + * Must be called under local_irq_disable(). + */ +static __cpuidle int intel_idle_hlt(struct cpuidle_device *dev, + struct cpuidle_driver *drv, int index) +{ + return __intel_idle_hlt(dev, drv, index); +} + +static __cpuidle int intel_idle_hlt_irq_on(struct cpuidle_device *dev, + struct cpuidle_driver *drv, int index) +{ + int ret; + + raw_local_irq_enable(); + ret = __intel_idle_hlt(dev, drv, index); + raw_local_irq_disable(); + + return ret; +} + /** * intel_idle_s2idle - Ask the processor to enter the given idle state. * @dev: cpuidle device of the target CPU. @@ -1242,6 +1279,25 @@ static struct cpuidle_state snr_cstates[] __initdata = { .enter = NULL } }; +static struct cpuidle_state vmguest_cstates[] __initdata = { + { + .name = "C1", + .desc = "HLT", + .flags = MWAIT2flg(0x00) | CPUIDLE_FLAG_IRQ_ENABLE, + .exit_latency = 5, + .target_residency = 10, + .enter = &intel_idle_hlt, }, + { + .name = "C1L", + .desc = "Long HLT", + .flags = MWAIT2flg(0x00) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 5, + .target_residency = 200, + .enter = &intel_idle_hlt, }, + { + .enter = NULL } +}; + static const struct idle_cpu idle_cpu_nehalem __initconst = { .state_table = nehalem_cstates, .auto_demotion_disable_flags = NHM_C1_AUTO_DEMOTE | NHM_C3_AUTO_DEMOTE, @@ -1839,6 +1895,66 @@ static bool __init intel_idle_verify_cstate(unsigned int mwait_hint) return true; } +static void state_update_enter_method(struct cpuidle_state *state, int cstate) +{ + if (state->enter == intel_idle_hlt) { + if (force_irq_on) { + pr_info("forced intel_idle_irq for state %d\n", cstate); + state->enter = intel_idle_hlt_irq_on; + } + return; + } + if (state->enter == intel_idle_hlt_irq_on) + return; /* no update scenarios */ + + if (state->flags & CPUIDLE_FLAG_INIT_XSTATE) { + /* + * Combining with XSTATE with IBRS or IRQ_ENABLE flags + * is not currently supported but this driver. + */ + WARN_ON_ONCE(state->flags & CPUIDLE_FLAG_IBRS); + WARN_ON_ONCE(state->flags & CPUIDLE_FLAG_IRQ_ENABLE); + state->enter = intel_idle_xstate; + return; + } + + if (cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS) && + state->flags & CPUIDLE_FLAG_IBRS) { + /* + * IBRS mitigation requires that C-states are entered + * with interrupts disabled. + */ + WARN_ON_ONCE(state->flags & CPUIDLE_FLAG_IRQ_ENABLE); + state->enter = intel_idle_ibrs; + return; + } + + if (state->flags & CPUIDLE_FLAG_IRQ_ENABLE) { + state->enter = intel_idle_irq; + return; + } + + if (force_irq_on) { + pr_info("forced intel_idle_irq for state %d\n", cstate); + state->enter = intel_idle_irq; + } +} + +/* + * For mwait based states, we want to verify the cpuid data to see if the state + * is actually supported by this specific CPU. + * For non-mwait based states, this check should be skipped. + */ +static bool should_verify_mwait(struct cpuidle_state *state) +{ + if (state->enter == intel_idle_hlt) + return false; + if (state->enter == intel_idle_hlt_irq_on) + return false; + + return true; +} + static void __init intel_idle_init_cstates_icpu(struct cpuidle_driver *drv) { int cstate; @@ -1887,35 +2003,15 @@ static void __init intel_idle_init_cstates_icpu(struct cpuidle_driver *drv) } mwait_hint = flg2MWAIT(cpuidle_state_table[cstate].flags); - if (!intel_idle_verify_cstate(mwait_hint)) + if (should_verify_mwait(&cpuidle_state_table[cstate]) && !intel_idle_verify_cstate(mwait_hint)) continue; /* Structure copy. */ drv->states[drv->state_count] = cpuidle_state_table[cstate]; state = &drv->states[drv->state_count]; - if (state->flags & CPUIDLE_FLAG_INIT_XSTATE) { - /* - * Combining with XSTATE with IBRS or IRQ_ENABLE flags - * is not currently supported but this driver. - */ - WARN_ON_ONCE(state->flags & CPUIDLE_FLAG_IBRS); - WARN_ON_ONCE(state->flags & CPUIDLE_FLAG_IRQ_ENABLE); - state->enter = intel_idle_xstate; - } else if (cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS) && - state->flags & CPUIDLE_FLAG_IBRS) { - /* - * IBRS mitigation requires that C-states are entered - * with interrupts disabled. - */ - WARN_ON_ONCE(state->flags & CPUIDLE_FLAG_IRQ_ENABLE); - state->enter = intel_idle_ibrs; - } else if (state->flags & CPUIDLE_FLAG_IRQ_ENABLE) { - state->enter = intel_idle_irq; - } else if (force_irq_on) { - pr_info("forced intel_idle_irq for state %d\n", cstate); - state->enter = intel_idle_irq; - } + state_update_enter_method(state, cstate); + if ((disabled_states_mask & BIT(drv->state_count)) || ((icpu->use_acpi || force_use_acpi) && @@ -2041,6 +2137,93 @@ static void __init intel_idle_cpuidle_devices_uninit(void) cpuidle_unregister_device(per_cpu_ptr(intel_idle_cpuidle_devices, i)); } +/* + * Match up the latency and break even point of the bare metal (cpu based) + * states with the deepest VM available state. + * + * We only want to do this for the deepest state, the ones that has + * the TLB_FLUSHED flag set on the . + * + * All our short idle states are dominated by vmexit/vmenter latencies, + * not the underlying hardware latencies so we keep our values for these. + */ +static void matchup_vm_state_with_baremetal(void) +{ + int cstate; + + for (cstate = 0; cstate < CPUIDLE_STATE_MAX; ++cstate) { + int matching_cstate; + + if (intel_idle_max_cstate_reached(cstate)) + break; + + if (!cpuidle_state_table[cstate].enter) + break; + + if (!(cpuidle_state_table[cstate].flags & CPUIDLE_FLAG_TLB_FLUSHED)) + continue; + + for (matching_cstate = 0; matching_cstate < CPUIDLE_STATE_MAX; ++matching_cstate) { + if (!icpu->state_table[matching_cstate].enter) + break; + if (icpu->state_table[matching_cstate].exit_latency > cpuidle_state_table[cstate].exit_latency) { + cpuidle_state_table[cstate].exit_latency = icpu->state_table[matching_cstate].exit_latency; + cpuidle_state_table[cstate].target_residency = icpu->state_table[matching_cstate].target_residency; + } + } + + } +} + + +static int __init intel_idle_vminit(const struct x86_cpu_id *id) +{ + int retval; + + cpuidle_state_table = vmguest_cstates; + + icpu = (const struct idle_cpu *)id->driver_data; + + pr_debug("v" INTEL_IDLE_VERSION " model 0x%X\n", + boot_cpu_data.x86_model); + + intel_idle_cpuidle_devices = alloc_percpu(struct cpuidle_device); + if (!intel_idle_cpuidle_devices) + return -ENOMEM; + + /* + * We don't know exactly what the host will do when we go idle, but as a worst estimate + * we can assume that the exit latency of the deepest host state will be hit for our + * deep (long duration) guest idle state. + * The same logic applies to the break even point for the long duration guest idle state. + * So lets copy these two properties from the table we found for the host CPU type. + */ + matchup_vm_state_with_baremetal(); + + intel_idle_cpuidle_driver_init(&intel_idle_driver); + + retval = cpuidle_register_driver(&intel_idle_driver); + if (retval) { + struct cpuidle_driver *drv = cpuidle_get_driver(); + printk(KERN_DEBUG pr_fmt("intel_idle yielding to %s\n"), + drv ? drv->name : "none"); + goto init_driver_fail; + } + + retval = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "idle/intel:online", + intel_idle_cpu_online, NULL); + if (retval < 0) + goto hp_setup_fail; + + return 0; +hp_setup_fail: + intel_idle_cpuidle_devices_uninit(); + cpuidle_unregister_driver(&intel_idle_driver); +init_driver_fail: + free_percpu(intel_idle_cpuidle_devices); + return retval; +} + static int __init intel_idle_init(void) { const struct x86_cpu_id *id; @@ -2059,6 +2242,8 @@ static int __init intel_idle_init(void) id = x86_match_cpu(intel_idle_ids); if (id) { if (!boot_cpu_has(X86_FEATURE_MWAIT)) { + if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) + return intel_idle_vminit(id); pr_debug("Please enable MWAIT in BIOS SETUP\n"); return -ENODEV; } diff --git a/include/acpi/actbl.h b/include/acpi/actbl.h index e5dfb6f4de52..451f6276da49 100644 --- a/include/acpi/actbl.h +++ b/include/acpi/actbl.h @@ -307,7 +307,8 @@ enum acpi_preferred_pm_profiles { PM_SOHO_SERVER = 5, PM_APPLIANCE_PC = 6, PM_PERFORMANCE_SERVER = 7, - PM_TABLET = 8 + PM_TABLET = 8, + NR_PM_PROFILES = 9 }; /* Values for sleep_status and sleep_control registers (V5+ FADT) */ diff --git a/include/linux/amd-pstate.h b/include/linux/amd-pstate.h index c10ebf8c42e6..446394f84606 100644 --- a/include/linux/amd-pstate.h +++ b/include/linux/amd-pstate.h @@ -94,7 +94,8 @@ struct amd_cpudata { * enum amd_pstate_mode - driver working mode of amd pstate */ enum amd_pstate_mode { - AMD_PSTATE_DISABLE = 0, + AMD_PSTATE_UNDEFINED = 0, + AMD_PSTATE_DISABLE, AMD_PSTATE_PASSIVE, AMD_PSTATE_ACTIVE, AMD_PSTATE_GUIDED, @@ -102,6 +103,7 @@ enum amd_pstate_mode { }; static const char * const amd_pstate_mode_string[] = { + [AMD_PSTATE_UNDEFINED] = "undefined", [AMD_PSTATE_DISABLE] = "disable", [AMD_PSTATE_PASSIVE] = "passive", [AMD_PSTATE_ACTIVE] = "active", diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 26e2eb399484..172ff51c1b2a 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -340,7 +340,10 @@ struct cpufreq_driver { /* * ->fast_switch() replacement for drivers that use an internal * representation of performance levels and can pass hints other than - * the target performance level to the hardware. + * the target performance level to the hardware. This can only be set + * if ->fast_switch is set too, because in those cases (under specific + * conditions) scale invariance can be disabled, which causes the + * schedutil governor to fall back to the latter. */ void (*adjust_perf)(unsigned int cpu, unsigned long min_perf,