diff --git a/arch/arm64/kernel/cpuidle.c b/arch/arm64/kernel/cpuidle.c index 03991eeff643..3006f4324808 100644 --- a/arch/arm64/kernel/cpuidle.c +++ b/arch/arm64/kernel/cpuidle.c @@ -54,6 +54,9 @@ static int psci_acpi_cpu_init_idle(unsigned int cpu) struct acpi_lpi_state *lpi; struct acpi_processor *pr = per_cpu(processors, cpu); + if (unlikely(!pr || !pr->flags.has_lpi)) + return -EINVAL; + /* * If the PSCI cpu_suspend function hook has not been initialized * idle states must not be enabled, so bail out @@ -61,9 +64,6 @@ static int psci_acpi_cpu_init_idle(unsigned int cpu) if (!psci_ops.cpu_suspend) return -EOPNOTSUPP; - if (unlikely(!pr || !pr->flags.has_lpi)) - return -EINVAL; - count = pr->power.count - 1; if (count <= 0) return -ENODEV; diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c index f8e9fa82cb9b..32b20efff5f8 100644 --- a/drivers/acpi/processor_idle.c +++ b/drivers/acpi/processor_idle.c @@ -1080,6 +1080,11 @@ static int flatten_lpi_states(struct acpi_processor *pr, return 0; } +int __weak acpi_processor_ffh_lpi_probe(unsigned int cpu) +{ + return -EOPNOTSUPP; +} + static int acpi_processor_get_lpi_info(struct acpi_processor *pr) { int ret, i; @@ -1088,6 +1093,11 @@ static int acpi_processor_get_lpi_info(struct acpi_processor *pr) struct acpi_device *d = NULL; struct acpi_lpi_states_array info[2], *tmp, *prev, *curr; + /* make sure our architecture has support */ + ret = acpi_processor_ffh_lpi_probe(pr->id); + if (ret == -EOPNOTSUPP) + return ret; + if (!osc_pc_lpi_support_confirmed) return -EOPNOTSUPP; @@ -1139,11 +1149,6 @@ static int acpi_processor_get_lpi_info(struct acpi_processor *pr) return 0; } -int __weak acpi_processor_ffh_lpi_probe(unsigned int cpu) -{ - return -ENODEV; -} - int __weak acpi_processor_ffh_lpi_enter(struct acpi_lpi_state *lpi) { return -ENODEV; diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c index 08515f7e515f..b6bd0ff35323 100644 --- a/drivers/cpufreq/cpufreq_conservative.c +++ b/drivers/cpufreq/cpufreq_conservative.c @@ -146,7 +146,7 @@ static unsigned int cs_dbs_update(struct cpufreq_policy *policy) /************************** sysfs interface ************************/ -static ssize_t store_sampling_down_factor(struct gov_attr_set *attr_set, +static ssize_t sampling_down_factor_store(struct gov_attr_set *attr_set, const char *buf, size_t count) { struct dbs_data *dbs_data = to_dbs_data(attr_set); @@ -161,7 +161,7 @@ static ssize_t store_sampling_down_factor(struct gov_attr_set *attr_set, return count; } -static ssize_t store_up_threshold(struct gov_attr_set *attr_set, +static ssize_t up_threshold_store(struct gov_attr_set *attr_set, const char *buf, size_t count) { struct dbs_data *dbs_data = to_dbs_data(attr_set); @@ -177,7 +177,7 @@ static ssize_t store_up_threshold(struct gov_attr_set *attr_set, return count; } -static ssize_t store_down_threshold(struct gov_attr_set *attr_set, +static ssize_t down_threshold_store(struct gov_attr_set *attr_set, const char *buf, size_t count) { struct dbs_data *dbs_data = to_dbs_data(attr_set); @@ -195,7 +195,7 @@ static ssize_t store_down_threshold(struct gov_attr_set *attr_set, return count; } -static ssize_t store_ignore_nice_load(struct gov_attr_set *attr_set, +static ssize_t ignore_nice_load_store(struct gov_attr_set *attr_set, const char *buf, size_t count) { struct dbs_data *dbs_data = to_dbs_data(attr_set); @@ -220,7 +220,7 @@ static ssize_t store_ignore_nice_load(struct gov_attr_set *attr_set, return count; } -static ssize_t store_freq_step(struct gov_attr_set *attr_set, const char *buf, +static ssize_t freq_step_store(struct gov_attr_set *attr_set, const char *buf, size_t count) { struct dbs_data *dbs_data = to_dbs_data(attr_set); diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c index 63f7c219062b..0d42cf8b88d8 100644 --- a/drivers/cpufreq/cpufreq_governor.c +++ b/drivers/cpufreq/cpufreq_governor.c @@ -27,7 +27,7 @@ static DEFINE_MUTEX(gov_dbs_data_mutex); /* Common sysfs tunables */ /* - * store_sampling_rate - update sampling rate effective immediately if needed. + * sampling_rate_store - update sampling rate effective immediately if needed. * * If new rate is smaller than the old, simply updating * dbs.sampling_rate might not be appropriate. For example, if the @@ -41,7 +41,7 @@ static DEFINE_MUTEX(gov_dbs_data_mutex); * This must be called with dbs_data->mutex held, otherwise traversing * policy_dbs_list isn't safe. */ -ssize_t store_sampling_rate(struct gov_attr_set *attr_set, const char *buf, +ssize_t sampling_rate_store(struct gov_attr_set *attr_set, const char *buf, size_t count) { struct dbs_data *dbs_data = to_dbs_data(attr_set); @@ -80,7 +80,7 @@ ssize_t store_sampling_rate(struct gov_attr_set *attr_set, const char *buf, return count; } -EXPORT_SYMBOL_GPL(store_sampling_rate); +EXPORT_SYMBOL_GPL(sampling_rate_store); /** * gov_update_cpu_data - Update CPU load data. diff --git a/drivers/cpufreq/cpufreq_governor.h b/drivers/cpufreq/cpufreq_governor.h index bab8e6140377..a5a0bc3cc23e 100644 --- a/drivers/cpufreq/cpufreq_governor.h +++ b/drivers/cpufreq/cpufreq_governor.h @@ -51,7 +51,7 @@ static inline struct dbs_data *to_dbs_data(struct gov_attr_set *attr_set) } #define gov_show_one(_gov, file_name) \ -static ssize_t show_##file_name \ +static ssize_t file_name##_show \ (struct gov_attr_set *attr_set, char *buf) \ { \ struct dbs_data *dbs_data = to_dbs_data(attr_set); \ @@ -60,7 +60,7 @@ static ssize_t show_##file_name \ } #define gov_show_one_common(file_name) \ -static ssize_t show_##file_name \ +static ssize_t file_name##_show \ (struct gov_attr_set *attr_set, char *buf) \ { \ struct dbs_data *dbs_data = to_dbs_data(attr_set); \ @@ -68,12 +68,10 @@ static ssize_t show_##file_name \ } #define gov_attr_ro(_name) \ -static struct governor_attr _name = \ -__ATTR(_name, 0444, show_##_name, NULL) +static struct governor_attr _name = __ATTR_RO(_name) #define gov_attr_rw(_name) \ -static struct governor_attr _name = \ -__ATTR(_name, 0644, show_##_name, store_##_name) +static struct governor_attr _name = __ATTR_RW(_name) /* Common to all CPUs of a policy */ struct policy_dbs_info { @@ -176,7 +174,7 @@ void od_register_powersave_bias_handler(unsigned int (*f) (struct cpufreq_policy *, unsigned int, unsigned int), unsigned int powersave_bias); void od_unregister_powersave_bias_handler(void); -ssize_t store_sampling_rate(struct gov_attr_set *attr_set, const char *buf, +ssize_t sampling_rate_store(struct gov_attr_set *attr_set, const char *buf, size_t count); void gov_update_cpu_data(struct dbs_data *dbs_data); #endif /* _CPUFREQ_GOVERNOR_H */ diff --git a/drivers/cpufreq/cpufreq_governor_attr_set.c b/drivers/cpufreq/cpufreq_governor_attr_set.c index a6f365b9cc1a..771770ea0ed0 100644 --- a/drivers/cpufreq/cpufreq_governor_attr_set.c +++ b/drivers/cpufreq/cpufreq_governor_attr_set.c @@ -8,11 +8,6 @@ #include "cpufreq_governor.h" -static inline struct gov_attr_set *to_gov_attr_set(struct kobject *kobj) -{ - return container_of(kobj, struct gov_attr_set, kobj); -} - static inline struct governor_attr *to_gov_attr(struct attribute *attr) { return container_of(attr, struct governor_attr, attr); diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c index 6a41ea4729b8..e8fbf970ff07 100644 --- a/drivers/cpufreq/cpufreq_ondemand.c +++ b/drivers/cpufreq/cpufreq_ondemand.c @@ -202,7 +202,7 @@ static unsigned int od_dbs_update(struct cpufreq_policy *policy) /************************** sysfs interface ************************/ static struct dbs_governor od_dbs_gov; -static ssize_t store_io_is_busy(struct gov_attr_set *attr_set, const char *buf, +static ssize_t io_is_busy_store(struct gov_attr_set *attr_set, const char *buf, size_t count) { struct dbs_data *dbs_data = to_dbs_data(attr_set); @@ -220,7 +220,7 @@ static ssize_t store_io_is_busy(struct gov_attr_set *attr_set, const char *buf, return count; } -static ssize_t store_up_threshold(struct gov_attr_set *attr_set, +static ssize_t up_threshold_store(struct gov_attr_set *attr_set, const char *buf, size_t count) { struct dbs_data *dbs_data = to_dbs_data(attr_set); @@ -237,7 +237,7 @@ static ssize_t store_up_threshold(struct gov_attr_set *attr_set, return count; } -static ssize_t store_sampling_down_factor(struct gov_attr_set *attr_set, +static ssize_t sampling_down_factor_store(struct gov_attr_set *attr_set, const char *buf, size_t count) { struct dbs_data *dbs_data = to_dbs_data(attr_set); @@ -265,7 +265,7 @@ static ssize_t store_sampling_down_factor(struct gov_attr_set *attr_set, return count; } -static ssize_t store_ignore_nice_load(struct gov_attr_set *attr_set, +static ssize_t ignore_nice_load_store(struct gov_attr_set *attr_set, const char *buf, size_t count) { struct dbs_data *dbs_data = to_dbs_data(attr_set); @@ -290,7 +290,7 @@ static ssize_t store_ignore_nice_load(struct gov_attr_set *attr_set, return count; } -static ssize_t store_powersave_bias(struct gov_attr_set *attr_set, +static ssize_t powersave_bias_store(struct gov_attr_set *attr_set, const char *buf, size_t count) { struct dbs_data *dbs_data = to_dbs_data(attr_set); diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index bc7f7e6759bd..846bb3a78788 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -1692,6 +1692,37 @@ static void intel_pstate_enable_hwp_interrupt(struct cpudata *cpudata) } } +static void intel_pstate_update_epp_defaults(struct cpudata *cpudata) +{ + cpudata->epp_default = intel_pstate_get_epp(cpudata, 0); + + /* + * If this CPU gen doesn't call for change in balance_perf + * EPP return. + */ + if (epp_values[EPP_INDEX_BALANCE_PERFORMANCE] == HWP_EPP_BALANCE_PERFORMANCE) + return; + + /* + * If powerup EPP is something other than chipset default 0x80 and + * - is more performance oriented than 0x80 (default balance_perf EPP) + * - But less performance oriented than performance EPP + * then use this as new balance_perf EPP. + */ + if (cpudata->epp_default < HWP_EPP_BALANCE_PERFORMANCE && + cpudata->epp_default > HWP_EPP_PERFORMANCE) { + epp_values[EPP_INDEX_BALANCE_PERFORMANCE] = cpudata->epp_default; + return; + } + + /* + * Use hard coded value per gen to update the balance_perf + * and default EPP. + */ + cpudata->epp_default = epp_values[EPP_INDEX_BALANCE_PERFORMANCE]; + intel_pstate_set_epp(cpudata, cpudata->epp_default); +} + static void intel_pstate_hwp_enable(struct cpudata *cpudata) { /* First disable HWP notification interrupt till we activate again */ @@ -1705,12 +1736,7 @@ static void intel_pstate_hwp_enable(struct cpudata *cpudata) if (cpudata->epp_default >= 0) return; - if (epp_values[EPP_INDEX_BALANCE_PERFORMANCE] == HWP_EPP_BALANCE_PERFORMANCE) { - cpudata->epp_default = intel_pstate_get_epp(cpudata, 0); - } else { - cpudata->epp_default = epp_values[EPP_INDEX_BALANCE_PERFORMANCE]; - intel_pstate_set_epp(cpudata, cpudata->epp_default); - } + intel_pstate_update_epp_defaults(cpudata); } static int atom_get_min_pstate(void) diff --git a/drivers/cpufreq/longhaul.c b/drivers/cpufreq/longhaul.c index c538a153ee82..3e000e1a75c6 100644 --- a/drivers/cpufreq/longhaul.c +++ b/drivers/cpufreq/longhaul.c @@ -668,9 +668,9 @@ static acpi_status longhaul_walk_callback(acpi_handle obj_handle, u32 nesting_level, void *context, void **return_value) { - struct acpi_device *d; + struct acpi_device *d = acpi_fetch_acpi_dev(obj_handle); - if (acpi_bus_get_device(obj_handle, &d)) + if (!d) return 0; *return_value = acpi_driver_data(d); diff --git a/drivers/cpufreq/powernow-k8.c b/drivers/cpufreq/powernow-k8.c index 12ab4014af71..d289036beff2 100644 --- a/drivers/cpufreq/powernow-k8.c +++ b/drivers/cpufreq/powernow-k8.c @@ -1172,14 +1172,14 @@ static int powernowk8_init(void) unsigned int i, supported_cpus = 0; int ret; + if (!x86_match_cpu(powernow_k8_ids)) + return -ENODEV; + if (boot_cpu_has(X86_FEATURE_HW_PSTATE)) { __request_acpi_cpufreq(); return -ENODEV; } - if (!x86_match_cpu(powernow_k8_ids)) - return -ENODEV; - cpus_read_lock(); for_each_online_cpu(i) { smp_call_function_single(i, check_supported_cpu, &ret, 1); diff --git a/drivers/cpuidle/cpuidle-haltpoll.c b/drivers/cpuidle/cpuidle-haltpoll.c index fcc53215bac8..3a39a7f48b77 100644 --- a/drivers/cpuidle/cpuidle-haltpoll.c +++ b/drivers/cpuidle/cpuidle-haltpoll.c @@ -108,11 +108,11 @@ static int __init haltpoll_init(void) if (boot_option_idle_override != IDLE_NO_OVERRIDE) return -ENODEV; - cpuidle_poll_state_init(drv); - if (!kvm_para_available() || !haltpoll_want()) return -ENODEV; + cpuidle_poll_state_init(drv); + ret = cpuidle_register_driver(drv); if (ret < 0) return ret; diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index 0b66e25c0e2d..b7640cfe0020 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -64,6 +64,7 @@ static struct cpuidle_driver intel_idle_driver = { /* intel_idle.max_cstate=0 disables driver */ static int max_cstate = CPUIDLE_STATE_MAX - 1; static unsigned int disabled_states_mask; +static unsigned int preferred_states_mask; static struct cpuidle_device __percpu *intel_idle_cpuidle_devices; @@ -121,9 +122,6 @@ static unsigned int mwait_substates __initdata; * If the local APIC timer is not known to be reliable in the target idle state, * enable one-shot tick broadcasting for the target CPU before executing MWAIT. * - * Optionally call leave_mm() for the target CPU upfront to avoid wakeups due to - * flushing user TLBs. - * * Must be called under local_irq_disable(). */ static __cpuidle int intel_idle(struct cpuidle_device *dev, @@ -761,6 +759,46 @@ static struct cpuidle_state icx_cstates[] __initdata = { .enter = NULL } }; +/* + * On Sapphire Rapids Xeon C1 has to be disabled if C1E is enabled, and vice + * versa. On SPR C1E is enabled only if "C1E promotion" bit is set in + * MSR_IA32_POWER_CTL. But in this case there effectively no C1, because C1 + * requests are promoted to C1E. If the "C1E promotion" bit is cleared, then + * both C1 and C1E requests end up with C1, so there is effectively no C1E. + * + * By default we enable C1 and disable C1E by marking it with + * 'CPUIDLE_FLAG_UNUSABLE'. + */ +static struct cpuidle_state spr_cstates[] __initdata = { + { + .name = "C1", + .desc = "MWAIT 0x00", + .flags = MWAIT2flg(0x00), + .exit_latency = 1, + .target_residency = 1, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { + .name = "C1E", + .desc = "MWAIT 0x01", + .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE | + CPUIDLE_FLAG_UNUSABLE, + .exit_latency = 2, + .target_residency = 4, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { + .name = "C6", + .desc = "MWAIT 0x20", + .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 290, + .target_residency = 800, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { + .enter = NULL } +}; + static struct cpuidle_state atom_cstates[] __initdata = { { .name = "C1E", @@ -1104,6 +1142,12 @@ static const struct idle_cpu idle_cpu_icx __initconst = { .use_acpi = true, }; +static const struct idle_cpu idle_cpu_spr __initconst = { + .state_table = spr_cstates, + .disable_promotion_to_c1e = true, + .use_acpi = true, +}; + static const struct idle_cpu idle_cpu_avn __initconst = { .state_table = avn_cstates, .disable_promotion_to_c1e = true, @@ -1166,6 +1210,7 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_X, &idle_cpu_skx), X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, &idle_cpu_icx), X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, &idle_cpu_icx), + X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, &idle_cpu_spr), X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNL, &idle_cpu_knl), X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNM, &idle_cpu_knl), X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT, &idle_cpu_bxt), @@ -1353,6 +1398,8 @@ static inline void intel_idle_init_cstates_acpi(struct cpuidle_driver *drv) { } static inline bool intel_idle_off_by_default(u32 mwait_hint) { return false; } #endif /* !CONFIG_ACPI_PROCESSOR_CSTATE */ +static void c1e_promotion_enable(void); + /** * ivt_idle_state_table_update - Tune the idle states table for Ivy Town. * @@ -1523,6 +1570,41 @@ static void __init skx_idle_state_table_update(void) } } +/** + * spr_idle_state_table_update - Adjust Sapphire Rapids idle states table. + */ +static void __init spr_idle_state_table_update(void) +{ + unsigned long long msr; + + /* Check if user prefers C1E over C1. */ + if (preferred_states_mask & BIT(2)) { + if (preferred_states_mask & BIT(1)) + /* Both can't be enabled, stick to the defaults. */ + return; + + spr_cstates[0].flags |= CPUIDLE_FLAG_UNUSABLE; + spr_cstates[1].flags &= ~CPUIDLE_FLAG_UNUSABLE; + + /* Enable C1E using the "C1E promotion" bit. */ + c1e_promotion_enable(); + disable_promotion_to_c1e = false; + } + + /* + * By default, the C6 state assumes the worst-case scenario of package + * C6. However, if PC6 is disabled, we update the numbers to match + * core C6. + */ + rdmsrl(MSR_PKG_CST_CONFIG_CONTROL, msr); + + /* Limit value 2 and above allow for PC6. */ + if ((msr & 0x7) < 2) { + spr_cstates[2].exit_latency = 190; + spr_cstates[2].target_residency = 600; + } +} + static bool __init intel_idle_verify_cstate(unsigned int mwait_hint) { unsigned int mwait_cstate = MWAIT_HINT2CSTATE(mwait_hint) + 1; @@ -1557,6 +1639,9 @@ static void __init intel_idle_init_cstates_icpu(struct cpuidle_driver *drv) case INTEL_FAM6_SKYLAKE_X: skx_idle_state_table_update(); break; + case INTEL_FAM6_SAPPHIRERAPIDS_X: + spr_idle_state_table_update(); + break; } for (cstate = 0; cstate < CPUIDLE_STATE_MAX; ++cstate) { @@ -1629,6 +1714,15 @@ static void auto_demotion_disable(void) wrmsrl(MSR_PKG_CST_CONFIG_CONTROL, msr_bits); } +static void c1e_promotion_enable(void) +{ + unsigned long long msr_bits; + + rdmsrl(MSR_IA32_POWER_CTL, msr_bits); + msr_bits |= 0x2; + wrmsrl(MSR_IA32_POWER_CTL, msr_bits); +} + static void c1e_promotion_disable(void) { unsigned long long msr_bits; @@ -1798,3 +1892,14 @@ module_param(max_cstate, int, 0444); */ module_param_named(states_off, disabled_states_mask, uint, 0444); MODULE_PARM_DESC(states_off, "Mask of disabled idle states"); +/* + * Some platforms come with mutually exclusive C-states, so that if one is + * enabled, the other C-states must not be used. Example: C1 and C1E on + * Sapphire Rapids platform. This parameter allows for selecting the + * preferred C-states among the groups of mutually exclusive C-states - the + * selected C-states will be registered, the other C-states from the mutually + * exclusive group won't be registered. If the platform has no mutually + * exclusive C-states, this parameter has no effect. + */ +module_param_named(preferred_cstates, preferred_states_mask, uint, 0444); +MODULE_PARM_DESC(preferred_cstates, "Mask of preferred idle states"); diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 3522a272b74d..35c7d6db4139 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -661,6 +661,11 @@ struct gov_attr_set { /* sysfs ops for cpufreq governors */ extern const struct sysfs_ops governor_sysfs_ops; +static inline struct gov_attr_set *to_gov_attr_set(struct kobject *kobj) +{ + return container_of(kobj, struct gov_attr_set, kobj); +} + void gov_attr_set_init(struct gov_attr_set *attr_set, struct list_head *list_node); void gov_attr_set_get(struct gov_attr_set *attr_set, struct list_head *list_node); unsigned int gov_attr_set_put(struct gov_attr_set *attr_set, struct list_head *list_node); diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 26778884d9ab..cffcd08f4ec8 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -539,7 +539,7 @@ ATTRIBUTE_GROUPS(sugov); static void sugov_tunables_free(struct kobject *kobj) { - struct gov_attr_set *attr_set = container_of(kobj, struct gov_attr_set, kobj); + struct gov_attr_set *attr_set = to_gov_attr_set(kobj); kfree(to_sugov_tunables(attr_set)); }