From db9ea3b22315b74fd682d0c381a6e2ad09a105e3 Mon Sep 17 00:00:00 2001 From: Xuewen Yan Date: Tue, 19 Mar 2024 16:01:53 +0800 Subject: [PATCH 01/68] cpufreq: Use a smaller freq for the policy->max when verify When driver use the cpufreq_frequency_table_verify() as the cpufreq_driver->verify's callback. It may cause the policy->max bigger than the freq_qos's max freq. Just as follow: unisoc:/sys/devices/system/cpu/cpufreq/policy0 # cat scaling_available_frequencies 614400 768000 988000 1228800 1469000 1586000 1690000 1833000 2002000 2093000 unisoc:/sys/devices/system/cpu/cpufreq/policy0 # echo 1900000 > scaling_max_freq unisoc:/sys/devices/system/cpu/cpufreq/policy0 # echo 1900000 > scaling_min_freq unisoc:/sys/devices/system/cpu/cpufreq/policy0 # cat scaling_max_freq 2002000 unisoc:/sys/devices/system/cpu/cpufreq/policy0 # cat scaling_min_freq 2002000 When user set the qos_min and qos_max as the same value, and the value is not in the freq-table, the above scenario will occur. This is because in cpufreq_frequency_table_verify() func, when it can not find the freq in table, it will change the policy->max to be a bigger freq, as above, because there is no 1.9G in the freq-table, the policy->max would be set to 2.002G. As a result, the cpufreq_policy->max is bigger than the user's qos_max. This is unreasonable. So use a smaller freq when can not find the freq in fre-table, to prevent the policy->max exceed the qos's max freq. Signed-off-by: Xuewen Yan Acked-by: Viresh Kumar Reviewed-by: Dhruva Gole Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/freq_table.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/cpufreq/freq_table.c b/drivers/cpufreq/freq_table.c index c17dc51a5a02..40e146942f3e 100644 --- a/drivers/cpufreq/freq_table.c +++ b/drivers/cpufreq/freq_table.c @@ -70,7 +70,7 @@ int cpufreq_frequency_table_verify(struct cpufreq_policy_data *policy, struct cpufreq_frequency_table *table) { struct cpufreq_frequency_table *pos; - unsigned int freq, next_larger = ~0; + unsigned int freq, prev_smaller = 0; bool found = false; pr_debug("request for verification of policy (%u - %u kHz) for cpu %u\n", @@ -86,12 +86,12 @@ int cpufreq_frequency_table_verify(struct cpufreq_policy_data *policy, break; } - if ((next_larger > freq) && (freq > policy->max)) - next_larger = freq; + if ((prev_smaller < freq) && (freq <= policy->max)) + prev_smaller = freq; } if (!found) { - policy->max = next_larger; + policy->max = prev_smaller; cpufreq_verify_within_cpu_limits(policy); } From eb68d909d53eed0ec9722fcb18747647ca33a18f Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Mon, 25 Mar 2024 17:09:53 -0500 Subject: [PATCH 02/68] Documentation: PM: Update platform_pci_wakeup_init() reference platform_pci_wakeup_init() was removed by d2e5f0c16ad6 ("ACPI / PCI: Rework the setup and cleanup of device wakeup") but was still mentioned in the documentation. Update the doc to refer to pci_acpi_setup(), which does the equivalent work. Signed-off-by: Bjorn Helgaas Reviewed-by: Dhruva Gole Signed-off-by: Rafael J. Wysocki --- Documentation/power/pci.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/power/pci.rst b/Documentation/power/pci.rst index 12070320307e..e2c1fb8a569a 100644 --- a/Documentation/power/pci.rst +++ b/Documentation/power/pci.rst @@ -333,7 +333,7 @@ struct pci_dev. The PCI subsystem's first task related to device power management is to prepare the device for power management and initialize the fields of struct pci_dev used for this purpose. This happens in two functions defined in -drivers/pci/pci.c, pci_pm_init() and platform_pci_wakeup_init(). +drivers/pci/, pci_pm_init() and pci_acpi_setup(). The first of these functions checks if the device supports native PCI PM and if that's the case the offset of its power management capability structure From f186b2dace86f36cc08872b693185eaf71128898 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 21 Mar 2024 20:29:43 +0100 Subject: [PATCH 03/68] cpufreq: intel_pstate: Drop redundant locking from intel_pstate_driver_cleanup() Remove the spinlock locking from intel_pstate_driver_cleanup() as it is not necessary because no other code accessing all_cpu_data[] can run in parallel with that function. Had the locking been necessary, though, it would have been incorrect because the lock in question is acquired from a hardirq handler and it cannot be acquired from thread context without disabling interrupts. Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/intel_pstate.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index dbbf299f4219..bcbeed92458d 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -3135,10 +3135,8 @@ static void intel_pstate_driver_cleanup(void) if (intel_pstate_driver == &intel_pstate) intel_pstate_clear_update_util_hook(cpu); - spin_lock(&hwp_notify_lock); kfree(all_cpu_data[cpu]); WRITE_ONCE(all_cpu_data[cpu], NULL); - spin_unlock(&hwp_notify_lock); } } cpus_read_unlock(); From 12ebba42d2f1eadc0f897ffeb6dbcfaf2449e107 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 21 Mar 2024 20:30:42 +0100 Subject: [PATCH 04/68] cpufreq: intel_pstate: Simplify spinlock locking Because intel_pstate_enable/disable_hwp_interrupt() are only called from thread context, they need not save the IRQ flags when using a spinlock as interrupts are guaranteed to be enabled when they run, so make them use spin_lock/unlock_irq(). Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/intel_pstate.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index bcbeed92458d..c0abf77c56ba 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -1682,30 +1682,26 @@ ack_intr: static void intel_pstate_disable_hwp_interrupt(struct cpudata *cpudata) { - unsigned long flags; - if (!boot_cpu_has(X86_FEATURE_HWP_NOTIFY)) return; /* wrmsrl_on_cpu has to be outside spinlock as this can result in IPC */ wrmsrl_on_cpu(cpudata->cpu, MSR_HWP_INTERRUPT, 0x00); - spin_lock_irqsave(&hwp_notify_lock, flags); + spin_lock_irq(&hwp_notify_lock); if (cpumask_test_and_clear_cpu(cpudata->cpu, &hwp_intr_enable_mask)) cancel_delayed_work(&cpudata->hwp_notify_work); - spin_unlock_irqrestore(&hwp_notify_lock, flags); + spin_unlock_irq(&hwp_notify_lock); } static void intel_pstate_enable_hwp_interrupt(struct cpudata *cpudata) { /* Enable HWP notification interrupt for guaranteed performance change */ if (boot_cpu_has(X86_FEATURE_HWP_NOTIFY)) { - unsigned long flags; - - spin_lock_irqsave(&hwp_notify_lock, flags); + spin_lock_irq(&hwp_notify_lock); INIT_DELAYED_WORK(&cpudata->hwp_notify_work, intel_pstate_notify_work); cpumask_set_cpu(cpudata->cpu, &hwp_intr_enable_mask); - spin_unlock_irqrestore(&hwp_notify_lock, flags); + spin_unlock_irq(&hwp_notify_lock); /* wrmsrl_on_cpu has to be outside spinlock as this can result in IPC */ wrmsrl_on_cpu(cpudata->cpu, MSR_HWP_INTERRUPT, 0x01); From 432acb219af4edecdd11d360f30b7cc643524db8 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 21 Mar 2024 20:32:02 +0100 Subject: [PATCH 05/68] cpufreq: intel_pstate: Wait for canceled delayed work to complete Make intel_pstate_disable_hwp_interrupt() wait for canceled delayed work to complete to avoid leftover work items running when it returns which may be during driver unregistration and may confuse things going forward. Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/intel_pstate.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index c0abf77c56ba..b702430dac29 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -1682,6 +1682,8 @@ ack_intr: static void intel_pstate_disable_hwp_interrupt(struct cpudata *cpudata) { + bool cancel_work; + if (!boot_cpu_has(X86_FEATURE_HWP_NOTIFY)) return; @@ -1689,9 +1691,11 @@ static void intel_pstate_disable_hwp_interrupt(struct cpudata *cpudata) wrmsrl_on_cpu(cpudata->cpu, MSR_HWP_INTERRUPT, 0x00); spin_lock_irq(&hwp_notify_lock); - if (cpumask_test_and_clear_cpu(cpudata->cpu, &hwp_intr_enable_mask)) - cancel_delayed_work(&cpudata->hwp_notify_work); + cancel_work = cpumask_test_and_clear_cpu(cpudata->cpu, &hwp_intr_enable_mask); spin_unlock_irq(&hwp_notify_lock); + + if (cancel_work) + cancel_delayed_work_sync(&cpudata->hwp_notify_work); } static void intel_pstate_enable_hwp_interrupt(struct cpudata *cpudata) From 0f2828e17b6f41b8b345f0031e3fe58529991748 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 28 Mar 2024 19:52:06 +0100 Subject: [PATCH 06/68] cpufreq: intel_pstate: Get rid of unnecessary READ_ONCE() annotations Drop two redundant checks involving READ_ONCE() from notify_hwp_interrupt() and make it check hwp_active without READ_ONCE() which is not necessary, because that variable is only set once during the early initialization of the driver. In order to make that clear, annotate hwp_active with __ro_after_init. Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/intel_pstate.c | 27 +++++---------------------- 1 file changed, 5 insertions(+), 22 deletions(-) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index b702430dac29..fa707a207c8e 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -292,7 +292,7 @@ struct pstate_funcs { static struct pstate_funcs pstate_funcs __read_mostly; -static int hwp_active __read_mostly; +static bool hwp_active __ro_after_init; static int hwp_mode_bdw __read_mostly; static bool per_cpu_limits __read_mostly; static bool hwp_boost __read_mostly; @@ -1636,11 +1636,10 @@ static cpumask_t hwp_intr_enable_mask; void notify_hwp_interrupt(void) { unsigned int this_cpu = smp_processor_id(); - struct cpudata *cpudata; unsigned long flags; u64 value; - if (!READ_ONCE(hwp_active) || !boot_cpu_has(X86_FEATURE_HWP_NOTIFY)) + if (!hwp_active || !boot_cpu_has(X86_FEATURE_HWP_NOTIFY)) return; rdmsrl_safe(MSR_HWP_STATUS, &value); @@ -1652,24 +1651,8 @@ void notify_hwp_interrupt(void) if (!cpumask_test_cpu(this_cpu, &hwp_intr_enable_mask)) goto ack_intr; - /* - * Currently we never free all_cpu_data. And we can't reach here - * without this allocated. But for safety for future changes, added - * check. - */ - if (unlikely(!READ_ONCE(all_cpu_data))) - goto ack_intr; - - /* - * The free is done during cleanup, when cpufreq registry is failed. - * We wouldn't be here if it fails on init or switch status. But for - * future changes, added check. - */ - cpudata = READ_ONCE(all_cpu_data[this_cpu]); - if (unlikely(!cpudata)) - goto ack_intr; - - schedule_delayed_work(&cpudata->hwp_notify_work, msecs_to_jiffies(10)); + schedule_delayed_work(&all_cpu_data[this_cpu]->hwp_notify_work, + msecs_to_jiffies(10)); spin_unlock_irqrestore(&hwp_notify_lock, flags); @@ -3464,7 +3447,7 @@ static int __init intel_pstate_init(void) * deal with it. */ if ((!no_hwp && boot_cpu_has(X86_FEATURE_HWP_EPP)) || hwp_forced) { - WRITE_ONCE(hwp_active, 1); + hwp_active = true; hwp_mode_bdw = id->driver_data; intel_pstate.attr = hwp_cpufreq_attrs; intel_cpufreq.attr = hwp_cpufreq_attrs; From e97a98238da68aea4a0be0b2cc40e39527c880b1 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 21 Mar 2024 20:34:06 +0100 Subject: [PATCH 07/68] cpufreq: intel_pstate: Use __ro_after_init for three variables There are at least 3 variables in intel_pstate that do not get updated after they have been initialized, so annotate them with __ro_after_init. Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/intel_pstate.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index fa707a207c8e..a9e36bbea4fa 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -293,10 +293,10 @@ struct pstate_funcs { static struct pstate_funcs pstate_funcs __read_mostly; static bool hwp_active __ro_after_init; -static int hwp_mode_bdw __read_mostly; -static bool per_cpu_limits __read_mostly; +static int hwp_mode_bdw __ro_after_init; +static bool per_cpu_limits __ro_after_init; +static bool hwp_forced __ro_after_init; static bool hwp_boost __read_mostly; -static bool hwp_forced __read_mostly; static struct cpufreq_driver *intel_pstate_driver __read_mostly; From 032c5565eb80edb6f2faeb31939540c897987119 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 25 Mar 2024 18:01:58 +0100 Subject: [PATCH 08/68] cpufreq: intel_pstate: Fold intel_pstate_max_within_limits() into caller Fold intel_pstate_max_within_limits() into its only caller. No functional impact. Signed-off-by: Rafael J. Wysocki Acked-by: Srinivas Pandruvada --- drivers/cpufreq/intel_pstate.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index a9e36bbea4fa..a401767bdf84 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -2012,14 +2012,6 @@ static void intel_pstate_set_min_pstate(struct cpudata *cpu) intel_pstate_set_pstate(cpu, cpu->pstate.min_pstate); } -static void intel_pstate_max_within_limits(struct cpudata *cpu) -{ - int pstate = max(cpu->pstate.min_pstate, cpu->max_perf_ratio); - - update_turbo_state(); - intel_pstate_set_pstate(cpu, pstate); -} - static void intel_pstate_get_cpu_pstates(struct cpudata *cpu) { int perf_ctl_max_phys = pstate_funcs.get_max_physical(cpu->cpu); @@ -2594,12 +2586,15 @@ static int intel_pstate_set_policy(struct cpufreq_policy *policy) intel_pstate_update_perf_limits(cpu, policy->min, policy->max); if (cpu->policy == CPUFREQ_POLICY_PERFORMANCE) { + int pstate = max(cpu->pstate.min_pstate, cpu->max_perf_ratio); + /* * NOHZ_FULL CPUs need this as the governor callback may not * be invoked on them. */ intel_pstate_clear_update_util_hook(policy->cpu); - intel_pstate_max_within_limits(cpu); + update_turbo_state(); + intel_pstate_set_pstate(cpu, pstate); } else { intel_pstate_set_update_util_hook(policy->cpu); } From 0940f1a8011fd69be5082015068e0dc31c800c20 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 25 Mar 2024 18:02:42 +0100 Subject: [PATCH 09/68] cpufreq: intel_pstate: Do not update global.turbo_disabled after initialization The global.turbo_disabled is updated quite often, especially in the passive mode in which case it is updated every time the scheduler calls into the driver. However, this is generally not necessary and it adds MSR read overhead to scheduler code paths (and that particular MSR is slow to read). For this reason, make the driver read MSR_IA32_MISC_ENABLE_TURBO_DISABLE just once at the cpufreq driver registration time and remove all of the in-flight updates of global.turbo_disabled. Signed-off-by: Rafael J. Wysocki Acked-by: Srinivas Pandruvada --- drivers/cpufreq/intel_pstate.c | 51 ++++++---------------------------- 1 file changed, 8 insertions(+), 43 deletions(-) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index a401767bdf84..7c00087e840d 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -173,7 +173,6 @@ struct vid_data { * based on the MSR_IA32_MISC_ENABLE value and whether or * not the maximum reported turbo P-state is different from * the maximum reported non-turbo one. - * @turbo_disabled_mf: The @turbo_disabled value reflected by cpuinfo.max_freq. * @min_perf_pct: Minimum capacity limit in percent of the maximum turbo * P-state capacity. * @max_perf_pct: Maximum capacity limit in percent of the maximum turbo @@ -182,7 +181,6 @@ struct vid_data { struct global_params { bool no_turbo; bool turbo_disabled; - bool turbo_disabled_mf; int max_perf_pct; int min_perf_pct; }; @@ -594,12 +592,13 @@ static void intel_pstate_hybrid_hwp_adjust(struct cpudata *cpu) cpu->pstate.min_pstate = intel_pstate_freq_to_hwp(cpu, freq); } -static inline void update_turbo_state(void) +static bool turbo_is_disabled(void) { u64 misc_en; rdmsrl(MSR_IA32_MISC_ENABLE, misc_en); - global.turbo_disabled = misc_en & MSR_IA32_MISC_ENABLE_TURBO_DISABLE; + + return !!(misc_en & MSR_IA32_MISC_ENABLE_TURBO_DISABLE); } static int min_perf_pct_min(void) @@ -1154,40 +1153,16 @@ static void intel_pstate_update_policies(void) static void __intel_pstate_update_max_freq(struct cpudata *cpudata, struct cpufreq_policy *policy) { - policy->cpuinfo.max_freq = global.turbo_disabled_mf ? + policy->cpuinfo.max_freq = global.turbo_disabled ? cpudata->pstate.max_freq : cpudata->pstate.turbo_freq; refresh_frequency_limits(policy); } -static void intel_pstate_update_max_freq(unsigned int cpu) -{ - struct cpufreq_policy *policy = cpufreq_cpu_acquire(cpu); - - if (!policy) - return; - - __intel_pstate_update_max_freq(all_cpu_data[cpu], policy); - - cpufreq_cpu_release(policy); -} - static void intel_pstate_update_limits(unsigned int cpu) { mutex_lock(&intel_pstate_driver_lock); - update_turbo_state(); - /* - * If turbo has been turned on or off globally, policy limits for - * all CPUs need to be updated to reflect that. - */ - if (global.turbo_disabled_mf != global.turbo_disabled) { - global.turbo_disabled_mf = global.turbo_disabled; - arch_set_max_freq_ratio(global.turbo_disabled); - for_each_possible_cpu(cpu) - intel_pstate_update_max_freq(cpu); - } else { - cpufreq_update_policy(cpu); - } + cpufreq_update_policy(cpu); mutex_unlock(&intel_pstate_driver_lock); } @@ -1287,7 +1262,6 @@ static ssize_t show_no_turbo(struct kobject *kobj, return -EAGAIN; } - update_turbo_state(); if (global.turbo_disabled) ret = sprintf(buf, "%u\n", global.turbo_disabled); else @@ -1317,7 +1291,6 @@ static ssize_t store_no_turbo(struct kobject *a, struct kobj_attribute *b, mutex_lock(&intel_pstate_limits_lock); - update_turbo_state(); if (global.turbo_disabled) { pr_notice_once("Turbo disabled by BIOS or unavailable on processor\n"); mutex_unlock(&intel_pstate_limits_lock); @@ -2281,8 +2254,6 @@ static void intel_pstate_adjust_pstate(struct cpudata *cpu) struct sample *sample; int target_pstate; - update_turbo_state(); - target_pstate = get_target_pstate(cpu); target_pstate = intel_pstate_prepare_request(cpu, target_pstate); trace_cpu_frequency(target_pstate * cpu->pstate.scaling, cpu->cpu); @@ -2593,7 +2564,6 @@ static int intel_pstate_set_policy(struct cpufreq_policy *policy) * be invoked on them. */ intel_pstate_clear_update_util_hook(policy->cpu); - update_turbo_state(); intel_pstate_set_pstate(cpu, pstate); } else { intel_pstate_set_update_util_hook(policy->cpu); @@ -2637,7 +2607,6 @@ static void intel_pstate_verify_cpu_policy(struct cpudata *cpu, { int max_freq; - update_turbo_state(); if (hwp_active) { intel_pstate_get_hwp_cap(cpu); max_freq = global.no_turbo || global.turbo_disabled ? @@ -2734,8 +2703,6 @@ static int __intel_pstate_cpu_init(struct cpufreq_policy *policy) /* cpuinfo and default policy values */ policy->cpuinfo.min_freq = cpu->pstate.min_freq; - update_turbo_state(); - global.turbo_disabled_mf = global.turbo_disabled; policy->cpuinfo.max_freq = global.turbo_disabled ? cpu->pstate.max_freq : cpu->pstate.turbo_freq; @@ -2901,8 +2868,6 @@ static int intel_cpufreq_target(struct cpufreq_policy *policy, struct cpufreq_freqs freqs; int target_pstate; - update_turbo_state(); - freqs.old = policy->cur; freqs.new = target_freq; @@ -2924,8 +2889,6 @@ static unsigned int intel_cpufreq_fast_switch(struct cpufreq_policy *policy, struct cpudata *cpu = all_cpu_data[policy->cpu]; int target_pstate; - update_turbo_state(); - target_pstate = intel_pstate_freq_to_hwp(cpu, target_freq); target_pstate = intel_cpufreq_update_pstate(policy, target_pstate, true); @@ -2943,7 +2906,6 @@ static void intel_cpufreq_adjust_perf(unsigned int cpunum, int old_pstate = cpu->pstate.current_pstate; int cap_pstate, min_pstate, max_pstate, target_pstate; - update_turbo_state(); cap_pstate = global.turbo_disabled ? HWP_GUARANTEED_PERF(hwp_cap) : HWP_HIGHEST_PERF(hwp_cap); @@ -3131,6 +3093,9 @@ static int intel_pstate_register_driver(struct cpufreq_driver *driver) memset(&global, 0, sizeof(global)); global.max_perf_pct = 100; + global.turbo_disabled = turbo_is_disabled(); + + arch_set_max_freq_ratio(global.turbo_disabled); intel_pstate_driver = driver; ret = cpufreq_register_driver(intel_pstate_driver); From c626a438452079824139f97137f17af47b1a8989 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 25 Mar 2024 18:03:25 +0100 Subject: [PATCH 10/68] cpufreq: intel_pstate: Rearrange show_no_turbo() and store_no_turbo() Now that global.turbo_disabled can only change at the cpufreq driver registration time, initialize global.no_turbo at that time too so they are in sync to start with (if the former is set, the latter cannot be updated later anyway). That allows show_no_turbo() to be simlified because it does not need to check global.turbo_disabled and store_no_turbo() can be rearranged to avoid doing anything if the new value of global.no_turbo is equal to the current one and only return an error on attempts to clear global.no_turbo when global.turbo_disabled. While at it, eliminate the redundant ret variable from store_no_turbo(). No intentional functional impact. Signed-off-by: Rafael J. Wysocki Acked-by: Srinivas Pandruvada --- drivers/cpufreq/intel_pstate.c | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 7c00087e840d..357993ae1454 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -1262,10 +1262,7 @@ static ssize_t show_no_turbo(struct kobject *kobj, return -EAGAIN; } - if (global.turbo_disabled) - ret = sprintf(buf, "%u\n", global.turbo_disabled); - else - ret = sprintf(buf, "%u\n", global.no_turbo); + ret = sprintf(buf, "%u\n", global.no_turbo); mutex_unlock(&intel_pstate_driver_lock); @@ -1276,31 +1273,34 @@ static ssize_t store_no_turbo(struct kobject *a, struct kobj_attribute *b, const char *buf, size_t count) { unsigned int input; - int ret; + bool no_turbo; - ret = sscanf(buf, "%u", &input); - if (ret != 1) + if (sscanf(buf, "%u", &input) != 1) return -EINVAL; mutex_lock(&intel_pstate_driver_lock); if (!intel_pstate_driver) { - mutex_unlock(&intel_pstate_driver_lock); - return -EAGAIN; + count = -EAGAIN; + goto unlock_driver; } - mutex_lock(&intel_pstate_limits_lock); + no_turbo = !!clamp_t(int, input, 0, 1); + + if (no_turbo == global.no_turbo) + goto unlock_driver; if (global.turbo_disabled) { pr_notice_once("Turbo disabled by BIOS or unavailable on processor\n"); - mutex_unlock(&intel_pstate_limits_lock); - mutex_unlock(&intel_pstate_driver_lock); - return -EPERM; + count = -EPERM; + goto unlock_driver; } - global.no_turbo = clamp_t(int, input, 0, 1); + global.no_turbo = no_turbo; - if (global.no_turbo) { + mutex_lock(&intel_pstate_limits_lock); + + if (no_turbo) { struct cpudata *cpu = all_cpu_data[0]; int pct = cpu->pstate.max_pstate * 100 / cpu->pstate.turbo_pstate; @@ -1312,8 +1312,9 @@ static ssize_t store_no_turbo(struct kobject *a, struct kobj_attribute *b, mutex_unlock(&intel_pstate_limits_lock); intel_pstate_update_policies(); - arch_set_max_freq_ratio(global.no_turbo); + arch_set_max_freq_ratio(no_turbo); +unlock_driver: mutex_unlock(&intel_pstate_driver_lock); return count; @@ -3094,6 +3095,7 @@ static int intel_pstate_register_driver(struct cpufreq_driver *driver) memset(&global, 0, sizeof(global)); global.max_perf_pct = 100; global.turbo_disabled = turbo_is_disabled(); + global.no_turbo = global.turbo_disabled; arch_set_max_freq_ratio(global.turbo_disabled); From 9558fae8ce97b3b320b387dd7c88309df2c36d4d Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 25 Mar 2024 18:04:24 +0100 Subject: [PATCH 11/68] cpufreq: intel_pstate: Read global.no_turbo under READ_ONCE() Because global.no_turbo is generally not read under intel_pstate_driver_lock make store_no_turbo() use WRITE_ONCE() for updating it (this is the only place at which it is updated except for the initialization) and make the majority of places reading it use READ_ONCE(). Also remove redundant global.turbo_disabled checks from places that depend on the 'true' value of global.no_turbo because it can only be 'true' if global.turbo_disabled is also 'true'. Signed-off-by: Rafael J. Wysocki Acked-by: Srinivas Pandruvada --- drivers/cpufreq/intel_pstate.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 357993ae1454..3a707e34acd8 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -1296,7 +1296,7 @@ static ssize_t store_no_turbo(struct kobject *a, struct kobj_attribute *b, goto unlock_driver; } - global.no_turbo = no_turbo; + WRITE_ONCE(global.no_turbo, no_turbo); mutex_lock(&intel_pstate_limits_lock); @@ -1748,7 +1748,7 @@ static u64 atom_get_val(struct cpudata *cpudata, int pstate) u32 vid; val = (u64)pstate << 8; - if (global.no_turbo && !global.turbo_disabled) + if (READ_ONCE(global.no_turbo) && !global.turbo_disabled) val |= (u64)1 << 32; vid_fp = cpudata->vid.min + mul_fp( @@ -1913,7 +1913,7 @@ static u64 core_get_val(struct cpudata *cpudata, int pstate) u64 val; val = (u64)pstate << 8; - if (global.no_turbo && !global.turbo_disabled) + if (READ_ONCE(global.no_turbo) && !global.turbo_disabled) val |= (u64)1 << 32; return val; @@ -2211,7 +2211,7 @@ static inline int32_t get_target_pstate(struct cpudata *cpu) sample->busy_scaled = busy_frac * 100; - target = global.no_turbo || global.turbo_disabled ? + target = READ_ONCE(global.no_turbo) ? cpu->pstate.max_pstate : cpu->pstate.turbo_pstate; target += target >> 2; target = mul_fp(target, busy_frac); @@ -2473,7 +2473,7 @@ static void intel_pstate_clear_update_util_hook(unsigned int cpu) static int intel_pstate_get_max_freq(struct cpudata *cpu) { - return global.turbo_disabled || global.no_turbo ? + return READ_ONCE(global.no_turbo) ? cpu->pstate.max_freq : cpu->pstate.turbo_freq; } @@ -2610,7 +2610,7 @@ static void intel_pstate_verify_cpu_policy(struct cpudata *cpu, if (hwp_active) { intel_pstate_get_hwp_cap(cpu); - max_freq = global.no_turbo || global.turbo_disabled ? + max_freq = READ_ONCE(global.no_turbo) ? cpu->pstate.max_freq : cpu->pstate.turbo_freq; } else { max_freq = intel_pstate_get_max_freq(cpu); From f32587dcbe5f40e160d8de262add6abab79356a7 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 25 Mar 2024 18:05:06 +0100 Subject: [PATCH 12/68] cpufreq: intel_pstate: Replace three global.turbo_disabled checks Replace the global.turbo_disabled in __intel_pstate_update_max_freq() with a global.no_turbo one to make store_no_turbo() actually update the maximum CPU frequency on the trubo preference changes, which needs to be consistent with arch_set_max_freq_ratio() called from there. For more consistency, replace the global.turbo_disabled checks in __intel_pstate_cpu_init() and intel_cpufreq_adjust_perf() with global.no_turbo checks either. Signed-off-by: Rafael J. Wysocki Acked-by: Srinivas Pandruvada --- drivers/cpufreq/intel_pstate.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 3a707e34acd8..f1d6de05bcab 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -1153,7 +1153,7 @@ static void intel_pstate_update_policies(void) static void __intel_pstate_update_max_freq(struct cpudata *cpudata, struct cpufreq_policy *policy) { - policy->cpuinfo.max_freq = global.turbo_disabled ? + policy->cpuinfo.max_freq = READ_ONCE(global.no_turbo) ? cpudata->pstate.max_freq : cpudata->pstate.turbo_freq; refresh_frequency_limits(policy); } @@ -2704,7 +2704,7 @@ static int __intel_pstate_cpu_init(struct cpufreq_policy *policy) /* cpuinfo and default policy values */ policy->cpuinfo.min_freq = cpu->pstate.min_freq; - policy->cpuinfo.max_freq = global.turbo_disabled ? + policy->cpuinfo.max_freq = READ_ONCE(global.no_turbo) ? cpu->pstate.max_freq : cpu->pstate.turbo_freq; policy->min = policy->cpuinfo.min_freq; @@ -2907,8 +2907,9 @@ static void intel_cpufreq_adjust_perf(unsigned int cpunum, int old_pstate = cpu->pstate.current_pstate; int cap_pstate, min_pstate, max_pstate, target_pstate; - cap_pstate = global.turbo_disabled ? HWP_GUARANTEED_PERF(hwp_cap) : - HWP_HIGHEST_PERF(hwp_cap); + cap_pstate = READ_ONCE(global.no_turbo) ? + HWP_GUARANTEED_PERF(hwp_cap) : + HWP_HIGHEST_PERF(hwp_cap); /* Optimization: Avoid unnecessary divisions. */ From e8217b4bece379e66d43ab5070431712f07bf625 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 28 Mar 2024 19:52:45 +0100 Subject: [PATCH 13/68] cpufreq: intel_pstate: Update the maximum CPU frequency consistently There are 3 places at which the maximum CPU frequency may change, store_no_turbo(), intel_pstate_update_limits() (when called by the cpufreq core) and intel_pstate_notify_work() (when handling a HWP change notification). Currently, cpuinfo.max_freq is only updated by store_no_turbo() and intel_pstate_notify_work(), although it principle it may be necessary to update it in intel_pstate_update_limits() either. Make all of them mutually consistent. Signed-off-by: Rafael J. Wysocki Acked-by: Srinivas Pandruvada --- drivers/cpufreq/intel_pstate.c | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index f1d6de05bcab..02f9e494e86e 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -1153,18 +1153,32 @@ static void intel_pstate_update_policies(void) static void __intel_pstate_update_max_freq(struct cpudata *cpudata, struct cpufreq_policy *policy) { + intel_pstate_get_hwp_cap(cpudata); + policy->cpuinfo.max_freq = READ_ONCE(global.no_turbo) ? cpudata->pstate.max_freq : cpudata->pstate.turbo_freq; + refresh_frequency_limits(policy); } static void intel_pstate_update_limits(unsigned int cpu) { - mutex_lock(&intel_pstate_driver_lock); + struct cpufreq_policy *policy = cpufreq_cpu_acquire(cpu); - cpufreq_update_policy(cpu); + if (!policy) + return; - mutex_unlock(&intel_pstate_driver_lock); + __intel_pstate_update_max_freq(all_cpu_data[cpu], policy); + + cpufreq_cpu_release(policy); +} + +static void intel_pstate_update_limits_for_all(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + intel_pstate_update_limits(cpu); } /************************** sysfs begin ************************/ @@ -1311,7 +1325,7 @@ static ssize_t store_no_turbo(struct kobject *a, struct kobj_attribute *b, mutex_unlock(&intel_pstate_limits_lock); - intel_pstate_update_policies(); + intel_pstate_update_limits_for_all(); arch_set_max_freq_ratio(no_turbo); unlock_driver: @@ -1595,7 +1609,6 @@ static void intel_pstate_notify_work(struct work_struct *work) struct cpufreq_policy *policy = cpufreq_cpu_acquire(cpudata->cpu); if (policy) { - intel_pstate_get_hwp_cap(cpudata); __intel_pstate_update_max_freq(cpudata, policy); cpufreq_cpu_release(policy); From 8c556541a53848d6611ff8b5f9bf52e96c56f48e Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 3 Apr 2024 10:06:45 +0200 Subject: [PATCH 14/68] cpufreq: intel_pstate: hide unused intel_pstate_cpu_oob_ids[] The reference to this variable is hidden in an #ifdef: drivers/cpufreq/intel_pstate.c:2440:32: error: 'intel_pstate_cpu_oob_ids' defined but not used [-Werror=unused-const-variable=] Use the same check around the definition. Signed-off-by: Arnd Bergmann Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/intel_pstate.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 02f9e494e86e..5f19d3824a4b 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -2397,6 +2397,7 @@ static const struct x86_cpu_id intel_pstate_cpu_ids[] = { }; MODULE_DEVICE_TABLE(x86cpu, intel_pstate_cpu_ids); +#ifdef CONFIG_ACPI static const struct x86_cpu_id intel_pstate_cpu_oob_ids[] __initconst = { X86_MATCH(BROADWELL_D, core_funcs), X86_MATCH(BROADWELL_X, core_funcs), @@ -2405,6 +2406,7 @@ static const struct x86_cpu_id intel_pstate_cpu_oob_ids[] __initconst = { X86_MATCH(SAPPHIRERAPIDS_X, core_funcs), {} }; +#endif static const struct x86_cpu_id intel_pstate_cpu_ee_disable_ids[] = { X86_MATCH(KABYLAKE, core_funcs), From afde996a33ee4dbe3692e1eff28b56c820331428 Mon Sep 17 00:00:00 2001 From: Dhruva Gole Date: Mon, 18 Mar 2024 20:46:32 +0530 Subject: [PATCH 15/68] PM: wakeup: make device_wakeup_disable() return void The device_wakeup_disable() call only returns an error if no dev exists, but there's not much a user can do at that point. Rather, make this function return void. Signed-off-by: Dhruva Gole Signed-off-by: Rafael J. Wysocki --- drivers/base/power/wakeup.c | 11 +++++++---- drivers/mmc/host/sdhci-pci-core.c | 2 +- include/linux/pm_wakeup.h | 5 ++--- 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/drivers/base/power/wakeup.c b/drivers/base/power/wakeup.c index a917219feea6..752b417e8129 100644 --- a/drivers/base/power/wakeup.c +++ b/drivers/base/power/wakeup.c @@ -451,16 +451,15 @@ static struct wakeup_source *device_wakeup_detach(struct device *dev) * Detach the @dev's wakeup source object from it, unregister this wakeup source * object and destroy it. */ -int device_wakeup_disable(struct device *dev) +void device_wakeup_disable(struct device *dev) { struct wakeup_source *ws; if (!dev || !dev->power.can_wakeup) - return -EINVAL; + return; ws = device_wakeup_detach(dev); wakeup_source_unregister(ws); - return 0; } EXPORT_SYMBOL_GPL(device_wakeup_disable); @@ -502,7 +501,11 @@ EXPORT_SYMBOL_GPL(device_set_wakeup_capable); */ int device_set_wakeup_enable(struct device *dev, bool enable) { - return enable ? device_wakeup_enable(dev) : device_wakeup_disable(dev); + if (enable) + return device_wakeup_enable(dev); + + device_wakeup_disable(dev); + return 0; } EXPORT_SYMBOL_GPL(device_set_wakeup_enable); diff --git a/drivers/mmc/host/sdhci-pci-core.c b/drivers/mmc/host/sdhci-pci-core.c index 025b31aa712c..ef89ec382bfe 100644 --- a/drivers/mmc/host/sdhci-pci-core.c +++ b/drivers/mmc/host/sdhci-pci-core.c @@ -63,7 +63,7 @@ static int sdhci_pci_init_wakeup(struct sdhci_pci_chip *chip) if ((pm_flags & MMC_PM_KEEP_POWER) && (pm_flags & MMC_PM_WAKE_SDIO_IRQ)) return device_wakeup_enable(&chip->pdev->dev); else if (!cap_cd_wake) - return device_wakeup_disable(&chip->pdev->dev); + device_wakeup_disable(&chip->pdev->dev); return 0; } diff --git a/include/linux/pm_wakeup.h b/include/linux/pm_wakeup.h index 6eb9adaef52b..428803eed798 100644 --- a/include/linux/pm_wakeup.h +++ b/include/linux/pm_wakeup.h @@ -107,7 +107,7 @@ extern void wakeup_sources_read_unlock(int idx); extern struct wakeup_source *wakeup_sources_walk_start(void); extern struct wakeup_source *wakeup_sources_walk_next(struct wakeup_source *ws); extern int device_wakeup_enable(struct device *dev); -extern int device_wakeup_disable(struct device *dev); +extern void device_wakeup_disable(struct device *dev); extern void device_set_wakeup_capable(struct device *dev, bool capable); extern int device_set_wakeup_enable(struct device *dev, bool enable); extern void __pm_stay_awake(struct wakeup_source *ws); @@ -154,10 +154,9 @@ static inline int device_wakeup_enable(struct device *dev) return 0; } -static inline int device_wakeup_disable(struct device *dev) +static inline void device_wakeup_disable(struct device *dev) { dev->power.should_wakeup = false; - return 0; } static inline int device_set_wakeup_enable(struct device *dev, bool enable) From 3642c7ed52312ac2b95c9aba45c40e50bd8798ad Mon Sep 17 00:00:00 2001 From: Dhruva Gole Date: Mon, 18 Mar 2024 20:46:33 +0530 Subject: [PATCH 16/68] PM: wakeup: Remove unnecessary else from device_init_wakeup() Checkpatch warns that else is generally not necessary after a return condition which exists in the if part of this function. Hence, just to abide by what checkpatch recommends, follow it's guidelines. Signed-off-by: Dhruva Gole Signed-off-by: Rafael J. Wysocki --- include/linux/pm_wakeup.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/include/linux/pm_wakeup.h b/include/linux/pm_wakeup.h index 428803eed798..76cd1f9f1365 100644 --- a/include/linux/pm_wakeup.h +++ b/include/linux/pm_wakeup.h @@ -234,11 +234,10 @@ static inline int device_init_wakeup(struct device *dev, bool enable) if (enable) { device_set_wakeup_capable(dev, true); return device_wakeup_enable(dev); - } else { - device_wakeup_disable(dev); - device_set_wakeup_capable(dev, false); - return 0; } + device_wakeup_disable(dev); + device_set_wakeup_capable(dev, false); + return 0; } #endif /* _LINUX_PM_WAKEUP_H */ From e3ac0f367d5806af09d2070bb7951af2f59d1f52 Mon Sep 17 00:00:00 2001 From: Lukasz Luba Date: Wed, 3 Apr 2024 16:49:04 +0100 Subject: [PATCH 17/68] OPP: OF: Export dev_opp_pm_calc_power() for usage from EM There are device drivers which can modify voltage values for OPPs. It could be due to the chip binning and those drivers have specific chip knowledge about it. This adjustment can happen after Energy Model is registered, thus EM can have stale data about power. Export dev_opp_pm_calc_power() which can be used by Energy Model to calculate new power with the new voltage for OPPs. Acked-by: Viresh Kumar Reviewed-by: Dietmar Eggemann Signed-off-by: Lukasz Luba Signed-off-by: Rafael J. Wysocki --- drivers/opp/of.c | 17 ++++++++++++----- include/linux/pm_opp.h | 8 ++++++++ 2 files changed, 20 insertions(+), 5 deletions(-) diff --git a/drivers/opp/of.c b/drivers/opp/of.c index f9f0b22bccbb..282eb5966fd0 100644 --- a/drivers/opp/of.c +++ b/drivers/opp/of.c @@ -1494,20 +1494,26 @@ _get_dt_power(struct device *dev, unsigned long *uW, unsigned long *kHz) return 0; } -/* - * Callback function provided to the Energy Model framework upon registration. +/** + * dev_pm_opp_calc_power() - Calculate power value for device with EM + * @dev : Device for which an Energy Model has to be registered + * @uW : New power value that is calculated + * @kHz : Frequency for which the new power is calculated + * * This computes the power estimated by @dev at @kHz if it is the frequency * of an existing OPP, or at the frequency of the first OPP above @kHz otherwise * (see dev_pm_opp_find_freq_ceil()). This function updates @kHz to the ceiled * frequency and @uW to the associated power. The power is estimated as * P = C * V^2 * f with C being the device's capacitance and V and f * respectively the voltage and frequency of the OPP. + * It is also used as a callback function provided to the Energy Model + * framework upon registration. * * Returns -EINVAL if the power calculation failed because of missing * parameters, 0 otherwise. */ -static int __maybe_unused _get_power(struct device *dev, unsigned long *uW, - unsigned long *kHz) +int dev_pm_opp_calc_power(struct device *dev, unsigned long *uW, + unsigned long *kHz) { struct dev_pm_opp *opp; struct device_node *np; @@ -1544,6 +1550,7 @@ static int __maybe_unused _get_power(struct device *dev, unsigned long *uW, return 0; } +EXPORT_SYMBOL_GPL(dev_pm_opp_calc_power); static bool _of_has_opp_microwatt_property(struct device *dev) { @@ -1619,7 +1626,7 @@ int dev_pm_opp_of_register_em(struct device *dev, struct cpumask *cpus) goto failed; } - EM_SET_ACTIVE_POWER_CB(em_cb, _get_power); + EM_SET_ACTIVE_POWER_CB(em_cb, dev_pm_opp_calc_power); register_em: ret = em_dev_register_perf_domain(dev, nr_opp, &em_cb, cpus, true); diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h index 065a47382302..dd7c8441af42 100644 --- a/include/linux/pm_opp.h +++ b/include/linux/pm_opp.h @@ -476,6 +476,8 @@ struct device_node *dev_pm_opp_get_of_node(struct dev_pm_opp *opp); int of_get_required_opp_performance_state(struct device_node *np, int index); int dev_pm_opp_of_find_icc_paths(struct device *dev, struct opp_table *opp_table); int dev_pm_opp_of_register_em(struct device *dev, struct cpumask *cpus); +int dev_pm_opp_calc_power(struct device *dev, unsigned long *uW, + unsigned long *kHz); static inline void dev_pm_opp_of_unregister_em(struct device *dev) { em_dev_unregister_perf_domain(dev); @@ -539,6 +541,12 @@ static inline void dev_pm_opp_of_unregister_em(struct device *dev) { } +static inline int dev_pm_opp_calc_power(struct device *dev, unsigned long *uW, + unsigned long *kHz) +{ + return -EOPNOTSUPP; +} + static inline int of_get_required_opp_performance_state(struct device_node *np, int index) { return -EOPNOTSUPP; From d61c2695bddf56f4527f71cc6ebc31897be36cff Mon Sep 17 00:00:00 2001 From: Lukasz Luba Date: Wed, 3 Apr 2024 16:49:05 +0100 Subject: [PATCH 18/68] PM: EM: Refactor em_adjust_new_capacity() Extract em_table_dup() and em_recalc_and_update() from em_adjust_new_capacity(). Both functions will be later reused by the 'update EM due to chip binning' functionality. Reviewed-by: Dietmar Eggemann Signed-off-by: Lukasz Luba Signed-off-by: Rafael J. Wysocki --- kernel/power/energy_model.c | 58 +++++++++++++++++++++++++------------ 1 file changed, 39 insertions(+), 19 deletions(-) diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c index 9e1c9aa399ea..6960dd7393b2 100644 --- a/kernel/power/energy_model.c +++ b/kernel/power/energy_model.c @@ -674,23 +674,15 @@ void em_dev_unregister_perf_domain(struct device *dev) } EXPORT_SYMBOL_GPL(em_dev_unregister_perf_domain); -/* - * Adjustment of CPU performance values after boot, when all CPUs capacites - * are correctly calculated. - */ -static void em_adjust_new_capacity(struct device *dev, - struct em_perf_domain *pd, - u64 max_cap) +static struct em_perf_table __rcu *em_table_dup(struct em_perf_domain *pd) { struct em_perf_table __rcu *em_table; struct em_perf_state *ps, *new_ps; - int ret, ps_size; + int ps_size; em_table = em_table_alloc(pd); - if (!em_table) { - dev_warn(dev, "EM: allocation failed\n"); - return; - } + if (!em_table) + return NULL; new_ps = em_table->state; @@ -702,24 +694,52 @@ static void em_adjust_new_capacity(struct device *dev, rcu_read_unlock(); - em_init_performance(dev, pd, new_ps, pd->nr_perf_states); - ret = em_compute_costs(dev, new_ps, NULL, pd->nr_perf_states, + return em_table; +} + +static int em_recalc_and_update(struct device *dev, struct em_perf_domain *pd, + struct em_perf_table __rcu *em_table) +{ + int ret; + + ret = em_compute_costs(dev, em_table->state, NULL, pd->nr_perf_states, pd->flags); - if (ret) { - dev_warn(dev, "EM: compute costs failed\n"); - return; - } + if (ret) + goto free_em_table; ret = em_dev_update_perf_domain(dev, em_table); if (ret) - dev_warn(dev, "EM: update failed %d\n", ret); + goto free_em_table; /* * This is one-time-update, so give up the ownership in this updater. * The EM framework has incremented the usage counter and from now * will keep the reference (then free the memory when needed). */ +free_em_table: em_table_free(em_table); + return ret; +} + +/* + * Adjustment of CPU performance values after boot, when all CPUs capacites + * are correctly calculated. + */ +static void em_adjust_new_capacity(struct device *dev, + struct em_perf_domain *pd, + u64 max_cap) +{ + struct em_perf_table __rcu *em_table; + + em_table = em_table_dup(pd); + if (!em_table) { + dev_warn(dev, "EM: allocation failed\n"); + return; + } + + em_init_performance(dev, pd, em_table->state, pd->nr_perf_states); + + em_recalc_and_update(dev, pd, em_table); } static void em_check_capacity_update(void) From cf61d53b026805e8222ca28ac2795611eb7fa547 Mon Sep 17 00:00:00 2001 From: Lukasz Luba Date: Wed, 3 Apr 2024 16:49:06 +0100 Subject: [PATCH 19/68] PM: EM: Add em_dev_update_chip_binning() Add a function which allows to modify easily the EM after the new voltage information is available. The device drivers for the chip can adjust the voltage values after setup. The voltage for the same frequency in OPP can be different due to chip binning. The voltage impacts the power usage and the EM power values can be updated to reflect that. Reviewed-by: Dietmar Eggemann Signed-off-by: Lukasz Luba Signed-off-by: Rafael J. Wysocki --- include/linux/energy_model.h | 5 ++++ kernel/power/energy_model.c | 48 ++++++++++++++++++++++++++++++++++++ 2 files changed, 53 insertions(+) diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index 70cd7258cd29..1ff52020cf75 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -172,6 +172,7 @@ struct em_perf_table __rcu *em_table_alloc(struct em_perf_domain *pd); void em_table_free(struct em_perf_table __rcu *table); int em_dev_compute_costs(struct device *dev, struct em_perf_state *table, int nr_states); +int em_dev_update_chip_binning(struct device *dev); /** * em_pd_get_efficient_state() - Get an efficient performance state from the EM @@ -386,6 +387,10 @@ int em_dev_compute_costs(struct device *dev, struct em_perf_state *table, { return -EINVAL; } +static inline int em_dev_update_chip_binning(struct device *dev) +{ + return -EINVAL; +} #endif #endif diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c index 6960dd7393b2..927cc55ba0b3 100644 --- a/kernel/power/energy_model.c +++ b/kernel/power/energy_model.c @@ -808,3 +808,51 @@ static void em_update_workfn(struct work_struct *work) { em_check_capacity_update(); } + +/** + * em_dev_update_chip_binning() - Update Energy Model after the new voltage + * information is present in the OPPs. + * @dev : Device for which the Energy Model has to be updated. + * + * This function allows to update easily the EM with new values available in + * the OPP framework and DT. It can be used after the chip has been properly + * verified by device drivers and the voltages adjusted for the 'chip binning'. + */ +int em_dev_update_chip_binning(struct device *dev) +{ + struct em_perf_table __rcu *em_table; + struct em_perf_domain *pd; + int i, ret; + + if (IS_ERR_OR_NULL(dev)) + return -EINVAL; + + pd = em_pd_get(dev); + if (!pd) { + dev_warn(dev, "Couldn't find Energy Model\n"); + return -EINVAL; + } + + em_table = em_table_dup(pd); + if (!em_table) { + dev_warn(dev, "EM: allocation failed\n"); + return -ENOMEM; + } + + /* Update power values which might change due to new voltage in OPPs */ + for (i = 0; i < pd->nr_perf_states; i++) { + unsigned long freq = em_table->state[i].frequency; + unsigned long power; + + ret = dev_pm_opp_calc_power(dev, &power, &freq); + if (ret) { + em_table_free(em_table); + return ret; + } + + em_table->state[i].power = power; + } + + return em_recalc_and_update(dev, pd, em_table); +} +EXPORT_SYMBOL_GPL(em_dev_update_chip_binning); From a5bb5e0877dee3595037eb8767b6bed047c898a5 Mon Sep 17 00:00:00 2001 From: Lukasz Luba Date: Wed, 3 Apr 2024 16:49:07 +0100 Subject: [PATCH 20/68] soc: samsung: exynos-asv: Update Energy Model after adjusting voltage When the voltage for OPPs is adjusted there is a need to also update Energy Model framework. The EM data contains power values which depend on voltage values. The EM structure is used for thermal (IPA governor) and in scheduler task placement (EAS) so it should reflect the real HW model as best as possible to operate properly. Based on data on Exynos5422 ASV tables the maximum power difference might be ~29%. An Odroid-XU4 (with a random sample SoC in this chip lottery) showed power difference for some OPPs ~20%. Therefore, it's worth to update the EM. Reviewed-by: Krzysztof Kozlowski Reviewed-by: Dietmar Eggemann Signed-off-by: Lukasz Luba Signed-off-by: Rafael J. Wysocki --- drivers/soc/samsung/exynos-asv.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/soc/samsung/exynos-asv.c b/drivers/soc/samsung/exynos-asv.c index d60af8acc391..97006cc3b946 100644 --- a/drivers/soc/samsung/exynos-asv.c +++ b/drivers/soc/samsung/exynos-asv.c @@ -11,6 +11,7 @@ #include #include +#include #include #include #include @@ -97,9 +98,16 @@ static int exynos_asv_update_opps(struct exynos_asv *asv) last_opp_table = opp_table; ret = exynos_asv_update_cpu_opps(asv, cpu); - if (ret < 0) + if (!ret) { + /* + * Update EM power values since OPP + * voltage values may have changed. + */ + em_dev_update_chip_binning(cpu); + } else { dev_err(asv->dev, "Couldn't udate OPPs for cpu%d\n", cpuid); + } } dev_pm_opp_put_opp_table(opp_table); From 5b9eda2b9aa8a2332305857604b6e4e5fd462449 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Fri, 5 Apr 2024 15:12:25 -0400 Subject: [PATCH 21/68] PM: sleep: Take advantage of %ps to simplify debug output initcall_debug previous and new output: ...PM: calling pci_pm_suspend+0x0/0x1b0 @ 3233, parent: pci0000:00 ...PM: calling pci_pm_suspend @ 3233, parent: pci0000:00 Signed-off-by: Len Brown Signed-off-by: Rafael J. Wysocki --- drivers/base/power/main.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index 5679f966f676..4a67e83300e1 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -208,7 +208,7 @@ static ktime_t initcall_debug_start(struct device *dev, void *cb) if (!pm_print_times_enabled) return 0; - dev_info(dev, "calling %pS @ %i, parent: %s\n", cb, + dev_info(dev, "calling %ps @ %i, parent: %s\n", cb, task_pid_nr(current), dev->parent ? dev_name(dev->parent) : "none"); return ktime_get(); @@ -223,7 +223,7 @@ static void initcall_debug_report(struct device *dev, ktime_t calltime, return; rettime = ktime_get(); - dev_info(dev, "%pS returned %d after %Ld usecs\n", cb, error, + dev_info(dev, "%ps returned %d after %Ld usecs\n", cb, error, (unsigned long long)ktime_us_delta(rettime, calltime)); } @@ -1927,7 +1927,7 @@ EXPORT_SYMBOL_GPL(dpm_suspend_start); void __suspend_report_result(const char *function, struct device *dev, void *fn, int ret) { if (ret) - dev_err(dev, "%s(): %pS returns %d\n", function, fn, ret); + dev_err(dev, "%s(): %ps returns %d\n", function, fn, ret); } EXPORT_SYMBOL_GPL(__suspend_report_result); From b8f85833c05730d631576008daaa34096bc7f3ce Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Fri, 12 Apr 2024 11:19:20 +0530 Subject: [PATCH 22/68] cpufreq: exit() callback is optional The exit() callback is optional and shouldn't be called without checking a valid pointer first. Also, we must clear freq_table pointer even if the exit() callback isn't present. Signed-off-by: Viresh Kumar Fixes: 91a12e91dc39 ("cpufreq: Allow light-weight tear down and bring up of CPUs") Fixes: f339f3541701 ("cpufreq: Rearrange locking in cpufreq_remove_dev()") Reported-by: Lizhe Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 66e10a19d76a..fd9c3ed21f49 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -1679,10 +1679,13 @@ static void __cpufreq_offline(unsigned int cpu, struct cpufreq_policy *policy) */ if (cpufreq_driver->offline) { cpufreq_driver->offline(policy); - } else if (cpufreq_driver->exit) { - cpufreq_driver->exit(policy); - policy->freq_table = NULL; + return; } + + if (cpufreq_driver->exit) + cpufreq_driver->exit(policy); + + policy->freq_table = NULL; } static int cpufreq_offline(unsigned int cpu) @@ -1740,7 +1743,7 @@ static void cpufreq_remove_dev(struct device *dev, struct subsys_interface *sif) } /* We did light-weight exit earlier, do full tear down now */ - if (cpufreq_driver->offline) + if (cpufreq_driver->offline && cpufreq_driver->exit) cpufreq_driver->exit(policy); up_write(&policy->rwsem); From 0654acd8eb7de3d82d0e8dc0235f1c7a67577da4 Mon Sep 17 00:00:00 2001 From: Dawei Li Date: Mon, 15 Apr 2024 17:48:21 +0800 Subject: [PATCH 23/68] powercap: DTPM: Avoid explicit cpumask allocation on stack In general it's preferable to avoid placing cpumasks on the stack, as for large values of NR_CPUS these can consume significant amounts of stack space and make stack overflows more likely. Use cpumask_weight_and() to avoid the need for a temporary cpumask on the stack. Signed-off-by: Dawei Li Signed-off-by: Rafael J. Wysocki --- drivers/powercap/dtpm_cpu.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/drivers/powercap/dtpm_cpu.c b/drivers/powercap/dtpm_cpu.c index bc90126f1b5f..6b6f51b21550 100644 --- a/drivers/powercap/dtpm_cpu.c +++ b/drivers/powercap/dtpm_cpu.c @@ -43,13 +43,11 @@ static u64 set_pd_power_limit(struct dtpm *dtpm, u64 power_limit) struct dtpm_cpu *dtpm_cpu = to_dtpm_cpu(dtpm); struct em_perf_domain *pd = em_cpu_get(dtpm_cpu->cpu); struct em_perf_state *table; - struct cpumask cpus; unsigned long freq; u64 power; int i, nr_cpus; - cpumask_and(&cpus, cpu_online_mask, to_cpumask(pd->cpus)); - nr_cpus = cpumask_weight(&cpus); + nr_cpus = cpumask_weight_and(cpu_online_mask, to_cpumask(pd->cpus)); rcu_read_lock(); table = em_perf_state_from_pd(pd); @@ -123,11 +121,9 @@ static int update_pd_power_uw(struct dtpm *dtpm) struct dtpm_cpu *dtpm_cpu = to_dtpm_cpu(dtpm); struct em_perf_domain *em = em_cpu_get(dtpm_cpu->cpu); struct em_perf_state *table; - struct cpumask cpus; int nr_cpus; - cpumask_and(&cpus, cpu_online_mask, to_cpumask(em->cpus)); - nr_cpus = cpumask_weight(&cpus); + nr_cpus = cpumask_weight_and(cpu_online_mask, to_cpumask(em->cpus)); rcu_read_lock(); table = em_perf_state_from_pd(em); From 94baae2b91818ad3167a0ba429438242b196fb88 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Mon, 8 Apr 2024 12:05:48 +0800 Subject: [PATCH 24/68] powercap: intel_rapl: Add support for ArrowLake-H platform Add support for ArrowLake-H platform. Signed-off-by: Zhang Rui Signed-off-by: Rafael J. Wysocki --- drivers/powercap/intel_rapl_common.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/powercap/intel_rapl_common.c b/drivers/powercap/intel_rapl_common.c index a28d54fd5222..c02851c73751 100644 --- a/drivers/powercap/intel_rapl_common.c +++ b/drivers/powercap/intel_rapl_common.c @@ -1263,6 +1263,7 @@ static const struct x86_cpu_id rapl_ids[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, &rapl_defaults_spr_server), X86_MATCH_INTEL_FAM6_MODEL(EMERALDRAPIDS_X, &rapl_defaults_spr_server), X86_MATCH_INTEL_FAM6_MODEL(LUNARLAKE_M, &rapl_defaults_core), + X86_MATCH_INTEL_FAM6_MODEL(ARROWLAKE_H, &rapl_defaults_core), X86_MATCH_INTEL_FAM6_MODEL(ARROWLAKE, &rapl_defaults_core), X86_MATCH_INTEL_FAM6_MODEL(LAKEFIELD, &rapl_defaults_core), From 72b8b94155d957f82697802555d53c142d82dece Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Mon, 8 Apr 2024 11:51:39 +0800 Subject: [PATCH 25/68] powercap: intel_rapl: Sort header files Sort header files alphabetically. Signed-off-by: Zhang Rui Signed-off-by: Rafael J. Wysocki --- drivers/powercap/intel_rapl_common.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/drivers/powercap/intel_rapl_common.c b/drivers/powercap/intel_rapl_common.c index c02851c73751..c4302caeb631 100644 --- a/drivers/powercap/intel_rapl_common.c +++ b/drivers/powercap/intel_rapl_common.c @@ -5,27 +5,27 @@ */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include -#include -#include -#include -#include -#include -#include -#include #include -#include -#include +#include #include -#include -#include +#include +#include #include -#include +#include +#include +#include +#include #include +#include +#include +#include +#include +#include +#include -#include #include #include +#include /* bitmasks for RAPL MSRs, used by primitive access functions */ #define ENERGY_STATUS_MASK 0xffffffff From b5e230aa8d0359bba49659c8358074755c6eb9e4 Mon Sep 17 00:00:00 2001 From: Javier Carrasco Date: Sun, 7 Apr 2024 22:15:29 +0200 Subject: [PATCH 26/68] cpupfreq: tegra124: eliminate uses of of_node_put() Make use of the __free() cleanup handler to automatically free nodes when they get out of scope. Only the probe function is affected by this modification. Given that this mechanism requires the node to be initialized, its initialization and the value check have been moved to the top of the function. After removing uses of of_node_put(), the jump to out_put_np is no longer necessary. Suggested-by: Julia Lawall Signed-off-by: Javier Carrasco Signed-off-by: Viresh Kumar --- drivers/cpufreq/tegra124-cpufreq.c | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/drivers/cpufreq/tegra124-cpufreq.c b/drivers/cpufreq/tegra124-cpufreq.c index aae951d4e77c..514146d98bca 100644 --- a/drivers/cpufreq/tegra124-cpufreq.c +++ b/drivers/cpufreq/tegra124-cpufreq.c @@ -52,12 +52,15 @@ out: static int tegra124_cpufreq_probe(struct platform_device *pdev) { + struct device_node *np __free(device_node) = of_cpu_device_node_get(0); struct tegra124_cpufreq_priv *priv; - struct device_node *np; struct device *cpu_dev; struct platform_device_info cpufreq_dt_devinfo = {}; int ret; + if (!np) + return -ENODEV; + priv = devm_kzalloc(&pdev->dev, sizeof(*priv), GFP_KERNEL); if (!priv) return -ENOMEM; @@ -66,15 +69,9 @@ static int tegra124_cpufreq_probe(struct platform_device *pdev) if (!cpu_dev) return -ENODEV; - np = of_cpu_device_node_get(0); - if (!np) - return -ENODEV; - priv->cpu_clk = of_clk_get_by_name(np, "cpu_g"); - if (IS_ERR(priv->cpu_clk)) { - ret = PTR_ERR(priv->cpu_clk); - goto out_put_np; - } + if (IS_ERR(priv->cpu_clk)) + return PTR_ERR(priv->cpu_clk); priv->dfll_clk = of_clk_get_by_name(np, "dfll"); if (IS_ERR(priv->dfll_clk)) { @@ -110,8 +107,6 @@ static int tegra124_cpufreq_probe(struct platform_device *pdev) platform_set_drvdata(pdev, priv); - of_node_put(np); - return 0; out_put_pllp_clk: @@ -122,8 +117,6 @@ out_put_dfll_clk: clk_put(priv->dfll_clk); out_put_cpu_clk: clk_put(priv->cpu_clk); -out_put_np: - of_node_put(np); return ret; } From cf7de25878a1f4508c69dc9f6819c21ba177dbfe Mon Sep 17 00:00:00 2001 From: Aleksandr Mishin Date: Mon, 8 Apr 2024 12:35:36 +0300 Subject: [PATCH 27/68] cppc_cpufreq: Fix possible null pointer dereference cppc_cpufreq_get_rate() and hisi_cppc_cpufreq_get_rate() can be called from different places with various parameters. So cpufreq_cpu_get() can return null as 'policy' in some circumstances. Fix this bug by adding null return check. Found by Linux Verification Center (linuxtesting.org) with SVACE. Fixes: a28b2bfc099c ("cppc_cpufreq: replace per-cpu data array with a list") Signed-off-by: Aleksandr Mishin Signed-off-by: Viresh Kumar --- drivers/cpufreq/cppc_cpufreq.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c index 64420d9cfd1e..15f1d41920a3 100644 --- a/drivers/cpufreq/cppc_cpufreq.c +++ b/drivers/cpufreq/cppc_cpufreq.c @@ -741,10 +741,15 @@ static unsigned int cppc_cpufreq_get_rate(unsigned int cpu) { struct cppc_perf_fb_ctrs fb_ctrs_t0 = {0}, fb_ctrs_t1 = {0}; struct cpufreq_policy *policy = cpufreq_cpu_get(cpu); - struct cppc_cpudata *cpu_data = policy->driver_data; + struct cppc_cpudata *cpu_data; u64 delivered_perf; int ret; + if (!policy) + return -ENODEV; + + cpu_data = policy->driver_data; + cpufreq_cpu_put(policy); ret = cppc_get_perf_ctrs(cpu, &fb_ctrs_t0); @@ -822,10 +827,15 @@ static struct cpufreq_driver cppc_cpufreq_driver = { static unsigned int hisi_cppc_cpufreq_get_rate(unsigned int cpu) { struct cpufreq_policy *policy = cpufreq_cpu_get(cpu); - struct cppc_cpudata *cpu_data = policy->driver_data; + struct cppc_cpudata *cpu_data; u64 desired_perf; int ret; + if (!policy) + return -ENODEV; + + cpu_data = policy->driver_data; + cpufreq_cpu_put(policy); ret = cppc_get_desired_perf(cpu, &desired_perf); From 9cf3415ade2d7598d78d2ce6d35d6d6d06132201 Mon Sep 17 00:00:00 2001 From: Martin Botka Date: Thu, 18 Apr 2024 16:44:01 +0100 Subject: [PATCH 28/68] firmware: smccc: Export revision soc_id function The "SoC ID revision" as provided via the SMCCC SOCID interface can be valuable information for drivers, when certain functionality depends on a die revision, for instance. One example is the sun50i-cpufreq-nvmem driver, which needs this information to determine the speed bin of the SoC. Export the arm_smccc_get_soc_id_revision() function so that it can be called by any driver. Signed-off-by: Martin Botka Signed-off-by: Andre Przywara Acked-by: Sudeep Holla Signed-off-by: Viresh Kumar --- drivers/firmware/smccc/smccc.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/firmware/smccc/smccc.c b/drivers/firmware/smccc/smccc.c index db818f9dcb8e..d670635914ec 100644 --- a/drivers/firmware/smccc/smccc.c +++ b/drivers/firmware/smccc/smccc.c @@ -69,6 +69,7 @@ s32 arm_smccc_get_soc_id_revision(void) { return smccc_soc_id_revision; } +EXPORT_SYMBOL_GPL(arm_smccc_get_soc_id_revision); static int __init smccc_devices_init(void) { From 6ae07744cf334b750762ba881492c0cfba524b38 Mon Sep 17 00:00:00 2001 From: Martin Botka Date: Thu, 18 Apr 2024 16:44:02 +0100 Subject: [PATCH 29/68] cpufreq: dt-platdev: Blocklist Allwinner H616/618 SoCs The AllWinner H616 SoC will use the (extended) H6 OPP driver, so add them to the cpufreq-dt blocklist, to not create the device twice. This also affects the closely related sibling SoCs H618 and H700. Signed-off-by: Martin Botka Signed-off-by: Andre Przywara Reviewed-by: Jernej Skrabec Signed-off-by: Viresh Kumar --- drivers/cpufreq/cpufreq-dt-platdev.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/cpufreq/cpufreq-dt-platdev.c b/drivers/cpufreq/cpufreq-dt-platdev.c index b993a498084b..86d8baa81679 100644 --- a/drivers/cpufreq/cpufreq-dt-platdev.c +++ b/drivers/cpufreq/cpufreq-dt-platdev.c @@ -104,6 +104,9 @@ static const struct of_device_id allowlist[] __initconst = { */ static const struct of_device_id blocklist[] __initconst = { { .compatible = "allwinner,sun50i-h6", }, + { .compatible = "allwinner,sun50i-h616", }, + { .compatible = "allwinner,sun50i-h618", }, + { .compatible = "allwinner,sun50i-h700", }, { .compatible = "apple,arm-platform", }, From 83d4e044310a7d26d1a5443c23451d4b9da9ada3 Mon Sep 17 00:00:00 2001 From: Martin Botka Date: Thu, 18 Apr 2024 16:44:03 +0100 Subject: [PATCH 30/68] dt-bindings: opp: Describe H616 OPPs and opp-supported-hw Compared to the existing Allwinner H6 OPP scheme, the H616 uses a similar NVMEM based mechanism to determine the silicon revision, which is required to select the right frequency / voltage pair for the OPPs. However it limits the maximum frequency for some speed bins, also seems to not support all frequencies in all speed bins, which requires us to introduce the opp-supported-hw property. Add this property to the list of allowed properties, also drop the requirement for the revision specific opp-microvolt properties, since they might not be needed if using opp-supported-hw. Also use to opportunity to adjust some wording, and drop a sentence referring to the Linux driver and the OPP subsystem. Shorten the existing example and add another example, showcasing the opp-supported-hw property. Signed-off-by: Martin Botka Signed-off-by: Andre Przywara Reviewed-by: Rob Herring Signed-off-by: Viresh Kumar --- .../allwinner,sun50i-h6-operating-points.yaml | 93 +++++++++---------- 1 file changed, 46 insertions(+), 47 deletions(-) diff --git a/Documentation/devicetree/bindings/opp/allwinner,sun50i-h6-operating-points.yaml b/Documentation/devicetree/bindings/opp/allwinner,sun50i-h6-operating-points.yaml index 51f62c3ae194..ec5e424bb3c8 100644 --- a/Documentation/devicetree/bindings/opp/allwinner,sun50i-h6-operating-points.yaml +++ b/Documentation/devicetree/bindings/opp/allwinner,sun50i-h6-operating-points.yaml @@ -13,25 +13,25 @@ maintainers: description: | For some SoCs, the CPU frequency subset and voltage value of each OPP varies based on the silicon variant in use. Allwinner Process - Voltage Scaling Tables defines the voltage and frequency value based - on the speedbin blown in the efuse combination. The - sun50i-cpufreq-nvmem driver reads the efuse value from the SoC to - provide the OPP framework with required information. + Voltage Scaling Tables define the voltage and frequency values based + on the speedbin blown in the efuse combination. allOf: - $ref: opp-v2-base.yaml# properties: compatible: - const: allwinner,sun50i-h6-operating-points + enum: + - allwinner,sun50i-h6-operating-points + - allwinner,sun50i-h616-operating-points nvmem-cells: description: | A phandle pointing to a nvmem-cells node representing the efuse - registers that has information about the speedbin that is used + register that has information about the speedbin that is used to select the right frequency/voltage value pair. Please refer - the for nvmem-cells bindings - Documentation/devicetree/bindings/nvmem/nvmem.txt and also + to the nvmem-cells bindings in + Documentation/devicetree/bindings/nvmem/nvmem.yaml and also the examples below. opp-shared: true @@ -47,15 +47,18 @@ patternProperties: properties: opp-hz: true clock-latency-ns: true + opp-microvolt: true + opp-supported-hw: + maxItems: 1 + description: + A single 32 bit bitmap value, representing compatible HW, one + bit per speed bin index. patternProperties: "^opp-microvolt-speed[0-9]$": true required: - opp-hz - - opp-microvolt-speed0 - - opp-microvolt-speed1 - - opp-microvolt-speed2 unevaluatedProperties: false @@ -77,33 +80,6 @@ examples: opp-microvolt-speed2 = <800000>; }; - opp-720000000 { - clock-latency-ns = <244144>; /* 8 32k periods */ - opp-hz = /bits/ 64 <720000000>; - - opp-microvolt-speed0 = <880000>; - opp-microvolt-speed1 = <820000>; - opp-microvolt-speed2 = <800000>; - }; - - opp-816000000 { - clock-latency-ns = <244144>; /* 8 32k periods */ - opp-hz = /bits/ 64 <816000000>; - - opp-microvolt-speed0 = <880000>; - opp-microvolt-speed1 = <820000>; - opp-microvolt-speed2 = <800000>; - }; - - opp-888000000 { - clock-latency-ns = <244144>; /* 8 32k periods */ - opp-hz = /bits/ 64 <888000000>; - - opp-microvolt-speed0 = <940000>; - opp-microvolt-speed1 = <820000>; - opp-microvolt-speed2 = <800000>; - }; - opp-1080000000 { clock-latency-ns = <244144>; /* 8 32k periods */ opp-hz = /bits/ 64 <1080000000>; @@ -113,15 +89,6 @@ examples: opp-microvolt-speed2 = <840000>; }; - opp-1320000000 { - clock-latency-ns = <244144>; /* 8 32k periods */ - opp-hz = /bits/ 64 <1320000000>; - - opp-microvolt-speed0 = <1160000>; - opp-microvolt-speed1 = <940000>; - opp-microvolt-speed2 = <900000>; - }; - opp-1488000000 { clock-latency-ns = <244144>; /* 8 32k periods */ opp-hz = /bits/ 64 <1488000000>; @@ -132,4 +99,36 @@ examples: }; }; + - | + opp-table { + compatible = "allwinner,sun50i-h616-operating-points"; + nvmem-cells = <&speedbin_efuse>; + opp-shared; + + opp-480000000 { + clock-latency-ns = <244144>; /* 8 32k periods */ + opp-hz = /bits/ 64 <480000000>; + + opp-microvolt = <900000>; + opp-supported-hw = <0x1f>; + }; + + opp-792000000 { + clock-latency-ns = <244144>; /* 8 32k periods */ + opp-hz = /bits/ 64 <792000000>; + + opp-microvolt-speed1 = <900000>; + opp-microvolt-speed4 = <940000>; + opp-supported-hw = <0x12>; + }; + + opp-1512000000 { + clock-latency-ns = <244144>; /* 8 32k periods */ + opp-hz = /bits/ 64 <1512000000>; + + opp-microvolt = <1100000>; + opp-supported-hw = <0x0a>; + }; + }; + ... From 6cc4bcceff9af0e6be9738096d95e4ba75e75123 Mon Sep 17 00:00:00 2001 From: Brandon Cheo Fusi Date: Thu, 18 Apr 2024 16:44:04 +0100 Subject: [PATCH 31/68] cpufreq: sun50i: Refactor speed bin decoding Make converting the speed bin value into a speed grade generic and determined by a platform specific callback. Also change the prototypes involved to encode the speed bin directly in the return value. This allows to extend the driver more easily to support more SoCs. Signed-off-by: Brandon Cheo Fusi [Andre: merge output into return value] Signed-off-by: Andre Przywara Reviewed-by: Jernej Skrabec Signed-off-by: Viresh Kumar --- drivers/cpufreq/sun50i-cpufreq-nvmem.c | 74 +++++++++++++++++--------- 1 file changed, 49 insertions(+), 25 deletions(-) diff --git a/drivers/cpufreq/sun50i-cpufreq-nvmem.c b/drivers/cpufreq/sun50i-cpufreq-nvmem.c index 32a9c88f8ff6..45c56e23346e 100644 --- a/drivers/cpufreq/sun50i-cpufreq-nvmem.c +++ b/drivers/cpufreq/sun50i-cpufreq-nvmem.c @@ -25,19 +25,52 @@ static struct platform_device *cpufreq_dt_pdev, *sun50i_cpufreq_pdev; +struct sunxi_cpufreq_data { + u32 (*efuse_xlate)(u32 speedbin); +}; + +static u32 sun50i_h6_efuse_xlate(u32 speedbin) +{ + u32 efuse_value; + + efuse_value = (speedbin >> NVMEM_SHIFT) & NVMEM_MASK; + + /* + * We treat unexpected efuse values as if the SoC was from + * the slowest bin. Expected efuse values are 1-3, slowest + * to fastest. + */ + if (efuse_value >= 1 && efuse_value <= 3) + return efuse_value - 1; + else + return 0; +} + +static struct sunxi_cpufreq_data sun50i_h6_cpufreq_data = { + .efuse_xlate = sun50i_h6_efuse_xlate, +}; + +static const struct of_device_id cpu_opp_match_list[] = { + { .compatible = "allwinner,sun50i-h6-operating-points", + .data = &sun50i_h6_cpufreq_data, + }, + {} +}; + /** * sun50i_cpufreq_get_efuse() - Determine speed grade from efuse value - * @versions: Set to the value parsed from efuse * - * Returns 0 if success. + * Returns non-negative speed bin index on success, a negative error + * value otherwise. */ -static int sun50i_cpufreq_get_efuse(u32 *versions) +static int sun50i_cpufreq_get_efuse(void) { + const struct sunxi_cpufreq_data *opp_data; struct nvmem_cell *speedbin_nvmem; + const struct of_device_id *match; struct device_node *np; struct device *cpu_dev; - u32 *speedbin, efuse_value; - size_t len; + u32 *speedbin; int ret; cpu_dev = get_cpu_device(0); @@ -48,12 +81,12 @@ static int sun50i_cpufreq_get_efuse(u32 *versions) if (!np) return -ENOENT; - ret = of_device_is_compatible(np, - "allwinner,sun50i-h6-operating-points"); - if (!ret) { + match = of_match_node(cpu_opp_match_list, np); + if (!match) { of_node_put(np); return -ENOENT; } + opp_data = match->data; speedbin_nvmem = of_nvmem_cell_get(np, NULL); of_node_put(np); @@ -61,25 +94,16 @@ static int sun50i_cpufreq_get_efuse(u32 *versions) return dev_err_probe(cpu_dev, PTR_ERR(speedbin_nvmem), "Could not get nvmem cell\n"); - speedbin = nvmem_cell_read(speedbin_nvmem, &len); + speedbin = nvmem_cell_read(speedbin_nvmem, NULL); nvmem_cell_put(speedbin_nvmem); if (IS_ERR(speedbin)) return PTR_ERR(speedbin); - efuse_value = (*speedbin >> NVMEM_SHIFT) & NVMEM_MASK; - - /* - * We treat unexpected efuse values as if the SoC was from - * the slowest bin. Expected efuse values are 1-3, slowest - * to fastest. - */ - if (efuse_value >= 1 && efuse_value <= 3) - *versions = efuse_value - 1; - else - *versions = 0; + ret = opp_data->efuse_xlate(*speedbin); kfree(speedbin); - return 0; + + return ret; }; static int sun50i_cpufreq_nvmem_probe(struct platform_device *pdev) @@ -87,7 +111,7 @@ static int sun50i_cpufreq_nvmem_probe(struct platform_device *pdev) int *opp_tokens; char name[MAX_NAME_LEN]; unsigned int cpu; - u32 speed = 0; + int speed; int ret; opp_tokens = kcalloc(num_possible_cpus(), sizeof(*opp_tokens), @@ -95,10 +119,10 @@ static int sun50i_cpufreq_nvmem_probe(struct platform_device *pdev) if (!opp_tokens) return -ENOMEM; - ret = sun50i_cpufreq_get_efuse(&speed); - if (ret) { + speed = sun50i_cpufreq_get_efuse(); + if (speed < 0) { kfree(opp_tokens); - return ret; + return speed; } snprintf(name, MAX_NAME_LEN, "speed%d", speed); From fa5aec9561cfc4f4370983ca5818c90227c9d90e Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Thu, 18 Apr 2024 16:44:05 +0100 Subject: [PATCH 32/68] cpufreq: sun50i: Add support for opp_supported_hw The opp_supported_hw DT property allows the DT to specify a mask of chip revisions that a certain OPP is eligible for. This allows for easy limiting of maximum frequencies, for instance. Add support for that in the sun50i-cpufreq-nvmem driver. We support both the existing opp-microvolt suffix properties as well as the opp-supported-hw property, the generic code figures out which is needed automatically. However if none of the DT OPP nodes contain an opp-supported-hw property, the core code will ignore all OPPs and the driver will fail probing. So check the DT's eligibility first before using that feature. Signed-off-by: Andre Przywara Reviewed-by: Jernej Skrabec Signed-off-by: Viresh Kumar --- drivers/cpufreq/sun50i-cpufreq-nvmem.c | 62 ++++++++++++++++++++++---- 1 file changed, 54 insertions(+), 8 deletions(-) diff --git a/drivers/cpufreq/sun50i-cpufreq-nvmem.c b/drivers/cpufreq/sun50i-cpufreq-nvmem.c index 45c56e23346e..8719955278d9 100644 --- a/drivers/cpufreq/sun50i-cpufreq-nvmem.c +++ b/drivers/cpufreq/sun50i-cpufreq-nvmem.c @@ -57,6 +57,41 @@ static const struct of_device_id cpu_opp_match_list[] = { {} }; +/** + * dt_has_supported_hw() - Check if any OPPs use opp-supported-hw + * + * If we ask the cpufreq framework to use the opp-supported-hw feature, it + * will ignore every OPP node without that DT property. If none of the OPPs + * have it, the driver will fail probing, due to the lack of OPPs. + * + * Returns true if we have at least one OPP with the opp-supported-hw property. + */ +static bool dt_has_supported_hw(void) +{ + bool has_opp_supported_hw = false; + struct device_node *np, *opp; + struct device *cpu_dev; + + cpu_dev = get_cpu_device(0); + if (!cpu_dev) + return -ENODEV; + + np = dev_pm_opp_of_get_opp_desc_node(cpu_dev); + if (!np) + return -ENOENT; + + for_each_child_of_node(np, opp) { + if (of_find_property(opp, "opp-supported-hw", NULL)) { + has_opp_supported_hw = true; + break; + } + } + + of_node_put(np); + + return has_opp_supported_hw; +} + /** * sun50i_cpufreq_get_efuse() - Determine speed grade from efuse value * @@ -110,7 +145,8 @@ static int sun50i_cpufreq_nvmem_probe(struct platform_device *pdev) { int *opp_tokens; char name[MAX_NAME_LEN]; - unsigned int cpu; + unsigned int cpu, supported_hw; + struct dev_pm_opp_config config = {}; int speed; int ret; @@ -125,7 +161,18 @@ static int sun50i_cpufreq_nvmem_probe(struct platform_device *pdev) return speed; } + /* + * We need at least one OPP with the "opp-supported-hw" property, + * or else the upper layers will ignore every OPP and will bail out. + */ + if (dt_has_supported_hw()) { + supported_hw = 1U << speed; + config.supported_hw = &supported_hw; + config.supported_hw_count = 1; + } + snprintf(name, MAX_NAME_LEN, "speed%d", speed); + config.prop_name = name; for_each_possible_cpu(cpu) { struct device *cpu_dev = get_cpu_device(cpu); @@ -135,12 +182,11 @@ static int sun50i_cpufreq_nvmem_probe(struct platform_device *pdev) goto free_opp; } - opp_tokens[cpu] = dev_pm_opp_set_prop_name(cpu_dev, name); - if (opp_tokens[cpu] < 0) { - ret = opp_tokens[cpu]; - pr_err("Failed to set prop name\n"); + ret = dev_pm_opp_set_config(cpu_dev, &config); + if (ret < 0) goto free_opp; - } + + opp_tokens[cpu] = ret; } cpufreq_dt_pdev = platform_device_register_simple("cpufreq-dt", -1, @@ -155,7 +201,7 @@ static int sun50i_cpufreq_nvmem_probe(struct platform_device *pdev) free_opp: for_each_possible_cpu(cpu) - dev_pm_opp_put_prop_name(opp_tokens[cpu]); + dev_pm_opp_clear_config(opp_tokens[cpu]); kfree(opp_tokens); return ret; @@ -169,7 +215,7 @@ static void sun50i_cpufreq_nvmem_remove(struct platform_device *pdev) platform_device_unregister(cpufreq_dt_pdev); for_each_possible_cpu(cpu) - dev_pm_opp_put_prop_name(opp_tokens[cpu]); + dev_pm_opp_clear_config(opp_tokens[cpu]); kfree(opp_tokens); } From e2e2dcd2e944fe6167cb731864f8a1343f1bbee7 Mon Sep 17 00:00:00 2001 From: Martin Botka Date: Thu, 18 Apr 2024 16:44:06 +0100 Subject: [PATCH 33/68] cpufreq: sun50i: Add H616 support The Allwinner H616/H618 SoCs have different OPP tables per SoC version and die revision. The SoC version is stored in NVMEM, as before, though encoded differently. The die revision is in a different register, in the SRAM controller. Firmware already exports that value in a standardised way, through the SMCCC SoCID mechanism. We need both values, as some chips have the same SoC version, but they don't support the same frequencies and they get differentiated by the die revision. Add the new compatible string and tie the new translation function to it. This mechanism not only covers the original H616 SoC, but also its very close sibling SoCs H618 and H700, so add them to the list as well. Signed-off-by: Martin Botka Signed-off-by: Andre Przywara Signed-off-by: Viresh Kumar --- drivers/cpufreq/sun50i-cpufreq-nvmem.c | 67 ++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) diff --git a/drivers/cpufreq/sun50i-cpufreq-nvmem.c b/drivers/cpufreq/sun50i-cpufreq-nvmem.c index 8719955278d9..30e5c337611c 100644 --- a/drivers/cpufreq/sun50i-cpufreq-nvmem.c +++ b/drivers/cpufreq/sun50i-cpufreq-nvmem.c @@ -10,6 +10,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include #include #include #include @@ -46,14 +47,77 @@ static u32 sun50i_h6_efuse_xlate(u32 speedbin) return 0; } +static int get_soc_id_revision(void) +{ +#ifdef CONFIG_HAVE_ARM_SMCCC_DISCOVERY + return arm_smccc_get_soc_id_revision(); +#else + return SMCCC_RET_NOT_SUPPORTED; +#endif +} + +/* + * Judging by the OPP tables in the vendor BSP, the quality order of the + * returned speedbin index is 4 -> 0/2 -> 3 -> 1, from worst to best. + * 0 and 2 seem identical from the OPP tables' point of view. + */ +static u32 sun50i_h616_efuse_xlate(u32 speedbin) +{ + int ver_bits = get_soc_id_revision(); + u32 value = 0; + + switch (speedbin & 0xffff) { + case 0x2000: + value = 0; + break; + case 0x2400: + case 0x7400: + case 0x2c00: + case 0x7c00: + if (ver_bits != SMCCC_RET_NOT_SUPPORTED && ver_bits <= 1) { + /* ic version A/B */ + value = 1; + } else { + /* ic version C and later version */ + value = 2; + } + break; + case 0x5000: + case 0x5400: + case 0x6000: + value = 3; + break; + case 0x5c00: + value = 4; + break; + case 0x5d00: + value = 0; + break; + default: + pr_warn("sun50i-cpufreq-nvmem: unknown speed bin 0x%x, using default bin 0\n", + speedbin & 0xffff); + value = 0; + break; + } + + return value; +} + static struct sunxi_cpufreq_data sun50i_h6_cpufreq_data = { .efuse_xlate = sun50i_h6_efuse_xlate, }; +static struct sunxi_cpufreq_data sun50i_h616_cpufreq_data = { + .efuse_xlate = sun50i_h616_efuse_xlate, +}; + static const struct of_device_id cpu_opp_match_list[] = { { .compatible = "allwinner,sun50i-h6-operating-points", .data = &sun50i_h6_cpufreq_data, }, + { .compatible = "allwinner,sun50i-h616-operating-points", + .data = &sun50i_h616_cpufreq_data, + }, {} }; @@ -230,6 +294,9 @@ static struct platform_driver sun50i_cpufreq_driver = { static const struct of_device_id sun50i_cpufreq_match_list[] = { { .compatible = "allwinner,sun50i-h6" }, + { .compatible = "allwinner,sun50i-h616" }, + { .compatible = "allwinner,sun50i-h618" }, + { .compatible = "allwinner,sun50i-h700" }, {} }; MODULE_DEVICE_TABLE(of, sun50i_cpufreq_match_list); From 3e057e05b3b281bcc29db573eb51f87ee6b5afc0 Mon Sep 17 00:00:00 2001 From: Martin Botka Date: Thu, 18 Apr 2024 16:44:07 +0100 Subject: [PATCH 34/68] arm64: dts: allwinner: h616: Add CPU OPPs table Add an Operating Performance Points table for the CPU cores to enable Dynamic Voltage & Frequency Scaling (DVFS) on the H616. The values were taken from the BSP sources. There is a separate OPP set seen on some H700 devices, but they didn't really work out in testing, so they are not included for now. Also add the needed cpu_speed_grade nvmem cell and the cooling cells properties, to enable passive cooling. Signed-off-by: Martin Botka [Andre: rework to minimise opp-microvolt properties] Signed-off-by: Andre Przywara Acked-by: Jernej Skrabec Signed-off-by: Viresh Kumar --- .../dts/allwinner/sun50i-h616-cpu-opp.dtsi | 115 ++++++++++++++++++ .../arm64/boot/dts/allwinner/sun50i-h616.dtsi | 8 ++ 2 files changed, 123 insertions(+) create mode 100644 arch/arm64/boot/dts/allwinner/sun50i-h616-cpu-opp.dtsi diff --git a/arch/arm64/boot/dts/allwinner/sun50i-h616-cpu-opp.dtsi b/arch/arm64/boot/dts/allwinner/sun50i-h616-cpu-opp.dtsi new file mode 100644 index 000000000000..aca22a7f0191 --- /dev/null +++ b/arch/arm64/boot/dts/allwinner/sun50i-h616-cpu-opp.dtsi @@ -0,0 +1,115 @@ +// SPDX-License-Identifier: (GPL-2.0+ OR MIT) +// Copyright (C) 2023 Martin Botka + +/ { + cpu_opp_table: opp-table-cpu { + compatible = "allwinner,sun50i-h616-operating-points"; + nvmem-cells = <&cpu_speed_grade>; + opp-shared; + + opp-480000000 { + opp-hz = /bits/ 64 <480000000>; + opp-microvolt = <900000>; + clock-latency-ns = <244144>; /* 8 32k periods */ + opp-supported-hw = <0x1f>; + }; + + opp-600000000 { + opp-hz = /bits/ 64 <600000000>; + opp-microvolt = <900000>; + clock-latency-ns = <244144>; /* 8 32k periods */ + opp-supported-hw = <0x12>; + }; + + opp-720000000 { + opp-hz = /bits/ 64 <720000000>; + opp-microvolt = <900000>; + clock-latency-ns = <244144>; /* 8 32k periods */ + opp-supported-hw = <0x0d>; + }; + + opp-792000000 { + opp-hz = /bits/ 64 <792000000>; + opp-microvolt-speed1 = <900000>; + opp-microvolt-speed4 = <940000>; + clock-latency-ns = <244144>; /* 8 32k periods */ + opp-supported-hw = <0x12>; + }; + + opp-936000000 { + opp-hz = /bits/ 64 <936000000>; + opp-microvolt = <900000>; + clock-latency-ns = <244144>; /* 8 32k periods */ + opp-supported-hw = <0x0d>; + }; + + opp-1008000000 { + opp-hz = /bits/ 64 <1008000000>; + opp-microvolt-speed0 = <950000>; + opp-microvolt-speed1 = <940000>; + opp-microvolt-speed2 = <950000>; + opp-microvolt-speed3 = <950000>; + opp-microvolt-speed4 = <1020000>; + clock-latency-ns = <244144>; /* 8 32k periods */ + opp-supported-hw = <0x1f>; + }; + + opp-1104000000 { + opp-hz = /bits/ 64 <1104000000>; + opp-microvolt-speed0 = <1000000>; + opp-microvolt-speed2 = <1000000>; + opp-microvolt-speed3 = <1000000>; + clock-latency-ns = <244144>; /* 8 32k periods */ + opp-supported-hw = <0x0d>; + }; + + opp-1200000000 { + opp-hz = /bits/ 64 <1200000000>; + opp-microvolt-speed0 = <1050000>; + opp-microvolt-speed1 = <1020000>; + opp-microvolt-speed2 = <1050000>; + opp-microvolt-speed3 = <1050000>; + opp-microvolt-speed4 = <1100000>; + clock-latency-ns = <244144>; /* 8 32k periods */ + opp-supported-hw = <0x1f>; + }; + + opp-1320000000 { + opp-hz = /bits/ 64 <1320000000>; + opp-microvolt = <1100000>; + clock-latency-ns = <244144>; /* 8 32k periods */ + opp-supported-hw = <0x1d>; + }; + + opp-1416000000 { + opp-hz = /bits/ 64 <1416000000>; + opp-microvolt = <1100000>; + clock-latency-ns = <244144>; /* 8 32k periods */ + opp-supported-hw = <0x0d>; + }; + + opp-1512000000 { + opp-hz = /bits/ 64 <1512000000>; + opp-microvolt-speed1 = <1100000>; + opp-microvolt-speed3 = <1100000>; + clock-latency-ns = <244144>; /* 8 32k periods */ + opp-supported-hw = <0x0a>; + }; + }; +}; + +&cpu0 { + operating-points-v2 = <&cpu_opp_table>; +}; + +&cpu1 { + operating-points-v2 = <&cpu_opp_table>; +}; + +&cpu2 { + operating-points-v2 = <&cpu_opp_table>; +}; + +&cpu3 { + operating-points-v2 = <&cpu_opp_table>; +}; diff --git a/arch/arm64/boot/dts/allwinner/sun50i-h616.dtsi b/arch/arm64/boot/dts/allwinner/sun50i-h616.dtsi index b2e85e52d1a1..c0fa466fa9f0 100644 --- a/arch/arm64/boot/dts/allwinner/sun50i-h616.dtsi +++ b/arch/arm64/boot/dts/allwinner/sun50i-h616.dtsi @@ -26,6 +26,7 @@ reg = <0>; enable-method = "psci"; clocks = <&ccu CLK_CPUX>; + #cooling-cells = <2>; }; cpu1: cpu@1 { @@ -34,6 +35,7 @@ reg = <1>; enable-method = "psci"; clocks = <&ccu CLK_CPUX>; + #cooling-cells = <2>; }; cpu2: cpu@2 { @@ -42,6 +44,7 @@ reg = <2>; enable-method = "psci"; clocks = <&ccu CLK_CPUX>; + #cooling-cells = <2>; }; cpu3: cpu@3 { @@ -50,6 +53,7 @@ reg = <3>; enable-method = "psci"; clocks = <&ccu CLK_CPUX>; + #cooling-cells = <2>; }; }; @@ -156,6 +160,10 @@ ths_calibration: thermal-sensor-calibration@14 { reg = <0x14 0x8>; }; + + cpu_speed_grade: cpu-speed-grade@0 { + reg = <0x0 2>; + }; }; watchdog: watchdog@30090a0 { From 09d0aaa0ae9c80ff9569393b206226c1008801b1 Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Thu, 18 Apr 2024 16:44:08 +0100 Subject: [PATCH 35/68] arm64: dts: allwinner: h616: enable DVFS for all boards With the DT bindings now describing the format of the CPU OPP tables, we can include the OPP table in each board's .dts file, and specify the CPU power supply. This allows to enable DVFS, and get up to 50% of performance benefit in the highest OPP, or up to 60% power savings in the lowest OPP, compared to the fixed 1GHz @ 1.0V OPP we are running in by default at the moment. Signed-off-by: Andre Przywara Acked-by: Jernej Skrabec Signed-off-by: Viresh Kumar --- .../boot/dts/allwinner/sun50i-h616-bigtreetech-cb1.dtsi | 5 +++++ arch/arm64/boot/dts/allwinner/sun50i-h616-orangepi-zero2.dts | 5 +++++ arch/arm64/boot/dts/allwinner/sun50i-h616-x96-mate.dts | 5 +++++ .../boot/dts/allwinner/sun50i-h618-longan-module-3h.dtsi | 5 +++++ .../arm64/boot/dts/allwinner/sun50i-h618-orangepi-zero2w.dts | 5 +++++ arch/arm64/boot/dts/allwinner/sun50i-h618-orangepi-zero3.dts | 5 +++++ .../boot/dts/allwinner/sun50i-h618-transpeed-8k618-t.dts | 5 +++++ 7 files changed, 35 insertions(+) diff --git a/arch/arm64/boot/dts/allwinner/sun50i-h616-bigtreetech-cb1.dtsi b/arch/arm64/boot/dts/allwinner/sun50i-h616-bigtreetech-cb1.dtsi index af421ba24ce0..d12b01c5f41b 100644 --- a/arch/arm64/boot/dts/allwinner/sun50i-h616-bigtreetech-cb1.dtsi +++ b/arch/arm64/boot/dts/allwinner/sun50i-h616-bigtreetech-cb1.dtsi @@ -6,6 +6,7 @@ /dts-v1/; #include "sun50i-h616.dtsi" +#include "sun50i-h616-cpu-opp.dtsi" #include #include @@ -62,6 +63,10 @@ }; }; +&cpu0 { + cpu-supply = <®_dcdc2>; +}; + &mmc0 { vmmc-supply = <®_dldo1>; /* Card detection pin is not connected */ diff --git a/arch/arm64/boot/dts/allwinner/sun50i-h616-orangepi-zero2.dts b/arch/arm64/boot/dts/allwinner/sun50i-h616-orangepi-zero2.dts index b5d713926a34..a360d8567f95 100644 --- a/arch/arm64/boot/dts/allwinner/sun50i-h616-orangepi-zero2.dts +++ b/arch/arm64/boot/dts/allwinner/sun50i-h616-orangepi-zero2.dts @@ -6,12 +6,17 @@ /dts-v1/; #include "sun50i-h616-orangepi-zero.dtsi" +#include "sun50i-h616-cpu-opp.dtsi" / { model = "OrangePi Zero2"; compatible = "xunlong,orangepi-zero2", "allwinner,sun50i-h616"; }; +&cpu0 { + cpu-supply = <®_dcdca>; +}; + &emac0 { allwinner,rx-delay-ps = <3100>; allwinner,tx-delay-ps = <700>; diff --git a/arch/arm64/boot/dts/allwinner/sun50i-h616-x96-mate.dts b/arch/arm64/boot/dts/allwinner/sun50i-h616-x96-mate.dts index 959b6fd18483..26d25b5b59e0 100644 --- a/arch/arm64/boot/dts/allwinner/sun50i-h616-x96-mate.dts +++ b/arch/arm64/boot/dts/allwinner/sun50i-h616-x96-mate.dts @@ -6,6 +6,7 @@ /dts-v1/; #include "sun50i-h616.dtsi" +#include "sun50i-h616-cpu-opp.dtsi" #include #include @@ -32,6 +33,10 @@ }; }; +&cpu0 { + cpu-supply = <®_dcdca>; +}; + &ehci0 { status = "okay"; }; diff --git a/arch/arm64/boot/dts/allwinner/sun50i-h618-longan-module-3h.dtsi b/arch/arm64/boot/dts/allwinner/sun50i-h618-longan-module-3h.dtsi index 8c1263a3939e..e92d150aaf1c 100644 --- a/arch/arm64/boot/dts/allwinner/sun50i-h618-longan-module-3h.dtsi +++ b/arch/arm64/boot/dts/allwinner/sun50i-h618-longan-module-3h.dtsi @@ -4,6 +4,11 @@ */ #include "sun50i-h616.dtsi" +#include "sun50i-h616-cpu-opp.dtsi" + +&cpu0 { + cpu-supply = <®_dcdc2>; +}; &mmc2 { pinctrl-names = "default"; diff --git a/arch/arm64/boot/dts/allwinner/sun50i-h618-orangepi-zero2w.dts b/arch/arm64/boot/dts/allwinner/sun50i-h618-orangepi-zero2w.dts index 21ca1977055d..6a4f0da97233 100644 --- a/arch/arm64/boot/dts/allwinner/sun50i-h618-orangepi-zero2w.dts +++ b/arch/arm64/boot/dts/allwinner/sun50i-h618-orangepi-zero2w.dts @@ -6,6 +6,7 @@ /dts-v1/; #include "sun50i-h616.dtsi" +#include "sun50i-h616-cpu-opp.dtsi" #include #include @@ -53,6 +54,10 @@ }; }; +&cpu0 { + cpu-supply = <®_dcdc2>; +}; + &ehci1 { status = "okay"; }; diff --git a/arch/arm64/boot/dts/allwinner/sun50i-h618-orangepi-zero3.dts b/arch/arm64/boot/dts/allwinner/sun50i-h618-orangepi-zero3.dts index b3b1b8692125..e1cd7572a14c 100644 --- a/arch/arm64/boot/dts/allwinner/sun50i-h618-orangepi-zero3.dts +++ b/arch/arm64/boot/dts/allwinner/sun50i-h618-orangepi-zero3.dts @@ -6,12 +6,17 @@ /dts-v1/; #include "sun50i-h616-orangepi-zero.dtsi" +#include "sun50i-h616-cpu-opp.dtsi" / { model = "OrangePi Zero3"; compatible = "xunlong,orangepi-zero3", "allwinner,sun50i-h618"; }; +&cpu0 { + cpu-supply = <®_dcdc2>; +}; + &emac0 { allwinner,tx-delay-ps = <700>; phy-mode = "rgmii-rxid"; diff --git a/arch/arm64/boot/dts/allwinner/sun50i-h618-transpeed-8k618-t.dts b/arch/arm64/boot/dts/allwinner/sun50i-h618-transpeed-8k618-t.dts index ac0a2b7ea6f3..a6458b7a8671 100644 --- a/arch/arm64/boot/dts/allwinner/sun50i-h618-transpeed-8k618-t.dts +++ b/arch/arm64/boot/dts/allwinner/sun50i-h618-transpeed-8k618-t.dts @@ -6,6 +6,7 @@ /dts-v1/; #include "sun50i-h616.dtsi" +#include "sun50i-h616-cpu-opp.dtsi" #include #include @@ -51,6 +52,10 @@ }; }; +&cpu0 { + cpu-supply = <®_dcdc2>; +}; + &ehci0 { status = "okay"; }; From d2059d3b548409905b20b4f52495bffbd7c8da8b Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Mon, 22 Apr 2024 08:58:51 +0530 Subject: [PATCH 36/68] cpufreq: sun50i: Fix build warning around snprint() The Sun50i driver generates a warning with W=1: warning: '%d' directive output may be truncated writing between 1 and 10 bytes into a region of size 2 [-Wformat-truncation=] Fix it by allocating a big enough array to print an integer. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202404191715.LDwMm2gP-lkp@intel.com/ Signed-off-by: Viresh Kumar Acked-by: Chen-Yu Tsai Reviewed-by: Andre Przywara Tested-by: Andre Przywara Reviewed-by: Julian Calaby --- drivers/cpufreq/sun50i-cpufreq-nvmem.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/cpufreq/sun50i-cpufreq-nvmem.c b/drivers/cpufreq/sun50i-cpufreq-nvmem.c index 30e5c337611c..cd50cea16a87 100644 --- a/drivers/cpufreq/sun50i-cpufreq-nvmem.c +++ b/drivers/cpufreq/sun50i-cpufreq-nvmem.c @@ -19,8 +19,6 @@ #include #include -#define MAX_NAME_LEN 7 - #define NVMEM_MASK 0x7 #define NVMEM_SHIFT 5 @@ -208,7 +206,7 @@ static int sun50i_cpufreq_get_efuse(void) static int sun50i_cpufreq_nvmem_probe(struct platform_device *pdev) { int *opp_tokens; - char name[MAX_NAME_LEN]; + char name[] = "speedXXXXXXXXXXX"; /* Integers can take 11 chars max */ unsigned int cpu, supported_hw; struct dev_pm_opp_config config = {}; int speed; @@ -235,7 +233,7 @@ static int sun50i_cpufreq_nvmem_probe(struct platform_device *pdev) config.supported_hw_count = 1; } - snprintf(name, MAX_NAME_LEN, "speed%d", speed); + snprintf(name, sizeof(name), "speed%d", speed); config.prop_name = name; for_each_possible_cpu(cpu) { From b69ec356db1a7c4703b1a9edc82ee1dfdd296b97 Mon Sep 17 00:00:00 2001 From: Sam Shih Date: Fri, 19 Apr 2024 17:59:07 +0100 Subject: [PATCH 37/68] cpufreq: mediatek: Add support for MT7988A This add cpufreq support for mediatek MT7988A SoC. The platform data of MT7988A is different from previous MediaTek SoCs, so we add a new compatible and platform data for it. Signed-off-by: Sam Shih Reviewed-by: AngeloGioacchino Del Regno Signed-off-by: Viresh Kumar --- drivers/cpufreq/mediatek-cpufreq.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/cpufreq/mediatek-cpufreq.c b/drivers/cpufreq/mediatek-cpufreq.c index a0a61919bc4c..518606adf14e 100644 --- a/drivers/cpufreq/mediatek-cpufreq.c +++ b/drivers/cpufreq/mediatek-cpufreq.c @@ -707,6 +707,15 @@ static const struct mtk_cpufreq_platform_data mt7623_platform_data = { .ccifreq_supported = false, }; +static const struct mtk_cpufreq_platform_data mt7988_platform_data = { + .min_volt_shift = 100000, + .max_volt_shift = 200000, + .proc_max_volt = 900000, + .sram_min_volt = 0, + .sram_max_volt = 1150000, + .ccifreq_supported = true, +}; + static const struct mtk_cpufreq_platform_data mt8183_platform_data = { .min_volt_shift = 100000, .max_volt_shift = 200000, @@ -740,6 +749,7 @@ static const struct of_device_id mtk_cpufreq_machines[] __initconst = { { .compatible = "mediatek,mt2712", .data = &mt2701_platform_data }, { .compatible = "mediatek,mt7622", .data = &mt7622_platform_data }, { .compatible = "mediatek,mt7623", .data = &mt7623_platform_data }, + { .compatible = "mediatek,mt7988a", .data = &mt7988_platform_data }, { .compatible = "mediatek,mt8167", .data = &mt8516_platform_data }, { .compatible = "mediatek,mt817x", .data = &mt2701_platform_data }, { .compatible = "mediatek,mt8173", .data = &mt2701_platform_data }, From d769eaef2a8d668035e34a19e3282b4222d6e782 Mon Sep 17 00:00:00 2001 From: Shivani Gupta Date: Tue, 23 Apr 2024 02:07:27 +0000 Subject: [PATCH 38/68] cpufreq: ti: Implement scope-based cleanup in ti_cpufreq_match_node() Modify the ti_cpufreq_match_node() function to utilize the __free() cleanup handler for automatically releasing the device node when it goes out of scope. By moving the declaration to the initialization, the patch ensures that the device node is properly managed throughout the function's scope, thus eliminating the need for manual invocation of of_node_put(). This approach reduces the potential for memory leaks. Suggested-by: Julia Lawall Signed-off-by: Shivani Gupta Signed-off-by: Viresh Kumar --- drivers/cpufreq/ti-cpufreq.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/cpufreq/ti-cpufreq.c b/drivers/cpufreq/ti-cpufreq.c index 46c41e2ca727..714ed53753fa 100644 --- a/drivers/cpufreq/ti-cpufreq.c +++ b/drivers/cpufreq/ti-cpufreq.c @@ -347,12 +347,10 @@ static const struct of_device_id ti_cpufreq_of_match[] = { static const struct of_device_id *ti_cpufreq_match_node(void) { - struct device_node *np; + struct device_node *np __free(device_node) = of_find_node_by_path("/"); const struct of_device_id *match; - np = of_find_node_by_path("/"); match = of_match_node(ti_cpufreq_of_match, np); - of_node_put(np); return match; } From f9059eb5d73e65c88b88465abed4364dfc7b20b4 Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Wed, 12 Jul 2023 17:40:13 +0800 Subject: [PATCH 39/68] cpuidle: kirkwood: Convert to platform remove callback returning void MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The .remove() callback for a platform driver returns an int which makes many driver authors wrongly assume it's possible to do error handling by returning an error code. However the value returned is (mostly) ignored and this typically results in resource leaks. To improve here there is a quest to make the remove callback return void. In the first step of this quest all drivers are converted to .remove_new() which already returns void. Trivially convert this driver from always returning zero in the remove callback to the void returning variant. Cc: Uwe Kleine-König Signed-off-by: Yangtao Li Reviewed-by: Uwe Kleine-König Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20230712094014.41787-1-frank.li@vivo.com --- drivers/cpuidle/cpuidle-kirkwood.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/cpuidle/cpuidle-kirkwood.c b/drivers/cpuidle/cpuidle-kirkwood.c index 13bf743f885b..602c4dfdd7e2 100644 --- a/drivers/cpuidle/cpuidle-kirkwood.c +++ b/drivers/cpuidle/cpuidle-kirkwood.c @@ -59,15 +59,14 @@ static int kirkwood_cpuidle_probe(struct platform_device *pdev) return cpuidle_register(&kirkwood_idle_driver, NULL); } -static int kirkwood_cpuidle_remove(struct platform_device *pdev) +static void kirkwood_cpuidle_remove(struct platform_device *pdev) { cpuidle_unregister(&kirkwood_idle_driver); - return 0; } static struct platform_driver kirkwood_cpuidle_driver = { .probe = kirkwood_cpuidle_probe, - .remove = kirkwood_cpuidle_remove, + .remove_new = kirkwood_cpuidle_remove, .driver = { .name = "kirkwood_cpuidle", }, From 68090fdaac8a3d4bbc681d562de2ef5160976559 Mon Sep 17 00:00:00 2001 From: Javier Carrasco Date: Tue, 23 Apr 2024 10:27:44 +0200 Subject: [PATCH 40/68] cpufreq: dt: eliminate uses of of_node_put() Make use of the __free() cleanup handler to automatically free nodes when they get out of scope. Only find_supply_name() is affected, and the new mechanism removes the need for a 'goto' and the 'name' local variable. Signed-off-by: Javier Carrasco Signed-off-by: Viresh Kumar --- drivers/cpufreq/cpufreq-dt.c | 21 ++++++--------------- 1 file changed, 6 insertions(+), 15 deletions(-) diff --git a/drivers/cpufreq/cpufreq-dt.c b/drivers/cpufreq/cpufreq-dt.c index 2d83bbc65dd0..907e22632fda 100644 --- a/drivers/cpufreq/cpufreq-dt.c +++ b/drivers/cpufreq/cpufreq-dt.c @@ -68,12 +68,9 @@ static int set_target(struct cpufreq_policy *policy, unsigned int index) */ static const char *find_supply_name(struct device *dev) { - struct device_node *np; + struct device_node *np __free(device_node) = of_node_get(dev->of_node); struct property *pp; int cpu = dev->id; - const char *name = NULL; - - np = of_node_get(dev->of_node); /* This must be valid for sure */ if (WARN_ON(!np)) @@ -82,22 +79,16 @@ static const char *find_supply_name(struct device *dev) /* Try "cpu0" for older DTs */ if (!cpu) { pp = of_find_property(np, "cpu0-supply", NULL); - if (pp) { - name = "cpu0"; - goto node_put; - } + if (pp) + return "cpu0"; } pp = of_find_property(np, "cpu-supply", NULL); - if (pp) { - name = "cpu"; - goto node_put; - } + if (pp) + return "cpu"; dev_dbg(dev, "no regulator for cpu%d\n", cpu); -node_put: - of_node_put(np); - return name; + return NULL; } static int cpufreq_init(struct cpufreq_policy *policy) From 39eaf5212441b7330e6e5fe50e3a0e7f8470b4ab Mon Sep 17 00:00:00 2001 From: Javier Carrasco Date: Tue, 23 Apr 2024 10:27:45 +0200 Subject: [PATCH 41/68] cpufreq: dt-platdev: eliminate uses of of_node_put() Make use of the __free() cleanup handler to automatically free nodes when they get out of scope. Signed-off-by: Javier Carrasco Signed-off-by: Viresh Kumar --- drivers/cpufreq/cpufreq-dt-platdev.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/cpufreq/cpufreq-dt-platdev.c b/drivers/cpufreq/cpufreq-dt-platdev.c index 86d8baa81679..c74dd1e01e0d 100644 --- a/drivers/cpufreq/cpufreq-dt-platdev.c +++ b/drivers/cpufreq/cpufreq-dt-platdev.c @@ -198,19 +198,18 @@ static const struct of_device_id blocklist[] __initconst = { static bool __init cpu0_node_has_opp_v2_prop(void) { - struct device_node *np = of_cpu_device_node_get(0); + struct device_node *np __free(device_node) = of_cpu_device_node_get(0); bool ret = false; if (of_property_present(np, "operating-points-v2")) ret = true; - of_node_put(np); return ret; } static int __init cpufreq_dt_platdev_init(void) { - struct device_node *np = of_find_node_by_path("/"); + struct device_node *np __free(device_node) = of_find_node_by_path("/"); const struct of_device_id *match; const void *data = NULL; @@ -226,11 +225,9 @@ static int __init cpufreq_dt_platdev_init(void) if (cpu0_node_has_opp_v2_prop() && !of_match_node(blocklist, np)) goto create_pdev; - of_node_put(np); return -ENODEV; create_pdev: - of_node_put(np); return PTR_ERR_OR_ZERO(platform_device_register_data(NULL, "cpufreq-dt", -1, data, sizeof(struct cpufreq_dt_platform_data))); From fa7bd98f3c8b33fb68c6b2bc69cff32b63db69f8 Mon Sep 17 00:00:00 2001 From: Portia Stephens Date: Wed, 24 Apr 2024 15:02:20 +1000 Subject: [PATCH 42/68] cpufreq: brcmstb-avs-cpufreq: ISO C90 forbids mixed declarations There is a compile warning because a NULL pointer check was added before a struct was declared. This moves the NULL pointer check to after the struct is declared and moves the struct assignment to after the NULL pointer check. Fixes: f661017e6d32 ("cpufreq: brcmstb-avs-cpufreq: add check for cpufreq_cpu_get's return value") Signed-off-by: Portia Stephens Acked-by: Florian Fainelli Signed-off-by: Viresh Kumar --- drivers/cpufreq/brcmstb-avs-cpufreq.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/cpufreq/brcmstb-avs-cpufreq.c b/drivers/cpufreq/brcmstb-avs-cpufreq.c index 1a1857b0a6f4..ea8438550b49 100644 --- a/drivers/cpufreq/brcmstb-avs-cpufreq.c +++ b/drivers/cpufreq/brcmstb-avs-cpufreq.c @@ -481,9 +481,12 @@ static bool brcm_avs_is_firmware_loaded(struct private_data *priv) static unsigned int brcm_avs_cpufreq_get(unsigned int cpu) { struct cpufreq_policy *policy = cpufreq_cpu_get(cpu); + struct private_data *priv; + if (!policy) return 0; - struct private_data *priv = policy->driver_data; + + priv = policy->driver_data; cpufreq_cpu_put(policy); From 76a6fc5644b2a1c70868bec24a078f784600ef2a Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 24 Apr 2024 14:40:11 +0300 Subject: [PATCH 43/68] cpufreq: sun50i: fix error returns in dt_has_supported_hw() The dt_has_supported_hw() function returns type bool. That means these negative error codes are cast to true but the function should return false instead. Fixes: fa5aec9561cf ("cpufreq: sun50i: Add support for opp_supported_hw") Signed-off-by: Dan Carpenter Reviewed-by: Andre Przywara Reviewed-by: Jernej Skrabec Signed-off-by: Viresh Kumar --- drivers/cpufreq/sun50i-cpufreq-nvmem.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/cpufreq/sun50i-cpufreq-nvmem.c b/drivers/cpufreq/sun50i-cpufreq-nvmem.c index cd50cea16a87..0b882765cd66 100644 --- a/drivers/cpufreq/sun50i-cpufreq-nvmem.c +++ b/drivers/cpufreq/sun50i-cpufreq-nvmem.c @@ -136,11 +136,11 @@ static bool dt_has_supported_hw(void) cpu_dev = get_cpu_device(0); if (!cpu_dev) - return -ENODEV; + return false; np = dev_pm_opp_of_get_opp_desc_node(cpu_dev); if (!np) - return -ENOENT; + return false; for_each_child_of_node(np, opp) { if (of_find_property(opp, "opp-supported-hw", NULL)) { From fde234239d161f958390e41d26cda2bb166f1994 Mon Sep 17 00:00:00 2001 From: Tengfei Fan Date: Wed, 24 Apr 2024 18:15:01 +0800 Subject: [PATCH 44/68] dt-bindings: cpufreq: cpufreq-qcom-hw: Add SM4450 compatibles Add compatible for EPSS CPUFREQ-HW on SM4450. Signed-off-by: Tengfei Fan Reviewed-by: Bjorn Andersson Acked-by: Rob Herring (Arm) Signed-off-by: Viresh Kumar --- Documentation/devicetree/bindings/cpufreq/cpufreq-qcom-hw.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Documentation/devicetree/bindings/cpufreq/cpufreq-qcom-hw.yaml b/Documentation/devicetree/bindings/cpufreq/cpufreq-qcom-hw.yaml index 56fc71d6a081..1e9797f96410 100644 --- a/Documentation/devicetree/bindings/cpufreq/cpufreq-qcom-hw.yaml +++ b/Documentation/devicetree/bindings/cpufreq/cpufreq-qcom-hw.yaml @@ -38,6 +38,7 @@ properties: - qcom,sc7280-cpufreq-epss - qcom,sc8280xp-cpufreq-epss - qcom,sdx75-cpufreq-epss + - qcom,sm4450-cpufreq-epss - qcom,sm6375-cpufreq-epss - qcom,sm8250-cpufreq-epss - qcom,sm8350-cpufreq-epss @@ -133,6 +134,7 @@ allOf: - qcom,sc8280xp-cpufreq-epss - qcom,sdm670-cpufreq-hw - qcom,sdm845-cpufreq-hw + - qcom,sm4450-cpufreq-epss - qcom,sm6115-cpufreq-hw - qcom,sm6350-cpufreq-hw - qcom,sm6375-cpufreq-epss From b37ef7210e51b1e996ca03b03227d93f7470784b Mon Sep 17 00:00:00 2001 From: "Gautham R. Shenoy" Date: Thu, 25 Apr 2024 16:07:51 +0800 Subject: [PATCH 45/68] cpufreq: amd-pstate: Document *_limit_* fields in struct amd_cpudata The four fields of struct cpudata namely min_limit_perf, max_limit_perf, min_limit_freq, max_limit_freq introduced in the commit febab20caeba("cpufreq/amd-pstate: Fix scaling_min_freq and scaling_max_freq update") are currently undocumented Add comments describing these fields Acked-by: Huang Rui Fixes: febab20caeba("cpufreq/amd-pstate: Fix scaling_min_freq and scaling_max_freq update") Reviewed-by: Li Meng Tested-by: Dhananjay Ugwekar Signed-off-by: Gautham R. Shenoy Signed-off-by: Perry Yuan Signed-off-by: Rafael J. Wysocki --- include/linux/amd-pstate.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/linux/amd-pstate.h b/include/linux/amd-pstate.h index d21838835abd..83fb3fc647fc 100644 --- a/include/linux/amd-pstate.h +++ b/include/linux/amd-pstate.h @@ -49,6 +49,10 @@ struct amd_aperf_mperf { * @lowest_perf: the absolute lowest performance level of the processor * @prefcore_ranking: the preferred core ranking, the higher value indicates a higher * priority. + * @min_limit_perf: Cached value of the performance corresponding to policy->min + * @max_limit_perf: Cached value of the performance corresponding to policy->max + * @min_limit_freq: Cached value of policy->min + * @max_limit_freq: Cached value of policy->max * @max_freq: the frequency that mapped to highest_perf * @min_freq: the frequency that mapped to lowest_perf * @nominal_freq: the frequency that mapped to nominal_perf From 4fcfd1954ad305e331b6b4b62de2874fbae61394 Mon Sep 17 00:00:00 2001 From: "Gautham R. Shenoy" Date: Thu, 25 Apr 2024 16:07:52 +0800 Subject: [PATCH 46/68] cpufreq: amd-pstate: Document the units for freq variables in amd_cpudata The min_limit_freq, max_limit_freq, min_freq, max_freq, nominal_freq and the lowest_nominal_freq members of struct cpudata store the frequency value in khz to be consistent with the cpufreq core. Update the comment to document this. Reviewed-by: Li Meng Tested-by: Dhananjay Ugwekar Signed-off-by: Gautham R. Shenoy Signed-off-by: Perry Yuan Acked-by: Huang Rui Signed-off-by: Rafael J. Wysocki --- include/linux/amd-pstate.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/include/linux/amd-pstate.h b/include/linux/amd-pstate.h index 83fb3fc647fc..ec0b0fa3e9bb 100644 --- a/include/linux/amd-pstate.h +++ b/include/linux/amd-pstate.h @@ -51,15 +51,15 @@ struct amd_aperf_mperf { * priority. * @min_limit_perf: Cached value of the performance corresponding to policy->min * @max_limit_perf: Cached value of the performance corresponding to policy->max - * @min_limit_freq: Cached value of policy->min - * @max_limit_freq: Cached value of policy->max - * @max_freq: the frequency that mapped to highest_perf - * @min_freq: the frequency that mapped to lowest_perf - * @nominal_freq: the frequency that mapped to nominal_perf - * @lowest_nonlinear_freq: the frequency that mapped to lowest_nonlinear_perf + * @min_limit_freq: Cached value of policy->min (in khz) + * @max_limit_freq: Cached value of policy->max (in khz) + * @max_freq: the frequency (in khz) that mapped to highest_perf + * @min_freq: the frequency (in khz) that mapped to lowest_perf + * @nominal_freq: the frequency (in khz) that mapped to nominal_perf + * @lowest_nonlinear_freq: the frequency (in khz) that mapped to lowest_nonlinear_perf * @cur: Difference of Aperf/Mperf/tsc count between last and current sample * @prev: Last Aperf/Mperf/tsc count value read from register - * @freq: current cpu frequency value + * @freq: current cpu frequency value (in khz) * @boost_supported: check whether the Processor or SBIOS supports boost mode * @hw_prefcore: check whether HW supports preferred core featue. * Only when hw_prefcore and early prefcore param are true, From 5547c0ebfc2efdab6ee93a7fd4d9c411ad87013e Mon Sep 17 00:00:00 2001 From: Perry Yuan Date: Thu, 25 Apr 2024 16:07:53 +0800 Subject: [PATCH 47/68] cpufreq: amd-pstate: Unify computation of {max,min,nominal,lowest_nonlinear}_freq Currently the amd_get_{min, max, nominal, lowest_nonlinear}_freq() helpers computes the values of min_freq, max_freq, nominal_freq and lowest_nominal_freq respectively afresh from cppc_get_perf_caps(). This is not necessary as there are fields in cpudata to cache these values. To simplify this, add a single helper function named amd_pstate_init_freq() which computes all these frequencies at once, and caches it in cpudata. Use the cached values everywhere else in the code. Acked-by: Huang Rui Reviewed-by: Li Meng Tested-by: Dhananjay Ugwekar Co-developed-by: Gautham R. Shenoy Signed-off-by: Gautham R. Shenoy Signed-off-by: Perry Yuan Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/amd-pstate.c | 126 ++++++++++++++++------------------- 1 file changed, 59 insertions(+), 67 deletions(-) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index 2015c9fcc3c9..891fad6f90e1 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -606,74 +606,22 @@ static void amd_pstate_adjust_perf(unsigned int cpu, static int amd_get_min_freq(struct amd_cpudata *cpudata) { - struct cppc_perf_caps cppc_perf; - - int ret = cppc_get_perf_caps(cpudata->cpu, &cppc_perf); - if (ret) - return ret; - - /* Switch to khz */ - return cppc_perf.lowest_freq * 1000; + return READ_ONCE(cpudata->min_freq); } static int amd_get_max_freq(struct amd_cpudata *cpudata) { - struct cppc_perf_caps cppc_perf; - u32 max_perf, max_freq, nominal_freq, nominal_perf; - u64 boost_ratio; - - int ret = cppc_get_perf_caps(cpudata->cpu, &cppc_perf); - if (ret) - return ret; - - nominal_freq = cppc_perf.nominal_freq; - nominal_perf = READ_ONCE(cpudata->nominal_perf); - max_perf = READ_ONCE(cpudata->highest_perf); - - boost_ratio = div_u64(max_perf << SCHED_CAPACITY_SHIFT, - nominal_perf); - - max_freq = nominal_freq * boost_ratio >> SCHED_CAPACITY_SHIFT; - - /* Switch to khz */ - return max_freq * 1000; + return READ_ONCE(cpudata->max_freq); } static int amd_get_nominal_freq(struct amd_cpudata *cpudata) { - struct cppc_perf_caps cppc_perf; - - int ret = cppc_get_perf_caps(cpudata->cpu, &cppc_perf); - if (ret) - return ret; - - /* Switch to khz */ - return cppc_perf.nominal_freq * 1000; + return READ_ONCE(cpudata->nominal_freq); } static int amd_get_lowest_nonlinear_freq(struct amd_cpudata *cpudata) { - struct cppc_perf_caps cppc_perf; - u32 lowest_nonlinear_freq, lowest_nonlinear_perf, - nominal_freq, nominal_perf; - u64 lowest_nonlinear_ratio; - - int ret = cppc_get_perf_caps(cpudata->cpu, &cppc_perf); - if (ret) - return ret; - - nominal_freq = cppc_perf.nominal_freq; - nominal_perf = READ_ONCE(cpudata->nominal_perf); - - lowest_nonlinear_perf = cppc_perf.lowest_nonlinear_perf; - - lowest_nonlinear_ratio = div_u64(lowest_nonlinear_perf << SCHED_CAPACITY_SHIFT, - nominal_perf); - - lowest_nonlinear_freq = nominal_freq * lowest_nonlinear_ratio >> SCHED_CAPACITY_SHIFT; - - /* Switch to khz */ - return lowest_nonlinear_freq * 1000; + return READ_ONCE(cpudata->lowest_nonlinear_freq); } static int amd_pstate_set_boost(struct cpufreq_policy *policy, int state) @@ -828,6 +776,53 @@ free_cpufreq_put: mutex_unlock(&amd_pstate_driver_lock); } +/** + * amd_pstate_init_freq: Initialize the max_freq, min_freq, + * nominal_freq and lowest_nonlinear_freq for + * the @cpudata object. + * + * Requires: highest_perf, lowest_perf, nominal_perf and + * lowest_nonlinear_perf members of @cpudata to be + * initialized. + * + * Returns 0 on success, non-zero value on failure. + */ +static int amd_pstate_init_freq(struct amd_cpudata *cpudata) +{ + int ret; + u32 min_freq; + u32 highest_perf, max_freq; + u32 nominal_perf, nominal_freq; + u32 lowest_nonlinear_perf, lowest_nonlinear_freq; + u32 boost_ratio, lowest_nonlinear_ratio; + struct cppc_perf_caps cppc_perf; + + + ret = cppc_get_perf_caps(cpudata->cpu, &cppc_perf); + if (ret) + return ret; + + min_freq = cppc_perf.lowest_freq * 1000; + nominal_freq = cppc_perf.nominal_freq; + nominal_perf = READ_ONCE(cpudata->nominal_perf); + + highest_perf = READ_ONCE(cpudata->highest_perf); + boost_ratio = div_u64(highest_perf << SCHED_CAPACITY_SHIFT, nominal_perf); + max_freq = (nominal_freq * boost_ratio >> SCHED_CAPACITY_SHIFT) * 1000; + + lowest_nonlinear_perf = READ_ONCE(cpudata->lowest_nonlinear_perf); + lowest_nonlinear_ratio = div_u64(lowest_nonlinear_perf << SCHED_CAPACITY_SHIFT, + nominal_perf); + lowest_nonlinear_freq = (nominal_freq * lowest_nonlinear_ratio >> SCHED_CAPACITY_SHIFT) * 1000; + + WRITE_ONCE(cpudata->min_freq, min_freq); + WRITE_ONCE(cpudata->lowest_nonlinear_freq, lowest_nonlinear_freq); + WRITE_ONCE(cpudata->nominal_freq, nominal_freq); + WRITE_ONCE(cpudata->max_freq, max_freq); + + return 0; +} + static int amd_pstate_cpu_init(struct cpufreq_policy *policy) { int min_freq, max_freq, nominal_freq, lowest_nonlinear_freq, ret; @@ -855,6 +850,10 @@ static int amd_pstate_cpu_init(struct cpufreq_policy *policy) if (ret) goto free_cpudata1; + ret = amd_pstate_init_freq(cpudata); + if (ret) + goto free_cpudata1; + min_freq = amd_get_min_freq(cpudata); max_freq = amd_get_max_freq(cpudata); nominal_freq = amd_get_nominal_freq(cpudata); @@ -896,13 +895,8 @@ static int amd_pstate_cpu_init(struct cpufreq_policy *policy) goto free_cpudata2; } - /* Initial processor data capability frequencies */ - cpudata->max_freq = max_freq; - cpudata->min_freq = min_freq; cpudata->max_limit_freq = max_freq; cpudata->min_limit_freq = min_freq; - cpudata->nominal_freq = nominal_freq; - cpudata->lowest_nonlinear_freq = lowest_nonlinear_freq; policy->driver_data = cpudata; @@ -1317,6 +1311,10 @@ static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy) if (ret) goto free_cpudata1; + ret = amd_pstate_init_freq(cpudata); + if (ret) + goto free_cpudata1; + min_freq = amd_get_min_freq(cpudata); max_freq = amd_get_max_freq(cpudata); nominal_freq = amd_get_nominal_freq(cpudata); @@ -1333,12 +1331,6 @@ static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy) /* It will be updated by governor */ policy->cur = policy->cpuinfo.min_freq; - /* Initial processor data capability frequencies */ - cpudata->max_freq = max_freq; - cpudata->min_freq = min_freq; - cpudata->nominal_freq = nominal_freq; - cpudata->lowest_nonlinear_freq = lowest_nonlinear_freq; - policy->driver_data = cpudata; cpudata->epp_cached = amd_pstate_get_epp(cpudata, 0); From 3cbbe8871a2fb8f454e740f3e04ff2e29b573abe Mon Sep 17 00:00:00 2001 From: "Gautham R. Shenoy" Date: Thu, 25 Apr 2024 16:07:54 +0800 Subject: [PATCH 48/68] cpufreq: amd-pstate: Remove amd_get_{min,max,nominal,lowest_nonlinear}_freq() amd_get_{min,max,nominal,lowest_nonlinear}_freq() functions merely return cpudata->{min,max,nominal,lowest_nonlinear}_freq values. There is no loss in readability in replacing their invocations by accesses to the corresponding members of cpudata. Do so and remove these helper functions. Acked-by: Huang Rui Reviewed-by: Li Meng Tested-by: Dhananjay Ugwekar Signed-off-by: Gautham R. Shenoy Signed-off-by: Perry Yuan Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/amd-pstate.c | 40 +++++++++--------------------------- 1 file changed, 10 insertions(+), 30 deletions(-) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index 891fad6f90e1..3836d62b54ef 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -604,26 +604,6 @@ static void amd_pstate_adjust_perf(unsigned int cpu, cpufreq_cpu_put(policy); } -static int amd_get_min_freq(struct amd_cpudata *cpudata) -{ - return READ_ONCE(cpudata->min_freq); -} - -static int amd_get_max_freq(struct amd_cpudata *cpudata) -{ - return READ_ONCE(cpudata->max_freq); -} - -static int amd_get_nominal_freq(struct amd_cpudata *cpudata) -{ - return READ_ONCE(cpudata->nominal_freq); -} - -static int amd_get_lowest_nonlinear_freq(struct amd_cpudata *cpudata) -{ - return READ_ONCE(cpudata->lowest_nonlinear_freq); -} - static int amd_pstate_set_boost(struct cpufreq_policy *policy, int state) { struct amd_cpudata *cpudata = policy->driver_data; @@ -854,10 +834,10 @@ static int amd_pstate_cpu_init(struct cpufreq_policy *policy) if (ret) goto free_cpudata1; - min_freq = amd_get_min_freq(cpudata); - max_freq = amd_get_max_freq(cpudata); - nominal_freq = amd_get_nominal_freq(cpudata); - lowest_nonlinear_freq = amd_get_lowest_nonlinear_freq(cpudata); + min_freq = READ_ONCE(cpudata->min_freq); + max_freq = READ_ONCE(cpudata->max_freq); + nominal_freq = READ_ONCE(cpudata->nominal_freq); + lowest_nonlinear_freq = READ_ONCE(cpudata->lowest_nonlinear_freq); if (min_freq < 0 || max_freq < 0 || min_freq > max_freq) { dev_err(dev, "min_freq(%d) or max_freq(%d) value is incorrect\n", @@ -960,7 +940,7 @@ static ssize_t show_amd_pstate_max_freq(struct cpufreq_policy *policy, int max_freq; struct amd_cpudata *cpudata = policy->driver_data; - max_freq = amd_get_max_freq(cpudata); + max_freq = READ_ONCE(cpudata->max_freq); if (max_freq < 0) return max_freq; @@ -973,7 +953,7 @@ static ssize_t show_amd_pstate_lowest_nonlinear_freq(struct cpufreq_policy *poli int freq; struct amd_cpudata *cpudata = policy->driver_data; - freq = amd_get_lowest_nonlinear_freq(cpudata); + freq = READ_ONCE(cpudata->lowest_nonlinear_freq); if (freq < 0) return freq; @@ -1315,10 +1295,10 @@ static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy) if (ret) goto free_cpudata1; - min_freq = amd_get_min_freq(cpudata); - max_freq = amd_get_max_freq(cpudata); - nominal_freq = amd_get_nominal_freq(cpudata); - lowest_nonlinear_freq = amd_get_lowest_nonlinear_freq(cpudata); + min_freq = READ_ONCE(cpudata->min_freq); + max_freq = READ_ONCE(cpudata->max_freq); + nominal_freq = READ_ONCE(cpudata->nominal_freq); + lowest_nonlinear_freq = READ_ONCE(cpudata->lowest_nonlinear_freq); if (min_freq < 0 || max_freq < 0 || min_freq > max_freq) { dev_err(dev, "min_freq(%d) or max_freq(%d) value is incorrect\n", min_freq, max_freq); From 2ddb8a3946d4d02115b2dca53c5493ff00536002 Mon Sep 17 00:00:00 2001 From: Perry Yuan Date: Thu, 25 Apr 2024 16:07:55 +0800 Subject: [PATCH 49/68] cpufreq: amd-pstate: Bail out if min/max/nominal_freq is 0 The amd-pstate driver cannot work when the min_freq, nominal_freq or the max_freq is zero. When this happens it is prudent to error out early on rather than waiting failing at the time of the governor initialization. Acked-by: Huang Rui Reviewed-by: Gautham R. Shenoy Tested-by: Dhananjay Ugwekar Signed-off-by: Perry Yuan Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/amd-pstate.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index 3836d62b54ef..960fead0b83e 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -839,9 +839,11 @@ static int amd_pstate_cpu_init(struct cpufreq_policy *policy) nominal_freq = READ_ONCE(cpudata->nominal_freq); lowest_nonlinear_freq = READ_ONCE(cpudata->lowest_nonlinear_freq); - if (min_freq < 0 || max_freq < 0 || min_freq > max_freq) { - dev_err(dev, "min_freq(%d) or max_freq(%d) value is incorrect\n", - min_freq, max_freq); + if (min_freq <= 0 || max_freq <= 0 || + nominal_freq <= 0 || min_freq > max_freq) { + dev_err(dev, + "min_freq(%d) or max_freq(%d) or nominal_freq (%d) value is incorrect, check _CPC in ACPI tables\n", + min_freq, max_freq, nominal_freq); ret = -EINVAL; goto free_cpudata1; } @@ -1299,9 +1301,11 @@ static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy) max_freq = READ_ONCE(cpudata->max_freq); nominal_freq = READ_ONCE(cpudata->nominal_freq); lowest_nonlinear_freq = READ_ONCE(cpudata->lowest_nonlinear_freq); - if (min_freq < 0 || max_freq < 0 || min_freq > max_freq) { - dev_err(dev, "min_freq(%d) or max_freq(%d) value is incorrect\n", - min_freq, max_freq); + if (min_freq <= 0 || max_freq <= 0 || + nominal_freq <= 0 || min_freq > max_freq) { + dev_err(dev, + "min_freq(%d) or max_freq(%d) or nominal_freq(%d) value is incorrect, check _CPC in ACPI tables\n", + min_freq, max_freq, nominal_freq); ret = -EINVAL; goto free_cpudata1; } From 069a2bb8c48c43176f2f0e6cae5efe2f39f6bdf2 Mon Sep 17 00:00:00 2001 From: Perry Yuan Date: Thu, 25 Apr 2024 16:07:56 +0800 Subject: [PATCH 50/68] cpufreq: amd-pstate: get transition delay and latency value from ACPI tables Make pstate driver initially retrieve the P-state transition delay and latency values from the BIOS ACPI tables which has more reasonable delay and latency values according to the platform design and requirements. Previously there values were hardcoded at specific value which may have conflicted with platform and it might not reflect the most accurate or optimized setting for the processor. [054h 0084 8] Preserve Mask : FFFFFFFF00000000 [05Ch 0092 8] Write Mask : 0000000000000001 [064h 0100 4] Command Latency : 00000FA0 [068h 0104 4] Maximum Access Rate : 0000EA60 [06Ch 0108 2] Minimum Turnaround Time : 0000 Reviewed-by: Gautham R. Shenoy Reviewed-by: Mario Limonciello Tested-by: Dhananjay Ugwekar Signed-off-by: Perry Yuan Acked-by: Huang Rui Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/amd-pstate.c | 34 ++++++++++++++++++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index 960fead0b83e..510b5aec42ea 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -756,6 +756,36 @@ free_cpufreq_put: mutex_unlock(&amd_pstate_driver_lock); } +/** + * Get pstate transition delay time from ACPI tables that firmware set + * instead of using hardcode value directly. + */ +static u32 amd_pstate_get_transition_delay_us(unsigned int cpu) +{ + u32 transition_delay_ns; + + transition_delay_ns = cppc_get_transition_latency(cpu); + if (transition_delay_ns == CPUFREQ_ETERNAL) + return AMD_PSTATE_TRANSITION_DELAY; + + return transition_delay_ns / NSEC_PER_USEC; +} + +/** + * Get pstate transition latency value from ACPI tables that firmware + * set instead of using hardcode value directly. + */ +static u32 amd_pstate_get_transition_latency(unsigned int cpu) +{ + u32 transition_latency; + + transition_latency = cppc_get_transition_latency(cpu); + if (transition_latency == CPUFREQ_ETERNAL) + return AMD_PSTATE_TRANSITION_LATENCY; + + return transition_latency; +} + /** * amd_pstate_init_freq: Initialize the max_freq, min_freq, * nominal_freq and lowest_nonlinear_freq for @@ -848,8 +878,8 @@ static int amd_pstate_cpu_init(struct cpufreq_policy *policy) goto free_cpudata1; } - policy->cpuinfo.transition_latency = AMD_PSTATE_TRANSITION_LATENCY; - policy->transition_delay_us = AMD_PSTATE_TRANSITION_DELAY; + policy->cpuinfo.transition_latency = amd_pstate_get_transition_latency(policy->cpu); + policy->transition_delay_us = amd_pstate_get_transition_delay_us(policy->cpu); policy->min = min_freq; policy->max = max_freq; From 5f8f9bc4d7bc8d44031b88b548c2572b746e2611 Mon Sep 17 00:00:00 2001 From: Perry Yuan Date: Thu, 25 Apr 2024 16:07:57 +0800 Subject: [PATCH 51/68] cppc_acpi: print error message if CPPC is unsupported The amd-pstate driver can fail when _CPC objects are not supported by the CPU. However, the current error message is ambiguous (see below) and there is no clear way for attributing the failure of the amd-pstate driver to the lack of CPPC support. [ 0.477523] amd_pstate: the _CPC object is not present in SBIOS or ACPI disabled Fix this by adding an debug message to notify the user if the amd-pstate driver failed to load due to CPPC not be supported by the CPU Reviewed-by: Mario Limonciello Reviewed-by: Gautham R. Shenoy Tested-by: Dhananjay Ugwekar Signed-off-by: Perry Yuan Acked-by: Huang Rui Signed-off-by: Rafael J. Wysocki --- drivers/acpi/cppc_acpi.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/acpi/cppc_acpi.c b/drivers/acpi/cppc_acpi.c index 4bfbe55553f4..3134101f31b6 100644 --- a/drivers/acpi/cppc_acpi.c +++ b/drivers/acpi/cppc_acpi.c @@ -686,8 +686,10 @@ int acpi_cppc_processor_probe(struct acpi_processor *pr) if (!osc_sb_cppc2_support_acked) { pr_debug("CPPC v2 _OSC not acked\n"); - if (!cpc_supported_by_cpu()) + if (!cpc_supported_by_cpu()) { + pr_debug("CPPC is not supported by the CPU\n"); return -ENODEV; + } } /* Parse the ACPI _CPC table for this CPU. */ From eb8b6c36820214df96e7e86d8614d93f6b028f28 Mon Sep 17 00:00:00 2001 From: Perry Yuan Date: Thu, 25 Apr 2024 16:07:58 +0800 Subject: [PATCH 52/68] cpufreq: amd-pstate: Add quirk for the pstate CPPC capabilities missing Add quirks table to get CPPC capabilities issue fixed by providing correct perf or frequency values while driver loading. If CPPC capabilities are not defined in the ACPI tables or wrongly defined by platform firmware, it needs to use quick to get those issues fixed with correct workaround values to make pstate driver can be loaded even though there are CPPC capabilities errors. The workaround will match the broken BIOS which lack of CPPC capabilities nominal_freq and lowest_freq definition in the ACPI table. $ cat /sys/devices/system/cpu/cpu0/acpi_cppc/lowest_freq 0 $ cat /sys/devices/system/cpu/cpu0/acpi_cppc/nominal_freq 0 Acked-by: Huang Rui Reviewed-by: Mario Limonciello Reviewed-by: Gautham R. Shenoy Tested-by: Dhananjay Ugwekar Signed-off-by: Perry Yuan Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/amd-pstate.c | 53 ++++++++++++++++++++++++++++++++++-- include/linux/amd-pstate.h | 6 ++++ 2 files changed, 57 insertions(+), 2 deletions(-) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index 510b5aec42ea..83a29b257794 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -67,6 +67,7 @@ static struct cpufreq_driver amd_pstate_epp_driver; static int cppc_state = AMD_PSTATE_UNDEFINED; static bool cppc_enabled; static bool amd_pstate_prefcore = true; +static struct quirk_entry *quirks; /* * AMD Energy Preference Performance (EPP) @@ -111,6 +112,41 @@ static unsigned int epp_values[] = { typedef int (*cppc_mode_transition_fn)(int); +static struct quirk_entry quirk_amd_7k62 = { + .nominal_freq = 2600, + .lowest_freq = 550, +}; + +static int __init dmi_matched_7k62_bios_bug(const struct dmi_system_id *dmi) +{ + /** + * match the broken bios for family 17h processor support CPPC V2 + * broken BIOS lack of nominal_freq and lowest_freq capabilities + * definition in ACPI tables + */ + if (boot_cpu_has(X86_FEATURE_ZEN2)) { + quirks = dmi->driver_data; + pr_info("Overriding nominal and lowest frequencies for %s\n", dmi->ident); + return 1; + } + + return 0; +} + +static const struct dmi_system_id amd_pstate_quirks_table[] __initconst = { + { + .callback = dmi_matched_7k62_bios_bug, + .ident = "AMD EPYC 7K62", + .matches = { + DMI_MATCH(DMI_BIOS_VERSION, "5.14"), + DMI_MATCH(DMI_BIOS_RELEASE, "12/12/2019"), + }, + .driver_data = &quirk_amd_7k62, + }, + {} +}; +MODULE_DEVICE_TABLE(dmi, amd_pstate_quirks_table); + static inline int get_mode_idx_from_str(const char *str, size_t size) { int i; @@ -812,8 +848,16 @@ static int amd_pstate_init_freq(struct amd_cpudata *cpudata) if (ret) return ret; - min_freq = cppc_perf.lowest_freq * 1000; - nominal_freq = cppc_perf.nominal_freq; + if (quirks && quirks->lowest_freq) + min_freq = quirks->lowest_freq * 1000; + else + min_freq = cppc_perf.lowest_freq * 1000; + + if (quirks && quirks->nominal_freq) + nominal_freq = quirks->nominal_freq ; + else + nominal_freq = cppc_perf.nominal_freq; + nominal_perf = READ_ONCE(cpudata->nominal_perf); highest_perf = READ_ONCE(cpudata->highest_perf); @@ -1662,6 +1706,11 @@ static int __init amd_pstate_init(void) if (cpufreq_get_current_driver()) return -EEXIST; + quirks = NULL; + + /* check if this machine need CPPC quirks */ + dmi_check_system(amd_pstate_quirks_table); + switch (cppc_state) { case AMD_PSTATE_UNDEFINED: /* Disable on the following configs by default: diff --git a/include/linux/amd-pstate.h b/include/linux/amd-pstate.h index ec0b0fa3e9bb..d58fc022ec46 100644 --- a/include/linux/amd-pstate.h +++ b/include/linux/amd-pstate.h @@ -128,4 +128,10 @@ static const char * const amd_pstate_mode_string[] = { [AMD_PSTATE_GUIDED] = "guided", NULL, }; + +struct quirk_entry { + u32 nominal_freq; + u32 lowest_freq; +}; + #endif /* _LINUX_AMD_PSTATE_H */ From 5131a3ca3518d726cb535543441a5f195b8b0299 Mon Sep 17 00:00:00 2001 From: Perry Yuan Date: Tue, 30 Apr 2024 15:48:56 +0800 Subject: [PATCH 53/68] cpufreq: amd-pstate: fix code format problems get some code format problems fixed in the amd-pstate driver. Changes Made: - Fixed incorrect comment format in the functions. - Removed unnecessary blank line. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202404271148.HK9yHBlB-lkp@intel.com/ Signed-off-by: Perry Yuan Reviewed-by: Mario Limonciello Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/amd-pstate.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index 83a29b257794..85656342a101 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -792,7 +792,7 @@ free_cpufreq_put: mutex_unlock(&amd_pstate_driver_lock); } -/** +/* * Get pstate transition delay time from ACPI tables that firmware set * instead of using hardcode value directly. */ @@ -807,7 +807,7 @@ static u32 amd_pstate_get_transition_delay_us(unsigned int cpu) return transition_delay_ns / NSEC_PER_USEC; } -/** +/* * Get pstate transition latency value from ACPI tables that firmware * set instead of using hardcode value directly. */ @@ -822,7 +822,7 @@ static u32 amd_pstate_get_transition_latency(unsigned int cpu) return transition_latency; } -/** +/* * amd_pstate_init_freq: Initialize the max_freq, min_freq, * nominal_freq and lowest_nonlinear_freq for * the @cpudata object. @@ -843,7 +843,6 @@ static int amd_pstate_init_freq(struct amd_cpudata *cpudata) u32 boost_ratio, lowest_nonlinear_ratio; struct cppc_perf_caps cppc_perf; - ret = cppc_get_perf_caps(cpudata->cpu, &cppc_perf); if (ret) return ret; From 5c3fd1edaa8b4c093e877f6354b3745178015070 Mon Sep 17 00:00:00 2001 From: Perry Yuan Date: Tue, 30 Apr 2024 15:48:57 +0800 Subject: [PATCH 54/68] cpufreq: amd-pstate: remove unused variable lowest_nonlinear_freq removed the unused variable `lowest_nonlinear_freq` for build warning. This variable was defined and assigned a value in the previous code, but it was not used in the subsequent code. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202404271038.em6nJjzy-lkp@intel.com/ Signed-off-by: Perry Yuan Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/amd-pstate.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index 85656342a101..2db095867d03 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -878,7 +878,7 @@ static int amd_pstate_init_freq(struct amd_cpudata *cpudata) static int amd_pstate_cpu_init(struct cpufreq_policy *policy) { - int min_freq, max_freq, nominal_freq, lowest_nonlinear_freq, ret; + int min_freq, max_freq, nominal_freq, ret; struct device *dev; struct amd_cpudata *cpudata; @@ -910,7 +910,6 @@ static int amd_pstate_cpu_init(struct cpufreq_policy *policy) min_freq = READ_ONCE(cpudata->min_freq); max_freq = READ_ONCE(cpudata->max_freq); nominal_freq = READ_ONCE(cpudata->nominal_freq); - lowest_nonlinear_freq = READ_ONCE(cpudata->lowest_nonlinear_freq); if (min_freq <= 0 || max_freq <= 0 || nominal_freq <= 0 || min_freq > max_freq) { @@ -1339,7 +1338,7 @@ static bool amd_pstate_acpi_pm_profile_undefined(void) static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy) { - int min_freq, max_freq, nominal_freq, lowest_nonlinear_freq, ret; + int min_freq, max_freq, nominal_freq, ret; struct amd_cpudata *cpudata; struct device *dev; u64 value; @@ -1373,7 +1372,6 @@ static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy) min_freq = READ_ONCE(cpudata->min_freq); max_freq = READ_ONCE(cpudata->max_freq); nominal_freq = READ_ONCE(cpudata->nominal_freq); - lowest_nonlinear_freq = READ_ONCE(cpudata->lowest_nonlinear_freq); if (min_freq <= 0 || max_freq <= 0 || nominal_freq <= 0 || min_freq > max_freq) { dev_err(dev, From 70f83f525304079746e53027963cd732543e11c8 Mon Sep 17 00:00:00 2001 From: Huang Rui Date: Mon, 29 Apr 2024 19:33:56 +0800 Subject: [PATCH 55/68] MAINTAINERS: cpufreq: amd-pstate: Add co-maintainers and reviewer I'm happy to add Gautham and Mario as the co-maintainers, Perry as the reviewer for amd-pstate driver. Signed-off-by: Huang Rui Acked-by: Mario Limonciello Signed-off-by: Rafael J. Wysocki --- MAINTAINERS | 3 +++ 1 file changed, 3 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 7c121493f43d..8c141c89eeaf 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1062,6 +1062,9 @@ F: drivers/gpu/drm/amd/pm/ AMD PSTATE DRIVER M: Huang Rui +M: Gautham R. Shenoy +M: Mario Limonciello +R: Perry Yuan L: linux-pm@vger.kernel.org S: Supported F: Documentation/admin-guide/pm/amd-pstate.rst From a2bd1d268e5d6411ddf3a10cdd3d964aad621cab Mon Sep 17 00:00:00 2001 From: Joshua Yeong Date: Thu, 25 Apr 2024 19:00:17 +0800 Subject: [PATCH 56/68] cpufreq: Fix up printing large CPU numbers and frequency values A negative CPU number or frequency value may be printed if they are really large (which is unlikely, though). Signed-off-by: Joshua Yeong Reviewed-by: Thorsten Blum Acked-by: Viresh Kumar [ rjw: Subject and changelog edits. ] Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/freq_table.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/cpufreq/freq_table.c b/drivers/cpufreq/freq_table.c index 40e146942f3e..10e80d912b8d 100644 --- a/drivers/cpufreq/freq_table.c +++ b/drivers/cpufreq/freq_table.c @@ -194,7 +194,7 @@ int cpufreq_table_index_unsorted(struct cpufreq_policy *policy, } if (optimal.driver_data > i) { if (suboptimal.driver_data > i) { - WARN(1, "Invalid frequency table: %d\n", policy->cpu); + WARN(1, "Invalid frequency table: %u\n", policy->cpu); return 0; } @@ -254,7 +254,7 @@ static ssize_t show_available_freqs(struct cpufreq_policy *policy, char *buf, if (show_boost ^ (pos->flags & CPUFREQ_BOOST_FREQ)) continue; - count += sprintf(&buf[count], "%d ", pos->frequency); + count += sprintf(&buf[count], "%u ", pos->frequency); } count += sprintf(&buf[count], "\n"); From 7b831bd3cf322fdacd07f321d6d7297914ed79bc Mon Sep 17 00:00:00 2001 From: Justin Stitt Date: Mon, 29 Apr 2024 20:50:30 +0000 Subject: [PATCH 57/68] PM: hibernate: replace deprecated strncpy() with strscpy() strncpy() is deprecated for use on NUL-terminated destination strings [1] and as such we should prefer more robust and less ambiguous string interfaces. This kernel config option is simply assigned with the resume_file buffer. It should be NUL-terminated but not necessarily NUL-padded as per its further usage with other string apis: | static int __init find_resume_device(void) | { | if (!strlen(resume_file)) | return -ENOENT; | | pm_pr_dbg("Checking hibernation image partition %s\n", resume_file); Use strscpy() [2] as it guarantees NUL-termination on the destination buffer. Specifically, use the new 2-argument version of strscpy() introduced in Commit e6584c3964f2f ("string: Allow 2-argument strscpy()"). Link: https://www.kernel.org/doc/html/latest/process/deprecated.html#strncpy-on-nul-terminated-strings [1] Link: https://manpages.debian.org/testing/linux-manual-4.8/strscpy.9.en.html [2] Link: https://github.com/KSPP/linux/issues/90 Signed-off-by: Justin Stitt Reviewed-by: Kees Cook Reviewed-by: Dhruva Gole Signed-off-by: Rafael J. Wysocki --- kernel/power/hibernate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 43b1a82e800c..0a213f69a9e4 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -1361,7 +1361,7 @@ static int __init resume_setup(char *str) if (noresume) return 1; - strncpy(resume_file, str, 255); + strscpy(resume_file, str); return 1; } From 575024a8aa7cf1dff49b94092f774ed1c90586be Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Sun, 28 Apr 2024 17:24:26 +0800 Subject: [PATCH 58/68] powercap: intel_rapl: Introduce APIs for PMU support Introduce two new APIs rapl_package_add_pmu()/rapl_package_remove_pmu(). RAPL driver can invoke these APIs to expose its supported energy counters via perf PMU. The new RAPL PMU is fully compatible with current MSR RAPL PMU, including using the same PMU name and events name/id/unit/scale, etc. For example, use below command perf stat -e power/energy-pkg/ -e power/energy-ram/ FOO to get the energy consumption if power/energy-pkg/ and power/energy-ram/ events are available in the "perf list" output. This does not introduce any conflict because TPMI RAPL is the only user of these APIs currently, and it never co-exists with MSR RAPL. Note that RAPL Packages can be probed/removed dynamically, and the events supported by each TPMI RAPL device can be different. Thus the RAPL PMU support is done on demand, which means 1. PMU is registered only if it is needed by a RAPL Package. PMU events for unsupported counters are not exposed. 2. PMU is unregistered and registered when a new RAPL Package is probed and supports new counters that are not supported by current PMU. For example, on a dual-package system using TPMI RAPL, it is possible that Package 1 behaves as TPMI domain root and supports Psys domain. In this case, register PMU without Psys event when probing Package 0, and re-register the PMU with Psys event when probing Package 1. 3. PMU is unregistered when all registered RAPL Packages don't need PMU. Signed-off-by: Zhang Rui Signed-off-by: Rafael J. Wysocki --- drivers/powercap/intel_rapl_common.c | 582 +++++++++++++++++++++++++++ include/linux/intel_rapl.h | 32 ++ 2 files changed, 614 insertions(+) diff --git a/drivers/powercap/intel_rapl_common.c b/drivers/powercap/intel_rapl_common.c index c4302caeb631..aac0744011a3 100644 --- a/drivers/powercap/intel_rapl_common.c +++ b/drivers/powercap/intel_rapl_common.c @@ -15,6 +15,8 @@ #include #include #include +#include +#include #include #include #include @@ -1507,6 +1509,586 @@ static int rapl_detect_domains(struct rapl_package *rp) return 0; } +#ifdef CONFIG_PERF_EVENTS + +/* + * Support for RAPL PMU + * + * Register a PMU if any of the registered RAPL Packages have the requirement + * of exposing its energy counters via Perf PMU. + * + * PMU Name: + * power + * + * Events: + * Name Event id RAPL Domain + * energy_cores 0x01 RAPL_DOMAIN_PP0 + * energy_pkg 0x02 RAPL_DOMAIN_PACKAGE + * energy_ram 0x03 RAPL_DOMAIN_DRAM + * energy_gpu 0x04 RAPL_DOMAIN_PP1 + * energy_psys 0x05 RAPL_DOMAIN_PLATFORM + * + * Unit: + * Joules + * + * Scale: + * 2.3283064365386962890625e-10 + * The same RAPL domain in different RAPL Packages may have different + * energy units. Use 2.3283064365386962890625e-10 (2^-32) Joules as + * the fixed unit for all energy counters, and covert each hardware + * counter increase to N times of PMU event counter increases. + * + * This is fully compatible with the current MSR RAPL PMU. This means that + * userspace programs like turbostat can use the same code to handle RAPL Perf + * PMU, no matter what RAPL Interface driver (MSR/TPMI, etc) is running + * underlying on the platform. + * + * Note that RAPL Packages can be probed/removed dynamically, and the events + * supported by each TPMI RAPL device can be different. Thus the RAPL PMU + * support is done on demand, which means + * 1. PMU is registered only if it is needed by a RAPL Package. PMU events for + * unsupported counters are not exposed. + * 2. PMU is unregistered and registered when a new RAPL Package is probed and + * supports new counters that are not supported by current PMU. + * 3. PMU is unregistered when all registered RAPL Packages don't need PMU. + */ + +struct rapl_pmu { + struct pmu pmu; /* Perf PMU structure */ + u64 timer_ms; /* Maximum expiration time to avoid counter overflow */ + unsigned long domain_map; /* Events supported by current registered PMU */ + bool registered; /* Whether the PMU has been registered or not */ +}; + +static struct rapl_pmu rapl_pmu; + +/* PMU helpers */ + +static int get_pmu_cpu(struct rapl_package *rp) +{ + int cpu; + + if (!rp->has_pmu) + return nr_cpu_ids; + + /* Only TPMI RAPL is supported for now */ + if (rp->priv->type != RAPL_IF_TPMI) + return nr_cpu_ids; + + /* TPMI RAPL uses any CPU in the package for PMU */ + for_each_online_cpu(cpu) + if (topology_physical_package_id(cpu) == rp->id) + return cpu; + + return nr_cpu_ids; +} + +static bool is_rp_pmu_cpu(struct rapl_package *rp, int cpu) +{ + if (!rp->has_pmu) + return false; + + /* Only TPMI RAPL is supported for now */ + if (rp->priv->type != RAPL_IF_TPMI) + return false; + + /* TPMI RAPL uses any CPU in the package for PMU */ + return topology_physical_package_id(cpu) == rp->id; +} + +static struct rapl_package_pmu_data *event_to_pmu_data(struct perf_event *event) +{ + struct rapl_package *rp = event->pmu_private; + + return &rp->pmu_data; +} + +/* PMU event callbacks */ + +static u64 event_read_counter(struct perf_event *event) +{ + struct rapl_package *rp = event->pmu_private; + u64 val; + int ret; + + /* Return 0 for unsupported events */ + if (event->hw.idx < 0) + return 0; + + ret = rapl_read_data_raw(&rp->domains[event->hw.idx], ENERGY_COUNTER, false, &val); + + /* Return 0 for failed read */ + if (ret) + return 0; + + return val; +} + +static void __rapl_pmu_event_start(struct perf_event *event) +{ + struct rapl_package_pmu_data *data = event_to_pmu_data(event); + + if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED))) + return; + + event->hw.state = 0; + + list_add_tail(&event->active_entry, &data->active_list); + + local64_set(&event->hw.prev_count, event_read_counter(event)); + if (++data->n_active == 1) + hrtimer_start(&data->hrtimer, data->timer_interval, + HRTIMER_MODE_REL_PINNED); +} + +static void rapl_pmu_event_start(struct perf_event *event, int mode) +{ + struct rapl_package_pmu_data *data = event_to_pmu_data(event); + unsigned long flags; + + raw_spin_lock_irqsave(&data->lock, flags); + __rapl_pmu_event_start(event); + raw_spin_unlock_irqrestore(&data->lock, flags); +} + +static u64 rapl_event_update(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct rapl_package_pmu_data *data = event_to_pmu_data(event); + u64 prev_raw_count, new_raw_count; + s64 delta, sdelta; + + /* + * Follow the generic code to drain hwc->prev_count. + * The loop is not expected to run for multiple times. + */ + prev_raw_count = local64_read(&hwc->prev_count); + do { + new_raw_count = event_read_counter(event); + } while (!local64_try_cmpxchg(&hwc->prev_count, + &prev_raw_count, new_raw_count)); + + + /* + * Now we have the new raw value and have updated the prev + * timestamp already. We can now calculate the elapsed delta + * (event-)time and add that to the generic event. + */ + delta = new_raw_count - prev_raw_count; + + /* + * Scale delta to smallest unit (2^-32) + * users must then scale back: count * 1/(1e9*2^32) to get Joules + * or use ldexp(count, -32). + * Watts = Joules/Time delta + */ + sdelta = delta * data->scale[event->hw.flags]; + + local64_add(sdelta, &event->count); + + return new_raw_count; +} + +static void rapl_pmu_event_stop(struct perf_event *event, int mode) +{ + struct rapl_package_pmu_data *data = event_to_pmu_data(event); + struct hw_perf_event *hwc = &event->hw; + unsigned long flags; + + raw_spin_lock_irqsave(&data->lock, flags); + + /* Mark event as deactivated and stopped */ + if (!(hwc->state & PERF_HES_STOPPED)) { + WARN_ON_ONCE(data->n_active <= 0); + if (--data->n_active == 0) + hrtimer_cancel(&data->hrtimer); + + list_del(&event->active_entry); + + WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED); + hwc->state |= PERF_HES_STOPPED; + } + + /* Check if update of sw counter is necessary */ + if ((mode & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) { + /* + * Drain the remaining delta count out of a event + * that we are disabling: + */ + rapl_event_update(event); + hwc->state |= PERF_HES_UPTODATE; + } + + raw_spin_unlock_irqrestore(&data->lock, flags); +} + +static int rapl_pmu_event_add(struct perf_event *event, int mode) +{ + struct rapl_package_pmu_data *data = event_to_pmu_data(event); + struct hw_perf_event *hwc = &event->hw; + unsigned long flags; + + raw_spin_lock_irqsave(&data->lock, flags); + + hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED; + + if (mode & PERF_EF_START) + __rapl_pmu_event_start(event); + + raw_spin_unlock_irqrestore(&data->lock, flags); + + return 0; +} + +static void rapl_pmu_event_del(struct perf_event *event, int flags) +{ + rapl_pmu_event_stop(event, PERF_EF_UPDATE); +} + +/* RAPL PMU event ids, same as shown in sysfs */ +enum perf_rapl_events { + PERF_RAPL_PP0 = 1, /* all cores */ + PERF_RAPL_PKG, /* entire package */ + PERF_RAPL_RAM, /* DRAM */ + PERF_RAPL_PP1, /* gpu */ + PERF_RAPL_PSYS, /* psys */ + PERF_RAPL_MAX +}; +#define RAPL_EVENT_MASK GENMASK(7, 0) + +static const int event_to_domain[PERF_RAPL_MAX] = { + [PERF_RAPL_PP0] = RAPL_DOMAIN_PP0, + [PERF_RAPL_PKG] = RAPL_DOMAIN_PACKAGE, + [PERF_RAPL_RAM] = RAPL_DOMAIN_DRAM, + [PERF_RAPL_PP1] = RAPL_DOMAIN_PP1, + [PERF_RAPL_PSYS] = RAPL_DOMAIN_PLATFORM, +}; + +static int rapl_pmu_event_init(struct perf_event *event) +{ + struct rapl_package *pos, *rp = NULL; + u64 cfg = event->attr.config & RAPL_EVENT_MASK; + int domain, idx; + + /* Only look at RAPL events */ + if (event->attr.type != event->pmu->type) + return -ENOENT; + + /* Check for supported events only */ + if (!cfg || cfg >= PERF_RAPL_MAX) + return -EINVAL; + + if (event->cpu < 0) + return -EINVAL; + + /* Find out which Package the event belongs to */ + list_for_each_entry(pos, &rapl_packages, plist) { + if (is_rp_pmu_cpu(pos, event->cpu)) { + rp = pos; + break; + } + } + if (!rp) + return -ENODEV; + + /* Find out which RAPL Domain the event belongs to */ + domain = event_to_domain[cfg]; + + event->event_caps |= PERF_EV_CAP_READ_ACTIVE_PKG; + event->pmu_private = rp; /* Which package */ + event->hw.flags = domain; /* Which domain */ + + event->hw.idx = -1; + /* Find out the index in rp->domains[] to get domain pointer */ + for (idx = 0; idx < rp->nr_domains; idx++) { + if (rp->domains[idx].id == domain) { + event->hw.idx = idx; + break; + } + } + + return 0; +} + +static void rapl_pmu_event_read(struct perf_event *event) +{ + rapl_event_update(event); +} + +static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer) +{ + struct rapl_package_pmu_data *data = + container_of(hrtimer, struct rapl_package_pmu_data, hrtimer); + struct perf_event *event; + unsigned long flags; + + if (!data->n_active) + return HRTIMER_NORESTART; + + raw_spin_lock_irqsave(&data->lock, flags); + + list_for_each_entry(event, &data->active_list, active_entry) + rapl_event_update(event); + + raw_spin_unlock_irqrestore(&data->lock, flags); + + hrtimer_forward_now(hrtimer, data->timer_interval); + + return HRTIMER_RESTART; +} + +/* PMU sysfs attributes */ + +/* + * There are no default events, but we need to create "events" group (with + * empty attrs) before updating it with detected events. + */ +static struct attribute *attrs_empty[] = { + NULL, +}; + +static struct attribute_group pmu_events_group = { + .name = "events", + .attrs = attrs_empty, +}; + +static ssize_t cpumask_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct rapl_package *rp; + cpumask_var_t cpu_mask; + int cpu; + int ret; + + if (!alloc_cpumask_var(&cpu_mask, GFP_KERNEL)) + return -ENOMEM; + + cpus_read_lock(); + + cpumask_clear(cpu_mask); + + /* Choose a cpu for each RAPL Package */ + list_for_each_entry(rp, &rapl_packages, plist) { + cpu = get_pmu_cpu(rp); + if (cpu < nr_cpu_ids) + cpumask_set_cpu(cpu, cpu_mask); + } + cpus_read_unlock(); + + ret = cpumap_print_to_pagebuf(true, buf, cpu_mask); + + free_cpumask_var(cpu_mask); + + return ret; +} + +static DEVICE_ATTR_RO(cpumask); + +static struct attribute *pmu_cpumask_attrs[] = { + &dev_attr_cpumask.attr, + NULL +}; + +static struct attribute_group pmu_cpumask_group = { + .attrs = pmu_cpumask_attrs, +}; + +PMU_FORMAT_ATTR(event, "config:0-7"); +static struct attribute *pmu_format_attr[] = { + &format_attr_event.attr, + NULL +}; + +static struct attribute_group pmu_format_group = { + .name = "format", + .attrs = pmu_format_attr, +}; + +static const struct attribute_group *pmu_attr_groups[] = { + &pmu_events_group, + &pmu_cpumask_group, + &pmu_format_group, + NULL +}; + +#define RAPL_EVENT_ATTR_STR(_name, v, str) \ +static struct perf_pmu_events_attr event_attr_##v = { \ + .attr = __ATTR(_name, 0444, perf_event_sysfs_show, NULL), \ + .event_str = str, \ +} + +RAPL_EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01"); +RAPL_EVENT_ATTR_STR(energy-pkg, rapl_pkg, "event=0x02"); +RAPL_EVENT_ATTR_STR(energy-ram, rapl_ram, "event=0x03"); +RAPL_EVENT_ATTR_STR(energy-gpu, rapl_gpu, "event=0x04"); +RAPL_EVENT_ATTR_STR(energy-psys, rapl_psys, "event=0x05"); + +RAPL_EVENT_ATTR_STR(energy-cores.unit, rapl_unit_cores, "Joules"); +RAPL_EVENT_ATTR_STR(energy-pkg.unit, rapl_unit_pkg, "Joules"); +RAPL_EVENT_ATTR_STR(energy-ram.unit, rapl_unit_ram, "Joules"); +RAPL_EVENT_ATTR_STR(energy-gpu.unit, rapl_unit_gpu, "Joules"); +RAPL_EVENT_ATTR_STR(energy-psys.unit, rapl_unit_psys, "Joules"); + +RAPL_EVENT_ATTR_STR(energy-cores.scale, rapl_scale_cores, "2.3283064365386962890625e-10"); +RAPL_EVENT_ATTR_STR(energy-pkg.scale, rapl_scale_pkg, "2.3283064365386962890625e-10"); +RAPL_EVENT_ATTR_STR(energy-ram.scale, rapl_scale_ram, "2.3283064365386962890625e-10"); +RAPL_EVENT_ATTR_STR(energy-gpu.scale, rapl_scale_gpu, "2.3283064365386962890625e-10"); +RAPL_EVENT_ATTR_STR(energy-psys.scale, rapl_scale_psys, "2.3283064365386962890625e-10"); + +#define RAPL_EVENT_GROUP(_name, domain) \ +static struct attribute *pmu_attr_##_name[] = { \ + &event_attr_rapl_##_name.attr.attr, \ + &event_attr_rapl_unit_##_name.attr.attr, \ + &event_attr_rapl_scale_##_name.attr.attr, \ + NULL \ +}; \ +static umode_t is_visible_##_name(struct kobject *kobj, struct attribute *attr, int event) \ +{ \ + return rapl_pmu.domain_map & BIT(domain) ? attr->mode : 0; \ +} \ +static struct attribute_group pmu_group_##_name = { \ + .name = "events", \ + .attrs = pmu_attr_##_name, \ + .is_visible = is_visible_##_name, \ +} + +RAPL_EVENT_GROUP(cores, RAPL_DOMAIN_PP0); +RAPL_EVENT_GROUP(pkg, RAPL_DOMAIN_PACKAGE); +RAPL_EVENT_GROUP(ram, RAPL_DOMAIN_DRAM); +RAPL_EVENT_GROUP(gpu, RAPL_DOMAIN_PP1); +RAPL_EVENT_GROUP(psys, RAPL_DOMAIN_PLATFORM); + +static const struct attribute_group *pmu_attr_update[] = { + &pmu_group_cores, + &pmu_group_pkg, + &pmu_group_ram, + &pmu_group_gpu, + &pmu_group_psys, + NULL +}; + +static int rapl_pmu_update(struct rapl_package *rp) +{ + int ret = 0; + + /* Return if PMU already covers all events supported by current RAPL Package */ + if (rapl_pmu.registered && !(rp->domain_map & (~rapl_pmu.domain_map))) + goto end; + + /* Unregister previous registered PMU */ + if (rapl_pmu.registered) + perf_pmu_unregister(&rapl_pmu.pmu); + + rapl_pmu.registered = false; + rapl_pmu.domain_map |= rp->domain_map; + + memset(&rapl_pmu.pmu, 0, sizeof(struct pmu)); + rapl_pmu.pmu.attr_groups = pmu_attr_groups; + rapl_pmu.pmu.attr_update = pmu_attr_update; + rapl_pmu.pmu.task_ctx_nr = perf_invalid_context; + rapl_pmu.pmu.event_init = rapl_pmu_event_init; + rapl_pmu.pmu.add = rapl_pmu_event_add; + rapl_pmu.pmu.del = rapl_pmu_event_del; + rapl_pmu.pmu.start = rapl_pmu_event_start; + rapl_pmu.pmu.stop = rapl_pmu_event_stop; + rapl_pmu.pmu.read = rapl_pmu_event_read; + rapl_pmu.pmu.module = THIS_MODULE; + rapl_pmu.pmu.capabilities = PERF_PMU_CAP_NO_EXCLUDE | PERF_PMU_CAP_NO_INTERRUPT; + ret = perf_pmu_register(&rapl_pmu.pmu, "power", -1); + if (ret) { + pr_info("Failed to register PMU\n"); + return ret; + } + + rapl_pmu.registered = true; +end: + rp->has_pmu = true; + return ret; +} + +int rapl_package_add_pmu(struct rapl_package *rp) +{ + struct rapl_package_pmu_data *data = &rp->pmu_data; + int idx; + + if (rp->has_pmu) + return -EEXIST; + + guard(cpus_read_lock)(); + + for (idx = 0; idx < rp->nr_domains; idx++) { + struct rapl_domain *rd = &rp->domains[idx]; + int domain = rd->id; + u64 val; + + if (!test_bit(domain, &rp->domain_map)) + continue; + + /* + * The RAPL PMU granularity is 2^-32 Joules + * data->scale[]: times of 2^-32 Joules for each ENERGY COUNTER increase + */ + val = rd->energy_unit * (1ULL << 32); + do_div(val, ENERGY_UNIT_SCALE * 1000000); + data->scale[domain] = val; + + if (!rapl_pmu.timer_ms) { + struct rapl_primitive_info *rpi = get_rpi(rp, ENERGY_COUNTER); + + /* + * Calculate the timer rate: + * Use reference of 200W for scaling the timeout to avoid counter + * overflows. + * + * max_count = rpi->mask >> rpi->shift + 1 + * max_energy_pj = max_count * rd->energy_unit + * max_time_sec = (max_energy_pj / 1000000000) / 200w + * + * rapl_pmu.timer_ms = max_time_sec * 1000 / 2 + */ + val = (rpi->mask >> rpi->shift) + 1; + val *= rd->energy_unit; + do_div(val, 1000000 * 200 * 2); + rapl_pmu.timer_ms = val; + + pr_debug("%llu ms overflow timer\n", rapl_pmu.timer_ms); + } + + pr_debug("Domain %s: hw unit %lld * 2^-32 Joules\n", rd->name, data->scale[domain]); + } + + /* Initialize per package PMU data */ + raw_spin_lock_init(&data->lock); + INIT_LIST_HEAD(&data->active_list); + data->timer_interval = ms_to_ktime(rapl_pmu.timer_ms); + hrtimer_init(&data->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + data->hrtimer.function = rapl_hrtimer_handle; + + return rapl_pmu_update(rp); +} +EXPORT_SYMBOL_GPL(rapl_package_add_pmu); + +void rapl_package_remove_pmu(struct rapl_package *rp) +{ + struct rapl_package *pos; + + if (!rp->has_pmu) + return; + + guard(cpus_read_lock)(); + + list_for_each_entry(pos, &rapl_packages, plist) { + /* PMU is still needed */ + if (pos->has_pmu && pos != rp) + return; + } + + perf_pmu_unregister(&rapl_pmu.pmu); + memset(&rapl_pmu, 0, sizeof(struct rapl_pmu)); +} +EXPORT_SYMBOL_GPL(rapl_package_remove_pmu); +#endif + /* called from CPU hotplug notifier, hotplug lock held */ void rapl_remove_package_cpuslocked(struct rapl_package *rp) { diff --git a/include/linux/intel_rapl.h b/include/linux/intel_rapl.h index f3196f82fd8a..c0397423d3a8 100644 --- a/include/linux/intel_rapl.h +++ b/include/linux/intel_rapl.h @@ -158,6 +158,26 @@ struct rapl_if_priv { void *rpi; }; +#ifdef CONFIG_PERF_EVENTS +/** + * struct rapl_package_pmu_data: Per package data for PMU support + * @scale: Scale of 2^-32 Joules for each energy counter increase. + * @lock: Lock to protect n_active and active_list. + * @n_active: Number of active events. + * @active_list: List of active events. + * @timer_interval: Maximum timer expiration time before counter overflow. + * @hrtimer: Periodically update the counter to prevent overflow. + */ +struct rapl_package_pmu_data { + u64 scale[RAPL_DOMAIN_MAX]; + raw_spinlock_t lock; + int n_active; + struct list_head active_list; + ktime_t timer_interval; + struct hrtimer hrtimer; +}; +#endif + /* maximum rapl package domain name: package-%d-die-%d */ #define PACKAGE_DOMAIN_NAME_LENGTH 30 @@ -176,6 +196,10 @@ struct rapl_package { struct cpumask cpumask; char name[PACKAGE_DOMAIN_NAME_LENGTH]; struct rapl_if_priv *priv; +#ifdef CONFIG_PERF_EVENTS + bool has_pmu; + struct rapl_package_pmu_data pmu_data; +#endif }; struct rapl_package *rapl_find_package_domain_cpuslocked(int id, struct rapl_if_priv *priv, @@ -188,4 +212,12 @@ struct rapl_package *rapl_find_package_domain(int id, struct rapl_if_priv *priv, struct rapl_package *rapl_add_package(int id, struct rapl_if_priv *priv, bool id_is_cpu); void rapl_remove_package(struct rapl_package *rp); +#ifdef CONFIG_PERF_EVENTS +int rapl_package_add_pmu(struct rapl_package *rp); +void rapl_package_remove_pmu(struct rapl_package *rp); +#else +static inline int rapl_package_add_pmu(struct rapl_package *rp) { return 0; } +static inline void rapl_package_remove_pmu(struct rapl_package *rp) { } +#endif + #endif /* __INTEL_RAPL_H__ */ From 963a9ad3c589dc0f922697faea53c69098083945 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Sun, 28 Apr 2024 17:24:27 +0800 Subject: [PATCH 59/68] powercap: intel_rapl_tpmi: Enable PMU support Enable RAPL PMU support for TPMI RAPL driver. Signed-off-by: Zhang Rui Signed-off-by: Rafael J. Wysocki --- drivers/powercap/intel_rapl_tpmi.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/powercap/intel_rapl_tpmi.c b/drivers/powercap/intel_rapl_tpmi.c index f6b7f085977c..947544e4d229 100644 --- a/drivers/powercap/intel_rapl_tpmi.c +++ b/drivers/powercap/intel_rapl_tpmi.c @@ -302,6 +302,8 @@ static int intel_rapl_tpmi_probe(struct auxiliary_device *auxdev, goto err; } + rapl_package_add_pmu(trp->rp); + auxiliary_set_drvdata(auxdev, trp); return 0; @@ -314,6 +316,7 @@ static void intel_rapl_tpmi_remove(struct auxiliary_device *auxdev) { struct tpmi_rapl_package *trp = auxiliary_get_drvdata(auxdev); + rapl_package_remove_pmu(trp->rp); rapl_remove_package(trp->rp); trp_release(trp); } From 774459238f80f914bb133099239f3a150892d342 Mon Sep 17 00:00:00 2001 From: Jeff Johnson Date: Sun, 5 May 2024 17:42:16 -0700 Subject: [PATCH 60/68] cpuidle: ladder: fix ladder_do_selection() kernel-doc make C=1 reports: warning: Function parameter or struct member 'dev' not described in 'ladder_do_selection' Document 'dev' for this function. Signed-off-by: Jeff Johnson Signed-off-by: Rafael J. Wysocki --- drivers/cpuidle/governors/ladder.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/cpuidle/governors/ladder.c b/drivers/cpuidle/governors/ladder.c index 8e9058c4ea63..6617eb494a11 100644 --- a/drivers/cpuidle/governors/ladder.c +++ b/drivers/cpuidle/governors/ladder.c @@ -44,6 +44,7 @@ static DEFINE_PER_CPU(struct ladder_device, ladder_devices); /** * ladder_do_selection - prepares private data for a state change + * @dev: the CPU * @ldev: the ladder device * @old_idx: the current state index * @new_idx: the new target state index From 0a206fe35d360a9ec1c8b1609ca394c2759a8962 Mon Sep 17 00:00:00 2001 From: Jeff Johnson Date: Sun, 5 May 2024 12:07:12 -0700 Subject: [PATCH 61/68] cpufreq: intel_pstate: fix struct cpudata::epp_cached kernel-doc make C=1 currently gives the following warning: drivers/cpufreq/intel_pstate.c:262: warning: Function parameter or struct member 'epp_cached' not described in 'cpudata' Add the missing ":" to fix the trivial kernel-doc syntax error. Signed-off-by: Jeff Johnson Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/intel_pstate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 5f19d3824a4b..4b986c044741 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -211,7 +211,7 @@ struct global_params { * @epp_policy: Last saved policy used to set EPP/EPB * @epp_default: Power on default HWP energy performance * preference/bias - * @epp_cached Cached HWP energy-performance preference value + * @epp_cached: Cached HWP energy-performance preference value * @hwp_req_cached: Cached value of the last HWP Request MSR * @hwp_cap_cached: Cached value of the last HWP Capabilities MSR * @last_io_update: Last time when IO wake flag was set From bf202e654bfa57fb8cf9d93d4c6855890b70b9c4 Mon Sep 17 00:00:00 2001 From: Perry Yuan Date: Wed, 8 May 2024 13:47:03 +0800 Subject: [PATCH 62/68] cpufreq: amd-pstate: fix the highest frequency issue which limits performance To address the performance drop issue, an optimization has been implemented. The incorrect highest performance value previously set by the low-level power firmware for AMD CPUs with Family ID 0x19 and Model ID ranging from 0x70 to 0x7F series has been identified as the cause. To resolve this, a check has been implemented to accurately determine the CPU family and model ID. The correct highest performance value is now set and the performance drop caused by the incorrect highest performance value are eliminated. Before the fix, the highest frequency was set to 4200MHz, now it is set to 4971MHz which is correct. CPU NODE SOCKET CORE L1d:L1i:L2:L3 ONLINE MAXMHZ MINMHZ MHZ 0 0 0 0 0:0:0:0 yes 4971.0000 400.0000 400.0000 1 0 0 0 0:0:0:0 yes 4971.0000 400.0000 400.0000 2 0 0 1 1:1:1:0 yes 4971.0000 400.0000 4865.8140 3 0 0 1 1:1:1:0 yes 4971.0000 400.0000 400.0000 Fixes: f3a052391822 ("cpufreq: amd-pstate: Enable amd-pstate preferred core support") Closes: https://bugzilla.kernel.org/show_bug.cgi?id=218759 Signed-off-by: Perry Yuan Co-developed-by: Mario Limonciello Signed-off-by: Mario Limonciello Tested-by: Gaha Bana Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/amd-pstate.c | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index 2db095867d03..6a342b0c0140 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -50,7 +50,8 @@ #define AMD_PSTATE_TRANSITION_LATENCY 20000 #define AMD_PSTATE_TRANSITION_DELAY 1000 -#define AMD_PSTATE_PREFCORE_THRESHOLD 166 +#define CPPC_HIGHEST_PERF_PERFORMANCE 196 +#define CPPC_HIGHEST_PERF_DEFAULT 166 /* * TODO: We need more time to fine tune processors with shared memory solution @@ -326,6 +327,21 @@ static inline int amd_pstate_enable(bool enable) return static_call(amd_pstate_enable)(enable); } +static u32 amd_pstate_highest_perf_set(struct amd_cpudata *cpudata) +{ + struct cpuinfo_x86 *c = &cpu_data(0); + + /* + * For AMD CPUs with Family ID 19H and Model ID range 0x70 to 0x7f, + * the highest performance level is set to 196. + * https://bugzilla.kernel.org/show_bug.cgi?id=218759 + */ + if (c->x86 == 0x19 && (c->x86_model >= 0x70 && c->x86_model <= 0x7f)) + return CPPC_HIGHEST_PERF_PERFORMANCE; + + return CPPC_HIGHEST_PERF_DEFAULT; +} + static int pstate_init_perf(struct amd_cpudata *cpudata) { u64 cap1; @@ -342,7 +358,7 @@ static int pstate_init_perf(struct amd_cpudata *cpudata) * the default max perf. */ if (cpudata->hw_prefcore) - highest_perf = AMD_PSTATE_PREFCORE_THRESHOLD; + highest_perf = amd_pstate_highest_perf_set(cpudata); else highest_perf = AMD_CPPC_HIGHEST_PERF(cap1); @@ -366,7 +382,7 @@ static int cppc_init_perf(struct amd_cpudata *cpudata) return ret; if (cpudata->hw_prefcore) - highest_perf = AMD_PSTATE_PREFCORE_THRESHOLD; + highest_perf = amd_pstate_highest_perf_set(cpudata); else highest_perf = cppc_perf.highest_perf; From 45d8b572fac3aa8b49d53c946b3685eaf78a2824 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Mon, 4 Mar 2024 22:28:39 +0100 Subject: [PATCH 63/68] PM / devfreq: exynos-nocp: Convert to platform remove callback returning void MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The .remove() callback for a platform driver returns an int which makes many driver authors wrongly assume it's possible to do error handling by returning an error code. However the value returned is ignored (apart from emitting a warning) and this typically results in resource leaks. To improve here there is a quest to make the remove callback return void. In the first step of this quest all drivers are converted to .remove_new(), which already returns void. Eventually after all drivers are converted, .remove_new() will be renamed to .remove(). Trivially convert this driver from always returning zero in the remove callback to the void returning variant. Signed-off-by: Uwe Kleine-König Signed-off-by: Chanwoo Choi --- drivers/devfreq/event/exynos-nocp.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/devfreq/event/exynos-nocp.c b/drivers/devfreq/event/exynos-nocp.c index c1cc23bcb995..5edc522f715c 100644 --- a/drivers/devfreq/event/exynos-nocp.c +++ b/drivers/devfreq/event/exynos-nocp.c @@ -275,18 +275,16 @@ static int exynos_nocp_probe(struct platform_device *pdev) return 0; } -static int exynos_nocp_remove(struct platform_device *pdev) +static void exynos_nocp_remove(struct platform_device *pdev) { struct exynos_nocp *nocp = platform_get_drvdata(pdev); clk_disable_unprepare(nocp->clk); - - return 0; } static struct platform_driver exynos_nocp_driver = { .probe = exynos_nocp_probe, - .remove = exynos_nocp_remove, + .remove_new = exynos_nocp_remove, .driver = { .name = "exynos-nocp", .of_match_table = exynos_nocp_id_match, From 177e15dfbcaa5b3944f4586e6d42e96920b81db9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Mon, 4 Mar 2024 22:28:40 +0100 Subject: [PATCH 64/68] PM / devfreq: exynos-ppmu: Convert to platform remove callback returning void MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The .remove() callback for a platform driver returns an int which makes many driver authors wrongly assume it's possible to do error handling by returning an error code. However the value returned is ignored (apart from emitting a warning) and this typically results in resource leaks. To improve here there is a quest to make the remove callback return void. In the first step of this quest all drivers are converted to .remove_new(), which already returns void. Eventually after all drivers are converted, .remove_new() will be renamed to .remove(). Trivially convert this driver from always returning zero in the remove callback to the void returning variant. Signed-off-by: Uwe Kleine-König Signed-off-by: Chanwoo Choi --- drivers/devfreq/event/exynos-ppmu.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/devfreq/event/exynos-ppmu.c b/drivers/devfreq/event/exynos-ppmu.c index 56bac4702006..7002df20a49e 100644 --- a/drivers/devfreq/event/exynos-ppmu.c +++ b/drivers/devfreq/event/exynos-ppmu.c @@ -692,18 +692,16 @@ static int exynos_ppmu_probe(struct platform_device *pdev) return 0; } -static int exynos_ppmu_remove(struct platform_device *pdev) +static void exynos_ppmu_remove(struct platform_device *pdev) { struct exynos_ppmu *info = platform_get_drvdata(pdev); clk_disable_unprepare(info->ppmu.clk); - - return 0; } static struct platform_driver exynos_ppmu_driver = { .probe = exynos_ppmu_probe, - .remove = exynos_ppmu_remove, + .remove_new = exynos_ppmu_remove, .driver = { .name = "exynos-ppmu", .of_match_table = exynos_ppmu_id_match, From 0df0258600c61df406aa0ad0abd514941ce62218 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Mon, 4 Mar 2024 22:28:41 +0100 Subject: [PATCH 65/68] PM / devfreq: mtk-cci: Convert to platform remove callback returning void MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The .remove() callback for a platform driver returns an int which makes many driver authors wrongly assume it's possible to do error handling by returning an error code. However the value returned is ignored (apart from emitting a warning) and this typically results in resource leaks. To improve here there is a quest to make the remove callback return void. In the first step of this quest all drivers are converted to .remove_new(), which already returns void. Eventually after all drivers are converted, .remove_new() will be renamed to .remove(). Trivially convert this driver from always returning zero in the remove callback to the void returning variant. Signed-off-by: Uwe Kleine-König Signed-off-by: Chanwoo Choi --- drivers/devfreq/mtk-cci-devfreq.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/devfreq/mtk-cci-devfreq.c b/drivers/devfreq/mtk-cci-devfreq.c index 11bc3d03494c..7ad5225b0381 100644 --- a/drivers/devfreq/mtk-cci-devfreq.c +++ b/drivers/devfreq/mtk-cci-devfreq.c @@ -392,7 +392,7 @@ out_free_resources: return ret; } -static int mtk_ccifreq_remove(struct platform_device *pdev) +static void mtk_ccifreq_remove(struct platform_device *pdev) { struct device *dev = &pdev->dev; struct mtk_ccifreq_drv *drv; @@ -405,8 +405,6 @@ static int mtk_ccifreq_remove(struct platform_device *pdev) regulator_disable(drv->proc_reg); if (drv->sram_reg) regulator_disable(drv->sram_reg); - - return 0; } static const struct mtk_ccifreq_platform_data mt8183_platform_data = { @@ -432,7 +430,7 @@ MODULE_DEVICE_TABLE(of, mtk_ccifreq_machines); static struct platform_driver mtk_ccifreq_platdrv = { .probe = mtk_ccifreq_probe, - .remove = mtk_ccifreq_remove, + .remove_new = mtk_ccifreq_remove, .driver = { .name = "mtk-ccifreq", .of_match_table = mtk_ccifreq_machines, From 14532a01feb063d7dd08ca1749a68b6f70ef2a62 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Mon, 4 Mar 2024 22:28:43 +0100 Subject: [PATCH 66/68] PM / devfreq: sun8i-a33-mbus: Convert to platform remove callback returning void MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The .remove() callback for a platform driver returns an int which makes many driver authors wrongly assume it's possible to do error handling by returning an error code. However the value returned is ignored (apart from emitting a warning) and this typically results in resource leaks. To improve here there is a quest to make the remove callback return void. In the first step of this quest all drivers are converted to .remove_new(), which already returns void. Eventually after all drivers are converted, .remove_new() will be renamed to .remove(). Trivially convert this driver from always returning zero in the remove callback to the void returning variant. Signed-off-by: Uwe Kleine-König Signed-off-by: Chanwoo Choi --- drivers/devfreq/sun8i-a33-mbus.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/devfreq/sun8i-a33-mbus.c b/drivers/devfreq/sun8i-a33-mbus.c index 13d32213139f..bcf654f4ff96 100644 --- a/drivers/devfreq/sun8i-a33-mbus.c +++ b/drivers/devfreq/sun8i-a33-mbus.c @@ -458,7 +458,7 @@ err_disable_bus: return dev_err_probe(dev, ret, err); } -static int sun8i_a33_mbus_remove(struct platform_device *pdev) +static void sun8i_a33_mbus_remove(struct platform_device *pdev) { struct sun8i_a33_mbus *priv = platform_get_drvdata(pdev); unsigned long initial_freq = priv->profile.initial_freq; @@ -475,8 +475,6 @@ static int sun8i_a33_mbus_remove(struct platform_device *pdev) clk_rate_exclusive_put(priv->clk_mbus); clk_rate_exclusive_put(priv->clk_dram); clk_disable_unprepare(priv->clk_bus); - - return 0; } static const struct sun8i_a33_mbus_variant sun50i_a64_mbus = { @@ -497,7 +495,7 @@ static SIMPLE_DEV_PM_OPS(sun8i_a33_mbus_pm_ops, static struct platform_driver sun8i_a33_mbus_driver = { .probe = sun8i_a33_mbus_probe, - .remove = sun8i_a33_mbus_remove, + .remove_new = sun8i_a33_mbus_remove, .driver = { .name = "sun8i-a33-mbus", .of_match_table = sun8i_a33_mbus_of_match, From 8eba5b693442fcfc7bfdd6402cd191250ce3e276 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Mon, 4 Mar 2024 22:28:42 +0100 Subject: [PATCH 67/68] PM / devfreq: rk3399_dmc: Convert to platform remove callback returning void MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The .remove() callback for a platform driver returns an int which makes many driver authors wrongly assume it's possible to do error handling by returning an error code. However the value returned is ignored (apart from emitting a warning) and this typically results in resource leaks. To improve here there is a quest to make the remove callback return void. In the first step of this quest all drivers are converted to .remove_new(), which already returns void. Eventually after all drivers are converted, .remove_new() will be renamed to .remove(). Trivially convert this driver from always returning zero in the remove callback to the void returning variant. Signed-off-by: Uwe Kleine-König Signed-off-by: Chanwoo Choi --- drivers/devfreq/rk3399_dmc.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/devfreq/rk3399_dmc.c b/drivers/devfreq/rk3399_dmc.c index fd2c5ffedf41..d405cee92c25 100644 --- a/drivers/devfreq/rk3399_dmc.c +++ b/drivers/devfreq/rk3399_dmc.c @@ -459,13 +459,11 @@ err_edev: return ret; } -static int rk3399_dmcfreq_remove(struct platform_device *pdev) +static void rk3399_dmcfreq_remove(struct platform_device *pdev) { struct rk3399_dmcfreq *dmcfreq = dev_get_drvdata(&pdev->dev); devfreq_event_disable_edev(dmcfreq->edev); - - return 0; } static const struct of_device_id rk3399dmc_devfreq_of_match[] = { @@ -476,7 +474,7 @@ MODULE_DEVICE_TABLE(of, rk3399dmc_devfreq_of_match); static struct platform_driver rk3399_dmcfreq_driver = { .probe = rk3399_dmcfreq_probe, - .remove = rk3399_dmcfreq_remove, + .remove_new = rk3399_dmcfreq_remove, .driver = { .name = "rk3399-dmc-freq", .pm = &rk3399_dmcfreq_pm, From ccad360a2d415447bd6f0de9e873eec05442d159 Mon Sep 17 00:00:00 2001 From: Anand Moon Date: Wed, 17 Apr 2024 10:14:48 +0530 Subject: [PATCH 68/68] PM / devfreq: exynos: Use DEFINE_SIMPLE_DEV_PM_OPS for PM functions This macro has the advantage over SET_SYSTEM_SLEEP_PM_OPS that we don't have to care about when the functions are actually used. Also make use of pm_sleep_ptr() to discard all PM_SLEEP related stuff if CONFIG_PM_SLEEP isn't enabled. Link: https://lore.kernel.org/lkml/20240417044459.1908-2-linux.amoon@gmail.com/ Signed-off-by: Anand Moon Signed-off-by: Chanwoo Choi --- drivers/devfreq/exynos-bus.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/devfreq/exynos-bus.c b/drivers/devfreq/exynos-bus.c index 245898f1a88e..00118580905a 100644 --- a/drivers/devfreq/exynos-bus.c +++ b/drivers/devfreq/exynos-bus.c @@ -467,7 +467,6 @@ static void exynos_bus_shutdown(struct platform_device *pdev) devfreq_suspend_device(bus->devfreq); } -#ifdef CONFIG_PM_SLEEP static int exynos_bus_resume(struct device *dev) { struct exynos_bus *bus = dev_get_drvdata(dev); @@ -495,11 +494,9 @@ static int exynos_bus_suspend(struct device *dev) return 0; } -#endif -static const struct dev_pm_ops exynos_bus_pm = { - SET_SYSTEM_SLEEP_PM_OPS(exynos_bus_suspend, exynos_bus_resume) -}; +static DEFINE_SIMPLE_DEV_PM_OPS(exynos_bus_pm, + exynos_bus_suspend, exynos_bus_resume); static const struct of_device_id exynos_bus_of_match[] = { { .compatible = "samsung,exynos-bus", }, @@ -512,7 +509,7 @@ static struct platform_driver exynos_bus_platdrv = { .shutdown = exynos_bus_shutdown, .driver = { .name = "exynos-bus", - .pm = &exynos_bus_pm, + .pm = pm_sleep_ptr(&exynos_bus_pm), .of_match_table = exynos_bus_of_match, }, };