From 4622ba923e55e12cb76081c8865b01fcb383a9d8 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Mon, 5 Jun 2023 15:47:13 +0000
Subject: [PATCH 01/12] intel_idle: refactor state->enter manipulation into its
 own function

Since the 6.4 kernel, the logic for updating a state's enter method
based on "environmental conditions" (command line options, cpu sidechannel
workarounds etc etc) has gotten pretty complex.
This patch refactors this into a seperate small, self contained function
(no behavior changes) for improved readability and to make future
changes to this logic easier to do and understand.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/idle/intel_idle.c | 50 ++++++++++++++++++++++-----------------
 1 file changed, 28 insertions(+), 22 deletions(-)

diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index aa2d19db2b1d..c351b21c0875 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -1839,6 +1839,32 @@ static bool __init intel_idle_verify_cstate(unsigned int mwait_hint)
 	return true;
 }
 
+static void state_update_enter_method(struct cpuidle_state *state, int cstate)
+{
+	if (state->flags & CPUIDLE_FLAG_INIT_XSTATE) {
+		/*
+		 * Combining with XSTATE with IBRS or IRQ_ENABLE flags
+		 * is not currently supported but this driver.
+		 */
+		WARN_ON_ONCE(state->flags & CPUIDLE_FLAG_IBRS);
+		WARN_ON_ONCE(state->flags & CPUIDLE_FLAG_IRQ_ENABLE);
+		state->enter = intel_idle_xstate;
+	} else if (cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS) &&
+			   state->flags & CPUIDLE_FLAG_IBRS) {
+		/*
+		 * IBRS mitigation requires that C-states are entered
+		 * with interrupts disabled.
+		 */
+		WARN_ON_ONCE(state->flags & CPUIDLE_FLAG_IRQ_ENABLE);
+		state->enter = intel_idle_ibrs;
+	} else if (state->flags & CPUIDLE_FLAG_IRQ_ENABLE) {
+		state->enter = intel_idle_irq;
+	} else if (force_irq_on) {
+		pr_info("forced intel_idle_irq for state %d\n", cstate);
+		state->enter = intel_idle_irq;
+	}
+}
+
 static void __init intel_idle_init_cstates_icpu(struct cpuidle_driver *drv)
 {
 	int cstate;
@@ -1894,28 +1920,8 @@ static void __init intel_idle_init_cstates_icpu(struct cpuidle_driver *drv)
 		drv->states[drv->state_count] = cpuidle_state_table[cstate];
 		state = &drv->states[drv->state_count];
 
-		if (state->flags & CPUIDLE_FLAG_INIT_XSTATE) {
-			/*
-			 * Combining with XSTATE with IBRS or IRQ_ENABLE flags
-			 * is not currently supported but this driver.
-			 */
-			WARN_ON_ONCE(state->flags & CPUIDLE_FLAG_IBRS);
-			WARN_ON_ONCE(state->flags & CPUIDLE_FLAG_IRQ_ENABLE);
-			state->enter = intel_idle_xstate;
-		} else if (cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS) &&
-			   state->flags & CPUIDLE_FLAG_IBRS) {
-			/*
-			 * IBRS mitigation requires that C-states are entered
-			 * with interrupts disabled.
-			 */
-			WARN_ON_ONCE(state->flags & CPUIDLE_FLAG_IRQ_ENABLE);
-			state->enter = intel_idle_ibrs;
-		} else if (state->flags & CPUIDLE_FLAG_IRQ_ENABLE) {
-			state->enter = intel_idle_irq;
-		} else if (force_irq_on) {
-			pr_info("forced intel_idle_irq for state %d\n", cstate);
-			state->enter = intel_idle_irq;
-		}
+		state_update_enter_method(state, cstate);
+
 
 		if ((disabled_states_mask & BIT(drv->state_count)) ||
 		    ((icpu->use_acpi || force_use_acpi) &&

From 7826c069c8765969cfb7a364f8e0e663fc132b10 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Mon, 5 Jun 2023 15:47:14 +0000
Subject: [PATCH 02/12] intel_idle: clean up the (new)
 state_update_enter_method function

Now that the logic for state_update_enter_method() is in its own
function, the long if .. else if .. else if .. else if chain
can be simplified by just returning from the function
at the various places. This does not change functionality,
but it makes the logic much simpler to read or modify later.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/idle/intel_idle.c | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index c351b21c0875..256c2d42e350 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -1849,7 +1849,10 @@ static void state_update_enter_method(struct cpuidle_state *state, int cstate)
 		WARN_ON_ONCE(state->flags & CPUIDLE_FLAG_IBRS);
 		WARN_ON_ONCE(state->flags & CPUIDLE_FLAG_IRQ_ENABLE);
 		state->enter = intel_idle_xstate;
-	} else if (cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS) &&
+		return;
+	}
+
+	if (cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS) &&
 			   state->flags & CPUIDLE_FLAG_IBRS) {
 		/*
 		 * IBRS mitigation requires that C-states are entered
@@ -1857,9 +1860,15 @@ static void state_update_enter_method(struct cpuidle_state *state, int cstate)
 		 */
 		WARN_ON_ONCE(state->flags & CPUIDLE_FLAG_IRQ_ENABLE);
 		state->enter = intel_idle_ibrs;
-	} else if (state->flags & CPUIDLE_FLAG_IRQ_ENABLE) {
+		return;
+	}
+
+	if (state->flags & CPUIDLE_FLAG_IRQ_ENABLE) {
 		state->enter = intel_idle_irq;
-	} else if (force_irq_on) {
+		return;
+	}
+
+	if (force_irq_on) {
 		pr_info("forced intel_idle_irq for state %d\n", cstate);
 		state->enter = intel_idle_irq;
 	}

From b4a11fa3331e163e177e76098fe1d8b12b87cf6b Mon Sep 17 00:00:00 2001
From: Wyes Karny <wyes.karny@amd.com>
Date: Mon, 29 May 2023 14:25:51 +0000
Subject: [PATCH 03/12] cpufreq: Fail driver register if it has adjust_perf
 without fast_switch

If fast_switch_possible flag is set by the scaling driver, the governor
is free to select fast_switch function even if adjust_perf is set.  Some
scaling drivers which use adjust_perf don't set fast_switch thinking
that the governor would never fall back to fast_switch. But the governor
can fall back to fast_switch even in runtime if frequency invariance is
disabled due to some reason. This could crash the kernel if the driver
didn't set the fast_switch function pointer.

Therefore, fail driver registration if it has adjust_perf without
fast_switch.

Suggested-by: Rafael J. Wysocki <rafael@kernel.org>
Suggested-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Wyes Karny <wyes.karny@amd.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cpufreq.c | 3 ++-
 include/linux/cpufreq.h   | 5 ++++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 6b52ebe5a890..50bbc969ffe5 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -2828,7 +2828,8 @@ int cpufreq_register_driver(struct cpufreq_driver *driver_data)
 	     (driver_data->setpolicy && (driver_data->target_index ||
 		    driver_data->target)) ||
 	     (!driver_data->get_intermediate != !driver_data->target_intermediate) ||
-	     (!driver_data->online != !driver_data->offline))
+	     (!driver_data->online != !driver_data->offline) ||
+		 (driver_data->adjust_perf && !driver_data->fast_switch))
 		return -EINVAL;
 
 	pr_debug("trying to register driver %s\n", driver_data->name);
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 26e2eb399484..172ff51c1b2a 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -340,7 +340,10 @@ struct cpufreq_driver {
 	/*
 	 * ->fast_switch() replacement for drivers that use an internal
 	 * representation of performance levels and can pass hints other than
-	 * the target performance level to the hardware.
+	 * the target performance level to the hardware. This can only be set
+	 * if ->fast_switch is set too, because in those cases (under specific
+	 * conditions) scale invariance can be disabled, which causes the
+	 * schedutil governor to fall back to the latter.
 	 */
 	void		(*adjust_perf)(unsigned int cpu,
 				       unsigned long min_perf,

From 2f3d08f074b02aa449de27238fda72496c789034 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Mon, 5 Jun 2023 15:47:15 +0000
Subject: [PATCH 04/12] intel_idle: Add support for using intel_idle in a VM
 guest using just hlt

In a typical VM guest, the mwait instruction is not available, leaving
only the 'hlt' instruction (which causes a VMEXIT to the host).

So for this common case, intel_idle will detect the lack of mwait, and
fail to initialize (after which another idle method would step in which
will just use hlt always).

Other (non-common) cases exist; the table below shows the before/after
for these:

+------------+--------------------------+-------------------------+
| Hypervisor | Idle method before patch | Idle method after patch |
| exposes    |                          |                         |
+============+==========================+=========================+
| nothing    | default_idle fallback    | intel_idle VM table     |
| (common)   | (straight "hlt")         |                         |
+------------+--------------------------+-------------------------+
| mwait      | intel_idle mwait table   | intel_idle mwait table  |
+------------+--------------------------+-------------------------+
| ACPI       | ACPI C1 state ("hlt")    | intel_idle VM table     |
+------------+--------------------------+-------------------------+

This is only applicable to CPUs known by intel_idle. For the bare metal
case, unknown CPU models will use the ACPI tables (when available) to
get estimates for latency and break even point for longer idle states.
In guests, the common case is that ACPI tables are not available, but
even when they are available, they can't and don't provide the latency
information for the longer (mwait based) states. For this scenario
(unknown CPU model), the default_idle mode (no ACPI) or ACPI C1 (ACPI
avaible) will be used.

By providing capability to do this with the intel_idle driver, we can
do better than the fallback or ACPI table methods. While this current
change only gets us to the existing behavior, later patches in this
series will add new capabilities such as optimized TLB flushing.

In order to do this, a simplified version of the initialization
function for VM guests is created, and this will be called if the CPU
is recognized, but mwait is not supported, and we're in a VM guest.

One thing to note is that the max latency (and break even) of this C1
state is higher than the typical bare metal C1 state. Because hlt causes
a vmexit, and the cost of vmexit + hypervisor overhead + vmenter is
typically in the order of upto 5 microseconds... even if the hypervisor
does not actually goes into a hardware power saving state.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
[ rjw: Dropped redundant checks from should_verify_mwait() ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/idle/intel_idle.c | 117 +++++++++++++++++++++++++++++++++++++-
 1 file changed, 116 insertions(+), 1 deletion(-)

diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index 256c2d42e350..a80e1f520293 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -199,6 +199,43 @@ static __cpuidle int intel_idle_xstate(struct cpuidle_device *dev,
 	return __intel_idle(dev, drv, index);
 }
 
+static __always_inline int __intel_idle_hlt(struct cpuidle_device *dev,
+					struct cpuidle_driver *drv, int index)
+{
+	raw_safe_halt();
+	raw_local_irq_disable();
+	return index;
+}
+
+/**
+ * intel_idle_hlt - Ask the processor to enter the given idle state using hlt.
+ * @dev: cpuidle device of the target CPU.
+ * @drv: cpuidle driver (assumed to point to intel_idle_driver).
+ * @index: Target idle state index.
+ *
+ * Use the HLT instruction to notify the processor that the CPU represented by
+ * @dev is idle and it can try to enter the idle state corresponding to @index.
+ *
+ * Must be called under local_irq_disable().
+ */
+static __cpuidle int intel_idle_hlt(struct cpuidle_device *dev,
+				struct cpuidle_driver *drv, int index)
+{
+	return __intel_idle_hlt(dev, drv, index);
+}
+
+static __cpuidle int intel_idle_hlt_irq_on(struct cpuidle_device *dev,
+                                   struct cpuidle_driver *drv, int index)
+{
+       int ret;
+
+       raw_local_irq_enable();
+       ret = __intel_idle_hlt(dev, drv, index);
+       raw_local_irq_disable();
+
+       return ret;
+}
+
 /**
  * intel_idle_s2idle - Ask the processor to enter the given idle state.
  * @dev: cpuidle device of the target CPU.
@@ -1242,6 +1279,18 @@ static struct cpuidle_state snr_cstates[] __initdata = {
 		.enter = NULL }
 };
 
+static struct cpuidle_state vmguest_cstates[] __initdata = {
+	{
+		.name = "C1",
+		.desc = "HLT",
+		.flags = MWAIT2flg(0x00) | CPUIDLE_FLAG_IRQ_ENABLE,
+		.exit_latency = 5,
+		.target_residency = 10,
+		.enter = &intel_idle_hlt, },
+	{
+		.enter = NULL }
+};
+
 static const struct idle_cpu idle_cpu_nehalem __initconst = {
 	.state_table = nehalem_cstates,
 	.auto_demotion_disable_flags = NHM_C1_AUTO_DEMOTE | NHM_C3_AUTO_DEMOTE,
@@ -1841,6 +1890,16 @@ static bool __init intel_idle_verify_cstate(unsigned int mwait_hint)
 
 static void state_update_enter_method(struct cpuidle_state *state, int cstate)
 {
+	if (state->enter == intel_idle_hlt) {
+		if (force_irq_on) {
+			pr_info("forced intel_idle_irq for state %d\n", cstate);
+			state->enter = intel_idle_hlt_irq_on;
+		}
+		return;
+	}
+	if (state->enter == intel_idle_hlt_irq_on)
+		return; /* no update scenarios */
+
 	if (state->flags & CPUIDLE_FLAG_INIT_XSTATE) {
 		/*
 		 * Combining with XSTATE with IBRS or IRQ_ENABLE flags
@@ -1874,6 +1933,21 @@ static void state_update_enter_method(struct cpuidle_state *state, int cstate)
 	}
 }
 
+/*
+ * For mwait based states, we want to verify the cpuid data to see if the state
+ * is actually supported by this specific CPU.
+ * For non-mwait based states, this check should be skipped.
+ */
+static bool should_verify_mwait(struct cpuidle_state *state)
+{
+	if (state->enter == intel_idle_hlt)
+		return false;
+	if (state->enter == intel_idle_hlt_irq_on)
+		return false;
+
+	return true;
+}
+
 static void __init intel_idle_init_cstates_icpu(struct cpuidle_driver *drv)
 {
 	int cstate;
@@ -1922,7 +1996,7 @@ static void __init intel_idle_init_cstates_icpu(struct cpuidle_driver *drv)
 		}
 
 		mwait_hint = flg2MWAIT(cpuidle_state_table[cstate].flags);
-		if (!intel_idle_verify_cstate(mwait_hint))
+		if (should_verify_mwait(&cpuidle_state_table[cstate]) && !intel_idle_verify_cstate(mwait_hint))
 			continue;
 
 		/* Structure copy. */
@@ -2056,6 +2130,45 @@ static void __init intel_idle_cpuidle_devices_uninit(void)
 		cpuidle_unregister_device(per_cpu_ptr(intel_idle_cpuidle_devices, i));
 }
 
+static int __init intel_idle_vminit(const struct x86_cpu_id *id)
+{
+	int retval;
+
+	cpuidle_state_table = vmguest_cstates;
+
+	icpu = (const struct idle_cpu *)id->driver_data;
+
+	pr_debug("v" INTEL_IDLE_VERSION " model 0x%X\n",
+		 boot_cpu_data.x86_model);
+
+	intel_idle_cpuidle_devices = alloc_percpu(struct cpuidle_device);
+	if (!intel_idle_cpuidle_devices)
+		return -ENOMEM;
+
+	intel_idle_cpuidle_driver_init(&intel_idle_driver);
+
+	retval = cpuidle_register_driver(&intel_idle_driver);
+	if (retval) {
+		struct cpuidle_driver *drv = cpuidle_get_driver();
+		printk(KERN_DEBUG pr_fmt("intel_idle yielding to %s\n"),
+		       drv ? drv->name : "none");
+		goto init_driver_fail;
+	}
+
+	retval = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "idle/intel:online",
+				   intel_idle_cpu_online, NULL);
+	if (retval < 0)
+		goto hp_setup_fail;
+
+	return 0;
+hp_setup_fail:
+	intel_idle_cpuidle_devices_uninit();
+	cpuidle_unregister_driver(&intel_idle_driver);
+init_driver_fail:
+	free_percpu(intel_idle_cpuidle_devices);
+	return retval;
+}
+
 static int __init intel_idle_init(void)
 {
 	const struct x86_cpu_id *id;
@@ -2074,6 +2187,8 @@ static int __init intel_idle_init(void)
 	id = x86_match_cpu(intel_idle_ids);
 	if (id) {
 		if (!boot_cpu_has(X86_FEATURE_MWAIT)) {
+			if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
+				return intel_idle_vminit(id);
 			pr_debug("Please enable MWAIT in BIOS SETUP\n");
 			return -ENODEV;
 		}

From 217e67784eab30cd0704fab4109647ea68a4d850 Mon Sep 17 00:00:00 2001
From: Wyes Karny <wyes.karny@amd.com>
Date: Tue, 30 May 2023 13:13:48 +0000
Subject: [PATCH 05/12] cpufreq: amd-pstate: Write CPPC enable bit per-socket

Currently amd_pstate sets CPPC enable bit in MSR_AMD_CPPC_ENABLE only
for the CPU where the module_init happened. But MSR_AMD_CPPC_ENABLE is
per-socket. This causes CPPC enable bit to set for only one socket for
servers with more than one physical packages. To fix this write
MSR_AMD_CPPC_ENABLE per-socket.

Also, handle duplicate calls for cppc_enable, because it's called from
per-policy/per-core callbacks and can result in duplicate MSR writes.

Before the fix:
amd@amd:~$ sudo rdmsr -a 0xc00102b1 | uniq --count
	192 0
    192 1

After the fix:
amd@amd:~$ sudo rdmsr -a 0xc00102b1 | uniq --count
    384 1

Suggested-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
Signed-off-by: Wyes Karny <wyes.karny@amd.com>
Acked-by: Huang Rui <ray.huang@amd.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/amd-pstate.c | 28 +++++++++++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c
index ddd346a239e0..50722bfbb34a 100644
--- a/drivers/cpufreq/amd-pstate.c
+++ b/drivers/cpufreq/amd-pstate.c
@@ -63,6 +63,7 @@ static struct cpufreq_driver *current_pstate_driver;
 static struct cpufreq_driver amd_pstate_driver;
 static struct cpufreq_driver amd_pstate_epp_driver;
 static int cppc_state = AMD_PSTATE_DISABLE;
+static bool cppc_enabled;
 
 /*
  * AMD Energy Preference Performance (EPP)
@@ -228,7 +229,28 @@ static int amd_pstate_set_energy_pref_index(struct amd_cpudata *cpudata,
 
 static inline int pstate_enable(bool enable)
 {
-	return wrmsrl_safe(MSR_AMD_CPPC_ENABLE, enable);
+	int ret, cpu;
+	unsigned long logical_proc_id_mask = 0;
+
+	if (enable == cppc_enabled)
+		return 0;
+
+	for_each_present_cpu(cpu) {
+		unsigned long logical_id = topology_logical_die_id(cpu);
+
+		if (test_bit(logical_id, &logical_proc_id_mask))
+			continue;
+
+		set_bit(logical_id, &logical_proc_id_mask);
+
+		ret = wrmsrl_safe_on_cpu(cpu, MSR_AMD_CPPC_ENABLE,
+				enable);
+		if (ret)
+			return ret;
+	}
+
+	cppc_enabled = enable;
+	return 0;
 }
 
 static int cppc_enable(bool enable)
@@ -236,6 +258,9 @@ static int cppc_enable(bool enable)
 	int cpu, ret = 0;
 	struct cppc_perf_ctrls perf_ctrls;
 
+	if (enable == cppc_enabled)
+		return 0;
+
 	for_each_present_cpu(cpu) {
 		ret = cppc_set_enable(cpu, enable);
 		if (ret)
@@ -251,6 +276,7 @@ static int cppc_enable(bool enable)
 		}
 	}
 
+	cppc_enabled = enable;
 	return ret;
 }
 

From f4aad639302a07454dcb23b408dcadf8a9efb031 Mon Sep 17 00:00:00 2001
From: Wyes Karny <wyes.karny@amd.com>
Date: Mon, 12 Jun 2023 11:36:10 +0000
Subject: [PATCH 06/12] cpufreq: amd-pstate: Make amd-pstate EPP driver name
 hyphenated

amd-pstate passive mode driver is hyphenated. So make amd-pstate active
mode driver consistent with that rename "amd_pstate_epp" to
"amd-pstate-epp".

Fixes: ffa5096a7c33 ("cpufreq: amd-pstate: implement Pstate EPP support for the AMD processors")
Cc: All applicable <stable@vger.kernel.org>
Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
Signed-off-by: Wyes Karny <wyes.karny@amd.com>
Acked-by: Huang Rui <ray.huang@amd.com>
Reviewed-by: Perry Yuan <Perry.Yuan@amd.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/amd-pstate.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c
index 50722bfbb34a..d8269994322e 100644
--- a/drivers/cpufreq/amd-pstate.c
+++ b/drivers/cpufreq/amd-pstate.c
@@ -1382,7 +1382,7 @@ static struct cpufreq_driver amd_pstate_epp_driver = {
 	.online		= amd_pstate_epp_cpu_online,
 	.suspend	= amd_pstate_epp_suspend,
 	.resume		= amd_pstate_epp_resume,
-	.name		= "amd_pstate_epp",
+	.name		= "amd-pstate-epp",
 	.attr		= amd_pstate_epp_attr,
 };
 

From a4ba10bf6855bf9381fe2365ec9c3af84c1fa7db Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Mon, 12 Jun 2023 09:26:48 -0400
Subject: [PATCH 07/12] cpufreq: amd-pstate: Set default governor to schedutil

The Kconfig currently defaults the governor to schedutil on x86_64
only when intel-pstate and SMP have been selected.

If the kernel is built only with amd-pstate, the default governor
should also be schedutil.

Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
Reviewed-by: Leo Li <sunpeng.li@amd.com>
Acked-by: Huang Rui <ray.huang@amd.com>
Tested-by: Perry Yuan <Perry.Yuan@amd.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig
index 2c839bd2b051..a1c51abddbc5 100644
--- a/drivers/cpufreq/Kconfig
+++ b/drivers/cpufreq/Kconfig
@@ -38,7 +38,7 @@ choice
 	prompt "Default CPUFreq governor"
 	default CPU_FREQ_DEFAULT_GOV_USERSPACE if ARM_SA1110_CPUFREQ
 	default CPU_FREQ_DEFAULT_GOV_SCHEDUTIL if ARM64 || ARM
-	default CPU_FREQ_DEFAULT_GOV_SCHEDUTIL if X86_INTEL_PSTATE && SMP
+	default CPU_FREQ_DEFAULT_GOV_SCHEDUTIL if (X86_INTEL_PSTATE || X86_AMD_PSTATE) && SMP
 	default CPU_FREQ_DEFAULT_GOV_PERFORMANCE
 	help
 	  This option sets which CPUFreq governor shall be loaded at

From 965262ef71c475857d0984a5eee57694b207a113 Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Tue, 20 Jun 2023 12:24:31 -0500
Subject: [PATCH 08/12] ACPI: CPPC: Add definition for undefined FADT preferred
 PM profile value

In the event a new preferred PM profile value is introduced it's best for
code to be able to defensively guard against it so that the wrong settings
don't get applied on a new system that uses this profile but ancient
kernels.

Acked-by: Huang Rui <ray.huang@amd.com>
Suggested-by: Gautham Ranjal Shenoy <gautham.shenoy@amd.com>
Link: https://uefi.org/htmlspecs/ACPI_Spec_6_4_html/05_ACPI_Software_Programming_Model/ACPI_Software_Programming_Model.html#fixed-acpi-description-table-fadt
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
Reviewed-by: Perry Yuan <Perry.Yuan@amd.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/acpi/actbl.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/acpi/actbl.h b/include/acpi/actbl.h
index e5dfb6f4de52..451f6276da49 100644
--- a/include/acpi/actbl.h
+++ b/include/acpi/actbl.h
@@ -307,7 +307,8 @@ enum acpi_preferred_pm_profiles {
 	PM_SOHO_SERVER = 5,
 	PM_APPLIANCE_PC = 6,
 	PM_PERFORMANCE_SERVER = 7,
-	PM_TABLET = 8
+	PM_TABLET = 8,
+	NR_PM_PROFILES = 9
 };
 
 /* Values for sleep_status and sleep_control registers (V5+ FADT) */

From 32f80b9adfdb43f8af248596724f59dde938a190 Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Tue, 20 Jun 2023 12:24:32 -0500
Subject: [PATCH 09/12] cpufreq: amd-pstate: Set a fallback policy based on
 preferred_profile

If a user's configuration doesn't explicitly specify the cpufreq
scaling governor then the code currently explicitly falls back to
'powersave'. This default is fine for notebooks and desktops, but
servers and undefined machines should default to 'performance'.

Look at the 'preferred_profile' field from the FADT to set this
policy accordingly.

Link: https://uefi.org/htmlspecs/ACPI_Spec_6_4_html/05_ACPI_Software_Programming_Model/ACPI_Software_Programming_Model.html#fixed-acpi-description-table-fadt
Acked-by: Huang Rui <ray.huang@amd.com>
Suggested-by: Wyes Karny <Wyes.Karny@amd.com>
Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
Reviewed-by: Perry Yuan <Perry.Yuan@amd.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/amd-pstate.c | 28 ++++++++++++++++++++++++++--
 1 file changed, 26 insertions(+), 2 deletions(-)

diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c
index d8269994322e..3546d7db614d 100644
--- a/drivers/cpufreq/amd-pstate.c
+++ b/drivers/cpufreq/amd-pstate.c
@@ -1071,6 +1071,26 @@ static const struct attribute_group amd_pstate_global_attr_group = {
 	.attrs = pstate_global_attributes,
 };
 
+static bool amd_pstate_acpi_pm_profile_server(void)
+{
+	switch (acpi_gbl_FADT.preferred_profile) {
+	case PM_ENTERPRISE_SERVER:
+	case PM_SOHO_SERVER:
+	case PM_PERFORMANCE_SERVER:
+		return true;
+	}
+	return false;
+}
+
+static bool amd_pstate_acpi_pm_profile_undefined(void)
+{
+	if (acpi_gbl_FADT.preferred_profile == PM_UNSPECIFIED)
+		return true;
+	if (acpi_gbl_FADT.preferred_profile >= NR_PM_PROFILES)
+		return true;
+	return false;
+}
+
 static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy)
 {
 	int min_freq, max_freq, nominal_freq, lowest_nonlinear_freq, ret;
@@ -1128,10 +1148,14 @@ static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy)
 	policy->max = policy->cpuinfo.max_freq;
 
 	/*
-	 * Set the policy to powersave to provide a valid fallback value in case
+	 * Set the policy to provide a valid fallback value in case
 	 * the default cpufreq governor is neither powersave nor performance.
 	 */
-	policy->policy = CPUFREQ_POLICY_POWERSAVE;
+	if (amd_pstate_acpi_pm_profile_server() ||
+	    amd_pstate_acpi_pm_profile_undefined())
+		policy->policy = CPUFREQ_POLICY_PERFORMANCE;
+	else
+		policy->policy = CPUFREQ_POLICY_POWERSAVE;
 
 	if (boot_cpu_has(X86_FEATURE_CPPC)) {
 		ret = rdmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, &value);

From c88ad30e3f861c7be4e3b4995554e2b0754059b7 Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Tue, 20 Jun 2023 12:24:33 -0500
Subject: [PATCH 10/12] cpufreq: amd-pstate: Add a kernel config option to set
 default mode

Users are having more success with amd-pstate since the introduction
of EPP and Guided modes.  To expose the driver to more users by default
introduce a kernel configuration option for setting the default mode.

Users can use an integer to map out which default mode they want to use
in lieu of a kernel command line option.

This will default to EPP, but only if:
 1) The CPU supports an MSR.
 2) The system profile is identified
 3) The system profile is identified as a non-server by the FADT.

Link: https://gitlab.freedesktop.org/hadess/power-profiles-daemon/-/merge_requests/121
Acked-by: Huang Rui <ray.huang@amd.com>
Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
Co-developed-by: Perry Yuan <perry.yuan@amd.com>
Signed-off-by: Perry Yuan <perry.yuan@amd.com>
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/Kconfig.x86  | 17 +++++++++
 drivers/cpufreq/amd-pstate.c | 73 ++++++++++++++++++++++++------------
 include/linux/amd-pstate.h   |  4 +-
 3 files changed, 68 insertions(+), 26 deletions(-)

diff --git a/drivers/cpufreq/Kconfig.x86 b/drivers/cpufreq/Kconfig.x86
index 00476e94db90..438c9e75a04d 100644
--- a/drivers/cpufreq/Kconfig.x86
+++ b/drivers/cpufreq/Kconfig.x86
@@ -51,6 +51,23 @@ config X86_AMD_PSTATE
 
 	  If in doubt, say N.
 
+config X86_AMD_PSTATE_DEFAULT_MODE
+	int "AMD Processor P-State default mode"
+	depends on X86_AMD_PSTATE
+	default 3 if X86_AMD_PSTATE
+	range 1 4
+	help
+	  Select the default mode the amd-pstate driver will use on
+	  supported hardware.
+	  The value set has the following meanings:
+		1 -> Disabled
+		2 -> Passive
+		3 -> Active (EPP)
+		4 -> Guided
+
+	  For details, take a look at:
+	  <file:Documentation/admin-guide/pm/amd-pstate.rst>.
+
 config X86_AMD_PSTATE_UT
 	tristate "selftest for AMD Processor P-State driver"
 	depends on X86 && ACPI_PROCESSOR
diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c
index 3546d7db614d..81fba0dcbee9 100644
--- a/drivers/cpufreq/amd-pstate.c
+++ b/drivers/cpufreq/amd-pstate.c
@@ -62,7 +62,7 @@
 static struct cpufreq_driver *current_pstate_driver;
 static struct cpufreq_driver amd_pstate_driver;
 static struct cpufreq_driver amd_pstate_epp_driver;
-static int cppc_state = AMD_PSTATE_DISABLE;
+static int cppc_state = AMD_PSTATE_UNDEFINED;
 static bool cppc_enabled;
 
 /*
@@ -1410,6 +1410,25 @@ static struct cpufreq_driver amd_pstate_epp_driver = {
 	.attr		= amd_pstate_epp_attr,
 };
 
+static int __init amd_pstate_set_driver(int mode_idx)
+{
+	if (mode_idx >= AMD_PSTATE_DISABLE && mode_idx < AMD_PSTATE_MAX) {
+		cppc_state = mode_idx;
+		if (cppc_state == AMD_PSTATE_DISABLE)
+			pr_info("driver is explicitly disabled\n");
+
+		if (cppc_state == AMD_PSTATE_ACTIVE)
+			current_pstate_driver = &amd_pstate_epp_driver;
+
+		if (cppc_state == AMD_PSTATE_PASSIVE || cppc_state == AMD_PSTATE_GUIDED)
+			current_pstate_driver = &amd_pstate_driver;
+
+		return 0;
+	}
+
+	return -EINVAL;
+}
+
 static int __init amd_pstate_init(void)
 {
 	struct device *dev_root;
@@ -1417,15 +1436,6 @@ static int __init amd_pstate_init(void)
 
 	if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
 		return -ENODEV;
-	/*
-	 * by default the pstate driver is disabled to load
-	 * enable the amd_pstate passive mode driver explicitly
-	 * with amd_pstate=passive or other modes in kernel command line
-	 */
-	if (cppc_state == AMD_PSTATE_DISABLE) {
-		pr_info("driver load is disabled, boot with specific mode to enable this\n");
-		return -ENODEV;
-	}
 
 	if (!acpi_cpc_valid()) {
 		pr_warn_once("the _CPC object is not present in SBIOS or ACPI disabled\n");
@@ -1436,6 +1446,33 @@ static int __init amd_pstate_init(void)
 	if (cpufreq_get_current_driver())
 		return -EEXIST;
 
+	switch (cppc_state) {
+	case AMD_PSTATE_UNDEFINED:
+		/* Disable on the following configs by default:
+		 * 1. Undefined platforms
+		 * 2. Server platforms
+		 * 3. Shared memory designs
+		 */
+		if (amd_pstate_acpi_pm_profile_undefined() ||
+		    amd_pstate_acpi_pm_profile_server() ||
+		    !boot_cpu_has(X86_FEATURE_CPPC)) {
+			pr_info("driver load is disabled, boot with specific mode to enable this\n");
+			return -ENODEV;
+		}
+		ret = amd_pstate_set_driver(CONFIG_X86_AMD_PSTATE_DEFAULT_MODE);
+		if (ret)
+			return ret;
+		break;
+	case AMD_PSTATE_DISABLE:
+		return -ENODEV;
+	case AMD_PSTATE_PASSIVE:
+	case AMD_PSTATE_ACTIVE:
+	case AMD_PSTATE_GUIDED:
+		break;
+	default:
+		return -EINVAL;
+	}
+
 	/* capability check */
 	if (boot_cpu_has(X86_FEATURE_CPPC)) {
 		pr_debug("AMD CPPC MSR based functionality is supported\n");
@@ -1488,21 +1525,7 @@ static int __init amd_pstate_param(char *str)
 	size = strlen(str);
 	mode_idx = get_mode_idx_from_str(str, size);
 
-	if (mode_idx >= AMD_PSTATE_DISABLE && mode_idx < AMD_PSTATE_MAX) {
-		cppc_state = mode_idx;
-		if (cppc_state == AMD_PSTATE_DISABLE)
-			pr_info("driver is explicitly disabled\n");
-
-		if (cppc_state == AMD_PSTATE_ACTIVE)
-			current_pstate_driver = &amd_pstate_epp_driver;
-
-		if (cppc_state == AMD_PSTATE_PASSIVE || cppc_state == AMD_PSTATE_GUIDED)
-			current_pstate_driver = &amd_pstate_driver;
-
-		return 0;
-	}
-
-	return -EINVAL;
+	return amd_pstate_set_driver(mode_idx);
 }
 early_param("amd_pstate", amd_pstate_param);
 
diff --git a/include/linux/amd-pstate.h b/include/linux/amd-pstate.h
index c10ebf8c42e6..446394f84606 100644
--- a/include/linux/amd-pstate.h
+++ b/include/linux/amd-pstate.h
@@ -94,7 +94,8 @@ struct amd_cpudata {
  * enum amd_pstate_mode - driver working mode of amd pstate
  */
 enum amd_pstate_mode {
-	AMD_PSTATE_DISABLE = 0,
+	AMD_PSTATE_UNDEFINED = 0,
+	AMD_PSTATE_DISABLE,
 	AMD_PSTATE_PASSIVE,
 	AMD_PSTATE_ACTIVE,
 	AMD_PSTATE_GUIDED,
@@ -102,6 +103,7 @@ enum amd_pstate_mode {
 };
 
 static const char * const amd_pstate_mode_string[] = {
+	[AMD_PSTATE_UNDEFINED]   = "undefined",
 	[AMD_PSTATE_DISABLE]     = "disable",
 	[AMD_PSTATE_PASSIVE]     = "passive",
 	[AMD_PSTATE_ACTIVE]      = "active",

From 03f44ffb3d5be2fceda375d92c70ab6de4df7081 Mon Sep 17 00:00:00 2001
From: Tero Kristo <tero.kristo@linux.intel.com>
Date: Wed, 21 Jun 2023 09:58:39 +0300
Subject: [PATCH 11/12] cpufreq: intel_pstate: Fix
 energy_performance_preference for passive

If the intel_pstate driver is set to passive mode, then writing the
same value to the energy_performance_preference sysfs twice will fail.
This is caused by the wrong return value used (index of the matched
energy_perf_string), instead of the length of the passed in parameter.
Fix by forcing the internal return value to zero when the same
preference is passed in by user. This same issue is not present when
active mode is used for the driver.

Fixes: f6ebbcf08f37 ("cpufreq: intel_pstate: Implement passive mode with HWP enabled")
Reported-by: Niklas Neronin <niklas.neronin@intel.com>
Signed-off-by: Tero Kristo <tero.kristo@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/intel_pstate.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index 2548ec92faa2..f29182512b98 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -824,6 +824,8 @@ static ssize_t store_energy_performance_preference(
 			err = cpufreq_start_governor(policy);
 			if (!ret)
 				ret = err;
+		} else {
+			ret = 0;
 		}
 	}
 

From 0fac214bb75ee64d7b9e6521d53f8251a4d324ea Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Tue, 20 Jun 2023 18:03:54 +0000
Subject: [PATCH 12/12] intel_idle: Add a "Long HLT" C1 state for the VM guest
 mode

intel_idle will, for the bare metal case, usually have one or more deep
power states that have the CPUIDLE_FLAG_TLB_FLUSHED flag set. When
a state with this flag is selected by the cpuidle framework, it will also
flush the TLBs as part of entering this state. The benefit of doing this is
that the kernel does not need to wake the cpu out of this deep power state
just to flush the TLBs... for which the latency can be very high due to
the exit latency of deep power states.

In a VM guest currently, this benefit of avoiding the wakeup does not exist,
while the problem (long exit latency) is even more severe. Linux will need
to wake up a vCPU (causing the host to either come out of a deep C state,
or the VMM to have to deschedule something else to schedule the vCPU) which
can take a very long time.. adding a lot of latency to tlb flush operations
(including munmap and others).

To solve this, add a "Long HLT" C state to the state table for the VM guest
case that has the CPUIDLE_FLAG_TLB_FLUSHED flag set.  The result of that is
that for long idle periods (where the VMM is likely to do things that cause
large latency) the cpuidle framework will flush the TLBs (and avoid the
wakeups), while for short/quick idle durations, the existing behavior is
retained.

Now, there is still only "hlt" available in the guest, but for long idle,
the host can go to a deeper state (say C6).  There is a reasonable debate
one can have to what to set for the exit_latency and break even point for
this "Long HLT" state.  The good news is that intel_idle has these values
available for the underlying CPU (even when mwait is not exposed).  The
solution thus is to just use the latency and break even of the deepest state
from the bare metal CPU.  This is under the assumption that this is a pretty
reasonable estimate of what the VMM would do to cause latency.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/idle/intel_idle.c | 55 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 55 insertions(+)

diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index a80e1f520293..34201d7ef33e 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -1287,6 +1287,13 @@ static struct cpuidle_state vmguest_cstates[] __initdata = {
 		.exit_latency = 5,
 		.target_residency = 10,
 		.enter = &intel_idle_hlt, },
+	{
+		.name = "C1L",
+		.desc = "Long HLT",
+		.flags = MWAIT2flg(0x00) | CPUIDLE_FLAG_TLB_FLUSHED,
+		.exit_latency = 5,
+		.target_residency = 200,
+		.enter = &intel_idle_hlt, },
 	{
 		.enter = NULL }
 };
@@ -2130,6 +2137,45 @@ static void __init intel_idle_cpuidle_devices_uninit(void)
 		cpuidle_unregister_device(per_cpu_ptr(intel_idle_cpuidle_devices, i));
 }
 
+/*
+ * Match up the latency and break even point of the bare metal (cpu based)
+ * states with the deepest VM available state.
+ *
+ * We only want to do this for the deepest state, the ones that has
+ * the TLB_FLUSHED flag set on the .
+ *
+ * All our short idle states are dominated by vmexit/vmenter latencies,
+ * not the underlying hardware latencies so we keep our values for these.
+ */
+static void matchup_vm_state_with_baremetal(void)
+{
+	int cstate;
+
+	for (cstate = 0; cstate < CPUIDLE_STATE_MAX; ++cstate) {
+		int matching_cstate;
+
+		if (intel_idle_max_cstate_reached(cstate))
+			break;
+
+		if (!cpuidle_state_table[cstate].enter)
+			break;
+
+		if (!(cpuidle_state_table[cstate].flags & CPUIDLE_FLAG_TLB_FLUSHED))
+			continue;
+
+		for (matching_cstate = 0; matching_cstate < CPUIDLE_STATE_MAX; ++matching_cstate) {
+			if (!icpu->state_table[matching_cstate].enter)
+				break;
+			if (icpu->state_table[matching_cstate].exit_latency > cpuidle_state_table[cstate].exit_latency) {
+				cpuidle_state_table[cstate].exit_latency = icpu->state_table[matching_cstate].exit_latency;
+				cpuidle_state_table[cstate].target_residency = icpu->state_table[matching_cstate].target_residency;
+			}
+		}
+
+	}
+}
+
+
 static int __init intel_idle_vminit(const struct x86_cpu_id *id)
 {
 	int retval;
@@ -2145,6 +2191,15 @@ static int __init intel_idle_vminit(const struct x86_cpu_id *id)
 	if (!intel_idle_cpuidle_devices)
 		return -ENOMEM;
 
+	/*
+	 * We don't know exactly what the host will do when we go idle, but as a worst estimate
+	 * we can assume that the exit latency of the deepest host state will be hit for our
+	 * deep (long duration) guest idle state.
+	 * The same logic applies to the break even point for the long duration guest idle state.
+	 * So lets copy these two properties from the table we found for the host CPU type.
+	 */
+	matchup_vm_state_with_baremetal();
+
 	intel_idle_cpuidle_driver_init(&intel_idle_driver);
 
 	retval = cpuidle_register_driver(&intel_idle_driver);