From e132b9b3bc7f19e9b158e42b323881d5dee5ecf3 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@redhat.com>
Date: Wed, 16 Mar 2016 12:14:00 -0400
Subject: [PATCH 01/14] cpuidle: menu: use high confidence factors only when
 considering polling

The menu governor uses five different factors to pick the
idle state:
 - the user configured latency_req
 - the time until the next timer (next_timer_us)
 - the typical sleep interval, as measured recently
 - an estimate of sleep time by dividing next_timer_us by an observed factor
 - a load corrected version of the above, divided again by load

Only the first three items are known with enough confidence that
we can use them to consider polling, instead of an actual CPU
idle state, because the cost of being wrong about polling can be
excessive power use.

The latter two are used in the menu governor's main selection
loop, and can result in choosing a shallower idle state when
the system is expected to be busy again soon.

This pushes a busy system in the "performance" direction of
the performance<>power tradeoff, when choosing between idle
states, but stays more strictly on the "power" state when
deciding between polling and C1.

Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpuidle/governors/menu.c | 42 ++++++++++++++++++--------------
 1 file changed, 24 insertions(+), 18 deletions(-)

diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c
index 27fc733cb5b9..00c5f891e352 100644
--- a/drivers/cpuidle/governors/menu.c
+++ b/drivers/cpuidle/governors/menu.c
@@ -196,7 +196,7 @@ static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev);
  * of points is below a threshold. If it is... then use the
  * average of these 8 points as the estimated value.
  */
-static void get_typical_interval(struct menu_device *data)
+static unsigned int get_typical_interval(struct menu_device *data)
 {
 	int i, divisor;
 	unsigned int max, thresh, avg;
@@ -253,9 +253,7 @@ again:
 	if (likely(variance <= U64_MAX/36)) {
 		if ((((u64)avg*avg > variance*36) && (divisor * 4 >= INTERVALS * 3))
 							|| variance <= 400) {
-			if (data->next_timer_us > avg)
-				data->predicted_us = avg;
-			return;
+			return avg;
 		}
 	}
 
@@ -269,7 +267,7 @@ again:
 	 * with sporadic activity with a bunch of short pauses.
 	 */
 	if ((divisor * 4) <= INTERVALS * 3)
-		return;
+		return UINT_MAX;
 
 	thresh = max - 1;
 	goto again;
@@ -286,6 +284,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev)
 	int latency_req = pm_qos_request(PM_QOS_CPU_DMA_LATENCY);
 	int i;
 	unsigned int interactivity_req;
+	unsigned int expected_interval;
 	unsigned long nr_iowaiters, cpu_load;
 
 	if (data->needs_update) {
@@ -312,31 +311,38 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev)
 					 data->correction_factor[data->bucket],
 					 RESOLUTION * DECAY);
 
-	get_typical_interval(data);
-
-	/*
-	 * Performance multiplier defines a minimum predicted idle
-	 * duration / latency ratio. Adjust the latency limit if
-	 * necessary.
-	 */
-	interactivity_req = data->predicted_us / performance_multiplier(nr_iowaiters, cpu_load);
-	if (latency_req > interactivity_req)
-		latency_req = interactivity_req;
+	expected_interval = get_typical_interval(data);
+	expected_interval = min(expected_interval, data->next_timer_us);
 
 	if (CPUIDLE_DRIVER_STATE_START > 0) {
 		data->last_state_idx = CPUIDLE_DRIVER_STATE_START - 1;
 		/*
 		 * We want to default to C1 (hlt), not to busy polling
-		 * unless the timer is happening really really soon.
+		 * unless the timer is happening really really soon, or
+		 * C1's exit latency exceeds the user configured limit.
 		 */
-		if (interactivity_req > 20 &&
+		if (expected_interval > drv->states[CPUIDLE_DRIVER_STATE_START].target_residency &&
+		    latency_req > drv->states[CPUIDLE_DRIVER_STATE_START].exit_latency &&
 		    !drv->states[CPUIDLE_DRIVER_STATE_START].disabled &&
-			dev->states_usage[CPUIDLE_DRIVER_STATE_START].disable == 0)
+		    !dev->states_usage[CPUIDLE_DRIVER_STATE_START].disable)
 			data->last_state_idx = CPUIDLE_DRIVER_STATE_START;
 	} else {
 		data->last_state_idx = CPUIDLE_DRIVER_STATE_START;
 	}
 
+	/*
+	 * Use the lowest expected idle interval to pick the idle state.
+	 */
+	data->predicted_us = min(data->predicted_us, expected_interval);
+
+	/*
+	 * Use the performance multiplier and the user-configurable
+	 * latency_req to determine the maximum exit latency.
+	 */
+	interactivity_req = data->predicted_us / performance_multiplier(nr_iowaiters, cpu_load);
+	if (latency_req > interactivity_req)
+		latency_req = interactivity_req;
+
 	/*
 	 * Find the idle state with the lowest power while satisfying
 	 * our constraints.

From c75361c0b08a59d3c9863a5a673ae039d5118c35 Mon Sep 17 00:00:00 2001
From: Richard Cochran <rcochran@linutronix.de>
Date: Fri, 11 Mar 2016 09:43:07 +0100
Subject: [PATCH 02/14] cpufreq: Make cpufreq_quick_get() safe to call

The function, cpufreq_quick_get, accesses the global 'cpufreq_driver' and
its fields without taking the associated lock, cpufreq_driver_lock.

Without the locking, nothing guarantees that 'cpufreq_driver' remains
consistent during the call.  This patch fixes the issue by taking the lock
before accessing the data structure.

Signed-off-by: Richard Cochran <rcochran@linutronix.de>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cpufreq.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 4c7825856eab..f870399f00b1 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -1401,9 +1401,17 @@ unsigned int cpufreq_quick_get(unsigned int cpu)
 {
 	struct cpufreq_policy *policy;
 	unsigned int ret_freq = 0;
+	unsigned long flags;
 
-	if (cpufreq_driver && cpufreq_driver->setpolicy && cpufreq_driver->get)
-		return cpufreq_driver->get(cpu);
+	read_lock_irqsave(&cpufreq_driver_lock, flags);
+
+	if (cpufreq_driver && cpufreq_driver->setpolicy && cpufreq_driver->get) {
+		ret_freq = cpufreq_driver->get(cpu);
+		read_unlock_irqrestore(&cpufreq_driver_lock, flags);
+		return ret_freq;
+	}
+
+	read_unlock_irqrestore(&cpufreq_driver_lock, flags);
 
 	policy = cpufreq_cpu_get(cpu);
 	if (policy) {

From fdfdb2b1301670a69195ba1e5666df4a7f02eb46 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Fri, 18 Mar 2016 23:20:02 +0100
Subject: [PATCH 03/14] intel_pstate: Do not call wrmsrl_on_cpu() with disabled
 interrupts

After commit a4675fbc4a7a (cpufreq: intel_pstate: Replace timers with
utilization update callbacks) wrmsrl_on_cpu() cannot be called in the
intel_pstate_adjust_busy_pstate() path as that is executed with
disabled interrupts.  However, atom_set_pstate() called from there
via intel_pstate_set_pstate() uses wrmsrl_on_cpu() to update the
IA32_PERF_CTL MSR which triggers the WARN_ON_ONCE() in
smp_call_function_single().

The reason why wrmsrl_on_cpu() is used by atom_set_pstate() is
because intel_pstate_set_pstate() calling it is also invoked during
the initialization and cleanup of the driver and in those cases it is
not guaranteed to be run on the CPU that is being updated.  However,
in the case when intel_pstate_set_pstate() is called by
intel_pstate_adjust_busy_pstate(), wrmsrl() can be used to update
the register safely.  Moreover, intel_pstate_set_pstate() already
contains code that only is executed if the function is called by
intel_pstate_adjust_busy_pstate() and there is a special argument
passed to it because of that.

To fix the problem at hand, rearrange the code taking the above
observations into account.

First, replace the ->set() callback in struct pstate_funcs with a
->get_val() one that will return the value to be written to the
IA32_PERF_CTL MSR without updating the register.

Second, split intel_pstate_set_pstate() into two functions,
intel_pstate_update_pstate() to be called by
intel_pstate_adjust_busy_pstate() that will contain all of the
intel_pstate_set_pstate() code which only needs to be executed in
that case and will use wrmsrl() to update the MSR (after obtaining
the value to write to it from the ->get_val() callback), and
intel_pstate_set_min_pstate() to be invoked during the
initialization and cleanup that will set the P-state to the
minimum one and will update the MSR using wrmsrl_on_cpu().

Finally, move the code shared between intel_pstate_update_pstate()
and intel_pstate_set_min_pstate() to a new static inline function
intel_pstate_record_pstate() and make them both call it.

Of course, that unifies the handling of the IA32_PERF_CTL MSR writes
between Atom and Core.

Fixes: a4675fbc4a7a (cpufreq: intel_pstate: Replace timers with utilization update callbacks)
Reported-and-tested-by: Josh Boyer <jwboyer@fedoraproject.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/intel_pstate.c | 73 ++++++++++++++++++++--------------
 1 file changed, 43 insertions(+), 30 deletions(-)

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index cb5607495816..4b644526fd59 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -134,7 +134,7 @@ struct pstate_funcs {
 	int (*get_min)(void);
 	int (*get_turbo)(void);
 	int (*get_scaling)(void);
-	void (*set)(struct cpudata*, int pstate);
+	u64 (*get_val)(struct cpudata*, int pstate);
 	void (*get_vid)(struct cpudata *);
 	int32_t (*get_target_pstate)(struct cpudata *);
 };
@@ -565,7 +565,7 @@ static int atom_get_turbo_pstate(void)
 	return value & 0x7F;
 }
 
-static void atom_set_pstate(struct cpudata *cpudata, int pstate)
+static u64 atom_get_val(struct cpudata *cpudata, int pstate)
 {
 	u64 val;
 	int32_t vid_fp;
@@ -585,9 +585,7 @@ static void atom_set_pstate(struct cpudata *cpudata, int pstate)
 	if (pstate > cpudata->pstate.max_pstate)
 		vid = cpudata->vid.turbo;
 
-	val |= vid;
-
-	wrmsrl_on_cpu(cpudata->cpu, MSR_IA32_PERF_CTL, val);
+	return val | vid;
 }
 
 static int silvermont_get_scaling(void)
@@ -711,7 +709,7 @@ static inline int core_get_scaling(void)
 	return 100000;
 }
 
-static void core_set_pstate(struct cpudata *cpudata, int pstate)
+static u64 core_get_val(struct cpudata *cpudata, int pstate)
 {
 	u64 val;
 
@@ -719,7 +717,7 @@ static void core_set_pstate(struct cpudata *cpudata, int pstate)
 	if (limits->no_turbo && !limits->turbo_disabled)
 		val |= (u64)1 << 32;
 
-	wrmsrl(MSR_IA32_PERF_CTL, val);
+	return val;
 }
 
 static int knl_get_turbo_pstate(void)
@@ -750,7 +748,7 @@ static struct cpu_defaults core_params = {
 		.get_min = core_get_min_pstate,
 		.get_turbo = core_get_turbo_pstate,
 		.get_scaling = core_get_scaling,
-		.set = core_set_pstate,
+		.get_val = core_get_val,
 		.get_target_pstate = get_target_pstate_use_performance,
 	},
 };
@@ -769,7 +767,7 @@ static struct cpu_defaults silvermont_params = {
 		.get_max_physical = atom_get_max_pstate,
 		.get_min = atom_get_min_pstate,
 		.get_turbo = atom_get_turbo_pstate,
-		.set = atom_set_pstate,
+		.get_val = atom_get_val,
 		.get_scaling = silvermont_get_scaling,
 		.get_vid = atom_get_vid,
 		.get_target_pstate = get_target_pstate_use_cpu_load,
@@ -790,7 +788,7 @@ static struct cpu_defaults airmont_params = {
 		.get_max_physical = atom_get_max_pstate,
 		.get_min = atom_get_min_pstate,
 		.get_turbo = atom_get_turbo_pstate,
-		.set = atom_set_pstate,
+		.get_val = atom_get_val,
 		.get_scaling = airmont_get_scaling,
 		.get_vid = atom_get_vid,
 		.get_target_pstate = get_target_pstate_use_cpu_load,
@@ -812,7 +810,7 @@ static struct cpu_defaults knl_params = {
 		.get_min = core_get_min_pstate,
 		.get_turbo = knl_get_turbo_pstate,
 		.get_scaling = core_get_scaling,
-		.set = core_set_pstate,
+		.get_val = core_get_val,
 		.get_target_pstate = get_target_pstate_use_performance,
 	},
 };
@@ -839,25 +837,24 @@ static void intel_pstate_get_min_max(struct cpudata *cpu, int *min, int *max)
 	*min = clamp_t(int, min_perf, cpu->pstate.min_pstate, max_perf);
 }
 
-static void intel_pstate_set_pstate(struct cpudata *cpu, int pstate, bool force)
+static inline void intel_pstate_record_pstate(struct cpudata *cpu, int pstate)
 {
-	int max_perf, min_perf;
-
-	if (force) {
-		update_turbo_state();
-
-		intel_pstate_get_min_max(cpu, &min_perf, &max_perf);
-
-		pstate = clamp_t(int, pstate, min_perf, max_perf);
-
-		if (pstate == cpu->pstate.current_pstate)
-			return;
-	}
 	trace_cpu_frequency(pstate * cpu->pstate.scaling, cpu->cpu);
-
 	cpu->pstate.current_pstate = pstate;
+}
 
-	pstate_funcs.set(cpu, pstate);
+static void intel_pstate_set_min_pstate(struct cpudata *cpu)
+{
+	int pstate = cpu->pstate.min_pstate;
+
+	intel_pstate_record_pstate(cpu, pstate);
+	/*
+	 * Generally, there is no guarantee that this code will always run on
+	 * the CPU being updated, so force the register update to run on the
+	 * right CPU.
+	 */
+	wrmsrl_on_cpu(cpu->cpu, MSR_IA32_PERF_CTL,
+		      pstate_funcs.get_val(cpu, pstate));
 }
 
 static void intel_pstate_get_cpu_pstates(struct cpudata *cpu)
@@ -870,7 +867,8 @@ static void intel_pstate_get_cpu_pstates(struct cpudata *cpu)
 
 	if (pstate_funcs.get_vid)
 		pstate_funcs.get_vid(cpu);
-	intel_pstate_set_pstate(cpu, cpu->pstate.min_pstate, false);
+
+	intel_pstate_set_min_pstate(cpu);
 }
 
 static inline void intel_pstate_calc_busy(struct cpudata *cpu)
@@ -997,6 +995,21 @@ static inline int32_t get_target_pstate_use_performance(struct cpudata *cpu)
 	return cpu->pstate.current_pstate - pid_calc(&cpu->pid, core_busy);
 }
 
+static inline void intel_pstate_update_pstate(struct cpudata *cpu, int pstate)
+{
+	int max_perf, min_perf;
+
+	update_turbo_state();
+
+	intel_pstate_get_min_max(cpu, &min_perf, &max_perf);
+	pstate = clamp_t(int, pstate, min_perf, max_perf);
+	if (pstate == cpu->pstate.current_pstate)
+		return;
+
+	intel_pstate_record_pstate(cpu, pstate);
+	wrmsrl(MSR_IA32_PERF_CTL, pstate_funcs.get_val(cpu, pstate));
+}
+
 static inline void intel_pstate_adjust_busy_pstate(struct cpudata *cpu)
 {
 	int from, target_pstate;
@@ -1006,7 +1019,7 @@ static inline void intel_pstate_adjust_busy_pstate(struct cpudata *cpu)
 
 	target_pstate = pstate_funcs.get_target_pstate(cpu);
 
-	intel_pstate_set_pstate(cpu, target_pstate, true);
+	intel_pstate_update_pstate(cpu, target_pstate);
 
 	sample = &cpu->sample;
 	trace_pstate_sample(fp_toint(sample->core_pct_busy),
@@ -1180,7 +1193,7 @@ static void intel_pstate_stop_cpu(struct cpufreq_policy *policy)
 	if (hwp_active)
 		return;
 
-	intel_pstate_set_pstate(cpu, cpu->pstate.min_pstate, false);
+	intel_pstate_set_min_pstate(cpu);
 }
 
 static int intel_pstate_cpu_init(struct cpufreq_policy *policy)
@@ -1255,7 +1268,7 @@ static void copy_cpu_funcs(struct pstate_funcs *funcs)
 	pstate_funcs.get_min   = funcs->get_min;
 	pstate_funcs.get_turbo = funcs->get_turbo;
 	pstate_funcs.get_scaling = funcs->get_scaling;
-	pstate_funcs.set       = funcs->set;
+	pstate_funcs.get_val   = funcs->get_val;
 	pstate_funcs.get_vid   = funcs->get_vid;
 	pstate_funcs.get_target_pstate = funcs->get_target_pstate;
 

From ed72662a84771fec1337dd8b187e070af6fd3890 Mon Sep 17 00:00:00 2001
From: Richard Cochran <rcochran@linutronix.de>
Date: Fri, 18 Mar 2016 22:26:11 +0100
Subject: [PATCH 04/14] cpufreq: acpi-cpufreq: Clean up hot plug notifier
 callback

This driver has two issues.  First, it tries to fiddle with the hot
plugged CPU's MSR on the UP_PREPARE event, at a time when the CPU is
not yet online.  Second, the driver sets the "boost-disable" bit for a
CPU when going down, but does not clear the bit again if the CPU comes
up again due to DOWN_FAILED.

This patch fixes the issues by changing the driver to react to the
ONLINE/DOWN_FAILED events instead of UP_PREPARE.  As an added benefit,
the driver also becomes symmetric with respect to the hot plug
mechanism.

Signed-off-by: Richard Cochran <rcochran@linutronix.de>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/acpi-cpufreq.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/cpufreq/acpi-cpufreq.c b/drivers/cpufreq/acpi-cpufreq.c
index 59a7b380fbe2..e7801e0112bb 100644
--- a/drivers/cpufreq/acpi-cpufreq.c
+++ b/drivers/cpufreq/acpi-cpufreq.c
@@ -514,8 +514,10 @@ static int boost_notify(struct notifier_block *nb, unsigned long action,
 	 */
 
 	switch (action) {
-	case CPU_UP_PREPARE:
-	case CPU_UP_PREPARE_FROZEN:
+	case CPU_DOWN_FAILED:
+	case CPU_DOWN_FAILED_FROZEN:
+	case CPU_ONLINE:
+	case CPU_ONLINE_FROZEN:
 		boost_set_msrs(acpi_cpufreq_driver.boost_enabled, cpumask);
 		break;
 

From 0c313cb207326f759a58f486214288411b25d4cf Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Sun, 20 Mar 2016 01:33:35 +0100
Subject: [PATCH 05/14] cpuidle: menu: Fall back to polling if next timer event
 is near

Commit a9ceb78bc75c (cpuidle,menu: use interactivity_req to disable
polling) changed the behavior of the fallback state selection part
of menu_select() so it looks at interactivity_req instead of
data->next_timer_us when it makes its decision.  That effectively
caused polling to be used more often as fallback idle which led to
significant increases of energy consumption in some cases.

Commit e132b9b3bc7f (cpuidle: menu: use high confidence factors
only when considering polling) changed that logic again to be more
predictable, but that didn't help with the increased energy
consumption problem.

For this reason, go back to making decisions on which state to fall
back to based on data->next_timer_us which is the time we know for
sure something will happen rather than a prediction (which may be
inaccurate and turns out to be so often enough to be problematic).
However, take the target residency of the first proper idle state
(C1) into account, so that state is not used as the fallback one
if its target residency is greater than data->next_timer_us.

Fixes: a9ceb78bc75c (cpuidle,menu: use interactivity_req to disable polling)
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reported-and-tested-by: Doug Smythies <dsmythies@telus.net>
---
 drivers/cpuidle/governors/menu.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c
index 00c5f891e352..03d38c291de6 100644
--- a/drivers/cpuidle/governors/menu.c
+++ b/drivers/cpuidle/governors/menu.c
@@ -315,17 +315,21 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev)
 	expected_interval = min(expected_interval, data->next_timer_us);
 
 	if (CPUIDLE_DRIVER_STATE_START > 0) {
-		data->last_state_idx = CPUIDLE_DRIVER_STATE_START - 1;
+		struct cpuidle_state *s = &drv->states[CPUIDLE_DRIVER_STATE_START];
+		unsigned int polling_threshold;
+
 		/*
 		 * We want to default to C1 (hlt), not to busy polling
 		 * unless the timer is happening really really soon, or
 		 * C1's exit latency exceeds the user configured limit.
 		 */
-		if (expected_interval > drv->states[CPUIDLE_DRIVER_STATE_START].target_residency &&
-		    latency_req > drv->states[CPUIDLE_DRIVER_STATE_START].exit_latency &&
-		    !drv->states[CPUIDLE_DRIVER_STATE_START].disabled &&
+		polling_threshold = max_t(unsigned int, 20, s->target_residency);
+		if (data->next_timer_us > polling_threshold &&
+		    latency_req > s->exit_latency && !s->disabled &&
 		    !dev->states_usage[CPUIDLE_DRIVER_STATE_START].disable)
 			data->last_state_idx = CPUIDLE_DRIVER_STATE_START;
+		else
+			data->last_state_idx = CPUIDLE_DRIVER_STATE_START - 1;
 	} else {
 		data->last_state_idx = CPUIDLE_DRIVER_STATE_START;
 	}

From 3e5963bc343b3fb4ca045e9d1c14cb9ce89234b8 Mon Sep 17 00:00:00 2001
From: Michael Neuling <mikey@neuling.org>
Date: Mon, 21 Mar 2016 22:24:52 +0530
Subject: [PATCH 06/14] cpufreq: powernv: Define per_cpu chip pointer to
 optimize hot-path

Commit 96c4726f01cd "cpufreq: powernv: Remove cpu_to_chip_id() from
hot-path" introduced a 'core_to_chip_map' array to cache the chip-ids
of all cores.

Replace this with a per-CPU variable that stores the pointer to the
chip-array. This removes the linear lookup and provides a neater and
simpler solution.

Signed-off-by: Michael Neuling <mikey@neuling.org>
Signed-off-by: Shilpasri G Bhat <shilpa.bhat@linux.vnet.ibm.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/powernv-cpufreq.c | 50 +++++++++++--------------------
 1 file changed, 17 insertions(+), 33 deletions(-)

diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c
index 50bf12033bbc..a00bcc2cef09 100644
--- a/drivers/cpufreq/powernv-cpufreq.c
+++ b/drivers/cpufreq/powernv-cpufreq.c
@@ -44,7 +44,6 @@
 
 static struct cpufreq_frequency_table powernv_freqs[POWERNV_MAX_PSTATES+1];
 static bool rebooting, throttled, occ_reset;
-static unsigned int *core_to_chip_map;
 
 static const char * const throttle_reason[] = {
 	"No throttling",
@@ -65,6 +64,7 @@ static struct chip {
 } *chips;
 
 static int nr_chips;
+static DEFINE_PER_CPU(struct chip *, chip_info);
 
 /*
  * Note: The set of pstates consists of contiguous integers, the
@@ -324,34 +324,31 @@ static inline unsigned int get_nominal_index(void)
 
 static void powernv_cpufreq_throttle_check(void *data)
 {
+	struct chip *chip;
 	unsigned int cpu = smp_processor_id();
-	unsigned int chip_id = core_to_chip_map[cpu_core_index_of_thread(cpu)];
 	unsigned long pmsr;
-	int pmsr_pmax, i;
+	int pmsr_pmax;
 
 	pmsr = get_pmspr(SPRN_PMSR);
-
-	for (i = 0; i < nr_chips; i++)
-		if (chips[i].id == chip_id)
-			break;
+	chip = this_cpu_read(chip_info);
 
 	/* Check for Pmax Capping */
 	pmsr_pmax = (s8)PMSR_MAX(pmsr);
 	if (pmsr_pmax != powernv_pstate_info.max) {
-		if (chips[i].throttled)
+		if (chip->throttled)
 			goto next;
-		chips[i].throttled = true;
+		chip->throttled = true;
 		if (pmsr_pmax < powernv_pstate_info.nominal)
 			pr_warn_once("CPU %d on Chip %u has Pmax reduced below nominal frequency (%d < %d)\n",
-				     cpu, chips[i].id, pmsr_pmax,
+				     cpu, chip->id, pmsr_pmax,
 				     powernv_pstate_info.nominal);
-		trace_powernv_throttle(chips[i].id,
-				      throttle_reason[chips[i].throttle_reason],
+		trace_powernv_throttle(chip->id,
+				      throttle_reason[chip->throttle_reason],
 				      pmsr_pmax);
-	} else if (chips[i].throttled) {
-		chips[i].throttled = false;
-		trace_powernv_throttle(chips[i].id,
-				      throttle_reason[chips[i].throttle_reason],
+	} else if (chip->throttled) {
+		chip->throttled = false;
+		trace_powernv_throttle(chip->id,
+				      throttle_reason[chip->throttle_reason],
 				      pmsr_pmax);
 	}
 
@@ -558,47 +555,34 @@ static int init_chip_info(void)
 	unsigned int chip[256];
 	unsigned int cpu, i;
 	unsigned int prev_chip_id = UINT_MAX;
-	cpumask_t cpu_mask;
-	int ret = -ENOMEM;
 
-	core_to_chip_map = kcalloc(cpu_nr_cores(), sizeof(unsigned int),
-				   GFP_KERNEL);
-	if (!core_to_chip_map)
-		goto out;
-
-	cpumask_copy(&cpu_mask, cpu_possible_mask);
-	for_each_cpu(cpu, &cpu_mask) {
+	for_each_possible_cpu(cpu) {
 		unsigned int id = cpu_to_chip_id(cpu);
 
 		if (prev_chip_id != id) {
 			prev_chip_id = id;
 			chip[nr_chips++] = id;
 		}
-		core_to_chip_map[cpu_core_index_of_thread(cpu)] = id;
-		cpumask_andnot(&cpu_mask, &cpu_mask, cpu_sibling_mask(cpu));
 	}
 
 	chips = kcalloc(nr_chips, sizeof(struct chip), GFP_KERNEL);
 	if (!chips)
-		goto free_chip_map;
+		return -ENOMEM;
 
 	for (i = 0; i < nr_chips; i++) {
 		chips[i].id = chip[i];
 		cpumask_copy(&chips[i].mask, cpumask_of_node(chip[i]));
 		INIT_WORK(&chips[i].throttle, powernv_cpufreq_work_fn);
+		for_each_cpu(cpu, &chips[i].mask)
+			per_cpu(chip_info, cpu) =  &chips[i];
 	}
 
 	return 0;
-free_chip_map:
-	kfree(core_to_chip_map);
-out:
-	return ret;
 }
 
 static inline void clean_chip_info(void)
 {
 	kfree(chips);
-	kfree(core_to_chip_map);
 }
 
 static inline void unregister_all_notifiers(void)

From ac13b9967de7eeb7746f6533673e4dec94933bc9 Mon Sep 17 00:00:00 2001
From: Jisheng Zhang <jszhang@marvell.com>
Date: Tue, 22 Mar 2016 22:34:30 +0800
Subject: [PATCH 07/14] cpufreq: acpi-cpufreq: make Intel/AMD MSR access, io
 port access static

These frequency register read/write operations' implementations for the
given processor (Intel/AMD MSR access or I/O port access) are only used
internally in acpi-cpufreq, so make them static.

Signed-off-by: Jisheng Zhang <jszhang@marvell.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/acpi-cpufreq.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/cpufreq/acpi-cpufreq.c b/drivers/cpufreq/acpi-cpufreq.c
index e7801e0112bb..fb5712141040 100644
--- a/drivers/cpufreq/acpi-cpufreq.c
+++ b/drivers/cpufreq/acpi-cpufreq.c
@@ -245,7 +245,7 @@ static unsigned extract_freq(u32 val, struct acpi_cpufreq_data *data)
 	}
 }
 
-u32 cpu_freq_read_intel(struct acpi_pct_register *not_used)
+static u32 cpu_freq_read_intel(struct acpi_pct_register *not_used)
 {
 	u32 val, dummy;
 
@@ -253,7 +253,7 @@ u32 cpu_freq_read_intel(struct acpi_pct_register *not_used)
 	return val;
 }
 
-void cpu_freq_write_intel(struct acpi_pct_register *not_used, u32 val)
+static void cpu_freq_write_intel(struct acpi_pct_register *not_used, u32 val)
 {
 	u32 lo, hi;
 
@@ -262,7 +262,7 @@ void cpu_freq_write_intel(struct acpi_pct_register *not_used, u32 val)
 	wrmsr(MSR_IA32_PERF_CTL, lo, hi);
 }
 
-u32 cpu_freq_read_amd(struct acpi_pct_register *not_used)
+static u32 cpu_freq_read_amd(struct acpi_pct_register *not_used)
 {
 	u32 val, dummy;
 
@@ -270,12 +270,12 @@ u32 cpu_freq_read_amd(struct acpi_pct_register *not_used)
 	return val;
 }
 
-void cpu_freq_write_amd(struct acpi_pct_register *not_used, u32 val)
+static void cpu_freq_write_amd(struct acpi_pct_register *not_used, u32 val)
 {
 	wrmsr(MSR_AMD_PERF_CTL, val, 0);
 }
 
-u32 cpu_freq_read_io(struct acpi_pct_register *reg)
+static u32 cpu_freq_read_io(struct acpi_pct_register *reg)
 {
 	u32 val;
 
@@ -283,7 +283,7 @@ u32 cpu_freq_read_io(struct acpi_pct_register *reg)
 	return val;
 }
 
-void cpu_freq_write_io(struct acpi_pct_register *reg, u32 val)
+static void cpu_freq_write_io(struct acpi_pct_register *reg, u32 val)
 {
 	acpi_os_write_port(reg->address, val, reg->bit_width);
 }

From 1b0289848d5dcea74a6e5115d6c9892b0dbe9c8f Mon Sep 17 00:00:00 2001
From: Shilpasri G Bhat <shilpa.bhat@linux.vnet.ibm.com>
Date: Tue, 22 Mar 2016 18:57:09 +0530
Subject: [PATCH 08/14] cpufreq: powernv: Add sysfs attributes to show throttle
 stats

Create sysfs attributes to export throttle information in
/sys/devices/system/cpu/cpuX/cpufreq/throttle_stats directory. The
newly added sysfs files are as follows:

 1)/sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/turbo_stat
 2)/sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/sub-turbo_stat
 3)/sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/unthrottle
 4)/sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/powercap
 5)/sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/overtemp
 6)/sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/supply_fault
 7)/sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/overcurrent
 8)/sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/occ_reset

Detailed explanation of each attribute is added to
Documentation/ABI/testing/sysfs-devices-system-cpu

Signed-off-by: Shilpasri G Bhat <shilpa.bhat@linux.vnet.ibm.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 .../ABI/testing/sysfs-devices-system-cpu      | 69 +++++++++++++++++
 drivers/cpufreq/powernv-cpufreq.c             | 74 ++++++++++++++++++-
 2 files changed, 141 insertions(+), 2 deletions(-)

diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu
index b683e8ee69ec..16501334b99f 100644
--- a/Documentation/ABI/testing/sysfs-devices-system-cpu
+++ b/Documentation/ABI/testing/sysfs-devices-system-cpu
@@ -271,3 +271,72 @@ Description:	Parameters for the CPU cache attributes
 			- WriteBack: data is written only to the cache line and
 				     the modified cache line is written to main
 				     memory only when it is replaced
+
+What:		/sys/devices/system/cpu/cpuX/cpufreq/throttle_stats
+		/sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/turbo_stat
+		/sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/sub_turbo_stat
+		/sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/unthrottle
+		/sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/powercap
+		/sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/overtemp
+		/sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/supply_fault
+		/sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/overcurrent
+		/sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/occ_reset
+Date:		March 2016
+Contact:	Linux kernel mailing list <linux-kernel@vger.kernel.org>
+		Linux for PowerPC mailing list <linuxppc-dev@ozlabs.org>
+Description:	POWERNV CPUFreq driver's frequency throttle stats directory and
+		attributes
+
+		'cpuX/cpufreq/throttle_stats' directory contains the CPU frequency
+		throttle stat attributes for the chip. The throttle stats of a cpu
+		is common across all the cpus belonging to a chip. Below are the
+		throttle attributes exported in the 'throttle_stats' directory:
+
+		- turbo_stat : This file gives the total number of times the max
+		frequency is throttled to lower frequency in turbo (at and above
+		nominal frequency) range of frequencies.
+
+		- sub_turbo_stat : This file gives the total number of times the
+		max frequency is throttled to lower frequency in sub-turbo(below
+		nominal frequency) range of frequencies.
+
+		- unthrottle : This file gives the total number of times the max
+		frequency is unthrottled after being throttled.
+
+		- powercap : This file gives the total number of times the max
+		frequency is throttled due to 'Power Capping'.
+
+		- overtemp : This file gives the total number of times the max
+		frequency is throttled due to 'CPU Over Temperature'.
+
+		- supply_fault : This file gives the total number of times the
+		max frequency is throttled due to 'Power Supply Failure'.
+
+		- overcurrent : This file gives the total number of times the
+		max frequency is throttled due to 'Overcurrent'.
+
+		- occ_reset : This file gives the total number of times the max
+		frequency is throttled due to 'OCC Reset'.
+
+		The sysfs attributes representing different throttle reasons like
+		powercap, overtemp, supply_fault, overcurrent and occ_reset map to
+		the reasons provided by OCC firmware for throttling the frequency.
+
+What:		/sys/devices/system/cpu/cpufreq/policyX/throttle_stats
+		/sys/devices/system/cpu/cpufreq/policyX/throttle_stats/turbo_stat
+		/sys/devices/system/cpu/cpufreq/policyX/throttle_stats/sub_turbo_stat
+		/sys/devices/system/cpu/cpufreq/policyX/throttle_stats/unthrottle
+		/sys/devices/system/cpu/cpufreq/policyX/throttle_stats/powercap
+		/sys/devices/system/cpu/cpufreq/policyX/throttle_stats/overtemp
+		/sys/devices/system/cpu/cpufreq/policyX/throttle_stats/supply_fault
+		/sys/devices/system/cpu/cpufreq/policyX/throttle_stats/overcurrent
+		/sys/devices/system/cpu/cpufreq/policyX/throttle_stats/occ_reset
+Date:		March 2016
+Contact:	Linux kernel mailing list <linux-kernel@vger.kernel.org>
+		Linux for PowerPC mailing list <linuxppc-dev@ozlabs.org>
+Description:	POWERNV CPUFreq driver's frequency throttle stats directory and
+		attributes
+
+		'policyX/throttle_stats' directory and all the attributes are same as
+		the /sys/devices/system/cpu/cpuX/cpufreq/throttle_stats directory and
+		attributes which give the frequency throttle information of the chip.
diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c
index a00bcc2cef09..39ac78c94be0 100644
--- a/drivers/cpufreq/powernv-cpufreq.c
+++ b/drivers/cpufreq/powernv-cpufreq.c
@@ -54,6 +54,16 @@ static const char * const throttle_reason[] = {
 	"OCC Reset"
 };
 
+enum throttle_reason_type {
+	NO_THROTTLE = 0,
+	POWERCAP,
+	CPU_OVERTEMP,
+	POWER_SUPPLY_FAILURE,
+	OVERCURRENT,
+	OCC_RESET_THROTTLE,
+	OCC_MAX_REASON
+};
+
 static struct chip {
 	unsigned int id;
 	bool throttled;
@@ -61,6 +71,9 @@ static struct chip {
 	u8 throttle_reason;
 	cpumask_t mask;
 	struct work_struct throttle;
+	int throttle_turbo;
+	int throttle_sub_turbo;
+	int reason[OCC_MAX_REASON];
 } *chips;
 
 static int nr_chips;
@@ -196,6 +209,42 @@ static struct freq_attr *powernv_cpu_freq_attr[] = {
 	NULL,
 };
 
+#define throttle_attr(name, member)					\
+static ssize_t name##_show(struct cpufreq_policy *policy, char *buf)	\
+{									\
+	struct chip *chip = per_cpu(chip_info, policy->cpu);		\
+									\
+	return sprintf(buf, "%u\n", chip->member);			\
+}									\
+									\
+static struct freq_attr throttle_attr_##name = __ATTR_RO(name)		\
+
+throttle_attr(unthrottle, reason[NO_THROTTLE]);
+throttle_attr(powercap, reason[POWERCAP]);
+throttle_attr(overtemp, reason[CPU_OVERTEMP]);
+throttle_attr(supply_fault, reason[POWER_SUPPLY_FAILURE]);
+throttle_attr(overcurrent, reason[OVERCURRENT]);
+throttle_attr(occ_reset, reason[OCC_RESET_THROTTLE]);
+throttle_attr(turbo_stat, throttle_turbo);
+throttle_attr(sub_turbo_stat, throttle_sub_turbo);
+
+static struct attribute *throttle_attrs[] = {
+	&throttle_attr_unthrottle.attr,
+	&throttle_attr_powercap.attr,
+	&throttle_attr_overtemp.attr,
+	&throttle_attr_supply_fault.attr,
+	&throttle_attr_overcurrent.attr,
+	&throttle_attr_occ_reset.attr,
+	&throttle_attr_turbo_stat.attr,
+	&throttle_attr_sub_turbo_stat.attr,
+	NULL,
+};
+
+static const struct attribute_group throttle_attr_grp = {
+	.name	= "throttle_stats",
+	.attrs	= throttle_attrs,
+};
+
 /* Helper routines */
 
 /* Access helpers to power mgt SPR */
@@ -338,10 +387,14 @@ static void powernv_cpufreq_throttle_check(void *data)
 		if (chip->throttled)
 			goto next;
 		chip->throttled = true;
-		if (pmsr_pmax < powernv_pstate_info.nominal)
+		if (pmsr_pmax < powernv_pstate_info.nominal) {
 			pr_warn_once("CPU %d on Chip %u has Pmax reduced below nominal frequency (%d < %d)\n",
 				     cpu, chip->id, pmsr_pmax,
 				     powernv_pstate_info.nominal);
+			chip->throttle_sub_turbo++;
+		} else {
+			chip->throttle_turbo++;
+		}
 		trace_powernv_throttle(chip->id,
 				      throttle_reason[chip->throttle_reason],
 				      pmsr_pmax);
@@ -408,6 +461,21 @@ static int powernv_cpufreq_cpu_init(struct cpufreq_policy *policy)
 	for (i = 0; i < threads_per_core; i++)
 		cpumask_set_cpu(base + i, policy->cpus);
 
+	if (!policy->driver_data) {
+		int ret;
+
+		ret = sysfs_create_group(&policy->kobj, &throttle_attr_grp);
+		if (ret) {
+			pr_info("Failed to create throttle stats directory for cpu %d\n",
+				policy->cpu);
+			return ret;
+		}
+		/*
+		 * policy->driver_data is used as a flag for one-time
+		 * creation of throttle sysfs files.
+		 */
+		policy->driver_data = policy;
+	}
 	return cpufreq_table_validate_and_show(policy, powernv_freqs);
 }
 
@@ -514,8 +582,10 @@ static int powernv_cpufreq_occ_msg(struct notifier_block *nb,
 				break;
 
 		if (omsg.throttle_status >= 0 &&
-		    omsg.throttle_status <= OCC_MAX_THROTTLE_STATUS)
+		    omsg.throttle_status <= OCC_MAX_THROTTLE_STATUS) {
 			chips[i].throttle_reason = omsg.throttle_status;
+			chips[i].reason[omsg.throttle_status]++;
+		}
 
 		if (!omsg.throttle_status)
 			chips[i].restore = true;

From 0a300767e5882ad5b687966c80f9620aa0871be5 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Mon, 21 Mar 2016 15:45:24 +0100
Subject: [PATCH 09/14] cpufreq: Introduce cpufreq_start_governor()

Starting a governor in cpufreq always follows the same pattern
involving two calls to cpufreq_governor(), one with the event
argument set to CPUFREQ_GOV_START and one with that argument set to
CPUFREQ_GOV_LIMITS.

Introduce cpufreq_start_governor() that will carry out those two
operations and make all places where governors are started use it.

That slightly modifies the behavior of cpufreq_set_policy() which
now also will go back to the old governor if the second call to
cpufreq_governor() (the one with event equal to CPUFREQ_GOV_LIMITS)
fails, but that really is how it should work in the first place.

Also cpufreq_resume() will now pring an error message if the
CPUFREQ_GOV_LIMITS call to cpufreq_governor() fails, but that
makes it follow cpufreq_add_policy_cpu() and cpufreq_offline()
in that respect.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/cpufreq/cpufreq.c | 44 +++++++++++++++++++--------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index f870399f00b1..43f3912a2ac8 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -76,6 +76,7 @@ static inline bool has_target(void)
 /* internal prototypes */
 static int cpufreq_governor(struct cpufreq_policy *policy, unsigned int event);
 static unsigned int __cpufreq_get(struct cpufreq_policy *policy);
+static int cpufreq_start_governor(struct cpufreq_policy *policy);
 
 /**
  * Two notifier lists: the "policy" list is involved in the
@@ -964,10 +965,7 @@ static int cpufreq_add_policy_cpu(struct cpufreq_policy *policy, unsigned int cp
 	cpumask_set_cpu(cpu, policy->cpus);
 
 	if (has_target()) {
-		ret = cpufreq_governor(policy, CPUFREQ_GOV_START);
-		if (!ret)
-			ret = cpufreq_governor(policy, CPUFREQ_GOV_LIMITS);
-
+		ret = cpufreq_start_governor(policy);
 		if (ret)
 			pr_err("%s: Failed to start governor\n", __func__);
 	}
@@ -1308,10 +1306,7 @@ static void cpufreq_offline(unsigned int cpu)
 	/* Start governor again for active policy */
 	if (!policy_is_inactive(policy)) {
 		if (has_target()) {
-			ret = cpufreq_governor(policy, CPUFREQ_GOV_START);
-			if (!ret)
-				ret = cpufreq_governor(policy, CPUFREQ_GOV_LIMITS);
-
+			ret = cpufreq_start_governor(policy);
 			if (ret)
 				pr_err("%s: Failed to start governor\n", __func__);
 		}
@@ -1591,9 +1586,7 @@ void cpufreq_resume(void)
 				policy);
 		} else {
 			down_write(&policy->rwsem);
-			ret = cpufreq_governor(policy, CPUFREQ_GOV_START);
-			if (!ret)
-				cpufreq_governor(policy, CPUFREQ_GOV_LIMITS);
+			ret = cpufreq_start_governor(policy);
 			up_write(&policy->rwsem);
 
 			if (ret)
@@ -1935,6 +1928,14 @@ static int cpufreq_governor(struct cpufreq_policy *policy, unsigned int event)
 	return ret;
 }
 
+static int cpufreq_start_governor(struct cpufreq_policy *policy)
+{
+	int ret;
+
+	ret = cpufreq_governor(policy, CPUFREQ_GOV_START);
+	return ret ? ret : cpufreq_governor(policy, CPUFREQ_GOV_LIMITS);
+}
+
 int cpufreq_register_governor(struct cpufreq_governor *governor)
 {
 	int err;
@@ -2071,8 +2072,10 @@ static int cpufreq_set_policy(struct cpufreq_policy *policy,
 		return cpufreq_driver->setpolicy(new_policy);
 	}
 
-	if (new_policy->governor == policy->governor)
-		goto out;
+	if (new_policy->governor == policy->governor) {
+		pr_debug("cpufreq: governor limits update\n");
+		return cpufreq_governor(policy, CPUFREQ_GOV_LIMITS);
+	}
 
 	pr_debug("governor switch\n");
 
@@ -2100,10 +2103,11 @@ static int cpufreq_set_policy(struct cpufreq_policy *policy,
 	policy->governor = new_policy->governor;
 	ret = cpufreq_governor(policy, CPUFREQ_GOV_POLICY_INIT);
 	if (!ret) {
-		ret = cpufreq_governor(policy, CPUFREQ_GOV_START);
-		if (!ret)
-			goto out;
-
+		ret = cpufreq_start_governor(policy);
+		if (!ret) {
+			pr_debug("cpufreq: governor change\n");
+			return 0;
+		}
 		cpufreq_governor(policy, CPUFREQ_GOV_POLICY_EXIT);
 	}
 
@@ -2114,14 +2118,10 @@ static int cpufreq_set_policy(struct cpufreq_policy *policy,
 		if (cpufreq_governor(policy, CPUFREQ_GOV_POLICY_INIT))
 			policy->governor = NULL;
 		else
-			cpufreq_governor(policy, CPUFREQ_GOV_START);
+			cpufreq_start_governor(policy);
 	}
 
 	return ret;
-
- out:
-	pr_debug("governor: change or update limits\n");
-	return cpufreq_governor(policy, CPUFREQ_GOV_LIMITS);
 }
 
 /**

From 999f572983de5e1a4ee9f42daa51b5448f12dd64 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Mon, 21 Mar 2016 15:46:25 +0100
Subject: [PATCH 10/14] cpufreq: Introduce cpufreq_update_current_freq()

Move the part of cpufreq_update_policy() that obtains the current
frequency from the driver and updates policy->cur if necessary to
a separate function, cpufreq_get_current_freq().

That should not introduce functional changes and subsequent change
set will need it.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/cpufreq/cpufreq.c | 28 +++++++++++++++++++---------
 1 file changed, 19 insertions(+), 9 deletions(-)

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 43f3912a2ac8..ffdb7fca5bb7 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -1487,6 +1487,24 @@ unsigned int cpufreq_get(unsigned int cpu)
 }
 EXPORT_SYMBOL(cpufreq_get);
 
+static unsigned int cpufreq_update_current_freq(struct cpufreq_policy *policy)
+{
+	unsigned int new_freq;
+
+	new_freq = cpufreq_driver->get(policy->cpu);
+	if (!new_freq)
+		return 0;
+
+	if (!policy->cur) {
+		pr_debug("cpufreq: Driver did not initialize current freq\n");
+		policy->cur = new_freq;
+	} else if (policy->cur != new_freq && has_target()) {
+		cpufreq_out_of_sync(policy, new_freq);
+	}
+
+	return new_freq;
+}
+
 static struct subsys_interface cpufreq_interface = {
 	.name		= "cpufreq",
 	.subsys		= &cpu_subsys,
@@ -2152,19 +2170,11 @@ int cpufreq_update_policy(unsigned int cpu)
 	 * -> ask driver for current freq and notify governors about a change
 	 */
 	if (cpufreq_driver->get && !cpufreq_driver->setpolicy) {
-		new_policy.cur = cpufreq_driver->get(cpu);
+		new_policy.cur = cpufreq_update_current_freq(policy);
 		if (WARN_ON(!new_policy.cur)) {
 			ret = -EIO;
 			goto unlock;
 		}
-
-		if (!policy->cur) {
-			pr_debug("Driver did not initialize current freq\n");
-			policy->cur = new_policy.cur;
-		} else {
-			if (policy->cur != new_policy.cur && has_target())
-				cpufreq_out_of_sync(policy, new_policy.cur);
-		}
 	}
 
 	ret = cpufreq_set_policy(policy, &new_policy);

From 3bbf8fe3ae0845af9192118753c2399efe531832 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Mon, 21 Mar 2016 15:47:48 +0100
Subject: [PATCH 11/14] cpufreq: Always update current frequency before startig
 governor

Make policy->cur match the current frequency returned by the driver's
->get() callback before starting the governor in case they went out of
sync in the meantime and drop the piece of code attempting to
resync policy->cur with the real frequency of the boot CPU from
cpufreq_resume() as it serves no purpose any more (and it's racy and
super-ugly anyway).

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/cpufreq/cpufreq.c | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index ffdb7fca5bb7..b87596b591b3 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -1612,17 +1612,6 @@ void cpufreq_resume(void)
 				       __func__, policy);
 		}
 	}
-
-	/*
-	 * schedule call cpufreq_update_policy() for first-online CPU, as that
-	 * wouldn't be hotplugged-out on suspend. It will verify that the
-	 * current freq is in sync with what we believe it to be.
-	 */
-	policy = cpufreq_cpu_get_raw(cpumask_first(cpu_online_mask));
-	if (WARN_ON(!policy))
-		return;
-
-	schedule_work(&policy->update);
 }
 
 /**
@@ -1950,6 +1939,9 @@ static int cpufreq_start_governor(struct cpufreq_policy *policy)
 {
 	int ret;
 
+	if (cpufreq_driver->get && !cpufreq_driver->setpolicy)
+		cpufreq_update_current_freq(policy);
+
 	ret = cpufreq_governor(policy, CPUFREQ_GOV_START);
 	return ret ? ret : cpufreq_governor(policy, CPUFREQ_GOV_LIMITS);
 }

From 539a4c4247c2697d291a8fb02c485ccd7cf34f05 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Tue, 22 Mar 2016 01:17:43 +0100
Subject: [PATCH 12/14] cpufreq: governor: Always schedule work on the CPU
 running update

Modify dbs_irq_work() to always schedule the process-context work
on the current CPU which also ran the dbs_update_util_handler()
that the irq_work being handled came from.

This causes the entire frequency update handling (involving the
"ondemand" or "conservative" governors) to be carried out by the
CPU whose frequency is to be updated and reduces the overall amount
of inter-CPU noise related to cpufreq.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cpufreq_governor.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c
index 1c25ef405616..10a5cfeae8c5 100644
--- a/drivers/cpufreq/cpufreq_governor.c
+++ b/drivers/cpufreq/cpufreq_governor.c
@@ -329,7 +329,7 @@ static void dbs_irq_work(struct irq_work *irq_work)
 	struct policy_dbs_info *policy_dbs;
 
 	policy_dbs = container_of(irq_work, struct policy_dbs_info, irq_work);
-	schedule_work(&policy_dbs->work);
+	schedule_work_on(smp_processor_id(), &policy_dbs->work);
 }
 
 static void dbs_update_util_handler(struct update_util_data *data, u64 time,

From d70e28f57e14a481977436695b0c9ba165472431 Mon Sep 17 00:00:00 2001
From: Len Brown <len.brown@intel.com>
Date: Sun, 13 Mar 2016 00:33:48 -0500
Subject: [PATCH 13/14] intel_idle: prevent SKL-H boot failure when C8+C9+C10
 enabled

Some SKL-H configurations require "intel_idle.max_cstate=7" to boot.
While that is an effective workaround, it disables C10.

This patch detects the problematic configuration,
and disables C8 and C9, keeping C10 enabled.

Note that enabling SGX in BIOS SETUP can also prevent this issue,
if the system BIOS provides that option.

https://bugzilla.kernel.org/show_bug.cgi?id=109081
"Freezes with Intel i7 6700HQ (Skylake), unless intel_idle.max_cstate=7"

Signed-off-by: Len Brown <len.brown@intel.com>
Cc: stable@vger.kernel.org
---
 drivers/idle/intel_idle.c | 112 ++++++++++++++++++++++++++++++--------
 1 file changed, 88 insertions(+), 24 deletions(-)

diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index cd4510a63375..146eed70bdf4 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -65,7 +65,7 @@
 #include <asm/mwait.h>
 #include <asm/msr.h>
 
-#define INTEL_IDLE_VERSION "0.4"
+#define INTEL_IDLE_VERSION "0.4.1"
 #define PREFIX "intel_idle: "
 
 static struct cpuidle_driver intel_idle_driver = {
@@ -993,37 +993,93 @@ static void intel_idle_cpuidle_devices_uninit(void)
 	return;
 }
 
+/*
+ * ivt_idle_state_table_update(void)
+ *
+ * Tune IVT multi-socket targets
+ * Assumption: num_sockets == (max_package_num + 1)
+ */
+static void ivt_idle_state_table_update(void)
+{
+	/* IVT uses a different table for 1-2, 3-4, and > 4 sockets */
+	int cpu, package_num, num_sockets = 1;
+
+	for_each_online_cpu(cpu) {
+		package_num = topology_physical_package_id(cpu);
+		if (package_num + 1 > num_sockets) {
+			num_sockets = package_num + 1;
+
+			if (num_sockets > 4) {
+				cpuidle_state_table = ivt_cstates_8s;
+				return;
+			}
+		}
+	}
+
+	if (num_sockets > 2)
+		cpuidle_state_table = ivt_cstates_4s;
+
+	/* else, 1 and 2 socket systems use default ivt_cstates */
+}
+/*
+ * sklh_idle_state_table_update(void)
+ *
+ * On SKL-H (model 0x5e) disable C8 and C9 if:
+ * C10 is enabled and SGX disabled
+ */
+static void sklh_idle_state_table_update(void)
+{
+	unsigned long long msr;
+	unsigned int eax, ebx, ecx, edx;
+
+
+	/* if PC10 disabled via cmdline intel_idle.max_cstate=7 or shallower */
+	if (max_cstate <= 7)
+		return;
+
+	/* if PC10 not present in CPUID.MWAIT.EDX */
+	if ((mwait_substates & (0xF << 28)) == 0)
+		return;
+
+	rdmsrl(MSR_NHM_SNB_PKG_CST_CFG_CTL, msr);
+
+	/* PC10 is not enabled in PKG C-state limit */
+	if ((msr & 0xF) != 8)
+		return;
+
+	ecx = 0;
+	cpuid(7, &eax, &ebx, &ecx, &edx);
+
+	/* if SGX is present */
+	if (ebx & (1 << 2)) {
+
+		rdmsrl(MSR_IA32_FEATURE_CONTROL, msr);
+
+		/* if SGX is enabled */
+		if (msr & (1 << 18))
+			return;
+	}
+
+	skl_cstates[5].disabled = 1;	/* C8-SKL */
+	skl_cstates[6].disabled = 1;	/* C9-SKL */
+}
 /*
  * intel_idle_state_table_update()
  *
  * Update the default state_table for this CPU-id
- *
- * Currently used to access tuned IVT multi-socket targets
- * Assumption: num_sockets == (max_package_num + 1)
  */
-void intel_idle_state_table_update(void)
+
+static void intel_idle_state_table_update(void)
 {
-	/* IVT uses a different table for 1-2, 3-4, and > 4 sockets */
-	if (boot_cpu_data.x86_model == 0x3e) { /* IVT */
-		int cpu, package_num, num_sockets = 1;
+	switch (boot_cpu_data.x86_model) {
 
-		for_each_online_cpu(cpu) {
-			package_num = topology_physical_package_id(cpu);
-			if (package_num + 1 > num_sockets) {
-				num_sockets = package_num + 1;
-
-				if (num_sockets > 4) {
-					cpuidle_state_table = ivt_cstates_8s;
-					return;
-				}
-			}
-		}
-
-		if (num_sockets > 2)
-			cpuidle_state_table = ivt_cstates_4s;
-		/* else, 1 and 2 socket systems use default ivt_cstates */
+	case 0x3e: /* IVT */
+		ivt_idle_state_table_update();
+		break;
+	case 0x5e: /* SKL-H */
+		sklh_idle_state_table_update();
+		break;
 	}
-	return;
 }
 
 /*
@@ -1063,6 +1119,14 @@ static int __init intel_idle_cpuidle_driver_init(void)
 		if (num_substates == 0)
 			continue;
 
+		/* if state marked as disabled, skip it */
+		if (cpuidle_state_table[cstate].disabled != 0) {
+			pr_debug(PREFIX "state %s is disabled",
+				cpuidle_state_table[cstate].name);
+			continue;
+		}
+
+
 		if (((mwait_cstate + 1) > 2) &&
 			!boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
 			mark_tsc_unstable("TSC halts in idle"

From 281baf7a702693deaa45c98ef0c5161006b48257 Mon Sep 17 00:00:00 2001
From: Dasaratharaman Chandramouli <dasaratharaman.chandramouli@intel.com>
Date: Thu, 4 Sep 2014 17:22:54 -0700
Subject: [PATCH 14/14] intel_idle: Support for Intel Xeon Phi Processor x200
 Product Family

Enables "Intel(R) Xeon Phi(TM) Processor x200 Product Family" support,
formerly code-named KNL. It is based on modified Intel Atom Silvermont
microarchitecture.

Signed-off-by: Dasaratharaman Chandramouli <dasaratharaman.chandramouli@intel.com>
[micah.barany@intel.com: adjusted values of residency and latency]
Signed-off-by: Micah Barany <micah.barany@intel.com>
[hubert.chrzaniuk@intel.com: removed deprecated CPUIDLE_FLAG_TIME_VALID flag]
Signed-off-by: Hubert Chrzaniuk <hubert.chrzaniuk@intel.com>
Signed-off-by: Pawel Karczewski <pawel.karczewski@intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 drivers/idle/intel_idle.c | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index 146eed70bdf4..ba947df5a8c7 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -716,6 +716,26 @@ static struct cpuidle_state avn_cstates[] = {
 	{
 		.enter = NULL }
 };
+static struct cpuidle_state knl_cstates[] = {
+	{
+		.name = "C1-KNL",
+		.desc = "MWAIT 0x00",
+		.flags = MWAIT2flg(0x00),
+		.exit_latency = 1,
+		.target_residency = 2,
+		.enter = &intel_idle,
+		.enter_freeze = intel_idle_freeze },
+	{
+		.name = "C6-KNL",
+		.desc = "MWAIT 0x10",
+		.flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED,
+		.exit_latency = 120,
+		.target_residency = 500,
+		.enter = &intel_idle,
+		.enter_freeze = intel_idle_freeze },
+	{
+		.enter = NULL }
+};
 
 /**
  * intel_idle
@@ -890,6 +910,10 @@ static const struct idle_cpu idle_cpu_avn = {
 	.disable_promotion_to_c1e = true,
 };
 
+static const struct idle_cpu idle_cpu_knl = {
+	.state_table = knl_cstates,
+};
+
 #define ICPU(model, cpu) \
 	{ X86_VENDOR_INTEL, 6, model, X86_FEATURE_MWAIT, (unsigned long)&cpu }
 
@@ -921,6 +945,7 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = {
 	ICPU(0x56, idle_cpu_bdw),
 	ICPU(0x4e, idle_cpu_skl),
 	ICPU(0x5e, idle_cpu_skl),
+	ICPU(0x57, idle_cpu_knl),
 	{}
 };
 MODULE_DEVICE_TABLE(x86cpu, intel_idle_ids);