From e128c864070055e062f6c90c64c03aad18452ac3 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Thu, 3 Dec 2015 09:37:49 +0530
Subject: [PATCH 01/30] cpufreq: ondemand: Update sampling rate only for
 concerned policies

We are comparing policy->governor against cpufreq_gov_ondemand to make
sure that we update sampling rate only for the concerned CPUs. But that
isn't enough.

In case of governor_per_policy, there can be multiple instances of
ondemand governor and we will always end up updating all of them with
current code. What we rather need to do, is to compare dbs_data with
poilcy->governor_data, which will match only for the policies governed
by dbs_data.

This code is also racy as the governor might be getting stopped at that
time and we may end up scheduling work for a policy, which we have just
disabled.

Fix that by protecting the entire function with &od_dbs_cdata.mutex,
which will prevent against races with policy START/STOP/etc.

After these locks are in place, we can safely get the policy via per-cpu
dbs_info.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cpufreq_ondemand.c | 37 +++++++++++++++++++++++-------
 1 file changed, 29 insertions(+), 8 deletions(-)

diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c
index 03ac6ce54042..089ca6a6ca02 100644
--- a/drivers/cpufreq/cpufreq_ondemand.c
+++ b/drivers/cpufreq/cpufreq_ondemand.c
@@ -252,20 +252,39 @@ static void update_sampling_rate(struct dbs_data *dbs_data,
 	od_tuners->sampling_rate = new_rate = max(new_rate,
 			dbs_data->min_sampling_rate);
 
+	/*
+	 * Lock governor so that governor start/stop can't execute in parallel.
+	 */
+	mutex_lock(&od_dbs_cdata.mutex);
+
 	for_each_online_cpu(cpu) {
 		struct cpufreq_policy *policy;
 		struct od_cpu_dbs_info_s *dbs_info;
+		struct cpu_dbs_info *cdbs;
+		struct cpu_common_dbs_info *shared;
 		unsigned long next_sampling, appointed_at;
 
-		policy = cpufreq_cpu_get(cpu);
-		if (!policy)
-			continue;
-		if (policy->governor != &cpufreq_gov_ondemand) {
-			cpufreq_cpu_put(policy);
-			continue;
-		}
 		dbs_info = &per_cpu(od_cpu_dbs_info, cpu);
-		cpufreq_cpu_put(policy);
+		cdbs = &dbs_info->cdbs;
+		shared = cdbs->shared;
+
+		/*
+		 * A valid shared and shared->policy means governor hasn't
+		 * stopped or exited yet.
+		 */
+		if (!shared || !shared->policy)
+			continue;
+
+		policy = shared->policy;
+
+		/*
+		 * Update sampling rate for CPUs whose policy is governed by
+		 * dbs_data. In case of governor_per_policy, only a single
+		 * policy will be governed by dbs_data, otherwise there can be
+		 * multiple policies that are governed by the same dbs_data.
+		 */
+		if (dbs_data != policy->governor_data)
+			continue;
 
 		if (!delayed_work_pending(&dbs_info->cdbs.dwork))
 			continue;
@@ -281,6 +300,8 @@ static void update_sampling_rate(struct dbs_data *dbs_data,
 
 		}
 	}
+
+	mutex_unlock(&od_dbs_cdata.mutex);
 }
 
 static ssize_t store_sampling_rate(struct dbs_data *dbs_data, const char *buf,

From e68fe18c5b5442baca162ccf3b273326e6132a51 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Thu, 3 Dec 2015 09:37:50 +0530
Subject: [PATCH 02/30] cpufreq: ondemand: Work is guaranteed to be pending

We are guaranteed to have works scheduled for policy->cpus, as the
policy isn't stopped yet. And so there is no need to check that again.
Drop it.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cpufreq_ondemand.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c
index 089ca6a6ca02..08f2aa602f9e 100644
--- a/drivers/cpufreq/cpufreq_ondemand.c
+++ b/drivers/cpufreq/cpufreq_ondemand.c
@@ -286,9 +286,6 @@ static void update_sampling_rate(struct dbs_data *dbs_data,
 		if (dbs_data != policy->governor_data)
 			continue;
 
-		if (!delayed_work_pending(&dbs_info->cdbs.dwork))
-			continue;
-
 		next_sampling = jiffies + usecs_to_jiffies(new_rate);
 		appointed_at = dbs_info->cdbs.dwork.timer.expires;
 

From affde5d06af1e39c2929e36a063e3912f02fc58f Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Thu, 3 Dec 2015 09:37:51 +0530
Subject: [PATCH 03/30] cpufreq: governor: Pass policy as argument to
 ->gov_dbs_timer()

Pass 'policy' as argument to ->gov_dbs_timer() instead of cdbs and
dbs_data.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cpufreq_conservative.c | 6 +++---
 drivers/cpufreq/cpufreq_governor.c     | 2 +-
 drivers/cpufreq/cpufreq_governor.h     | 3 +--
 drivers/cpufreq/cpufreq_ondemand.c     | 5 ++---
 4 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c
index 1fa1deb6e91f..606ad74abe6e 100644
--- a/drivers/cpufreq/cpufreq_conservative.c
+++ b/drivers/cpufreq/cpufreq_conservative.c
@@ -115,13 +115,13 @@ static void cs_check_cpu(int cpu, unsigned int load)
 	}
 }
 
-static unsigned int cs_dbs_timer(struct cpu_dbs_info *cdbs,
-				 struct dbs_data *dbs_data, bool modify_all)
+static unsigned int cs_dbs_timer(struct cpufreq_policy *policy, bool modify_all)
 {
+	struct dbs_data *dbs_data = policy->governor_data;
 	struct cs_dbs_tuners *cs_tuners = dbs_data->tuners;
 
 	if (modify_all)
-		dbs_check_cpu(dbs_data, cdbs->shared->policy->cpu);
+		dbs_check_cpu(dbs_data, policy->cpu);
 
 	return delay_for_sampling_rate(cs_tuners->sampling_rate);
 }
diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c
index b260576ddb12..cdcb56a49b28 100644
--- a/drivers/cpufreq/cpufreq_governor.c
+++ b/drivers/cpufreq/cpufreq_governor.c
@@ -253,7 +253,7 @@ static void dbs_timer(struct work_struct *work)
 	if (!need_load_eval(cdbs->shared, sampling_rate))
 		modify_all = false;
 
-	delay = dbs_data->cdata->gov_dbs_timer(cdbs, dbs_data, modify_all);
+	delay = dbs_data->cdata->gov_dbs_timer(policy, modify_all);
 	gov_queue_work(dbs_data, policy, delay, modify_all);
 
 unlock:
diff --git a/drivers/cpufreq/cpufreq_governor.h b/drivers/cpufreq/cpufreq_governor.h
index 5621bb03e874..0c7589016b6c 100644
--- a/drivers/cpufreq/cpufreq_governor.h
+++ b/drivers/cpufreq/cpufreq_governor.h
@@ -209,8 +209,7 @@ struct common_dbs_data {
 
 	struct cpu_dbs_info *(*get_cpu_cdbs)(int cpu);
 	void *(*get_cpu_dbs_info_s)(int cpu);
-	unsigned int (*gov_dbs_timer)(struct cpu_dbs_info *cdbs,
-				      struct dbs_data *dbs_data,
+	unsigned int (*gov_dbs_timer)(struct cpufreq_policy *policy,
 				      bool modify_all);
 	void (*gov_check_cpu)(int cpu, unsigned int load);
 	int (*init)(struct dbs_data *dbs_data, bool notify);
diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c
index 08f2aa602f9e..fc0384b4d02d 100644
--- a/drivers/cpufreq/cpufreq_ondemand.c
+++ b/drivers/cpufreq/cpufreq_ondemand.c
@@ -191,10 +191,9 @@ static void od_check_cpu(int cpu, unsigned int load)
 	}
 }
 
-static unsigned int od_dbs_timer(struct cpu_dbs_info *cdbs,
-				 struct dbs_data *dbs_data, bool modify_all)
+static unsigned int od_dbs_timer(struct cpufreq_policy *policy, bool modify_all)
 {
-	struct cpufreq_policy *policy = cdbs->shared->policy;
+	struct dbs_data *dbs_data = policy->governor_data;
 	unsigned int cpu = policy->cpu;
 	struct od_cpu_dbs_info_s *dbs_info = &per_cpu(od_cpu_dbs_info,
 			cpu);

From 5e4500d8dba16d88b528cf037566b84747ec23f0 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Thu, 3 Dec 2015 09:37:52 +0530
Subject: [PATCH 04/30] cpufreq: governor: initialize/destroy timer_mutex with
 'shared'

timer_mutex is required to be initialized only while memory for 'shared'
is allocated and in a similar way it is required to be destroyed only
when memory for 'shared' is freed.

There is no need to do the same every time we start/stop the governor.
Move code to initialize/destroy timer_mutex to the relevant places.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cpufreq_governor.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c
index cdcb56a49b28..999e1f6addf9 100644
--- a/drivers/cpufreq/cpufreq_governor.c
+++ b/drivers/cpufreq/cpufreq_governor.c
@@ -287,6 +287,7 @@ static int alloc_common_dbs_info(struct cpufreq_policy *policy,
 	for_each_cpu(j, policy->related_cpus)
 		cdata->get_cpu_cdbs(j)->shared = shared;
 
+	mutex_init(&shared->timer_mutex);
 	return 0;
 }
 
@@ -297,6 +298,8 @@ static void free_common_dbs_info(struct cpufreq_policy *policy,
 	struct cpu_common_dbs_info *shared = cdbs->shared;
 	int j;
 
+	mutex_destroy(&shared->timer_mutex);
+
 	for_each_cpu(j, policy->cpus)
 		cdata->get_cpu_cdbs(j)->shared = NULL;
 
@@ -433,7 +436,6 @@ static int cpufreq_governor_start(struct cpufreq_policy *policy,
 
 	shared->policy = policy;
 	shared->time_stamp = ktime_get();
-	mutex_init(&shared->timer_mutex);
 
 	for_each_cpu(j, policy->cpus) {
 		struct cpu_dbs_info *j_cdbs = cdata->get_cpu_cdbs(j);
@@ -493,8 +495,6 @@ static int cpufreq_governor_stop(struct cpufreq_policy *policy,
 	mutex_unlock(&shared->timer_mutex);
 
 	gov_cancel_work(dbs_data, policy);
-
-	mutex_destroy(&shared->timer_mutex);
 	return 0;
 }
 

From 70f43e5e798c8818d97d8d6a9bd4cd3235af9686 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Wed, 9 Dec 2015 07:34:42 +0530
Subject: [PATCH 05/30] cpufreq: governor: replace per-CPU delayed work with
 timers

cpufreq governors evaluate load at sampling rate and based on that they
update frequency for a group of CPUs belonging to the same cpufreq
policy.

This is required to be done in a single thread for all policy->cpus, but
because we don't want to wakeup idle CPUs to do just that, we use
deferrable work for this. If we would have used a single delayed
deferrable work for the entire policy, there were chances that the CPU
required to run the handler can be in idle and we might end up not
changing the frequency for the entire group with load variations.

And so we were forced to keep per-cpu works, and only the one that
expires first need to do the real work and others are rescheduled for
next sampling time.

We have been using the more complex solution until now, where we used a
delayed deferrable work for this, which is a combination of a timer and
a work.

This could be made lightweight by keeping per-cpu deferred timers with a
single work item, which is scheduled by the first timer that expires.

This patch does just that and here are important changes:
- The timer handler will run in irq context and so we need to use a
  spin_lock instead of the timer_mutex. And so a separate timer_lock is
  created. This also makes the use of the mutex and lock quite clear, as
  we know what exactly they are protecting.
- A new field 'skip_work' is added to track when the timer handlers can
  queue a work. More comments present in code.

Suggested-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Reviewed-by: Ashwin Chaugule <ashwin.chaugule@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cpufreq_governor.c | 142 +++++++++++++++++------------
 drivers/cpufreq/cpufreq_governor.h |  20 ++--
 drivers/cpufreq/cpufreq_ondemand.c |   8 +-
 3 files changed, 100 insertions(+), 70 deletions(-)

diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c
index 999e1f6addf9..2d61eae5cc5d 100644
--- a/drivers/cpufreq/cpufreq_governor.c
+++ b/drivers/cpufreq/cpufreq_governor.c
@@ -158,47 +158,53 @@ void dbs_check_cpu(struct dbs_data *dbs_data, int cpu)
 }
 EXPORT_SYMBOL_GPL(dbs_check_cpu);
 
-static inline void __gov_queue_work(int cpu, struct dbs_data *dbs_data,
-		unsigned int delay)
+void gov_add_timers(struct cpufreq_policy *policy, unsigned int delay)
 {
-	struct cpu_dbs_info *cdbs = dbs_data->cdata->get_cpu_cdbs(cpu);
+	struct dbs_data *dbs_data = policy->governor_data;
+	struct cpu_dbs_info *cdbs;
+	int cpu;
 
-	mod_delayed_work_on(cpu, system_wq, &cdbs->dwork, delay);
-}
-
-void gov_queue_work(struct dbs_data *dbs_data, struct cpufreq_policy *policy,
-		unsigned int delay, bool all_cpus)
-{
-	int i;
-
-	if (!all_cpus) {
-		/*
-		 * Use raw_smp_processor_id() to avoid preemptible warnings.
-		 * We know that this is only called with all_cpus == false from
-		 * works that have been queued with *_work_on() functions and
-		 * those works are canceled during CPU_DOWN_PREPARE so they
-		 * can't possibly run on any other CPU.
-		 */
-		__gov_queue_work(raw_smp_processor_id(), dbs_data, delay);
-	} else {
-		for_each_cpu(i, policy->cpus)
-			__gov_queue_work(i, dbs_data, delay);
+	for_each_cpu(cpu, policy->cpus) {
+		cdbs = dbs_data->cdata->get_cpu_cdbs(cpu);
+		cdbs->timer.expires = jiffies + delay;
+		add_timer_on(&cdbs->timer, cpu);
 	}
 }
-EXPORT_SYMBOL_GPL(gov_queue_work);
+EXPORT_SYMBOL_GPL(gov_add_timers);
 
-static inline void gov_cancel_work(struct dbs_data *dbs_data,
-		struct cpufreq_policy *policy)
+static inline void gov_cancel_timers(struct cpufreq_policy *policy)
 {
+	struct dbs_data *dbs_data = policy->governor_data;
 	struct cpu_dbs_info *cdbs;
 	int i;
 
 	for_each_cpu(i, policy->cpus) {
 		cdbs = dbs_data->cdata->get_cpu_cdbs(i);
-		cancel_delayed_work_sync(&cdbs->dwork);
+		del_timer_sync(&cdbs->timer);
 	}
 }
 
+void gov_cancel_work(struct cpu_common_dbs_info *shared)
+{
+	unsigned long flags;
+
+	/*
+	 * No work will be queued from timer handlers after skip_work is
+	 * updated. And so we can safely cancel the work first and then the
+	 * timers.
+	 */
+	spin_lock_irqsave(&shared->timer_lock, flags);
+	shared->skip_work++;
+	spin_unlock_irqrestore(&shared->timer_lock, flags);
+
+	cancel_work_sync(&shared->work);
+
+	gov_cancel_timers(shared->policy);
+
+	shared->skip_work = 0;
+}
+EXPORT_SYMBOL_GPL(gov_cancel_work);
+
 /* Will return if we need to evaluate cpu load again or not */
 static bool need_load_eval(struct cpu_common_dbs_info *shared,
 			   unsigned int sampling_rate)
@@ -217,29 +223,22 @@ static bool need_load_eval(struct cpu_common_dbs_info *shared,
 	return true;
 }
 
-static void dbs_timer(struct work_struct *work)
+static void dbs_work_handler(struct work_struct *work)
 {
-	struct cpu_dbs_info *cdbs = container_of(work, struct cpu_dbs_info,
-						 dwork.work);
-	struct cpu_common_dbs_info *shared = cdbs->shared;
+	struct cpu_common_dbs_info *shared = container_of(work, struct
+					cpu_common_dbs_info, work);
 	struct cpufreq_policy *policy;
 	struct dbs_data *dbs_data;
 	unsigned int sampling_rate, delay;
-	bool modify_all = true;
-
-	mutex_lock(&shared->timer_mutex);
+	unsigned long flags;
+	bool eval_load;
 
 	policy = shared->policy;
-
-	/*
-	 * Governor might already be disabled and there is no point continuing
-	 * with the work-handler.
-	 */
-	if (!policy)
-		goto unlock;
-
 	dbs_data = policy->governor_data;
 
+	/* Kill all timers */
+	gov_cancel_timers(policy);
+
 	if (dbs_data->cdata->governor == GOV_CONSERVATIVE) {
 		struct cs_dbs_tuners *cs_tuners = dbs_data->tuners;
 
@@ -250,14 +249,43 @@ static void dbs_timer(struct work_struct *work)
 		sampling_rate = od_tuners->sampling_rate;
 	}
 
-	if (!need_load_eval(cdbs->shared, sampling_rate))
-		modify_all = false;
+	eval_load = need_load_eval(shared, sampling_rate);
 
-	delay = dbs_data->cdata->gov_dbs_timer(policy, modify_all);
-	gov_queue_work(dbs_data, policy, delay, modify_all);
-
-unlock:
+	/*
+	 * Make sure cpufreq_governor_limits() isn't evaluating load in
+	 * parallel.
+	 */
+	mutex_lock(&shared->timer_mutex);
+	delay = dbs_data->cdata->gov_dbs_timer(policy, eval_load);
 	mutex_unlock(&shared->timer_mutex);
+
+	spin_lock_irqsave(&shared->timer_lock, flags);
+	shared->skip_work--;
+	spin_unlock_irqrestore(&shared->timer_lock, flags);
+
+	gov_add_timers(policy, delay);
+}
+
+static void dbs_timer_handler(unsigned long data)
+{
+	struct cpu_dbs_info *cdbs = (struct cpu_dbs_info *)data;
+	struct cpu_common_dbs_info *shared = cdbs->shared;
+	unsigned long flags;
+
+	spin_lock_irqsave(&shared->timer_lock, flags);
+
+	/*
+	 * Timer handler isn't allowed to queue work at the moment, because:
+	 * - Another timer handler has done that
+	 * - We are stopping the governor
+	 * - Or we are updating the sampling rate of ondemand governor
+	 */
+	if (!shared->skip_work) {
+		shared->skip_work++;
+		queue_work(system_wq, &shared->work);
+	}
+
+	spin_unlock_irqrestore(&shared->timer_lock, flags);
 }
 
 static void set_sampling_rate(struct dbs_data *dbs_data,
@@ -288,6 +316,8 @@ static int alloc_common_dbs_info(struct cpufreq_policy *policy,
 		cdata->get_cpu_cdbs(j)->shared = shared;
 
 	mutex_init(&shared->timer_mutex);
+	spin_lock_init(&shared->timer_lock);
+	INIT_WORK(&shared->work, dbs_work_handler);
 	return 0;
 }
 
@@ -452,7 +482,9 @@ static int cpufreq_governor_start(struct cpufreq_policy *policy,
 		if (ignore_nice)
 			j_cdbs->prev_cpu_nice = kcpustat_cpu(j).cpustat[CPUTIME_NICE];
 
-		INIT_DEFERRABLE_WORK(&j_cdbs->dwork, dbs_timer);
+		__setup_timer(&j_cdbs->timer, dbs_timer_handler,
+			      (unsigned long)j_cdbs,
+			      TIMER_DEFERRABLE | TIMER_IRQSAFE);
 	}
 
 	if (cdata->governor == GOV_CONSERVATIVE) {
@@ -470,8 +502,7 @@ static int cpufreq_governor_start(struct cpufreq_policy *policy,
 		od_ops->powersave_bias_init_cpu(cpu);
 	}
 
-	gov_queue_work(dbs_data, policy, delay_for_sampling_rate(sampling_rate),
-		       true);
+	gov_add_timers(policy, delay_for_sampling_rate(sampling_rate));
 	return 0;
 }
 
@@ -485,16 +516,9 @@ static int cpufreq_governor_stop(struct cpufreq_policy *policy,
 	if (!shared || !shared->policy)
 		return -EBUSY;
 
-	/*
-	 * Work-handler must see this updated, as it should not proceed any
-	 * further after governor is disabled. And so timer_mutex is taken while
-	 * updating this value.
-	 */
-	mutex_lock(&shared->timer_mutex);
+	gov_cancel_work(shared);
 	shared->policy = NULL;
-	mutex_unlock(&shared->timer_mutex);
 
-	gov_cancel_work(dbs_data, policy);
 	return 0;
 }
 
diff --git a/drivers/cpufreq/cpufreq_governor.h b/drivers/cpufreq/cpufreq_governor.h
index 0c7589016b6c..76742902491e 100644
--- a/drivers/cpufreq/cpufreq_governor.h
+++ b/drivers/cpufreq/cpufreq_governor.h
@@ -132,12 +132,20 @@ static void *get_cpu_dbs_info_s(int cpu)				\
 struct cpu_common_dbs_info {
 	struct cpufreq_policy *policy;
 	/*
-	 * percpu mutex that serializes governor limit change with dbs_timer
-	 * invocation. We do not want dbs_timer to run when user is changing
-	 * the governor or limits.
+	 * Per policy mutex that serializes load evaluation from limit-change
+	 * and work-handler.
 	 */
 	struct mutex timer_mutex;
+
+	/*
+	 * Per policy lock that serializes access to queuing work from timer
+	 * handlers.
+	 */
+	spinlock_t timer_lock;
+
 	ktime_t time_stamp;
+	unsigned int skip_work;
+	struct work_struct work;
 };
 
 /* Per cpu structures */
@@ -152,7 +160,7 @@ struct cpu_dbs_info {
 	 * wake-up from idle.
 	 */
 	unsigned int prev_load;
-	struct delayed_work dwork;
+	struct timer_list timer;
 	struct cpu_common_dbs_info *shared;
 };
 
@@ -268,11 +276,11 @@ static ssize_t show_sampling_rate_min_gov_pol				\
 
 extern struct mutex cpufreq_governor_lock;
 
+void gov_add_timers(struct cpufreq_policy *policy, unsigned int delay);
+void gov_cancel_work(struct cpu_common_dbs_info *shared);
 void dbs_check_cpu(struct dbs_data *dbs_data, int cpu);
 int cpufreq_governor_dbs(struct cpufreq_policy *policy,
 		struct common_dbs_data *cdata, unsigned int event);
-void gov_queue_work(struct dbs_data *dbs_data, struct cpufreq_policy *policy,
-		unsigned int delay, bool all_cpus);
 void od_register_powersave_bias_handler(unsigned int (*f)
 		(struct cpufreq_policy *, unsigned int, unsigned int),
 		unsigned int powersave_bias);
diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c
index fc0384b4d02d..f879012cf849 100644
--- a/drivers/cpufreq/cpufreq_ondemand.c
+++ b/drivers/cpufreq/cpufreq_ondemand.c
@@ -286,13 +286,11 @@ static void update_sampling_rate(struct dbs_data *dbs_data,
 			continue;
 
 		next_sampling = jiffies + usecs_to_jiffies(new_rate);
-		appointed_at = dbs_info->cdbs.dwork.timer.expires;
+		appointed_at = dbs_info->cdbs.timer.expires;
 
 		if (time_before(next_sampling, appointed_at)) {
-			cancel_delayed_work_sync(&dbs_info->cdbs.dwork);
-
-			gov_queue_work(dbs_data, policy,
-				       usecs_to_jiffies(new_rate), true);
+			gov_cancel_work(shared);
+			gov_add_timers(policy, usecs_to_jiffies(new_rate));
 
 		}
 	}

From f08f638b9c7f1bf3cb9006d3d26bf568d807ede0 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Thu, 3 Dec 2015 09:37:54 +0530
Subject: [PATCH 06/30] cpufreq: ondemand: update update_sampling_rate() to
 make it more efficient

Currently update_sampling_rate() runs over each online CPU and
cancels/queues timers on all policy->cpus every time. This should be
done just once for any cpu belonging to a policy.

Create a cpumask and keep on clearing it as and when we process
policies, so that we don't have to traverse through all CPUs of the same
policy.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cpufreq_ondemand.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c
index f879012cf849..eae51070c034 100644
--- a/drivers/cpufreq/cpufreq_ondemand.c
+++ b/drivers/cpufreq/cpufreq_ondemand.c
@@ -246,6 +246,7 @@ static void update_sampling_rate(struct dbs_data *dbs_data,
 		unsigned int new_rate)
 {
 	struct od_dbs_tuners *od_tuners = dbs_data->tuners;
+	struct cpumask cpumask;
 	int cpu;
 
 	od_tuners->sampling_rate = new_rate = max(new_rate,
@@ -256,7 +257,9 @@ static void update_sampling_rate(struct dbs_data *dbs_data,
 	 */
 	mutex_lock(&od_dbs_cdata.mutex);
 
-	for_each_online_cpu(cpu) {
+	cpumask_copy(&cpumask, cpu_online_mask);
+
+	for_each_cpu(cpu, &cpumask) {
 		struct cpufreq_policy *policy;
 		struct od_cpu_dbs_info_s *dbs_info;
 		struct cpu_dbs_info *cdbs;
@@ -276,6 +279,9 @@ static void update_sampling_rate(struct dbs_data *dbs_data,
 
 		policy = shared->policy;
 
+		/* clear all CPUs of this policy */
+		cpumask_andnot(&cpumask, &cpumask, policy->cpus);
+
 		/*
 		 * Update sampling rate for CPUs whose policy is governed by
 		 * dbs_data. In case of governor_per_policy, only a single
@@ -285,6 +291,10 @@ static void update_sampling_rate(struct dbs_data *dbs_data,
 		if (dbs_data != policy->governor_data)
 			continue;
 
+		/*
+		 * Checking this for any CPU should be fine, timers for all of
+		 * them are scheduled together.
+		 */
 		next_sampling = jiffies + usecs_to_jiffies(new_rate);
 		appointed_at = dbs_info->cdbs.timer.expires;
 

From 2dd3e724b4e2237cfaaf155cab72af02c1c420cc Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Tue, 8 Dec 2015 21:44:05 +0100
Subject: [PATCH 07/30] cpufreq: governor: Use lockless timer function

It is possible to get rid of the timer_lock spinlock used by the
governor timer function for synchronization, but a couple of races
need to be avoided.

The first race is between multiple dbs_timer_handler() instances
that may be running in parallel with each other on different
CPUs.  Namely, one of them has to queue up the work item, but it
cannot be queued up more than once.  To achieve that,
atomic_inc_return() can be used on the skip_work field of
struct cpu_common_dbs_info.

The second race is between an already running dbs_timer_handler()
and gov_cancel_work().  In that case the dbs_timer_handler() might
not notice the skip_work incrementation in gov_cancel_work() and
it might queue up its work item after gov_cancel_work() had
returned (and that work item would corrupt skip_work going
forward).  To prevent that from happening, gov_cancel_work()
can be made wait for the timer function to complete (on all CPUs)
right after skip_work has been incremented.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/cpufreq/cpufreq_governor.c | 51 ++++++++++++++----------------
 drivers/cpufreq/cpufreq_governor.h |  9 ++----
 2 files changed, 25 insertions(+), 35 deletions(-)

diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c
index 2d61eae5cc5d..4de12fd35b1f 100644
--- a/drivers/cpufreq/cpufreq_governor.c
+++ b/drivers/cpufreq/cpufreq_governor.c
@@ -186,22 +186,24 @@ static inline void gov_cancel_timers(struct cpufreq_policy *policy)
 
 void gov_cancel_work(struct cpu_common_dbs_info *shared)
 {
-	unsigned long flags;
-
+	/* Tell dbs_timer_handler() to skip queuing up work items. */
+	atomic_inc(&shared->skip_work);
 	/*
-	 * No work will be queued from timer handlers after skip_work is
-	 * updated. And so we can safely cancel the work first and then the
-	 * timers.
+	 * If dbs_timer_handler() is already running, it may not notice the
+	 * incremented skip_work, so wait for it to complete to prevent its work
+	 * item from being queued up after the cancel_work_sync() below.
 	 */
-	spin_lock_irqsave(&shared->timer_lock, flags);
-	shared->skip_work++;
-	spin_unlock_irqrestore(&shared->timer_lock, flags);
-
-	cancel_work_sync(&shared->work);
-
 	gov_cancel_timers(shared->policy);
-
-	shared->skip_work = 0;
+	/*
+	 * In case dbs_timer_handler() managed to run and spawn a work item
+	 * before the timers have been canceled, wait for that work item to
+	 * complete and then cancel all of the timers set up by it.  If
+	 * dbs_timer_handler() runs again at that point, it will see the
+	 * positive value of skip_work and won't spawn any more work items.
+	 */
+	cancel_work_sync(&shared->work);
+	gov_cancel_timers(shared->policy);
+	atomic_set(&shared->skip_work, 0);
 }
 EXPORT_SYMBOL_GPL(gov_cancel_work);
 
@@ -230,7 +232,6 @@ static void dbs_work_handler(struct work_struct *work)
 	struct cpufreq_policy *policy;
 	struct dbs_data *dbs_data;
 	unsigned int sampling_rate, delay;
-	unsigned long flags;
 	bool eval_load;
 
 	policy = shared->policy;
@@ -259,9 +260,7 @@ static void dbs_work_handler(struct work_struct *work)
 	delay = dbs_data->cdata->gov_dbs_timer(policy, eval_load);
 	mutex_unlock(&shared->timer_mutex);
 
-	spin_lock_irqsave(&shared->timer_lock, flags);
-	shared->skip_work--;
-	spin_unlock_irqrestore(&shared->timer_lock, flags);
+	atomic_dec(&shared->skip_work);
 
 	gov_add_timers(policy, delay);
 }
@@ -270,22 +269,18 @@ static void dbs_timer_handler(unsigned long data)
 {
 	struct cpu_dbs_info *cdbs = (struct cpu_dbs_info *)data;
 	struct cpu_common_dbs_info *shared = cdbs->shared;
-	unsigned long flags;
-
-	spin_lock_irqsave(&shared->timer_lock, flags);
 
 	/*
-	 * Timer handler isn't allowed to queue work at the moment, because:
+	 * Timer handler may not be allowed to queue the work at the moment,
+	 * because:
 	 * - Another timer handler has done that
 	 * - We are stopping the governor
-	 * - Or we are updating the sampling rate of ondemand governor
+	 * - Or we are updating the sampling rate of the ondemand governor
 	 */
-	if (!shared->skip_work) {
-		shared->skip_work++;
+	if (atomic_inc_return(&shared->skip_work) > 1)
+		atomic_dec(&shared->skip_work);
+	else
 		queue_work(system_wq, &shared->work);
-	}
-
-	spin_unlock_irqrestore(&shared->timer_lock, flags);
 }
 
 static void set_sampling_rate(struct dbs_data *dbs_data,
@@ -316,7 +311,7 @@ static int alloc_common_dbs_info(struct cpufreq_policy *policy,
 		cdata->get_cpu_cdbs(j)->shared = shared;
 
 	mutex_init(&shared->timer_mutex);
-	spin_lock_init(&shared->timer_lock);
+	atomic_set(&shared->skip_work, 0);
 	INIT_WORK(&shared->work, dbs_work_handler);
 	return 0;
 }
diff --git a/drivers/cpufreq/cpufreq_governor.h b/drivers/cpufreq/cpufreq_governor.h
index 76742902491e..91e767a058a7 100644
--- a/drivers/cpufreq/cpufreq_governor.h
+++ b/drivers/cpufreq/cpufreq_governor.h
@@ -17,6 +17,7 @@
 #ifndef _CPUFREQ_GOVERNOR_H
 #define _CPUFREQ_GOVERNOR_H
 
+#include <linux/atomic.h>
 #include <linux/cpufreq.h>
 #include <linux/kernel_stat.h>
 #include <linux/module.h>
@@ -137,14 +138,8 @@ struct cpu_common_dbs_info {
 	 */
 	struct mutex timer_mutex;
 
-	/*
-	 * Per policy lock that serializes access to queuing work from timer
-	 * handlers.
-	 */
-	spinlock_t timer_lock;
-
 	ktime_t time_stamp;
-	unsigned int skip_work;
+	atomic_t skip_work;
 	struct work_struct work;
 };
 

From 3be3f8f36e7349006f19c8c8f0d686e98462a993 Mon Sep 17 00:00:00 2001
From: Punit Agrawal <punit.agrawal@arm.com>
Date: Tue, 17 Nov 2015 12:06:21 +0000
Subject: [PATCH 08/30] devicetree: bindings: Add optional
 dynamic-power-coefficient property

The dynamic power consumption of a device is proportional to the
square of voltage (V) and the clock frequency (f). It can be expressed as

Pdyn = dynamic-power-coefficient * V^2 * f.

The coefficient represents the running time dynamic power consumption in
units of mw/MHz/uVolt^2 and can be used in the above formula to
calculate the dynamic power in mW.

Signed-off-by: Punit Agrawal <punit.agrawal@arm.com>
Acked-by: Rob Herring <robh@kernel.org>
Reviewed-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 Documentation/devicetree/bindings/arm/cpus.txt | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/Documentation/devicetree/bindings/arm/cpus.txt b/Documentation/devicetree/bindings/arm/cpus.txt
index 3a07a87fef20..6aca64f289b6 100644
--- a/Documentation/devicetree/bindings/arm/cpus.txt
+++ b/Documentation/devicetree/bindings/arm/cpus.txt
@@ -242,6 +242,23 @@ nodes to be present and contain the properties described below.
 		Definition: Specifies the syscon node controlling the cpu core
 			    power domains.
 
+	- dynamic-power-coefficient
+		Usage: optional
+		Value type: <prop-encoded-array>
+		Definition: A u32 value that represents the running time dynamic
+			    power coefficient in units of mW/MHz/uVolt^2. The
+			    coefficient can either be calculated from power
+			    measurements or derived by analysis.
+
+			    The dynamic power consumption of the CPU  is
+			    proportional to the square of the Voltage (V) and
+			    the clock frequency (f). The coefficient is used to
+			    calculate the dynamic power as below -
+
+			    Pdyn = dynamic-power-coefficient * V^2 * f
+
+			    where voltage is in uV, frequency is in MHz.
+
 Example 1 (dual-cluster big.LITTLE system 32-bit):
 
 	cpus {

From f8fa8ae06b8c2c25d81c99766f9226adc5c3e073 Mon Sep 17 00:00:00 2001
From: Punit Agrawal <punit.agrawal@arm.com>
Date: Tue, 17 Nov 2015 12:06:22 +0000
Subject: [PATCH 09/30] cpufreq-dt: Supply power coefficient when registering
 cooling devices

Support registering cooling devices with dynamic power coefficient
where provided by the device tree. This allows OF registered cooling
devices driver to be used with the power_allocator thermal governor.

Signed-off-by: Punit Agrawal <punit.agrawal@arm.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Reviewed-by: Javi Merino <javi.merino@arm.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cpufreq-dt.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/drivers/cpufreq/cpufreq-dt.c b/drivers/cpufreq/cpufreq-dt.c
index 90d64081ddb3..1ceece9d6711 100644
--- a/drivers/cpufreq/cpufreq-dt.c
+++ b/drivers/cpufreq/cpufreq-dt.c
@@ -407,8 +407,13 @@ static void cpufreq_ready(struct cpufreq_policy *policy)
 	 * thermal DT code takes care of matching them.
 	 */
 	if (of_find_property(np, "#cooling-cells", NULL)) {
-		priv->cdev = of_cpufreq_cooling_register(np,
-							 policy->related_cpus);
+		u32 power_coefficient = 0;
+
+		of_property_read_u32(np, "dynamic-power-coefficient",
+				     &power_coefficient);
+
+		priv->cdev = of_cpufreq_power_cooling_register(np,
+				policy->related_cpus, power_coefficient, NULL);
 		if (IS_ERR(priv->cdev)) {
 			dev_err(priv->cpu_dev,
 				"running cpufreq without cooling device: %ld\n",

From 2f7e8a175db72bdaf377235962fd85796edb3fbc Mon Sep 17 00:00:00 2001
From: Punit Agrawal <punit.agrawal@arm.com>
Date: Tue, 17 Nov 2015 12:06:23 +0000
Subject: [PATCH 10/30] cpufreq: arm_big_little: Add support to register a
 cpufreq cooling device

Register passive cooling devices when initialising cpufreq on
big.LITTLE systems. If the device tree provides a dynamic power
coefficient for the CPUs then the bound cooling device will support
the extensions that allow it to be used with all the existing thermal
governors including the power allocator governor.

A cooling device will be created per individual frequency domain and
can be bound to thermal zones via the thermal DT bindings.

Signed-off-by: Punit Agrawal <punit.agrawal@arm.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/Kconfig.arm      |  2 ++
 drivers/cpufreq/arm_big_little.c | 41 ++++++++++++++++++++++++++++++++
 2 files changed, 43 insertions(+)

diff --git a/drivers/cpufreq/Kconfig.arm b/drivers/cpufreq/Kconfig.arm
index 235a1ba73d92..80fbfb32b5a9 100644
--- a/drivers/cpufreq/Kconfig.arm
+++ b/drivers/cpufreq/Kconfig.arm
@@ -6,6 +6,8 @@
 config ARM_BIG_LITTLE_CPUFREQ
 	tristate "Generic ARM big LITTLE CPUfreq driver"
 	depends on (ARM_CPU_TOPOLOGY || ARM64) && HAVE_CLK
+	# if CPU_THERMAL is on and THERMAL=m, ARM_BIT_LITTLE_CPUFREQ cannot be =y
+	depends on !CPU_THERMAL || THERMAL
 	select PM_OPP
 	help
 	  This enables the Generic CPUfreq driver for ARM big.LITTLE platforms.
diff --git a/drivers/cpufreq/arm_big_little.c b/drivers/cpufreq/arm_big_little.c
index c5d256caa664..c251247ae661 100644
--- a/drivers/cpufreq/arm_big_little.c
+++ b/drivers/cpufreq/arm_big_little.c
@@ -23,6 +23,7 @@
 #include <linux/cpu.h>
 #include <linux/cpufreq.h>
 #include <linux/cpumask.h>
+#include <linux/cpu_cooling.h>
 #include <linux/export.h>
 #include <linux/module.h>
 #include <linux/mutex.h>
@@ -55,6 +56,7 @@ static bool bL_switching_enabled;
 #define ACTUAL_FREQ(cluster, freq)  ((cluster == A7_CLUSTER) ? freq << 1 : freq)
 #define VIRT_FREQ(cluster, freq)    ((cluster == A7_CLUSTER) ? freq >> 1 : freq)
 
+static struct thermal_cooling_device *cdev[MAX_CLUSTERS];
 static struct cpufreq_arm_bL_ops *arm_bL_ops;
 static struct clk *clk[MAX_CLUSTERS];
 static struct cpufreq_frequency_table *freq_table[MAX_CLUSTERS + 1];
@@ -493,6 +495,12 @@ static int bL_cpufreq_init(struct cpufreq_policy *policy)
 static int bL_cpufreq_exit(struct cpufreq_policy *policy)
 {
 	struct device *cpu_dev;
+	int cur_cluster = cpu_to_cluster(policy->cpu);
+
+	if (cur_cluster < MAX_CLUSTERS) {
+		cpufreq_cooling_unregister(cdev[cur_cluster]);
+		cdev[cur_cluster] = NULL;
+	}
 
 	cpu_dev = get_cpu_device(policy->cpu);
 	if (!cpu_dev) {
@@ -507,6 +515,38 @@ static int bL_cpufreq_exit(struct cpufreq_policy *policy)
 	return 0;
 }
 
+static void bL_cpufreq_ready(struct cpufreq_policy *policy)
+{
+	struct device *cpu_dev = get_cpu_device(policy->cpu);
+	int cur_cluster = cpu_to_cluster(policy->cpu);
+	struct device_node *np;
+
+	/* Do not register a cpu_cooling device if we are in IKS mode */
+	if (cur_cluster >= MAX_CLUSTERS)
+		return;
+
+	np = of_node_get(cpu_dev->of_node);
+	if (WARN_ON(!np))
+		return;
+
+	if (of_find_property(np, "#cooling-cells", NULL)) {
+		u32 power_coefficient = 0;
+
+		of_property_read_u32(np, "dynamic-power-coefficient",
+				     &power_coefficient);
+
+		cdev[cur_cluster] = of_cpufreq_power_cooling_register(np,
+				policy->related_cpus, power_coefficient, NULL);
+		if (IS_ERR(cdev[cur_cluster])) {
+			dev_err(cpu_dev,
+				"running cpufreq without cooling device: %ld\n",
+				PTR_ERR(cdev[cur_cluster]));
+			cdev[cur_cluster] = NULL;
+		}
+	}
+	of_node_put(np);
+}
+
 static struct cpufreq_driver bL_cpufreq_driver = {
 	.name			= "arm-big-little",
 	.flags			= CPUFREQ_STICKY |
@@ -517,6 +557,7 @@ static struct cpufreq_driver bL_cpufreq_driver = {
 	.get			= bL_cpufreq_get_rate,
 	.init			= bL_cpufreq_init,
 	.exit			= bL_cpufreq_exit,
+	.ready			= bL_cpufreq_ready,
 	.attr			= cpufreq_generic_attr,
 };
 

From 790d849bf811a8ab5d4cd2cce0f6fda92f6aebf2 Mon Sep 17 00:00:00 2001
From: Jacob Tanenbaum <jtanenba@redhat.com>
Date: Thu, 19 Nov 2015 10:29:01 -0500
Subject: [PATCH 11/30] cpufreq: pcc-cpufreq: update default value of
 cpuinfo_transition_latency

The cpufreq documentation specifies

policy->cpuinfo.transition_latency   the time it takes on this CPU to
                                switch between two frequencies in
                                nanoseconds (if appropriate, else
                                specify CPUFREQ_ETERNAL)

currently pcc-cpufreq does not expose the value and sets it to zero. I
changed the pcc-cpufreq driver and it's documentation to conform to the
default value specified in Documentation/cpu-freq/cpu-drivers.txt

Signed-off-by: Jacob Tanenbaum <jtanenba@redhat.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 Documentation/cpu-freq/pcc-cpufreq.txt | 4 ++--
 drivers/cpufreq/pcc-cpufreq.c          | 2 ++
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/Documentation/cpu-freq/pcc-cpufreq.txt b/Documentation/cpu-freq/pcc-cpufreq.txt
index 9e3c3b33514c..0a94224ad296 100644
--- a/Documentation/cpu-freq/pcc-cpufreq.txt
+++ b/Documentation/cpu-freq/pcc-cpufreq.txt
@@ -159,8 +159,8 @@ to be strictly associated with a P-state.
 
 2.2 cpuinfo_transition_latency:
 -------------------------------
-The cpuinfo_transition_latency field is 0. The PCC specification does
-not include a field to expose this value currently.
+The cpuinfo_transition_latency field is CPUFREQ_ETERNAL. The PCC specification
+does not include a field to expose this value currently.
 
 2.3 cpuinfo_cur_freq:
 ---------------------
diff --git a/drivers/cpufreq/pcc-cpufreq.c b/drivers/cpufreq/pcc-cpufreq.c
index 2a0d58959acf..808a320e9d5d 100644
--- a/drivers/cpufreq/pcc-cpufreq.c
+++ b/drivers/cpufreq/pcc-cpufreq.c
@@ -555,6 +555,8 @@ static int pcc_cpufreq_cpu_init(struct cpufreq_policy *policy)
 	policy->min = policy->cpuinfo.min_freq =
 		ioread32(&pcch_hdr->minimum_frequency) * 1000;
 
+	policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
+
 	pr_debug("init: policy->max is %d, policy->min is %d\n",
 		policy->max, policy->min);
 out:

From 8ae1702a0df5e0730607b97fd9fd1f8066870832 Mon Sep 17 00:00:00 2001
From: Hongtao Jia <hongtao.jia@freescale.com>
Date: Thu, 26 Nov 2015 17:21:11 +0800
Subject: [PATCH 12/30] cpufreq: qoriq: Register cooling device based on device
 tree

Register the qoriq cpufreq driver as a cooling device, based on the
thermal device tree framework. When temperature crosses the passive trip
point cpufreq is used to throttle CPUs.

Signed-off-by: Jia Hongtao <hongtao.jia@freescale.com>
Reviewed-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/qoriq-cpufreq.c | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/drivers/cpufreq/qoriq-cpufreq.c b/drivers/cpufreq/qoriq-cpufreq.c
index 358f0752c31e..b23e525a7af3 100644
--- a/drivers/cpufreq/qoriq-cpufreq.c
+++ b/drivers/cpufreq/qoriq-cpufreq.c
@@ -12,6 +12,7 @@
 
 #include <linux/clk.h>
 #include <linux/cpufreq.h>
+#include <linux/cpu_cooling.h>
 #include <linux/errno.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
@@ -33,6 +34,7 @@
 struct cpu_data {
 	struct clk **pclk;
 	struct cpufreq_frequency_table *table;
+	struct thermal_cooling_device *cdev;
 };
 
 /**
@@ -321,6 +323,27 @@ static int qoriq_cpufreq_target(struct cpufreq_policy *policy,
 	return clk_set_parent(policy->clk, parent);
 }
 
+
+static void qoriq_cpufreq_ready(struct cpufreq_policy *policy)
+{
+	struct cpu_data *cpud = policy->driver_data;
+	struct device_node *np = of_get_cpu_node(policy->cpu, NULL);
+
+	if (of_find_property(np, "#cooling-cells", NULL)) {
+		cpud->cdev = of_cpufreq_cooling_register(np,
+							 policy->related_cpus);
+
+		if (IS_ERR(cpud->cdev)) {
+			pr_err("Failed to register cooling device cpu%d: %ld\n",
+					policy->cpu, PTR_ERR(cpud->cdev));
+
+			cpud->cdev = NULL;
+		}
+	}
+
+	of_node_put(np);
+}
+
 static struct cpufreq_driver qoriq_cpufreq_driver = {
 	.name		= "qoriq_cpufreq",
 	.flags		= CPUFREQ_CONST_LOOPS,
@@ -329,6 +352,7 @@ static struct cpufreq_driver qoriq_cpufreq_driver = {
 	.verify		= cpufreq_generic_frequency_table_verify,
 	.target_index	= qoriq_cpufreq_target,
 	.get		= cpufreq_generic_get,
+	.ready		= qoriq_cpufreq_ready,
 	.attr		= cpufreq_generic_attr,
 };
 

From 9bb46b87d662ab704bd852db9916f0e51db3e94b Mon Sep 17 00:00:00 2001
From: Pi-Cheng Chen <pi-cheng.chen@linaro.org>
Date: Sun, 29 Nov 2015 16:31:35 +0800
Subject: [PATCH 13/30] cpufreq: mt8173: add CPUFREQ_HAVE_GOVERNOR_PER_POLICY
 flag

Add CPUFREQ_HAVE_GOVERNOR_PER_POLICY to have individual set of tunables
for each cluster of MT8173.

Signed-off-by: Pi-Cheng Chen <pi-cheng.chen@linaro.org>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/mt8173-cpufreq.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/cpufreq/mt8173-cpufreq.c b/drivers/cpufreq/mt8173-cpufreq.c
index 83001dc5b646..c43810946446 100644
--- a/drivers/cpufreq/mt8173-cpufreq.c
+++ b/drivers/cpufreq/mt8173-cpufreq.c
@@ -469,7 +469,8 @@ static int mtk_cpufreq_exit(struct cpufreq_policy *policy)
 }
 
 static struct cpufreq_driver mt8173_cpufreq_driver = {
-	.flags = CPUFREQ_STICKY | CPUFREQ_NEED_INITIAL_FREQ_CHECK,
+	.flags = CPUFREQ_STICKY | CPUFREQ_NEED_INITIAL_FREQ_CHECK |
+		 CPUFREQ_HAVE_GOVERNOR_PER_POLICY,
 	.verify = cpufreq_generic_frequency_table_verify,
 	.target_index = mtk_cpufreq_set_target,
 	.get = cpufreq_generic_get,

From 93625d52e7a74492416f77fed945ba34e0ae0c18 Mon Sep 17 00:00:00 2001
From: Pi-Cheng Chen <pi-cheng.chen@linaro.org>
Date: Sun, 29 Nov 2015 16:31:36 +0800
Subject: [PATCH 14/30] cpufreq: mt8173: remove redundant
 regulator_get_voltage() call

Remove redundant regulator_get_voltage() call to get Vsram value
since it will be obtained later at the beginning of voltage tracking
loop.

Signed-off-by: Pi-Cheng Chen <pi-cheng.chen@linaro.org>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/mt8173-cpufreq.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/cpufreq/mt8173-cpufreq.c b/drivers/cpufreq/mt8173-cpufreq.c
index c43810946446..750cda7876ac 100644
--- a/drivers/cpufreq/mt8173-cpufreq.c
+++ b/drivers/cpufreq/mt8173-cpufreq.c
@@ -59,7 +59,6 @@ static int mtk_cpufreq_voltage_tracking(struct mtk_cpu_dvfs_info *info,
 	int old_vproc, old_vsram, new_vsram, vsram, vproc, ret;
 
 	old_vproc = regulator_get_voltage(proc_reg);
-	old_vsram = regulator_get_voltage(sram_reg);
 	/* Vsram should not exceed the maximum allowed voltage of SoC. */
 	new_vsram = min(new_vproc + MIN_VOLT_SHIFT, MAX_VOLT_LIMIT);
 

From 40be4c3ccbf4078e2f8426a7962879b7a447cde4 Mon Sep 17 00:00:00 2001
From: Pi-Cheng Chen <pi-cheng.chen@linaro.org>
Date: Sun, 29 Nov 2015 16:31:37 +0800
Subject: [PATCH 15/30] cpufreq: mt8173: check return value of
 regulator_get_voltage() call

Sometimes regulator_get_voltage() call returns negative values for
reasons(e.g. underlying I2C bus timeout). Add check for the return
values and fail out early.

Signed-off-by: Pi-Cheng Chen <pi-cheng.chen@linaro.org>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/mt8173-cpufreq.c | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/drivers/cpufreq/mt8173-cpufreq.c b/drivers/cpufreq/mt8173-cpufreq.c
index 750cda7876ac..9d0fe37b4c3e 100644
--- a/drivers/cpufreq/mt8173-cpufreq.c
+++ b/drivers/cpufreq/mt8173-cpufreq.c
@@ -59,6 +59,10 @@ static int mtk_cpufreq_voltage_tracking(struct mtk_cpu_dvfs_info *info,
 	int old_vproc, old_vsram, new_vsram, vsram, vproc, ret;
 
 	old_vproc = regulator_get_voltage(proc_reg);
+	if (old_vproc < 0) {
+		pr_err("%s: invalid Vproc value: %d\n", __func__, old_vproc);
+		return old_vproc;
+	}
 	/* Vsram should not exceed the maximum allowed voltage of SoC. */
 	new_vsram = min(new_vproc + MIN_VOLT_SHIFT, MAX_VOLT_LIMIT);
 
@@ -71,7 +75,17 @@ static int mtk_cpufreq_voltage_tracking(struct mtk_cpu_dvfs_info *info,
 		 */
 		do {
 			old_vsram = regulator_get_voltage(sram_reg);
+			if (old_vsram < 0) {
+				pr_err("%s: invalid Vsram value: %d\n",
+				       __func__, old_vsram);
+				return old_vsram;
+			}
 			old_vproc = regulator_get_voltage(proc_reg);
+			if (old_vproc < 0) {
+				pr_err("%s: invalid Vproc value: %d\n",
+				       __func__, old_vproc);
+				return old_vproc;
+			}
 
 			vsram = min(new_vsram, old_vproc + MAX_VOLT_SHIFT);
 
@@ -116,7 +130,17 @@ static int mtk_cpufreq_voltage_tracking(struct mtk_cpu_dvfs_info *info,
 		 */
 		do {
 			old_vproc = regulator_get_voltage(proc_reg);
+			if (old_vproc < 0) {
+				pr_err("%s: invalid Vproc value: %d\n",
+				       __func__, old_vproc);
+				return old_vproc;
+			}
 			old_vsram = regulator_get_voltage(sram_reg);
+			if (old_vsram < 0) {
+				pr_err("%s: invalid Vsram value: %d\n",
+				       __func__, old_vsram);
+				return old_vsram;
+			}
 
 			vproc = max(new_vproc, old_vsram - MAX_VOLT_SHIFT);
 			ret = regulator_set_voltage(proc_reg, vproc,
@@ -184,6 +208,10 @@ static int mtk_cpufreq_set_target(struct cpufreq_policy *policy,
 
 	old_freq_hz = clk_get_rate(cpu_clk);
 	old_vproc = regulator_get_voltage(info->proc_reg);
+	if (old_vproc < 0) {
+		pr_err("%s: invalid Vproc value: %d\n", __func__, old_vproc);
+		return old_vproc;
+	}
 
 	freq_hz = freq_table[index].frequency * 1000;
 

From 157386b6fc1465f292b66c4133409033650ad335 Mon Sep 17 00:00:00 2001
From: Philippe Longepe <philippe.longepe@intel.com>
Date: Fri, 4 Dec 2015 17:40:30 +0100
Subject: [PATCH 16/30] cpufreq: intel_pstate: Configurable algorithm to get
 target pstate

Target systems using different cpus have different power and performance
requirements. They may use different algorithms to get the next P-state
based on their power or performance preference.

For example, power-constrained systems may not want to use
high-performance P-states as aggressively as a full-size desktop or a
server platform. A server platform may want to run close to the max to
achieve better performance, while laptop-like systems may prefer
sacrificing performance for longer battery lifes.

For the above reasons, modify intel_pstate to allow the target P-state
selection algorithm to be depend on the CPU ID.

Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Signed-off-by: Philippe Longepe <philippe.longepe@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/intel_pstate.c | 30 +++++++++++++++++-------------
 1 file changed, 17 insertions(+), 13 deletions(-)

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index 4d07cbd2b23c..ff58029a56e2 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -66,6 +66,7 @@ static inline int ceiling_fp(int32_t x)
 
 struct sample {
 	int32_t core_pct_busy;
+	int32_t busy_scaled;
 	u64 aperf;
 	u64 mperf;
 	u64 tsc;
@@ -133,6 +134,7 @@ struct pstate_funcs {
 	int (*get_scaling)(void);
 	void (*set)(struct cpudata*, int pstate);
 	void (*get_vid)(struct cpudata *);
+	int32_t (*get_target_pstate)(struct cpudata *);
 };
 
 struct cpu_defaults {
@@ -140,6 +142,8 @@ struct cpu_defaults {
 	struct pstate_funcs funcs;
 };
 
+static inline int32_t get_target_pstate_use_performance(struct cpudata *cpu);
+
 static struct pstate_adjust_policy pid_params;
 static struct pstate_funcs pstate_funcs;
 static int hwp_active;
@@ -738,6 +742,7 @@ static struct cpu_defaults core_params = {
 		.get_turbo = core_get_turbo_pstate,
 		.get_scaling = core_get_scaling,
 		.set = core_set_pstate,
+		.get_target_pstate = get_target_pstate_use_performance,
 	},
 };
 
@@ -758,6 +763,7 @@ static struct cpu_defaults silvermont_params = {
 		.set = atom_set_pstate,
 		.get_scaling = silvermont_get_scaling,
 		.get_vid = atom_get_vid,
+		.get_target_pstate = get_target_pstate_use_performance,
 	},
 };
 
@@ -778,6 +784,7 @@ static struct cpu_defaults airmont_params = {
 		.set = atom_set_pstate,
 		.get_scaling = airmont_get_scaling,
 		.get_vid = atom_get_vid,
+		.get_target_pstate = get_target_pstate_use_performance,
 	},
 };
 
@@ -797,6 +804,7 @@ static struct cpu_defaults knl_params = {
 		.get_turbo = knl_get_turbo_pstate,
 		.get_scaling = core_get_scaling,
 		.set = core_set_pstate,
+		.get_target_pstate = get_target_pstate_use_performance,
 	},
 };
 
@@ -922,7 +930,7 @@ static inline void intel_pstate_set_sample_time(struct cpudata *cpu)
 	mod_timer_pinned(&cpu->timer, jiffies + delay);
 }
 
-static inline int32_t intel_pstate_get_scaled_busy(struct cpudata *cpu)
+static inline int32_t get_target_pstate_use_performance(struct cpudata *cpu)
 {
 	int32_t core_busy, max_pstate, current_pstate, sample_ratio;
 	s64 duration_us;
@@ -960,30 +968,24 @@ static inline int32_t intel_pstate_get_scaled_busy(struct cpudata *cpu)
 		core_busy = mul_fp(core_busy, sample_ratio);
 	}
 
-	return core_busy;
+	cpu->sample.busy_scaled = core_busy;
+	return cpu->pstate.current_pstate - pid_calc(&cpu->pid, core_busy);
 }
 
 static inline void intel_pstate_adjust_busy_pstate(struct cpudata *cpu)
 {
-	int32_t busy_scaled;
-	struct _pid *pid;
-	signed int ctl;
-	int from;
+	int from, target_pstate;
 	struct sample *sample;
 
 	from = cpu->pstate.current_pstate;
 
-	pid = &cpu->pid;
-	busy_scaled = intel_pstate_get_scaled_busy(cpu);
+	target_pstate = pstate_funcs.get_target_pstate(cpu);
 
-	ctl = pid_calc(pid, busy_scaled);
-
-	/* Negative values of ctl increase the pstate and vice versa */
-	intel_pstate_set_pstate(cpu, cpu->pstate.current_pstate - ctl, true);
+	intel_pstate_set_pstate(cpu, target_pstate, true);
 
 	sample = &cpu->sample;
 	trace_pstate_sample(fp_toint(sample->core_pct_busy),
-		fp_toint(busy_scaled),
+		fp_toint(sample->busy_scaled),
 		from,
 		cpu->pstate.current_pstate,
 		sample->mperf,
@@ -1237,6 +1239,8 @@ static void copy_cpu_funcs(struct pstate_funcs *funcs)
 	pstate_funcs.get_scaling = funcs->get_scaling;
 	pstate_funcs.set       = funcs->set;
 	pstate_funcs.get_vid   = funcs->get_vid;
+	pstate_funcs.get_target_pstate = funcs->get_target_pstate;
+
 }
 
 #if IS_ENABLED(CONFIG_ACPI)

From e70eed2b64545ab5c9d2f4d43372d79762f1b985 Mon Sep 17 00:00:00 2001
From: Philippe Longepe <philippe.longepe@intel.com>
Date: Fri, 4 Dec 2015 17:40:32 +0100
Subject: [PATCH 17/30] cpufreq: intel_pstate: Account for non C0 time

The current function to calculate cpu utilization uses the average P-state
ratio (APerf/Mperf) scaled by the ratio of the current P-state to the
max available non-turbo one. This leads to an overestimation of
utilization which causes higher-performance P-states to be selected more
often and that leads to increased energy consumption.

This is a problem for low-power systems, so it is better to use a
different utilization calculation algorithm for them.

Namely, the Percent Busy value (or load) can be estimated as the ratio of the
MPERF counter that runs at a constant rate only during active periods (C0) to
the time stamp counter (TSC) that also runs (at the same rate) during idle.
That is:

Percent Busy = 100 * (delta_mperf / delta_tsc)

Use this algorithm for platforms with SoCs based on the Airmont and Silvermont
Atom cores.

Signed-off-by: Philippe Longepe <philippe.longepe@intel.com>
Signed-off-by: Stephane Gasparini <stephane.gasparini@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/intel_pstate.c | 29 ++++++++++++++++++++++++-----
 1 file changed, 24 insertions(+), 5 deletions(-)

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index ff58029a56e2..8bfebaeda2dd 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -143,6 +143,7 @@ struct cpu_defaults {
 };
 
 static inline int32_t get_target_pstate_use_performance(struct cpudata *cpu);
+static inline int32_t get_target_pstate_use_cpu_load(struct cpudata *cpu);
 
 static struct pstate_adjust_policy pid_params;
 static struct pstate_funcs pstate_funcs;
@@ -763,7 +764,7 @@ static struct cpu_defaults silvermont_params = {
 		.set = atom_set_pstate,
 		.get_scaling = silvermont_get_scaling,
 		.get_vid = atom_get_vid,
-		.get_target_pstate = get_target_pstate_use_performance,
+		.get_target_pstate = get_target_pstate_use_cpu_load,
 	},
 };
 
@@ -784,7 +785,7 @@ static struct cpu_defaults airmont_params = {
 		.set = atom_set_pstate,
 		.get_scaling = airmont_get_scaling,
 		.get_vid = atom_get_vid,
-		.get_target_pstate = get_target_pstate_use_performance,
+		.get_target_pstate = get_target_pstate_use_cpu_load,
 	},
 };
 
@@ -890,12 +891,11 @@ static inline void intel_pstate_sample(struct cpudata *cpu)
 	local_irq_save(flags);
 	rdmsrl(MSR_IA32_APERF, aperf);
 	rdmsrl(MSR_IA32_MPERF, mperf);
-	if (cpu->prev_mperf == mperf) {
+	tsc = rdtsc();
+	if ((cpu->prev_mperf == mperf) || (cpu->prev_tsc == tsc)) {
 		local_irq_restore(flags);
 		return;
 	}
-
-	tsc = rdtsc();
 	local_irq_restore(flags);
 
 	cpu->last_sample_time = cpu->sample.time;
@@ -930,6 +930,25 @@ static inline void intel_pstate_set_sample_time(struct cpudata *cpu)
 	mod_timer_pinned(&cpu->timer, jiffies + delay);
 }
 
+static inline int32_t get_target_pstate_use_cpu_load(struct cpudata *cpu)
+{
+	struct sample *sample = &cpu->sample;
+	int32_t cpu_load;
+
+	/*
+	 * The load can be estimated as the ratio of the mperf counter
+	 * running at a constant frequency during active periods
+	 * (C0) and the time stamp counter running at the same frequency
+	 * also during C-states.
+	 */
+	cpu_load = div64_u64(int_tofp(100) * sample->mperf, sample->tsc);
+
+	cpu->sample.busy_scaled = cpu_load;
+
+	return cpu->pstate.current_pstate - pid_calc(&cpu->pid, cpu_load);
+}
+
+
 static inline int32_t get_target_pstate_use_performance(struct cpudata *cpu)
 {
 	int32_t core_busy, max_pstate, current_pstate, sample_ratio;

From 63d1d656a5232f2f189b217b50542eadcf9d74ae Mon Sep 17 00:00:00 2001
From: Philippe Longepe <philippe.longepe@intel.com>
Date: Fri, 4 Dec 2015 17:40:35 +0100
Subject: [PATCH 18/30] cpufreq: intel_pstate: Account for IO wait time

In cases where we have many IOs, the global load becomes low and the
load algorithm will decrease the requested P-State. Because of that,
the IOs overheads will increase and impact the IO performances.

To improve IO bound work, we can count the io-wait time as busy time
in calculating CPU busy.

This change uses get_cpu_iowait_time_us() to obtain the IO wait time value
and converts time into number of cycles spent waiting on IO at the TSC
rate. At the moment, this trick is only used for Atom.

Signed-off-by: Philippe Longepe <philippe.longepe@intel.com>
Signed-off-by: Stephane Gasparini <stephane.gasparini@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/intel_pstate.c | 24 +++++++++++++++++++++---
 1 file changed, 21 insertions(+), 3 deletions(-)

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index 8bfebaeda2dd..efc581392bd1 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -113,6 +113,7 @@ struct cpudata {
 	u64	prev_aperf;
 	u64	prev_mperf;
 	u64	prev_tsc;
+	u64	prev_cummulative_iowait;
 	struct sample sample;
 };
 
@@ -933,22 +934,39 @@ static inline void intel_pstate_set_sample_time(struct cpudata *cpu)
 static inline int32_t get_target_pstate_use_cpu_load(struct cpudata *cpu)
 {
 	struct sample *sample = &cpu->sample;
+	u64 cummulative_iowait, delta_iowait_us;
+	u64 delta_iowait_mperf;
+	u64 mperf, now;
 	int32_t cpu_load;
 
+	cummulative_iowait = get_cpu_iowait_time_us(cpu->cpu, &now);
+
+	/*
+	 * Convert iowait time into number of IO cycles spent at max_freq.
+	 * IO is considered as busy only for the cpu_load algorithm. For
+	 * performance this is not needed since we always try to reach the
+	 * maximum P-State, so we are already boosting the IOs.
+	 */
+	delta_iowait_us = cummulative_iowait - cpu->prev_cummulative_iowait;
+	delta_iowait_mperf = div64_u64(delta_iowait_us * cpu->pstate.scaling *
+		cpu->pstate.max_pstate, MSEC_PER_SEC);
+
+	mperf = cpu->sample.mperf + delta_iowait_mperf;
+	cpu->prev_cummulative_iowait = cummulative_iowait;
+
+
 	/*
 	 * The load can be estimated as the ratio of the mperf counter
 	 * running at a constant frequency during active periods
 	 * (C0) and the time stamp counter running at the same frequency
 	 * also during C-states.
 	 */
-	cpu_load = div64_u64(int_tofp(100) * sample->mperf, sample->tsc);
-
+	cpu_load = div64_u64(int_tofp(100) * mperf, sample->tsc);
 	cpu->sample.busy_scaled = cpu_load;
 
 	return cpu->pstate.current_pstate - pid_calc(&cpu->pid, cpu_load);
 }
 
-
 static inline int32_t get_target_pstate_use_performance(struct cpudata *cpu)
 {
 	int32_t core_busy, max_pstate, current_pstate, sample_ratio;

From 89b56047f6f9b15fa3e9df3e34fa391835972ab7 Mon Sep 17 00:00:00 2001
From: Pi-Cheng Chen <pi-cheng.chen@linaro.org>
Date: Thu, 10 Dec 2015 11:48:13 +0800
Subject: [PATCH 19/30] cpufreq: mt8173: Move resources allocation into
 ->probe()

Since the return value of ->init() of cpufreq driver is not propagated
to the device driver model now, move resources allocation into
->probe() to handle -EPROBE_DEFER properly.

Signed-off-by: Pi-Cheng Chen <pi-cheng.chen@linaro.org>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/mt8173-cpufreq.c | 92 +++++++++++++++++++++++---------
 1 file changed, 68 insertions(+), 24 deletions(-)

diff --git a/drivers/cpufreq/mt8173-cpufreq.c b/drivers/cpufreq/mt8173-cpufreq.c
index 9d0fe37b4c3e..fd601b92f5ec 100644
--- a/drivers/cpufreq/mt8173-cpufreq.c
+++ b/drivers/cpufreq/mt8173-cpufreq.c
@@ -41,16 +41,35 @@
  * the original PLL becomes stable at target frequency.
  */
 struct mtk_cpu_dvfs_info {
+	struct cpumask cpus;
 	struct device *cpu_dev;
 	struct regulator *proc_reg;
 	struct regulator *sram_reg;
 	struct clk *cpu_clk;
 	struct clk *inter_clk;
 	struct thermal_cooling_device *cdev;
+	struct list_head list_head;
 	int intermediate_voltage;
 	bool need_voltage_tracking;
 };
 
+static LIST_HEAD(dvfs_info_list);
+
+static struct mtk_cpu_dvfs_info *mtk_cpu_dvfs_info_lookup(int cpu)
+{
+	struct mtk_cpu_dvfs_info *info;
+	struct list_head *list;
+
+	list_for_each(list, &dvfs_info_list) {
+		info = list_entry(list, struct mtk_cpu_dvfs_info, list_head);
+
+		if (cpumask_test_cpu(cpu, &info->cpus))
+			return info;
+	}
+
+	return NULL;
+}
+
 static int mtk_cpufreq_voltage_tracking(struct mtk_cpu_dvfs_info *info,
 					int new_vproc)
 {
@@ -402,6 +421,9 @@ static int mtk_cpu_dvfs_info_init(struct mtk_cpu_dvfs_info *info, int cpu)
 	 */
 	info->need_voltage_tracking = !IS_ERR(sram_reg);
 
+	/* CPUs in the same cluster share a clock and power domain. */
+	cpumask_copy(&info->cpus, &cpu_topology[cpu].core_sibling);
+
 	return 0;
 
 out_free_opp_table:
@@ -440,22 +462,18 @@ static int mtk_cpufreq_init(struct cpufreq_policy *policy)
 	struct cpufreq_frequency_table *freq_table;
 	int ret;
 
-	info = kzalloc(sizeof(*info), GFP_KERNEL);
-	if (!info)
-		return -ENOMEM;
-
-	ret = mtk_cpu_dvfs_info_init(info, policy->cpu);
-	if (ret) {
-		pr_err("%s failed to initialize dvfs info for cpu%d\n",
-		       __func__, policy->cpu);
-		goto out_free_dvfs_info;
+	info = mtk_cpu_dvfs_info_lookup(policy->cpu);
+	if (!info) {
+		pr_err("dvfs info for cpu%d is not initialized.\n",
+		       policy->cpu);
+		return -EINVAL;
 	}
 
 	ret = dev_pm_opp_init_cpufreq_table(info->cpu_dev, &freq_table);
 	if (ret) {
 		pr_err("failed to init cpufreq table for cpu%d: %d\n",
 		       policy->cpu, ret);
-		goto out_release_dvfs_info;
+		return ret;
 	}
 
 	ret = cpufreq_table_validate_and_show(policy, freq_table);
@@ -464,8 +482,7 @@ static int mtk_cpufreq_init(struct cpufreq_policy *policy)
 		goto out_free_cpufreq_table;
 	}
 
-	/* CPUs in the same cluster share a clock and power domain. */
-	cpumask_copy(policy->cpus, &cpu_topology[policy->cpu].core_sibling);
+	cpumask_copy(policy->cpus, &info->cpus);
 	policy->driver_data = info;
 	policy->clk = info->cpu_clk;
 
@@ -473,13 +490,6 @@ static int mtk_cpufreq_init(struct cpufreq_policy *policy)
 
 out_free_cpufreq_table:
 	dev_pm_opp_free_cpufreq_table(info->cpu_dev, &freq_table);
-
-out_release_dvfs_info:
-	mtk_cpu_dvfs_info_release(info);
-
-out_free_dvfs_info:
-	kfree(info);
-
 	return ret;
 }
 
@@ -489,8 +499,6 @@ static int mtk_cpufreq_exit(struct cpufreq_policy *policy)
 
 	cpufreq_cooling_unregister(info->cdev);
 	dev_pm_opp_free_cpufreq_table(info->cpu_dev, &policy->freq_table);
-	mtk_cpu_dvfs_info_release(info);
-	kfree(info);
 
 	return 0;
 }
@@ -510,11 +518,47 @@ static struct cpufreq_driver mt8173_cpufreq_driver = {
 
 static int mt8173_cpufreq_probe(struct platform_device *pdev)
 {
-	int ret;
+	struct mtk_cpu_dvfs_info *info;
+	struct list_head *list, *tmp;
+	int cpu, ret;
+
+	for_each_possible_cpu(cpu) {
+		info = mtk_cpu_dvfs_info_lookup(cpu);
+		if (info)
+			continue;
+
+		info = devm_kzalloc(&pdev->dev, sizeof(*info), GFP_KERNEL);
+		if (!info) {
+			ret = -ENOMEM;
+			goto release_dvfs_info_list;
+		}
+
+		ret = mtk_cpu_dvfs_info_init(info, cpu);
+		if (ret) {
+			dev_err(&pdev->dev,
+				"failed to initialize dvfs info for cpu%d\n",
+				cpu);
+			goto release_dvfs_info_list;
+		}
+
+		list_add(&info->list_head, &dvfs_info_list);
+	}
 
 	ret = cpufreq_register_driver(&mt8173_cpufreq_driver);
-	if (ret)
-		pr_err("failed to register mtk cpufreq driver\n");
+	if (ret) {
+		dev_err(&pdev->dev, "failed to register mtk cpufreq driver\n");
+		goto release_dvfs_info_list;
+	}
+
+	return 0;
+
+release_dvfs_info_list:
+	list_for_each_safe(list, tmp, &dvfs_info_list) {
+		info = list_entry(list, struct mtk_cpu_dvfs_info, list_head);
+
+		mtk_cpu_dvfs_info_release(info);
+		list_del(list);
+	}
 
 	return ret;
 }

From ab0ea257fc58d8742f73f50fba3797dfe001aa3c Mon Sep 17 00:00:00 2001
From: Lee Jones <lee.jones@linaro.org>
Date: Thu, 10 Dec 2015 09:42:16 +0000
Subject: [PATCH 20/30] cpufreq: st: Provide runtime initialised driver for
 ST's platforms

The bootloader is charged with the responsibility to provide platform
specific Dynamic Voltage and Frequency Scaling (DVFS) information via
Device Tree.  This driver takes the supplied configuration and
registers it with the new generic OPP framework, to then be used with
CPUFreq.

Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/Kconfig.arm   |  10 ++
 drivers/cpufreq/Makefile      |   1 +
 drivers/cpufreq/sti-cpufreq.c | 294 ++++++++++++++++++++++++++++++++++
 3 files changed, 305 insertions(+)
 create mode 100644 drivers/cpufreq/sti-cpufreq.c

diff --git a/drivers/cpufreq/Kconfig.arm b/drivers/cpufreq/Kconfig.arm
index 80fbfb32b5a9..ff9be3661480 100644
--- a/drivers/cpufreq/Kconfig.arm
+++ b/drivers/cpufreq/Kconfig.arm
@@ -219,6 +219,16 @@ config ARM_SPEAR_CPUFREQ
 	help
 	  This adds the CPUFreq driver support for SPEAr SOCs.
 
+config ARM_STI_CPUFREQ
+	tristate "STi CPUFreq support"
+	depends on SOC_STIH407
+	help
+	  This driver uses the generic OPP framework to match the running
+	  platform with a predefined set of suitable values.  If not provided
+	  we will fall-back so safe-values contained in Device Tree.  Enable
+	  this config option if you wish to add CPUFreq support for STi based
+	  SoCs.
+
 config ARM_TEGRA20_CPUFREQ
 	bool "Tegra20 CPUFreq support"
 	depends on ARCH_TEGRA
diff --git a/drivers/cpufreq/Makefile b/drivers/cpufreq/Makefile
index c0af1a1281c8..9e63fb1b09f8 100644
--- a/drivers/cpufreq/Makefile
+++ b/drivers/cpufreq/Makefile
@@ -73,6 +73,7 @@ obj-$(CONFIG_ARM_SA1100_CPUFREQ)	+= sa1100-cpufreq.o
 obj-$(CONFIG_ARM_SA1110_CPUFREQ)	+= sa1110-cpufreq.o
 obj-$(CONFIG_ARM_SCPI_CPUFREQ)		+= scpi-cpufreq.o
 obj-$(CONFIG_ARM_SPEAR_CPUFREQ)		+= spear-cpufreq.o
+obj-$(CONFIG_ARM_STI_CPUFREQ)		+= sti-cpufreq.o
 obj-$(CONFIG_ARM_TEGRA20_CPUFREQ)	+= tegra20-cpufreq.o
 obj-$(CONFIG_ARM_TEGRA124_CPUFREQ)	+= tegra124-cpufreq.o
 obj-$(CONFIG_ARM_VEXPRESS_SPC_CPUFREQ)	+= vexpress-spc-cpufreq.o
diff --git a/drivers/cpufreq/sti-cpufreq.c b/drivers/cpufreq/sti-cpufreq.c
new file mode 100644
index 000000000000..a9c659f58974
--- /dev/null
+++ b/drivers/cpufreq/sti-cpufreq.c
@@ -0,0 +1,294 @@
+/*
+ * Match running platform with pre-defined OPP values for CPUFreq
+ *
+ * Author: Ajit Pal Singh <ajitpal.singh@st.com>
+ *         Lee Jones <lee.jones@linaro.org>
+ *
+ * Copyright (C) 2015 STMicroelectronics (R&D) Limited
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the version 2 of the GNU General Public License as
+ * published by the Free Software Foundation
+ */
+
+#include <linux/cpu.h>
+#include <linux/io.h>
+#include <linux/mfd/syscon.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/of_platform.h>
+#include <linux/pm_opp.h>
+#include <linux/regmap.h>
+
+#define VERSION_ELEMENTS	3
+#define MAX_PCODE_NAME_LEN	7
+
+#define VERSION_SHIFT		28
+#define HW_INFO_INDEX		1
+#define MAJOR_ID_INDEX		1
+#define MINOR_ID_INDEX		2
+
+/*
+ * Only match on "suitable for ALL versions" entries
+ *
+ * This will be used with the BIT() macro.  It sets the
+ * top bit of a 32bit value and is equal to 0x80000000.
+ */
+#define DEFAULT_VERSION		31
+
+enum {
+	PCODE = 0,
+	SUBSTRATE,
+	DVFS_MAX_REGFIELDS,
+};
+
+/**
+ * ST CPUFreq Driver Data
+ *
+ * @cpu_node		CPU's OF node
+ * @syscfg_eng		Engineering Syscon register map
+ * @regmap		Syscon register map
+ */
+static struct sti_cpufreq_ddata {
+	struct device *cpu;
+	struct regmap *syscfg_eng;
+	struct regmap *syscfg;
+} ddata;
+
+static int sti_cpufreq_fetch_major(void) {
+	struct device_node *np = ddata.cpu->of_node;
+	struct device *dev = ddata.cpu;
+	unsigned int major_offset;
+	unsigned int socid;
+	int ret;
+
+	ret = of_property_read_u32_index(np, "st,syscfg",
+					 MAJOR_ID_INDEX, &major_offset);
+	if (ret) {
+		dev_err(dev, "No major number offset provided in %s [%d]\n",
+			np->full_name, ret);
+		return ret;
+	}
+
+	ret = regmap_read(ddata.syscfg, major_offset, &socid);
+	if (ret) {
+		dev_err(dev, "Failed to read major number from syscon [%d]\n",
+			ret);
+		return ret;
+	}
+
+	return ((socid >> VERSION_SHIFT) & 0xf) + 1;
+}
+
+static int sti_cpufreq_fetch_minor(void)
+{
+	struct device *dev = ddata.cpu;
+	struct device_node *np = dev->of_node;
+	unsigned int minor_offset;
+	unsigned int minid;
+	int ret;
+
+	ret = of_property_read_u32_index(np, "st,syscfg-eng",
+					 MINOR_ID_INDEX, &minor_offset);
+	if (ret) {
+		dev_err(dev,
+			"No minor number offset provided %s [%d]\n",
+			np->full_name, ret);
+		return ret;
+	}
+
+	ret = regmap_read(ddata.syscfg_eng, minor_offset, &minid);
+	if (ret) {
+		dev_err(dev,
+			"Failed to read the minor number from syscon [%d]\n",
+			ret);
+		return ret;
+	}
+
+	return minid & 0xf;
+}
+
+static int sti_cpufreq_fetch_regmap_field(const struct reg_field *reg_fields,
+					  int hw_info_offset, int field)
+{
+	struct regmap_field *regmap_field;
+	struct reg_field reg_field = reg_fields[field];
+	struct device *dev = ddata.cpu;
+	unsigned int value;
+	int ret;
+
+	reg_field.reg = hw_info_offset;
+	regmap_field = devm_regmap_field_alloc(dev,
+					       ddata.syscfg_eng,
+					       reg_field);
+	if (IS_ERR(regmap_field)) {
+		dev_err(dev, "Failed to allocate reg field\n");
+		return PTR_ERR(regmap_field);
+	}
+
+	ret = regmap_field_read(regmap_field, &value);
+	if (ret) {
+		dev_err(dev, "Failed to read %s code\n",
+			field ? "SUBSTRATE" : "PCODE");
+		return ret;
+	}
+
+	return value;
+}
+
+static const struct reg_field sti_stih407_dvfs_regfields[DVFS_MAX_REGFIELDS] = {
+	[PCODE]		= REG_FIELD(0, 16, 19),
+	[SUBSTRATE]	= REG_FIELD(0, 0, 2),
+};
+
+static const struct reg_field *sti_cpufreq_match(void)
+{
+	if (of_machine_is_compatible("st,stih407") ||
+	    of_machine_is_compatible("st,stih410"))
+		return sti_stih407_dvfs_regfields;
+
+	return NULL;
+}
+
+static int sti_cpufreq_set_opp_info(void)
+{
+	struct device *dev = ddata.cpu;
+	struct device_node *np = dev->of_node;
+	const struct reg_field *reg_fields;
+	unsigned int hw_info_offset;
+	unsigned int version[VERSION_ELEMENTS];
+	int pcode, substrate, major, minor;
+	int ret;
+	char name[MAX_PCODE_NAME_LEN];
+
+	reg_fields = sti_cpufreq_match();
+	if (!reg_fields) {
+		dev_err(dev, "This SoC doesn't support voltage scaling");
+		return -ENODEV;
+	}
+
+	ret = of_property_read_u32_index(np, "st,syscfg-eng",
+					 HW_INFO_INDEX, &hw_info_offset);
+	if (ret) {
+		dev_warn(dev, "Failed to read HW info offset from DT\n");
+		substrate = DEFAULT_VERSION;
+		pcode = 0;
+		goto use_defaults;
+	}
+
+	pcode = sti_cpufreq_fetch_regmap_field(reg_fields,
+					       hw_info_offset,
+					       PCODE);
+	if (pcode < 0) {
+		dev_warn(dev, "Failed to obtain process code\n");
+		/* Use default pcode */
+		pcode = 0;
+	}
+
+	substrate = sti_cpufreq_fetch_regmap_field(reg_fields,
+						   hw_info_offset,
+						   SUBSTRATE);
+	if (substrate) {
+		dev_warn(dev, "Failed to obtain substrate code\n");
+		/* Use default substrate */
+		substrate = DEFAULT_VERSION;
+	}
+
+use_defaults:
+	major = sti_cpufreq_fetch_major();
+	if (major < 0) {
+		dev_err(dev, "Failed to obtain major version\n");
+		/* Use default major number */
+		major = DEFAULT_VERSION;
+	}
+
+	minor = sti_cpufreq_fetch_minor();
+	if (minor < 0) {
+		dev_err(dev, "Failed to obtain minor version\n");
+		/* Use default minor number */
+		minor = DEFAULT_VERSION;
+	}
+
+	snprintf(name, MAX_PCODE_NAME_LEN, "pcode%d", pcode);
+
+	ret = dev_pm_opp_set_prop_name(dev, name);
+	if (ret) {
+		dev_err(dev, "Failed to set prop name\n");
+		return ret;
+	}
+
+	version[0] = BIT(major);
+	version[1] = BIT(minor);
+	version[2] = BIT(substrate);
+
+	ret = dev_pm_opp_set_supported_hw(dev, version, VERSION_ELEMENTS);
+	if (ret) {
+		dev_err(dev, "Failed to set supported hardware\n");
+		return ret;
+	}
+
+	dev_dbg(dev, "pcode: %d major: %d minor: %d substrate: %d\n",
+		pcode, major, minor, substrate);
+	dev_dbg(dev, "version[0]: %x version[1]: %x version[2]: %x\n",
+		version[0], version[1], version[2]);
+
+	return 0;
+}
+
+static int sti_cpufreq_fetch_syscon_regsiters(void)
+{
+	struct device *dev = ddata.cpu;
+	struct device_node *np = dev->of_node;
+
+	ddata.syscfg = syscon_regmap_lookup_by_phandle(np, "st,syscfg");
+	if (IS_ERR(ddata.syscfg)) {
+		dev_err(dev,  "\"st,syscfg\" not supplied\n");
+		return PTR_ERR(ddata.syscfg);
+	}
+
+	ddata.syscfg_eng = syscon_regmap_lookup_by_phandle(np, "st,syscfg-eng");
+	if (IS_ERR(ddata.syscfg_eng)) {
+		dev_err(dev, "\"st,syscfg-eng\" not supplied\n");
+		return PTR_ERR(ddata.syscfg_eng);
+	}
+
+	return 0;
+}
+
+static int sti_cpufreq_init(void)
+{
+	int ret;
+
+	ddata.cpu = get_cpu_device(0);
+	if (!ddata.cpu) {
+		dev_err(ddata.cpu, "Failed to get device for CPU0\n");
+		goto skip_voltage_scaling;
+	}
+
+	if (!of_get_property(ddata.cpu->of_node, "operating-points-v2", NULL)) {
+		dev_err(ddata.cpu, "OPP-v2 not supported\n");
+		goto skip_voltage_scaling;
+	}
+
+	ret = sti_cpufreq_fetch_syscon_regsiters();
+	if (ret)
+		goto skip_voltage_scaling;
+
+	ret = sti_cpufreq_set_opp_info();
+	if (!ret)
+		goto register_cpufreq_dt;
+
+skip_voltage_scaling:
+	dev_err(ddata.cpu, "Not doing voltage scaling\n");
+
+register_cpufreq_dt:
+	platform_device_register_simple("cpufreq-dt", -1, NULL, 0);
+
+	return 0;
+}
+module_init(sti_cpufreq_init);
+
+MODULE_DESCRIPTION("STMicroelectronics CPUFreq/OPP driver");
+MODULE_AUTHOR("Ajitpal Singh <ajitpal.singh@st.com>");
+MODULE_AUTHOR("Lee Jones <lee.jones@linaro.org>");
+MODULE_LICENSE("GPL v2");

From b122bcd94743239cc26a5732fef87b28d7f5c22a Mon Sep 17 00:00:00 2001
From: Lee Jones <lee.jones@linaro.org>
Date: Thu, 10 Dec 2015 09:42:17 +0000
Subject: [PATCH 21/30] dt: cpufreq: st: Provide bindings for ST's CPUFreq
 implementation

Acked-by: Rob Herring <robh@kernel.org>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 .../bindings/cpufreq/cpufreq-st.txt           | 91 +++++++++++++++++++
 1 file changed, 91 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/cpufreq/cpufreq-st.txt

diff --git a/Documentation/devicetree/bindings/cpufreq/cpufreq-st.txt b/Documentation/devicetree/bindings/cpufreq/cpufreq-st.txt
new file mode 100644
index 000000000000..d91a02a3b6b0
--- /dev/null
+++ b/Documentation/devicetree/bindings/cpufreq/cpufreq-st.txt
@@ -0,0 +1,91 @@
+Binding for ST's CPUFreq driver
+===============================
+
+ST's CPUFreq driver attempts to read 'process' and 'version' attributes
+from the SoC, then supplies the OPP framework with 'prop' and 'supported
+hardware' information respectively.  The framework is then able to read
+the DT and operate in the usual way.
+
+For more information about the expected DT format [See: ../opp/opp.txt].
+
+Frequency Scaling only
+----------------------
+
+No vendor specific driver required for this.
+
+Located in CPU's node:
+
+- operating-points		: [See: ../power/opp.txt]
+
+Example [safe]
+--------------
+
+cpus {
+	cpu@0 {
+				 /* kHz     uV   */
+		operating-points = <1500000 0
+				    1200000 0
+				    800000  0
+				    500000  0>;
+	};
+};
+
+Dynamic Voltage and Frequency Scaling (DVFS)
+--------------------------------------------
+
+This requires the ST CPUFreq driver to supply 'process' and 'version' info.
+
+Located in CPU's node:
+
+- operating-points-v2		: [See ../power/opp.txt]
+
+Example [unsafe]
+----------------
+
+cpus {
+	cpu@0 {
+		operating-points-v2	= <&cpu0_opp_table>;
+	};
+};
+
+cpu0_opp_table: opp_table {
+	compatible = "operating-points-v2";
+
+	/* ############################################################### */
+	/* # WARNING: Do not attempt to copy/replicate these nodes,      # */
+	/* #          they are only to be supplied by the bootloader !!! # */
+	/* ############################################################### */
+	opp0 {
+		/*			   Major       Minor       Substrate */
+		/*			   2           all         all       */
+		opp-supported-hw	= <0x00000004  0xffffffff  0xffffffff>;
+		opp-hz			= /bits/ 64 <1500000000>;
+		clock-latency-ns	= <10000000>;
+
+		opp-microvolt-pcode0	= <1200000>;
+		opp-microvolt-pcode1	= <1200000>;
+		opp-microvolt-pcode2	= <1200000>;
+		opp-microvolt-pcode3	= <1200000>;
+		opp-microvolt-pcode4	= <1170000>;
+		opp-microvolt-pcode5	= <1140000>;
+		opp-microvolt-pcode6	= <1100000>;
+		opp-microvolt-pcode7	= <1070000>;
+	};
+
+	opp1 {
+		/*			   Major       Minor       Substrate */
+		/*			   all         all         all       */
+		opp-supported-hw	= <0xffffffff  0xffffffff  0xffffffff>;
+		opp-hz			= /bits/ 64 <1200000000>;
+		clock-latency-ns	= <10000000>;
+
+		opp-microvolt-pcode0	= <1110000>;
+		opp-microvolt-pcode1	= <1150000>;
+		opp-microvolt-pcode2	= <1100000>;
+		opp-microvolt-pcode3	= <1080000>;
+		opp-microvolt-pcode4	= <1040000>;
+		opp-microvolt-pcode5	= <1020000>;
+		opp-microvolt-pcode6	= <980000>;
+		opp-microvolt-pcode7	= <930000>;
+	};
+};

From a5810b4f078f4a41ff47668d7e3351b903ab959f Mon Sep 17 00:00:00 2001
From: Markus Elfring <elfring@users.sourceforge.net>
Date: Mon, 21 Dec 2015 21:56:27 +0100
Subject: [PATCH 22/30] blackfin-cpufreq: Change return type of cpu_set_cclk()
 to that of clk_set_rate()

The return type "unsigned long" was used by the cpu_set_cclk() function
while the type "int" is provided by the clk_set_rate() function.
Let us make this usage consistent.

This issue was detected by using the Coccinelle software.

Signed-off-by: Markus Elfring <elfring@users.sourceforge.net>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/blackfin-cpufreq.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/cpufreq/blackfin-cpufreq.c b/drivers/cpufreq/blackfin-cpufreq.c
index a9f8e5bd0716..2a6f3ac858d2 100644
--- a/drivers/cpufreq/blackfin-cpufreq.c
+++ b/drivers/cpufreq/blackfin-cpufreq.c
@@ -112,7 +112,7 @@ static unsigned int bfin_getfreq_khz(unsigned int cpu)
 }
 
 #ifdef CONFIG_BF60x
-unsigned long cpu_set_cclk(int cpu, unsigned long new)
+int cpu_set_cclk(int cpu, unsigned long new)
 {
 	struct clk *clk;
 	int ret;

From d23f8cadf99b553eaa78544c53e96f3aec456530 Mon Sep 17 00:00:00 2001
From: Markus Elfring <elfring@users.sourceforge.net>
Date: Mon, 21 Dec 2015 22:12:26 +0100
Subject: [PATCH 23/30] blackfin-cpufreq: Mark cpu_set_cclk() as static

The cpu_set_cclk() function was only used in a single source file so far.
Indicate this setting also by the corresponding linkage specifier.

Signed-off-by: Markus Elfring <elfring@users.sourceforge.net>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/blackfin-cpufreq.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/cpufreq/blackfin-cpufreq.c b/drivers/cpufreq/blackfin-cpufreq.c
index 2a6f3ac858d2..12e97d8a9db0 100644
--- a/drivers/cpufreq/blackfin-cpufreq.c
+++ b/drivers/cpufreq/blackfin-cpufreq.c
@@ -112,7 +112,7 @@ static unsigned int bfin_getfreq_khz(unsigned int cpu)
 }
 
 #ifdef CONFIG_BF60x
-int cpu_set_cclk(int cpu, unsigned long new)
+static int cpu_set_cclk(int cpu, unsigned long new)
 {
 	struct clk *clk;
 	int ret;

From 41669da03060c5e55e9d3061ddede624f01d7262 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Sun, 27 Dec 2015 00:23:48 +0100
Subject: [PATCH 24/30] cpufreq: Make cpufreq_boost_supported() static

cpufreq_boost_supported() is not used outside of cpufreq.c, so make
it static.

While at it, refactor it as a one-liner (which it really is).

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/cpufreq/cpufreq.c | 8 ++------
 include/linux/cpufreq.h   | 5 -----
 2 files changed, 2 insertions(+), 11 deletions(-)

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 8412ce5f93a7..49f3f58f2501 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -2330,14 +2330,10 @@ int cpufreq_boost_trigger_state(int state)
 	return ret;
 }
 
-int cpufreq_boost_supported(void)
+static bool cpufreq_boost_supported(void)
 {
-	if (likely(cpufreq_driver))
-		return cpufreq_driver->boost_supported;
-
-	return 0;
+	return likely(cpufreq_driver) && cpufreq_driver->boost_supported;
 }
-EXPORT_SYMBOL_GPL(cpufreq_boost_supported);
 
 static int create_boost_sysfs_file(void)
 {
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 177c7680c1a8..f859b728d98e 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -574,7 +574,6 @@ ssize_t cpufreq_show_cpus(const struct cpumask *mask, char *buf);
 
 #ifdef CONFIG_CPU_FREQ
 int cpufreq_boost_trigger_state(int state);
-int cpufreq_boost_supported(void);
 int cpufreq_boost_enabled(void);
 int cpufreq_enable_boost_support(void);
 bool policy_has_boost_freq(struct cpufreq_policy *policy);
@@ -583,10 +582,6 @@ static inline int cpufreq_boost_trigger_state(int state)
 {
 	return 0;
 }
-static inline int cpufreq_boost_supported(void)
-{
-	return 0;
-}
 static inline int cpufreq_boost_enabled(void)
 {
 	return 0;

From 17135782b816383b426d280c350467b23ef9c10c Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Sun, 27 Dec 2015 00:25:35 +0100
Subject: [PATCH 25/30] cpufreq: acpi-cpufreq: Simplify boost-related code

The store_boost() routine is only used by store_cpb(), so move
the code from it directly to that function and rename _store_boost()
to set_boost() to make its name reflect the name of the driver
callback pointing to it.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/cpufreq/acpi-cpufreq.c | 21 ++++++++-------------
 1 file changed, 8 insertions(+), 13 deletions(-)

diff --git a/drivers/cpufreq/acpi-cpufreq.c b/drivers/cpufreq/acpi-cpufreq.c
index cec1ee2d2f74..12da54a28a42 100644
--- a/drivers/cpufreq/acpi-cpufreq.c
+++ b/drivers/cpufreq/acpi-cpufreq.c
@@ -135,7 +135,7 @@ static void boost_set_msrs(bool enable, const struct cpumask *cpumask)
 	wrmsr_on_cpus(cpumask, msr_addr, msrs);
 }
 
-static int _store_boost(int val)
+static int set_boost(int val)
 {
 	get_online_cpus();
 	boost_set_msrs(val, cpu_online_mask);
@@ -158,29 +158,24 @@ static ssize_t show_freqdomain_cpus(struct cpufreq_policy *policy, char *buf)
 cpufreq_freq_attr_ro(freqdomain_cpus);
 
 #ifdef CONFIG_X86_ACPI_CPUFREQ_CPB
-static ssize_t store_boost(const char *buf, size_t count)
+static ssize_t store_cpb(struct cpufreq_policy *policy, const char *buf,
+			 size_t count)
 {
 	int ret;
-	unsigned long val = 0;
+	unsigned int val = 0;
 
 	if (!acpi_cpufreq_driver.boost_supported)
 		return -EINVAL;
 
-	ret = kstrtoul(buf, 10, &val);
-	if (ret || (val > 1))
+	ret = kstrtouint(buf, 10, &val);
+	if (ret || val > 1)
 		return -EINVAL;
 
-	_store_boost((int) val);
+	set_boost(val);
 
 	return count;
 }
 
-static ssize_t store_cpb(struct cpufreq_policy *policy, const char *buf,
-			 size_t count)
-{
-	return store_boost(buf, count);
-}
-
 static ssize_t show_cpb(struct cpufreq_policy *policy, char *buf)
 {
 	return sprintf(buf, "%u\n", acpi_cpufreq_driver.boost_enabled);
@@ -905,7 +900,7 @@ static struct cpufreq_driver acpi_cpufreq_driver = {
 	.resume		= acpi_cpufreq_resume,
 	.name		= "acpi-cpufreq",
 	.attr		= acpi_cpufreq_attr,
-	.set_boost      = _store_boost,
+	.set_boost      = set_boost,
 };
 
 static void __init acpi_cpufreq_boost_init(void)

From 7a6c79f2fe53dac9b7b290f3a4c56b0958e19201 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Sun, 27 Dec 2015 00:27:38 +0100
Subject: [PATCH 26/30] cpufreq: Simplify core code related to boost support

Notice that the boost_supported field in struct cpufreq_driver is
redundant, because the driver's ->set_boost callback may be left
unset if "boost" is not supported.  Moreover, the only driver
populating the ->set_boost callback is acpi_cpufreq, so make it
avoid populating that callback if "boost" is not supported, rework
the core to check ->set_boost instead of boost_supported to
verify "boost" support and drop boost_supported which isn't
used any more.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/cpufreq/acpi-cpufreq.c |  5 ++---
 drivers/cpufreq/cpufreq.c      | 22 +++++++---------------
 include/linux/cpufreq.h        |  1 -
 3 files changed, 9 insertions(+), 19 deletions(-)

diff --git a/drivers/cpufreq/acpi-cpufreq.c b/drivers/cpufreq/acpi-cpufreq.c
index 12da54a28a42..51eef87bbc37 100644
--- a/drivers/cpufreq/acpi-cpufreq.c
+++ b/drivers/cpufreq/acpi-cpufreq.c
@@ -164,7 +164,7 @@ static ssize_t store_cpb(struct cpufreq_policy *policy, const char *buf,
 	int ret;
 	unsigned int val = 0;
 
-	if (!acpi_cpufreq_driver.boost_supported)
+	if (!acpi_cpufreq_driver.set_boost)
 		return -EINVAL;
 
 	ret = kstrtouint(buf, 10, &val);
@@ -900,7 +900,6 @@ static struct cpufreq_driver acpi_cpufreq_driver = {
 	.resume		= acpi_cpufreq_resume,
 	.name		= "acpi-cpufreq",
 	.attr		= acpi_cpufreq_attr,
-	.set_boost      = set_boost,
 };
 
 static void __init acpi_cpufreq_boost_init(void)
@@ -911,7 +910,7 @@ static void __init acpi_cpufreq_boost_init(void)
 		if (!msrs)
 			return;
 
-		acpi_cpufreq_driver.boost_supported = true;
+		acpi_cpufreq_driver.set_boost = set_boost;
 		acpi_cpufreq_driver.boost_enabled = boost_state(0);
 
 		cpu_notifier_register_begin();
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 49f3f58f2501..c35e7da1ed7a 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -2332,23 +2332,13 @@ int cpufreq_boost_trigger_state(int state)
 
 static bool cpufreq_boost_supported(void)
 {
-	return likely(cpufreq_driver) && cpufreq_driver->boost_supported;
+	return likely(cpufreq_driver) && cpufreq_driver->set_boost;
 }
 
 static int create_boost_sysfs_file(void)
 {
 	int ret;
 
-	if (!cpufreq_boost_supported())
-		return 0;
-
-	/*
-	 * Check if driver provides function to enable boost -
-	 * if not, use cpufreq_boost_set_sw as default
-	 */
-	if (!cpufreq_driver->set_boost)
-		cpufreq_driver->set_boost = cpufreq_boost_set_sw;
-
 	ret = sysfs_create_file(cpufreq_global_kobject, &boost.attr);
 	if (ret)
 		pr_err("%s: cannot register global BOOST sysfs file\n",
@@ -2371,7 +2361,7 @@ int cpufreq_enable_boost_support(void)
 	if (cpufreq_boost_supported())
 		return 0;
 
-	cpufreq_driver->boost_supported = true;
+	cpufreq_driver->set_boost = cpufreq_boost_set_sw;
 
 	/* This will get removed on driver unregister */
 	return create_boost_sysfs_file();
@@ -2431,9 +2421,11 @@ int cpufreq_register_driver(struct cpufreq_driver *driver_data)
 	if (driver_data->setpolicy)
 		driver_data->flags |= CPUFREQ_CONST_LOOPS;
 
-	ret = create_boost_sysfs_file();
-	if (ret)
-		goto err_null_driver;
+	if (cpufreq_boost_supported()) {
+		ret = create_boost_sysfs_file();
+		if (ret)
+			goto err_null_driver;
+	}
 
 	ret = subsys_interface_register(&cpufreq_interface);
 	if (ret)
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index f859b728d98e..88a4215125bc 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -278,7 +278,6 @@ struct cpufreq_driver {
 	struct freq_attr **attr;
 
 	/* platform specific boost support code */
-	bool		boost_supported;
 	bool		boost_enabled;
 	int		(*set_boost)(int state);
 };

From a889331d759453fa7f424330f75ae4e2b9e02db4 Mon Sep 17 00:00:00 2001
From: Pi-Cheng Chen <pi-cheng.chen@linaro.org>
Date: Sun, 27 Dec 2015 14:21:57 +0800
Subject: [PATCH 27/30] cpufreq: mt8173: migrate to use operating-points-v2
 bindings

Modify mt8173-cpufreq driver to get OPP-sharing information and set up
OPP table provided by operating-points-v2 bindings.

Signed-off-by: Pi-Cheng Chen <pi-cheng.chen@linaro.org>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/mt8173-cpufreq.c | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/drivers/cpufreq/mt8173-cpufreq.c b/drivers/cpufreq/mt8173-cpufreq.c
index fd601b92f5ec..1efba340456d 100644
--- a/drivers/cpufreq/mt8173-cpufreq.c
+++ b/drivers/cpufreq/mt8173-cpufreq.c
@@ -390,7 +390,15 @@ static int mtk_cpu_dvfs_info_init(struct mtk_cpu_dvfs_info *info, int cpu)
 	/* Both presence and absence of sram regulator are valid cases. */
 	sram_reg = regulator_get_exclusive(cpu_dev, "sram");
 
-	ret = dev_pm_opp_of_add_table(cpu_dev);
+	/* Get OPP-sharing information from "operating-points-v2" bindings */
+	ret = dev_pm_opp_of_get_sharing_cpus(cpu_dev, &info->cpus);
+	if (ret) {
+		pr_err("failed to get OPP-sharing information for cpu%d\n",
+		       cpu);
+		goto out_free_resources;
+	}
+
+	ret = dev_pm_opp_of_cpumask_add_table(&info->cpus);
 	if (ret) {
 		pr_warn("no OPP table for cpu%d\n", cpu);
 		goto out_free_resources;
@@ -421,13 +429,10 @@ static int mtk_cpu_dvfs_info_init(struct mtk_cpu_dvfs_info *info, int cpu)
 	 */
 	info->need_voltage_tracking = !IS_ERR(sram_reg);
 
-	/* CPUs in the same cluster share a clock and power domain. */
-	cpumask_copy(&info->cpus, &cpu_topology[cpu].core_sibling);
-
 	return 0;
 
 out_free_opp_table:
-	dev_pm_opp_of_remove_table(cpu_dev);
+	dev_pm_opp_of_cpumask_remove_table(&info->cpus);
 
 out_free_resources:
 	if (!IS_ERR(proc_reg))
@@ -453,7 +458,7 @@ static void mtk_cpu_dvfs_info_release(struct mtk_cpu_dvfs_info *info)
 	if (!IS_ERR(info->inter_clk))
 		clk_put(info->inter_clk);
 
-	dev_pm_opp_of_remove_table(info->cpu_dev);
+	dev_pm_opp_of_cpumask_remove_table(&info->cpus);
 }
 
 static int mtk_cpufreq_init(struct cpufreq_policy *policy)

From 0df35026c6a527e65b53bc895ad672d648a248f3 Mon Sep 17 00:00:00 2001
From: Chen Yu <yu.c.chen@intel.com>
Date: Wed, 16 Dec 2015 12:20:29 +0800
Subject: [PATCH 28/30] cpufreq: governor: Fix negative idle_time when
 configured with CONFIG_HZ_PERIODIC

It is reported that, with CONFIG_HZ_PERIODIC=y cpu stays at the
lowest frequency even if the usage goes to 100%, neither ondemand
nor conservative governor works, however performance and
userspace work as expected. If set with CONFIG_NO_HZ_FULL=y,
everything goes well.

This problem is caused by improper calculation of the idle_time
when the load is extremely high(near 100%). Firstly, cpufreq_governor
uses get_cpu_idle_time to get the total idle time for specific cpu, then:

1.If the system is configured with CONFIG_NO_HZ_FULL, the idle time is
  returned by ktime_get, which is always increasing, it's OK.
2.However, if the system is configured with CONFIG_HZ_PERIODIC,
  get_cpu_idle_time might not guarantee to be always increasing,
  because it will leverage get_cpu_idle_time_jiffy to calculate the
  idle_time, consider the following scenario:

At T1:
idle_tick_1 = total_tick_1 - user_tick_1

sample period(80ms)...

At T2: ( T2 = T1 + 80ms):
idle_tick_2 = total_tick_2 - user_tick_2

Currently the algorithm is using (idle_tick_2 - idle_tick_1) to
get the delta idle_time during the past sample period, however
it CAN NOT guarantee that idle_tick_2 >= idle_tick_1, especially
when cpu load is high.
(Yes, total_tick_2 >= total_tick_1, and user_tick_2 >= user_tick_1,
but how about idle_tick_2 and idle_tick_1? No guarantee.)
So governor might get a negative value of idle_time during the past
sample period, which might mislead the system that the idle time is
very big(converted to unsigned int), and the busy time is nearly zero,
which causes the governor to always choose the lowest cpufreq,
then cause this problem.

In theory there are two solutions:

1.The logic should not rely on the idle tick during every sample period,
  but be based on the busy tick directly, as this is how 'top' is
  implemented.

2.Or the logic must make sure that the idle_time is strictly increasing
  during each sample period, then there would be no negative idle_time
  anymore. This solution requires minimum modification to current code
  and this patch uses method 2.

Link: https://bugzilla.kernel.org/show_bug.cgi?id=69821
Reported-by: Jan Fikar <j.fikar@gmail.com>
Signed-off-by: Chen Yu <yu.c.chen@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cpufreq_governor.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c
index 4de12fd35b1f..bab3a514ec12 100644
--- a/drivers/cpufreq/cpufreq_governor.c
+++ b/drivers/cpufreq/cpufreq_governor.c
@@ -84,6 +84,9 @@ void dbs_check_cpu(struct dbs_data *dbs_data, int cpu)
 			(cur_wall_time - j_cdbs->prev_cpu_wall);
 		j_cdbs->prev_cpu_wall = cur_wall_time;
 
+		if (cur_idle_time < j_cdbs->prev_cpu_idle)
+			cur_idle_time = j_cdbs->prev_cpu_idle;
+
 		idle_time = (unsigned int)
 			(cur_idle_time - j_cdbs->prev_cpu_idle);
 		j_cdbs->prev_cpu_idle = cur_idle_time;

From 929ca89c305a6ed7a4149115be99af6d73c36918 Mon Sep 17 00:00:00 2001
From: Andrzej Hajda <a.hajda@samsung.com>
Date: Wed, 30 Dec 2015 12:18:42 +0100
Subject: [PATCH 29/30] cpufreq-dt: fix handling regulator_get_voltage() result

The function can return negative values so it should be assigned
to signed type.

The problem has been detected using proposed semantic patch
scripts/coccinelle/tests/unsigned_lesser_than_zero.cocci.

Link: http://permalink.gmane.org/gmane.linux.kernel/2038576
Signed-off-by: Andrzej Hajda <a.hajda@samsung.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cpufreq-dt.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/cpufreq/cpufreq-dt.c b/drivers/cpufreq/cpufreq-dt.c
index 1ceece9d6711..9bc37c437874 100644
--- a/drivers/cpufreq/cpufreq-dt.c
+++ b/drivers/cpufreq/cpufreq-dt.c
@@ -50,7 +50,8 @@ static int set_target(struct cpufreq_policy *policy, unsigned int index)
 	struct private_data *priv = policy->driver_data;
 	struct device *cpu_dev = priv->cpu_dev;
 	struct regulator *cpu_reg = priv->cpu_reg;
-	unsigned long volt = 0, volt_old = 0, tol = 0;
+	unsigned long volt = 0, tol = 0;
+	int volt_old = 0;
 	unsigned int old_freq, new_freq;
 	long freq_Hz, freq_exact;
 	int ret;
@@ -83,7 +84,7 @@ static int set_target(struct cpufreq_policy *policy, unsigned int index)
 			opp_freq / 1000, volt);
 	}
 
-	dev_dbg(cpu_dev, "%u MHz, %ld mV --> %u MHz, %ld mV\n",
+	dev_dbg(cpu_dev, "%u MHz, %d mV --> %u MHz, %ld mV\n",
 		old_freq / 1000, (volt_old > 0) ? volt_old / 1000 : -1,
 		new_freq / 1000, volt ? volt / 1000 : -1);
 

From a032d2de0b5f17631844b34481c61cb799d0af6b Mon Sep 17 00:00:00 2001
From: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Date: Wed, 30 Dec 2015 17:45:19 -0800
Subject: [PATCH 30/30] Documentation: cpufreq: intel_pstate: enhance
 documentation

This is an attempt to make documentation more user friendly.

Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Reviewed-by: Doug Smythies <dsmythies@telus.net>
Reviewed-by: Chen, Yu C <yu.c.chen@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 Documentation/cpu-freq/intel-pstate.txt | 233 ++++++++++++++++++++----
 1 file changed, 195 insertions(+), 38 deletions(-)

diff --git a/Documentation/cpu-freq/intel-pstate.txt b/Documentation/cpu-freq/intel-pstate.txt
index be8d4006bf76..f7b12c071d53 100644
--- a/Documentation/cpu-freq/intel-pstate.txt
+++ b/Documentation/cpu-freq/intel-pstate.txt
@@ -1,61 +1,131 @@
-Intel P-state driver
+Intel P-State driver
 --------------------
 
-This driver provides an interface to control the P state selection for
-SandyBridge+ Intel processors.  The driver can operate two different
-modes based on the processor model, legacy mode and Hardware P state (HWP)
-mode.
+This driver provides an interface to control the P-State selection for the
+SandyBridge+ Intel processors.
 
-In legacy mode, the Intel P-state implements two internal governors,
-performance and powersave, that differ from the general cpufreq governors of
-the same name (the general cpufreq governors implement target(), whereas the
-internal Intel P-state governors implement setpolicy()).  The internal
-performance governor sets the max_perf_pct and min_perf_pct to 100; that is,
-the governor selects the highest available P state to maximize the performance
-of the core.  The internal powersave governor selects the appropriate P state
-based on the current load on the CPU.
+The following document explains P-States:
+http://events.linuxfoundation.org/sites/events/files/slides/LinuxConEurope_2015.pdf
+As stated in the document, P-State doesn’t exactly mean a frequency. However, for
+the sake of the relationship with cpufreq, P-State and frequency are used
+interchangeably.
 
-In HWP mode P state selection is implemented in the processor
-itself. The driver provides the interfaces between the cpufreq core and
-the processor to control P state selection based on user preferences
-and reporting frequency to the cpufreq core.  In this mode the
-internal Intel P-state governor code is disabled.
+Understanding the cpufreq core governors and policies are important before
+discussing more details about the Intel P-State driver. Based on what callbacks
+a cpufreq driver provides to the cpufreq core, it can support two types of
+drivers:
+- with target_index() callback: In this mode, the drivers using cpufreq core
+simply provide the minimum and maximum frequency limits and an additional
+interface target_index() to set the current frequency. The cpufreq subsystem
+has a number of scaling governors ("performance", "powersave", "ondemand",
+etc.). Depending on which governor is in use, cpufreq core will call for
+transitions to a specific frequency using target_index() callback.
+- setpolicy() callback: In this mode, drivers do not provide target_index()
+callback, so cpufreq core can't request a transition to a specific frequency.
+The driver provides minimum and maximum frequency limits and callbacks to set a
+policy. The policy in cpufreq sysfs is referred to as the "scaling governor".
+The cpufreq core can request the driver to operate in any of the two policies:
+"performance: and "powersave". The driver decides which frequency to use based
+on the above policy selection considering minimum and maximum frequency limits.
 
-In addition to the interfaces provided by the cpufreq core for
-controlling frequency the driver provides sysfs files for
-controlling P state selection. These files have been added to
-/sys/devices/system/cpu/intel_pstate/
+The Intel P-State driver falls under the latter category, which implements the
+setpolicy() callback. This driver decides what P-State to use based on the
+requested policy from the cpufreq core. If the processor is capable of
+selecting its next P-State internally, then the driver will offload this
+responsibility to the processor (aka HWP: Hardware P-States). If not, the
+driver implements algorithms to select the next P-State.
 
-      max_perf_pct: limits the maximum P state that will be requested by
-      the driver stated as a percentage of the available performance. The
-      available (P states) performance may be reduced by the no_turbo
+Since these policies are implemented in the driver, they are not same as the
+cpufreq scaling governors implementation, even if they have the same name in
+the cpufreq sysfs (scaling_governors). For example the "performance" policy is
+similar to cpufreq’s "performance" governor, but "powersave" is completely
+different than the cpufreq "powersave" governor. The strategy here is similar
+to cpufreq "ondemand", where the requested P-State is related to the system load.
+
+Sysfs Interface
+
+In addition to the frequency-controlling interfaces provided by the cpufreq
+core, the driver provides its own sysfs files to control the P-State selection.
+These files have been added to /sys/devices/system/cpu/intel_pstate/.
+Any changes made to these files are applicable to all CPUs (even in a
+multi-package system).
+
+      max_perf_pct: Limits the maximum P-State that will be requested by
+      the driver. It states it as a percentage of the available performance. The
+      available (P-State) performance may be reduced by the no_turbo
       setting described below.
 
-      min_perf_pct: limits the minimum P state that will be  requested by
-      the driver stated as a percentage of the max (non-turbo)
+      min_perf_pct: Limits the minimum P-State that will be requested by
+      the driver. It states it as a percentage of the max (non-turbo)
       performance level.
 
-      no_turbo: limits the driver to selecting P states below the turbo
+      no_turbo: Limits the driver to selecting P-State below the turbo
       frequency range.
 
-      turbo_pct: displays the percentage of the total performance that
-      is supported by hardware that is in the turbo range.  This number
+      turbo_pct: Displays the percentage of the total performance that
+      is supported by hardware that is in the turbo range. This number
       is independent of whether turbo has been disabled or not.
 
-      num_pstates: displays the number of pstates that are supported
-      by hardware.  This number is independent of whether turbo has
+      num_pstates: Displays the number of P-States that are supported
+      by hardware. This number is independent of whether turbo has
       been disabled or not.
 
+For example, if a system has these parameters:
+	Max 1 core turbo ratio: 0x21 (Max 1 core ratio is the maximum P-State)
+	Max non turbo ratio: 0x17
+	Minimum ratio : 0x08 (Here the ratio is called max efficiency ratio)
+
+Sysfs will show :
+	max_perf_pct:100, which corresponds to 1 core ratio
+	min_perf_pct:24, max_efficiency_ratio / max 1 Core ratio
+	no_turbo:0, turbo is not disabled
+	num_pstates:26 = (max 1 Core ratio - Max Efficiency Ratio + 1)
+	turbo_pct:39 = (max 1 core ratio - max non turbo ratio) / num_pstates
+
+Refer to "Intel® 64 and IA-32 Architectures Software Developer’s Manual
+Volume 3: System Programming Guide" to understand ratios.
+
+cpufreq sysfs for Intel P-State
+
+Since this driver registers with cpufreq, cpufreq sysfs is also presented.
+There are some important differences, which need to be considered.
+
+scaling_cur_freq: This displays the real frequency which was used during
+the last sample period instead of what is requested. Some other cpufreq driver,
+like acpi-cpufreq, displays what is requested (Some changes are on the
+way to fix this for acpi-cpufreq driver). The same is true for frequencies
+displayed at /proc/cpuinfo.
+
+scaling_governor: This displays current active policy. Since each CPU has a
+cpufreq sysfs, it is possible to set a scaling governor to each CPU. But this
+is not possible with Intel P-States, as there is one common policy for all
+CPUs. Here, the last requested policy will be applicable to all CPUs. It is
+suggested that one use the cpupower utility to change policy to all CPUs at the
+same time.
+
+scaling_setspeed: This attribute can never be used with Intel P-State.
+
+scaling_max_freq/scaling_min_freq: This interface can be used similarly to
+the max_perf_pct/min_perf_pct of Intel P-State sysfs. However since frequencies
+are converted to nearest possible P-State, this is prone to rounding errors.
+This method is not preferred to limit performance.
+
+affected_cpus: Not used
+related_cpus: Not used
+
 For contemporary Intel processors, the frequency is controlled by the
-processor itself and the P-states exposed to software are related to
+processor itself and the P-State exposed to software is related to
 performance levels.  The idea that frequency can be set to a single
-frequency is fiction for Intel Core processors. Even if the scaling
-driver selects a single P state the actual frequency the processor
+frequency is fictional for Intel Core processors. Even if the scaling
+driver selects a single P-State, the actual frequency the processor
 will run at is selected by the processor itself.
 
-For legacy mode debugfs files have also been added to allow tuning of
-the internal governor algorythm. These files are located at
-/sys/kernel/debug/pstate_snb/ These files are NOT present in HWP mode.
+Tuning Intel P-State driver
+
+When HWP mode is not used, debugfs files have also been added to allow the
+tuning of the internal governor algorithm. These files are located at
+/sys/kernel/debug/pstate_snb/. The algorithm uses a PID (Proportional
+Integral Derivative) controller. The PID tunable parameters are:
 
       deadband
       d_gain_pct
@@ -63,3 +133,90 @@ the internal governor algorythm. These files are located at
       p_gain_pct
       sample_rate_ms
       setpoint
+
+To adjust these parameters, some understanding of driver implementation is
+necessary. There are some tweeks described here, but be very careful. Adjusting
+them requires expert level understanding of power and performance relationship.
+These limits are only useful when the "powersave" policy is active.
+
+-To make the system more responsive to load changes, sample_rate_ms can
+be adjusted  (current default is 10ms).
+-To make the system use higher performance, even if the load is lower, setpoint
+can be adjusted to a lower number. This will also lead to faster ramp up time
+to reach the maximum P-State.
+If there are no derivative and integral coefficients, The next P-State will be
+equal to:
+	current P-State - ((setpoint - current cpu load) * p_gain_pct)
+
+For example, if the current PID parameters are (Which are defaults for the core
+processors like SandyBridge):
+      deadband = 0
+      d_gain_pct = 0
+      i_gain_pct = 0
+      p_gain_pct = 20
+      sample_rate_ms = 10
+      setpoint = 97
+
+If the current P-State = 0x08 and current load = 100, this will result in the
+next P-State = 0x08 - ((97 - 100) * 0.2) = 8.6 (rounded to 9). Here the P-State
+goes up by only 1. If during next sample interval the current load doesn't
+change and still 100, then P-State goes up by one again. This process will
+continue as long as the load is more than the setpoint until the maximum P-State
+is reached.
+
+For the same load at setpoint = 60, this will result in the next P-State
+= 0x08 - ((60 - 100) * 0.2) = 16
+So by changing the setpoint from 97 to 60, there is an increase of the
+next P-State from 9 to 16. So this will make processor execute at higher
+P-State for the same CPU load. If the load continues to be more than the
+setpoint during next sample intervals, then P-State will go up again till the
+maximum P-State is reached. But the ramp up time to reach the maximum P-State
+will be much faster when the setpoint is 60 compared to 97.
+
+Debugging Intel P-State driver
+
+Event tracing
+To debug P-State transition, the Linux event tracing interface can be used.
+There are two specific events, which can be enabled (Provided the kernel
+configs related to event tracing are enabled).
+
+# cd /sys/kernel/debug/tracing/
+# echo 1 > events/power/pstate_sample/enable
+# echo 1 > events/power/cpu_frequency/enable
+# cat trace
+gnome-terminal--4510  [001] ..s.  1177.680733: pstate_sample: core_busy=107
+	scaled=94 from=26 to=26 mperf=1143818 aperf=1230607 tsc=29838618
+		freq=2474476
+cat-5235  [002] ..s.  1177.681723: cpu_frequency: state=2900000 cpu_id=2
+
+
+Using ftrace
+
+If function level tracing is required, the Linux ftrace interface can be used.
+For example if we want to check how often a function to set a P-State is
+called, we can set ftrace filter to intel_pstate_set_pstate.
+
+# cd /sys/kernel/debug/tracing/
+# cat available_filter_functions | grep -i pstate
+intel_pstate_set_pstate
+intel_pstate_cpu_init
+...
+
+# echo intel_pstate_set_pstate > set_ftrace_filter
+# echo function > current_tracer
+# cat trace | head -15
+# tracer: function
+#
+# entries-in-buffer/entries-written: 80/80   #P:4
+#
+#                              _-----=> irqs-off
+#                             / _----=> need-resched
+#                            | / _---=> hardirq/softirq
+#                            || / _--=> preempt-depth
+#                            ||| /     delay
+#           TASK-PID   CPU#  ||||    TIMESTAMP  FUNCTION
+#              | |       |   ||||       |         |
+            Xorg-3129  [000] ..s.  2537.644844: intel_pstate_set_pstate <-intel_pstate_timer_func
+ gnome-terminal--4510  [002] ..s.  2537.649844: intel_pstate_set_pstate <-intel_pstate_timer_func
+     gnome-shell-3409  [001] ..s.  2537.650850: intel_pstate_set_pstate <-intel_pstate_timer_func
+          <idle>-0     [000] ..s.  2537.654843: intel_pstate_set_pstate <-intel_pstate_timer_func