2005-04-17 02:20:36 +04:00
/*
* drivers / cpufreq / cpufreq_ondemand . c
*
* Copyright ( C ) 2001 Russell King
* ( C ) 2003 Venkatesh Pallipadi < venkatesh . pallipadi @ intel . com > .
* Jun Nakajima < jun . nakajima @ intel . com >
*
* This program is free software ; you can redistribute it and / or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation .
*/
2012-10-26 02:47:42 +04:00
# define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
2013-08-06 21:23:03 +04:00
# include <linux/cpu.h>
2012-10-26 02:47:42 +04:00
# include <linux/percpu-defs.h>
2013-03-27 19:58:58 +04:00
# include <linux/slab.h>
2008-08-04 22:59:12 +04:00
# include <linux/tick.h>
2012-10-26 02:47:42 +04:00
# include "cpufreq_governor.h"
2005-04-17 02:20:36 +04:00
2013-02-08 21:24:18 +04:00
/* On-demand governor macros */
2005-04-17 02:20:36 +04:00
# define DEF_FREQUENCY_UP_THRESHOLD (80)
2010-10-07 00:54:24 +04:00
# define DEF_SAMPLING_DOWN_FACTOR (1)
# define MAX_SAMPLING_DOWN_FACTOR (100000)
2008-08-04 22:59:12 +04:00
# define MICRO_FREQUENCY_UP_THRESHOLD (95)
2009-04-22 15:48:29 +04:00
# define MICRO_FREQUENCY_MIN_SAMPLE_RATE (10000)
2005-06-01 06:03:50 +04:00
# define MIN_FREQUENCY_UP_THRESHOLD (11)
2005-04-17 02:20:36 +04:00
# define MAX_FREQUENCY_UP_THRESHOLD (100)
2012-10-26 02:47:42 +04:00
static DEFINE_PER_CPU ( struct od_cpu_dbs_info_s , od_cpu_dbs_info ) ;
2005-04-17 02:20:36 +04:00
2013-04-02 18:56:56 +04:00
static struct od_ops od_ops ;
2012-11-26 22:10:12 +04:00
# ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND
static struct cpufreq_governor cpufreq_gov_ondemand ;
# endif
2013-06-26 00:42:37 +04:00
static unsigned int default_powersave_bias ;
2012-10-26 02:47:42 +04:00
static void ondemand_powersave_bias_init_cpu ( int cpu )
ondemand: Solve a big performance issue by counting IOWAIT time as busy
The ondemand cpufreq governor uses CPU busy time (e.g. not-idle
time) as a measure for scaling the CPU frequency up or down.
If the CPU is busy, the CPU frequency scales up, if it's idle,
the CPU frequency scales down. Effectively, it uses the CPU busy
time as proxy variable for the more nebulous "how critical is
performance right now" question.
This algorithm falls flat on its face in the light of workloads
where you're alternatingly disk and CPU bound, such as the ever
popular "git grep", but also things like startup of programs and
maildir using email clients... much to the chagarin of Andrew
Morton.
This patch changes the ondemand algorithm to count iowait time
as busy, not idle, time. As shown in the breakdown cases above,
iowait is performance critical often, and by counting iowait,
the proxy variable becomes a more accurate representation of the
"how critical is performance" question.
The problem and fix are both verified with the "perf timechar"
tool.
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Dave Jones <davej@redhat.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20100509082606.3d9f00d0@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2010-05-09 19:26:06 +04:00
{
2012-10-26 02:47:42 +04:00
struct od_cpu_dbs_info_s * dbs_info = & per_cpu ( od_cpu_dbs_info , cpu ) ;
ondemand: Solve a big performance issue by counting IOWAIT time as busy
The ondemand cpufreq governor uses CPU busy time (e.g. not-idle
time) as a measure for scaling the CPU frequency up or down.
If the CPU is busy, the CPU frequency scales up, if it's idle,
the CPU frequency scales down. Effectively, it uses the CPU busy
time as proxy variable for the more nebulous "how critical is
performance right now" question.
This algorithm falls flat on its face in the light of workloads
where you're alternatingly disk and CPU bound, such as the ever
popular "git grep", but also things like startup of programs and
maildir using email clients... much to the chagarin of Andrew
Morton.
This patch changes the ondemand algorithm to count iowait time
as busy, not idle, time. As shown in the breakdown cases above,
iowait is performance critical often, and by counting iowait,
the proxy variable becomes a more accurate representation of the
"how critical is performance" question.
The problem and fix are both verified with the "perf timechar"
tool.
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Dave Jones <davej@redhat.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20100509082606.3d9f00d0@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2010-05-09 19:26:06 +04:00
2012-10-26 02:47:42 +04:00
dbs_info - > freq_table = cpufreq_frequency_get_table ( cpu ) ;
dbs_info - > freq_lo = 0 ;
}
ondemand: Solve a big performance issue by counting IOWAIT time as busy
The ondemand cpufreq governor uses CPU busy time (e.g. not-idle
time) as a measure for scaling the CPU frequency up or down.
If the CPU is busy, the CPU frequency scales up, if it's idle,
the CPU frequency scales down. Effectively, it uses the CPU busy
time as proxy variable for the more nebulous "how critical is
performance right now" question.
This algorithm falls flat on its face in the light of workloads
where you're alternatingly disk and CPU bound, such as the ever
popular "git grep", but also things like startup of programs and
maildir using email clients... much to the chagarin of Andrew
Morton.
This patch changes the ondemand algorithm to count iowait time
as busy, not idle, time. As shown in the breakdown cases above,
iowait is performance critical often, and by counting iowait,
the proxy variable becomes a more accurate representation of the
"how critical is performance" question.
The problem and fix are both verified with the "perf timechar"
tool.
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Dave Jones <davej@redhat.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20100509082606.3d9f00d0@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2010-05-09 19:26:06 +04:00
2012-10-26 02:47:42 +04:00
/*
* Not all CPUs want IO time to be accounted as busy ; this depends on how
* efficient idling at a higher frequency / voltage is .
* Pavel Machek says this is not so for various generations of AMD and old
* Intel systems .
2013-02-08 21:24:18 +04:00
* Mike Chan ( android . com ) claims this is also not true for ARM .
2012-10-26 02:47:42 +04:00
* Because of this , whitelist specific known ( series ) of CPUs by default , and
* leave all others up to the user .
*/
static int should_io_be_busy ( void )
{
# if defined(CONFIG_X86)
/*
2013-02-08 21:24:18 +04:00
* For Intel , Core 2 ( model 15 ) and later have an efficient idle .
2012-10-26 02:47:42 +04:00
*/
if ( boot_cpu_data . x86_vendor = = X86_VENDOR_INTEL & &
boot_cpu_data . x86 = = 6 & &
boot_cpu_data . x86_model > = 15 )
return 1 ;
# endif
return 0 ;
ondemand: Solve a big performance issue by counting IOWAIT time as busy
The ondemand cpufreq governor uses CPU busy time (e.g. not-idle
time) as a measure for scaling the CPU frequency up or down.
If the CPU is busy, the CPU frequency scales up, if it's idle,
the CPU frequency scales down. Effectively, it uses the CPU busy
time as proxy variable for the more nebulous "how critical is
performance right now" question.
This algorithm falls flat on its face in the light of workloads
where you're alternatingly disk and CPU bound, such as the ever
popular "git grep", but also things like startup of programs and
maildir using email clients... much to the chagarin of Andrew
Morton.
This patch changes the ondemand algorithm to count iowait time
as busy, not idle, time. As shown in the breakdown cases above,
iowait is performance critical often, and by counting iowait,
the proxy variable becomes a more accurate representation of the
"how critical is performance" question.
The problem and fix are both verified with the "perf timechar"
tool.
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Dave Jones <davej@redhat.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20100509082606.3d9f00d0@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2010-05-09 19:26:06 +04:00
}
[CPUFREQ][2/2] ondemand: updated add powersave_bias tunable
ondemand selects the minimum frequency that can retire
a workload with negligible idle time -- ideally resulting in the highest
performance/power efficiency with negligible performance impact.
But on some systems and some workloads, this algorithm
is more performance biased than necessary, and
de-tuning it a bit to allow some performance impact
can save measurable power.
This patch adds a "powersave_bias" tunable to ondemand
to allow it to reduce its target frequency by a specified percent.
By default, the powersave_bias is 0 and has no effect.
powersave_bias is in units of 0.1%, so it has an effective range
of 1 through 1000, resulting in 0.1% to 100% impact.
In practice, users will not be able to detect a difference between
0.1% increments, but 1.0% increments turned out to be too large.
Also, the max value of 1000 (100%) would simply peg the system
in its deepest power saving P-state, unless the processor really has
a hardware P-state at 0Hz:-)
For example, If ondemand requests 2.0GHz based on utilization,
and powersave_bias=100, this code will knock 10% off the target
and seek a target of 1.8GHz instead of 2.0GHz until the
next sampling. If 1.8 is an exact match with an hardware frequency
we use it, otherwise we average our time between the frequency
next higher than 1.8 and next lower than 1.8.
Note that a user or administrative program can change powersave_bias
at run-time depending on how they expect the system to be used.
Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi at intel.com>
Signed-off-by: Alexey Starikovskiy <alexey.y.starikovskiy at intel.com>
Signed-off-by: Dave Jones <davej@redhat.com>
2006-07-31 22:28:12 +04:00
/*
* Find right freq to be set now with powersave_bias on .
* Returns the freq_hi to be used right now and will set freq_hi_jiffies ,
* freq_lo , and freq_lo_jiffies in percpu area for averaging freqs .
*/
2013-04-02 18:56:56 +04:00
static unsigned int generic_powersave_bias_target ( struct cpufreq_policy * policy ,
2012-10-26 02:47:42 +04:00
unsigned int freq_next , unsigned int relation )
[CPUFREQ][2/2] ondemand: updated add powersave_bias tunable
ondemand selects the minimum frequency that can retire
a workload with negligible idle time -- ideally resulting in the highest
performance/power efficiency with negligible performance impact.
But on some systems and some workloads, this algorithm
is more performance biased than necessary, and
de-tuning it a bit to allow some performance impact
can save measurable power.
This patch adds a "powersave_bias" tunable to ondemand
to allow it to reduce its target frequency by a specified percent.
By default, the powersave_bias is 0 and has no effect.
powersave_bias is in units of 0.1%, so it has an effective range
of 1 through 1000, resulting in 0.1% to 100% impact.
In practice, users will not be able to detect a difference between
0.1% increments, but 1.0% increments turned out to be too large.
Also, the max value of 1000 (100%) would simply peg the system
in its deepest power saving P-state, unless the processor really has
a hardware P-state at 0Hz:-)
For example, If ondemand requests 2.0GHz based on utilization,
and powersave_bias=100, this code will knock 10% off the target
and seek a target of 1.8GHz instead of 2.0GHz until the
next sampling. If 1.8 is an exact match with an hardware frequency
we use it, otherwise we average our time between the frequency
next higher than 1.8 and next lower than 1.8.
Note that a user or administrative program can change powersave_bias
at run-time depending on how they expect the system to be used.
Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi at intel.com>
Signed-off-by: Alexey Starikovskiy <alexey.y.starikovskiy at intel.com>
Signed-off-by: Dave Jones <davej@redhat.com>
2006-07-31 22:28:12 +04:00
{
unsigned int freq_req , freq_reduc , freq_avg ;
unsigned int freq_hi , freq_lo ;
unsigned int index = 0 ;
unsigned int jiffies_total , jiffies_hi , jiffies_lo ;
2012-10-26 02:47:42 +04:00
struct od_cpu_dbs_info_s * dbs_info = & per_cpu ( od_cpu_dbs_info ,
2009-06-24 10:13:48 +04:00
policy - > cpu ) ;
2013-03-27 19:58:58 +04:00
struct dbs_data * dbs_data = policy - > governor_data ;
struct od_dbs_tuners * od_tuners = dbs_data - > tuners ;
[CPUFREQ][2/2] ondemand: updated add powersave_bias tunable
ondemand selects the minimum frequency that can retire
a workload with negligible idle time -- ideally resulting in the highest
performance/power efficiency with negligible performance impact.
But on some systems and some workloads, this algorithm
is more performance biased than necessary, and
de-tuning it a bit to allow some performance impact
can save measurable power.
This patch adds a "powersave_bias" tunable to ondemand
to allow it to reduce its target frequency by a specified percent.
By default, the powersave_bias is 0 and has no effect.
powersave_bias is in units of 0.1%, so it has an effective range
of 1 through 1000, resulting in 0.1% to 100% impact.
In practice, users will not be able to detect a difference between
0.1% increments, but 1.0% increments turned out to be too large.
Also, the max value of 1000 (100%) would simply peg the system
in its deepest power saving P-state, unless the processor really has
a hardware P-state at 0Hz:-)
For example, If ondemand requests 2.0GHz based on utilization,
and powersave_bias=100, this code will knock 10% off the target
and seek a target of 1.8GHz instead of 2.0GHz until the
next sampling. If 1.8 is an exact match with an hardware frequency
we use it, otherwise we average our time between the frequency
next higher than 1.8 and next lower than 1.8.
Note that a user or administrative program can change powersave_bias
at run-time depending on how they expect the system to be used.
Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi at intel.com>
Signed-off-by: Alexey Starikovskiy <alexey.y.starikovskiy at intel.com>
Signed-off-by: Dave Jones <davej@redhat.com>
2006-07-31 22:28:12 +04:00
if ( ! dbs_info - > freq_table ) {
dbs_info - > freq_lo = 0 ;
dbs_info - > freq_lo_jiffies = 0 ;
return freq_next ;
}
cpufreq_frequency_table_target ( policy , dbs_info - > freq_table , freq_next ,
relation , & index ) ;
freq_req = dbs_info - > freq_table [ index ] . frequency ;
2013-03-27 19:58:58 +04:00
freq_reduc = freq_req * od_tuners - > powersave_bias / 1000 ;
[CPUFREQ][2/2] ondemand: updated add powersave_bias tunable
ondemand selects the minimum frequency that can retire
a workload with negligible idle time -- ideally resulting in the highest
performance/power efficiency with negligible performance impact.
But on some systems and some workloads, this algorithm
is more performance biased than necessary, and
de-tuning it a bit to allow some performance impact
can save measurable power.
This patch adds a "powersave_bias" tunable to ondemand
to allow it to reduce its target frequency by a specified percent.
By default, the powersave_bias is 0 and has no effect.
powersave_bias is in units of 0.1%, so it has an effective range
of 1 through 1000, resulting in 0.1% to 100% impact.
In practice, users will not be able to detect a difference between
0.1% increments, but 1.0% increments turned out to be too large.
Also, the max value of 1000 (100%) would simply peg the system
in its deepest power saving P-state, unless the processor really has
a hardware P-state at 0Hz:-)
For example, If ondemand requests 2.0GHz based on utilization,
and powersave_bias=100, this code will knock 10% off the target
and seek a target of 1.8GHz instead of 2.0GHz until the
next sampling. If 1.8 is an exact match with an hardware frequency
we use it, otherwise we average our time between the frequency
next higher than 1.8 and next lower than 1.8.
Note that a user or administrative program can change powersave_bias
at run-time depending on how they expect the system to be used.
Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi at intel.com>
Signed-off-by: Alexey Starikovskiy <alexey.y.starikovskiy at intel.com>
Signed-off-by: Dave Jones <davej@redhat.com>
2006-07-31 22:28:12 +04:00
freq_avg = freq_req - freq_reduc ;
/* Find freq bounds for freq_avg in freq_table */
index = 0 ;
cpufreq_frequency_table_target ( policy , dbs_info - > freq_table , freq_avg ,
CPUFREQ_RELATION_H , & index ) ;
freq_lo = dbs_info - > freq_table [ index ] . frequency ;
index = 0 ;
cpufreq_frequency_table_target ( policy , dbs_info - > freq_table , freq_avg ,
CPUFREQ_RELATION_L , & index ) ;
freq_hi = dbs_info - > freq_table [ index ] . frequency ;
/* Find out how long we have to be in hi and lo freqs */
if ( freq_hi = = freq_lo ) {
dbs_info - > freq_lo = 0 ;
dbs_info - > freq_lo_jiffies = 0 ;
return freq_lo ;
}
2013-03-27 19:58:58 +04:00
jiffies_total = usecs_to_jiffies ( od_tuners - > sampling_rate ) ;
[CPUFREQ][2/2] ondemand: updated add powersave_bias tunable
ondemand selects the minimum frequency that can retire
a workload with negligible idle time -- ideally resulting in the highest
performance/power efficiency with negligible performance impact.
But on some systems and some workloads, this algorithm
is more performance biased than necessary, and
de-tuning it a bit to allow some performance impact
can save measurable power.
This patch adds a "powersave_bias" tunable to ondemand
to allow it to reduce its target frequency by a specified percent.
By default, the powersave_bias is 0 and has no effect.
powersave_bias is in units of 0.1%, so it has an effective range
of 1 through 1000, resulting in 0.1% to 100% impact.
In practice, users will not be able to detect a difference between
0.1% increments, but 1.0% increments turned out to be too large.
Also, the max value of 1000 (100%) would simply peg the system
in its deepest power saving P-state, unless the processor really has
a hardware P-state at 0Hz:-)
For example, If ondemand requests 2.0GHz based on utilization,
and powersave_bias=100, this code will knock 10% off the target
and seek a target of 1.8GHz instead of 2.0GHz until the
next sampling. If 1.8 is an exact match with an hardware frequency
we use it, otherwise we average our time between the frequency
next higher than 1.8 and next lower than 1.8.
Note that a user or administrative program can change powersave_bias
at run-time depending on how they expect the system to be used.
Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi at intel.com>
Signed-off-by: Alexey Starikovskiy <alexey.y.starikovskiy at intel.com>
Signed-off-by: Dave Jones <davej@redhat.com>
2006-07-31 22:28:12 +04:00
jiffies_hi = ( freq_avg - freq_lo ) * jiffies_total ;
jiffies_hi + = ( ( freq_hi - freq_lo ) / 2 ) ;
jiffies_hi / = ( freq_hi - freq_lo ) ;
jiffies_lo = jiffies_total - jiffies_hi ;
dbs_info - > freq_lo = freq_lo ;
dbs_info - > freq_lo_jiffies = jiffies_lo ;
dbs_info - > freq_hi_jiffies = jiffies_hi ;
return freq_hi ;
}
static void ondemand_powersave_bias_init ( void )
{
int i ;
for_each_online_cpu ( i ) {
2009-07-03 04:08:32 +04:00
ondemand_powersave_bias_init_cpu ( i ) ;
[CPUFREQ][2/2] ondemand: updated add powersave_bias tunable
ondemand selects the minimum frequency that can retire
a workload with negligible idle time -- ideally resulting in the highest
performance/power efficiency with negligible performance impact.
But on some systems and some workloads, this algorithm
is more performance biased than necessary, and
de-tuning it a bit to allow some performance impact
can save measurable power.
This patch adds a "powersave_bias" tunable to ondemand
to allow it to reduce its target frequency by a specified percent.
By default, the powersave_bias is 0 and has no effect.
powersave_bias is in units of 0.1%, so it has an effective range
of 1 through 1000, resulting in 0.1% to 100% impact.
In practice, users will not be able to detect a difference between
0.1% increments, but 1.0% increments turned out to be too large.
Also, the max value of 1000 (100%) would simply peg the system
in its deepest power saving P-state, unless the processor really has
a hardware P-state at 0Hz:-)
For example, If ondemand requests 2.0GHz based on utilization,
and powersave_bias=100, this code will knock 10% off the target
and seek a target of 1.8GHz instead of 2.0GHz until the
next sampling. If 1.8 is an exact match with an hardware frequency
we use it, otherwise we average our time between the frequency
next higher than 1.8 and next lower than 1.8.
Note that a user or administrative program can change powersave_bias
at run-time depending on how they expect the system to be used.
Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi at intel.com>
Signed-off-by: Alexey Starikovskiy <alexey.y.starikovskiy at intel.com>
Signed-off-by: Dave Jones <davej@redhat.com>
2006-07-31 22:28:12 +04:00
}
}
2013-08-06 21:23:05 +04:00
static void dbs_freq_increase ( struct cpufreq_policy * policy , unsigned int freq )
2012-10-26 02:47:42 +04:00
{
2013-08-06 21:23:05 +04:00
struct dbs_data * dbs_data = policy - > governor_data ;
2013-03-27 19:58:58 +04:00
struct od_dbs_tuners * od_tuners = dbs_data - > tuners ;
if ( od_tuners - > powersave_bias )
2013-08-06 21:23:05 +04:00
freq = od_ops . powersave_bias_target ( policy , freq ,
2013-04-02 18:56:56 +04:00
CPUFREQ_RELATION_H ) ;
2013-08-06 21:23:05 +04:00
else if ( policy - > cur = = policy - > max )
2012-10-26 02:47:42 +04:00
return ;
2009-07-24 17:25:06 +04:00
2013-08-06 21:23:05 +04:00
__cpufreq_driver_target ( policy , freq , od_tuners - > powersave_bias ?
2012-10-26 02:47:42 +04:00
CPUFREQ_RELATION_L : CPUFREQ_RELATION_H ) ;
}
/*
* Every sampling_rate , we check , if current idle time is less than 20 %
2013-06-05 20:01:25 +04:00
* ( default ) , then we try to increase frequency . Else , we adjust the frequency
* proportional to load .
2012-10-26 02:47:42 +04:00
*/
2013-06-05 20:01:25 +04:00
static void od_check_cpu ( int cpu , unsigned int load )
2005-04-17 02:20:36 +04:00
{
2012-10-26 02:47:42 +04:00
struct od_cpu_dbs_info_s * dbs_info = & per_cpu ( od_cpu_dbs_info , cpu ) ;
2015-07-18 09:00:59 +03:00
struct cpufreq_policy * policy = dbs_info - > cdbs . shared - > policy ;
2013-03-27 19:58:58 +04:00
struct dbs_data * dbs_data = policy - > governor_data ;
struct od_dbs_tuners * od_tuners = dbs_data - > tuners ;
2012-10-26 02:47:42 +04:00
dbs_info - > freq_lo = 0 ;
/* Check for frequency increase */
2013-06-05 20:01:25 +04:00
if ( load > od_tuners - > up_threshold ) {
2012-10-26 02:47:42 +04:00
/* If switching to max speed, apply sampling_down_factor */
if ( policy - > cur < policy - > max )
dbs_info - > rate_mult =
2013-03-27 19:58:58 +04:00
od_tuners - > sampling_down_factor ;
2012-10-26 02:47:42 +04:00
dbs_freq_increase ( policy , policy - > max ) ;
2013-06-05 20:01:25 +04:00
} else {
/* Calculate the next frequency proportional to load */
cpufreq: ondemand: Eliminate the deadband effect
Currently, ondemand calculates the target frequency proportional to load
using the formula:
Target frequency = C * load
where C = policy->cpuinfo.max_freq / 100
Though, in many cases, the minimum available frequency is pretty high and
the above calculation introduces a dead band from load 0 to
100 * policy->cpuinfo.min_freq / policy->cpuinfo.max_freq where the target
frequency is always calculated to less than policy->cpuinfo.min_freq and
the minimum frequency is selected.
For example: on Intel i7-3770 @ 3.4GHz the policy->cpuinfo.min_freq = 1600000
and the policy->cpuinfo.max_freq = 3400000 (without turbo). Thus, the CPU
starts to scale up at a load above 47.
On quad core 1500MHz Krait the policy->cpuinfo.min_freq = 384000
and the policy->cpuinfo.max_freq = 1512000. Thus, the CPU starts to scale
at load above 25.
Change the calculation of target frequency to eliminate the above effect using
the formula:
Target frequency = A + B * load
where A = policy->cpuinfo.min_freq and
B = (policy->cpuinfo.max_freq - policy->cpuinfo->min_freq) / 100
This will map load values 0 to 100 linearly to cpuinfo.min_freq to
cpuinfo.max_freq.
Also, use the CPUFREQ_RELATION_C in __cpufreq_driver_target to select the
closest frequency in frequency_table. This is necessary to avoid selection
of minimum frequency only when load equals to 0. It will also help for selection
of frequencies using a more 'fair' criterion.
Tables below show the difference in selected frequency for specific values
of load without and with this patch. On Intel i7-3770 @ 3.40GHz:
Without With
Load Target Selected Target Selected
0 0 1600000 1600000 1600000
5 170050 1600000 1690050 1700000
10 340100 1600000 1780100 1700000
15 510150 1600000 1870150 1900000
20 680200 1600000 1960200 2000000
25 850250 1600000 2050250 2100000
30 1020300 1600000 2140300 2100000
35 1190350 1600000 2230350 2200000
40 1360400 1600000 2320400 2400000
45 1530450 1600000 2410450 2400000
50 1700500 1900000 2500500 2500000
55 1870550 1900000 2590550 2600000
60 2040600 2100000 2680600 2600000
65 2210650 2400000 2770650 2800000
70 2380700 2400000 2860700 2800000
75 2550750 2600000 2950750 3000000
80 2720800 2800000 3040800 3000000
85 2890850 2900000 3130850 3100000
90 3060900 3100000 3220900 3300000
95 3230950 3300000 3310950 3300000
100 3401000 3401000 3401000 3401000
On ARM quad core 1500MHz Krait:
Without With
Load Target Selected Target Selected
0 0 384000 384000 384000
5 75600 384000 440400 486000
10 151200 384000 496800 486000
15 226800 384000 553200 594000
20 302400 384000 609600 594000
25 378000 384000 666000 702000
30 453600 486000 722400 702000
35 529200 594000 778800 810000
40 604800 702000 835200 810000
45 680400 702000 891600 918000
50 756000 810000 948000 918000
55 831600 918000 1004400 1026000
60 907200 918000 1060800 1026000
65 982800 1026000 1117200 1134000
70 1058400 1134000 1173600 1134000
75 1134000 1134000 1230000 1242000
80 1209600 1242000 1286400 1242000
85 1285200 1350000 1342800 1350000
90 1360800 1458000 1399200 1350000
95 1436400 1458000 1455600 1458000
100 1512000 1512000 1512000 1512000
Tested on Intel i7-3770 CPU @ 3.40GHz and on ARM quad core 1500MHz Krait
(Android smartphone).
Benchmarks on Intel i7 shows a performance improvement on low and medium
work loads with lower power consumption. Specifics:
Phoronix Linux Kernel Compilation 3.1:
Time: -0.40%, energy: -0.07%
Phoronix Apache:
Time: -4.98%, energy: -2.35%
Phoronix FFMPEG:
Time: -6.29%, energy: -4.02%
Also, running mp3 decoding (very low load) shows no differences with and
without this patch.
Signed-off-by: Stratos Karafotis <stratosk@semaphore.gr>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2014-06-30 20:59:34 +04:00
unsigned int freq_next , min_f , max_f ;
min_f = policy - > cpuinfo . min_freq ;
max_f = policy - > cpuinfo . max_freq ;
freq_next = min_f + load * ( max_f - min_f ) / 100 ;
2012-10-26 02:47:42 +04:00
/* No longer fully busy, reset rate_mult */
dbs_info - > rate_mult = 1 ;
2013-03-27 19:58:58 +04:00
if ( ! od_tuners - > powersave_bias ) {
2012-10-26 02:47:42 +04:00
__cpufreq_driver_target ( policy , freq_next ,
cpufreq: ondemand: Eliminate the deadband effect
Currently, ondemand calculates the target frequency proportional to load
using the formula:
Target frequency = C * load
where C = policy->cpuinfo.max_freq / 100
Though, in many cases, the minimum available frequency is pretty high and
the above calculation introduces a dead band from load 0 to
100 * policy->cpuinfo.min_freq / policy->cpuinfo.max_freq where the target
frequency is always calculated to less than policy->cpuinfo.min_freq and
the minimum frequency is selected.
For example: on Intel i7-3770 @ 3.4GHz the policy->cpuinfo.min_freq = 1600000
and the policy->cpuinfo.max_freq = 3400000 (without turbo). Thus, the CPU
starts to scale up at a load above 47.
On quad core 1500MHz Krait the policy->cpuinfo.min_freq = 384000
and the policy->cpuinfo.max_freq = 1512000. Thus, the CPU starts to scale
at load above 25.
Change the calculation of target frequency to eliminate the above effect using
the formula:
Target frequency = A + B * load
where A = policy->cpuinfo.min_freq and
B = (policy->cpuinfo.max_freq - policy->cpuinfo->min_freq) / 100
This will map load values 0 to 100 linearly to cpuinfo.min_freq to
cpuinfo.max_freq.
Also, use the CPUFREQ_RELATION_C in __cpufreq_driver_target to select the
closest frequency in frequency_table. This is necessary to avoid selection
of minimum frequency only when load equals to 0. It will also help for selection
of frequencies using a more 'fair' criterion.
Tables below show the difference in selected frequency for specific values
of load without and with this patch. On Intel i7-3770 @ 3.40GHz:
Without With
Load Target Selected Target Selected
0 0 1600000 1600000 1600000
5 170050 1600000 1690050 1700000
10 340100 1600000 1780100 1700000
15 510150 1600000 1870150 1900000
20 680200 1600000 1960200 2000000
25 850250 1600000 2050250 2100000
30 1020300 1600000 2140300 2100000
35 1190350 1600000 2230350 2200000
40 1360400 1600000 2320400 2400000
45 1530450 1600000 2410450 2400000
50 1700500 1900000 2500500 2500000
55 1870550 1900000 2590550 2600000
60 2040600 2100000 2680600 2600000
65 2210650 2400000 2770650 2800000
70 2380700 2400000 2860700 2800000
75 2550750 2600000 2950750 3000000
80 2720800 2800000 3040800 3000000
85 2890850 2900000 3130850 3100000
90 3060900 3100000 3220900 3300000
95 3230950 3300000 3310950 3300000
100 3401000 3401000 3401000 3401000
On ARM quad core 1500MHz Krait:
Without With
Load Target Selected Target Selected
0 0 384000 384000 384000
5 75600 384000 440400 486000
10 151200 384000 496800 486000
15 226800 384000 553200 594000
20 302400 384000 609600 594000
25 378000 384000 666000 702000
30 453600 486000 722400 702000
35 529200 594000 778800 810000
40 604800 702000 835200 810000
45 680400 702000 891600 918000
50 756000 810000 948000 918000
55 831600 918000 1004400 1026000
60 907200 918000 1060800 1026000
65 982800 1026000 1117200 1134000
70 1058400 1134000 1173600 1134000
75 1134000 1134000 1230000 1242000
80 1209600 1242000 1286400 1242000
85 1285200 1350000 1342800 1350000
90 1360800 1458000 1399200 1350000
95 1436400 1458000 1455600 1458000
100 1512000 1512000 1512000 1512000
Tested on Intel i7-3770 CPU @ 3.40GHz and on ARM quad core 1500MHz Krait
(Android smartphone).
Benchmarks on Intel i7 shows a performance improvement on low and medium
work loads with lower power consumption. Specifics:
Phoronix Linux Kernel Compilation 3.1:
Time: -0.40%, energy: -0.07%
Phoronix Apache:
Time: -4.98%, energy: -2.35%
Phoronix FFMPEG:
Time: -6.29%, energy: -4.02%
Also, running mp3 decoding (very low load) shows no differences with and
without this patch.
Signed-off-by: Stratos Karafotis <stratosk@semaphore.gr>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2014-06-30 20:59:34 +04:00
CPUFREQ_RELATION_C ) ;
2013-04-02 18:56:56 +04:00
return ;
2012-10-26 02:47:42 +04:00
}
2013-04-02 18:56:56 +04:00
freq_next = od_ops . powersave_bias_target ( policy , freq_next ,
CPUFREQ_RELATION_L ) ;
cpufreq: ondemand: Eliminate the deadband effect
Currently, ondemand calculates the target frequency proportional to load
using the formula:
Target frequency = C * load
where C = policy->cpuinfo.max_freq / 100
Though, in many cases, the minimum available frequency is pretty high and
the above calculation introduces a dead band from load 0 to
100 * policy->cpuinfo.min_freq / policy->cpuinfo.max_freq where the target
frequency is always calculated to less than policy->cpuinfo.min_freq and
the minimum frequency is selected.
For example: on Intel i7-3770 @ 3.4GHz the policy->cpuinfo.min_freq = 1600000
and the policy->cpuinfo.max_freq = 3400000 (without turbo). Thus, the CPU
starts to scale up at a load above 47.
On quad core 1500MHz Krait the policy->cpuinfo.min_freq = 384000
and the policy->cpuinfo.max_freq = 1512000. Thus, the CPU starts to scale
at load above 25.
Change the calculation of target frequency to eliminate the above effect using
the formula:
Target frequency = A + B * load
where A = policy->cpuinfo.min_freq and
B = (policy->cpuinfo.max_freq - policy->cpuinfo->min_freq) / 100
This will map load values 0 to 100 linearly to cpuinfo.min_freq to
cpuinfo.max_freq.
Also, use the CPUFREQ_RELATION_C in __cpufreq_driver_target to select the
closest frequency in frequency_table. This is necessary to avoid selection
of minimum frequency only when load equals to 0. It will also help for selection
of frequencies using a more 'fair' criterion.
Tables below show the difference in selected frequency for specific values
of load without and with this patch. On Intel i7-3770 @ 3.40GHz:
Without With
Load Target Selected Target Selected
0 0 1600000 1600000 1600000
5 170050 1600000 1690050 1700000
10 340100 1600000 1780100 1700000
15 510150 1600000 1870150 1900000
20 680200 1600000 1960200 2000000
25 850250 1600000 2050250 2100000
30 1020300 1600000 2140300 2100000
35 1190350 1600000 2230350 2200000
40 1360400 1600000 2320400 2400000
45 1530450 1600000 2410450 2400000
50 1700500 1900000 2500500 2500000
55 1870550 1900000 2590550 2600000
60 2040600 2100000 2680600 2600000
65 2210650 2400000 2770650 2800000
70 2380700 2400000 2860700 2800000
75 2550750 2600000 2950750 3000000
80 2720800 2800000 3040800 3000000
85 2890850 2900000 3130850 3100000
90 3060900 3100000 3220900 3300000
95 3230950 3300000 3310950 3300000
100 3401000 3401000 3401000 3401000
On ARM quad core 1500MHz Krait:
Without With
Load Target Selected Target Selected
0 0 384000 384000 384000
5 75600 384000 440400 486000
10 151200 384000 496800 486000
15 226800 384000 553200 594000
20 302400 384000 609600 594000
25 378000 384000 666000 702000
30 453600 486000 722400 702000
35 529200 594000 778800 810000
40 604800 702000 835200 810000
45 680400 702000 891600 918000
50 756000 810000 948000 918000
55 831600 918000 1004400 1026000
60 907200 918000 1060800 1026000
65 982800 1026000 1117200 1134000
70 1058400 1134000 1173600 1134000
75 1134000 1134000 1230000 1242000
80 1209600 1242000 1286400 1242000
85 1285200 1350000 1342800 1350000
90 1360800 1458000 1399200 1350000
95 1436400 1458000 1455600 1458000
100 1512000 1512000 1512000 1512000
Tested on Intel i7-3770 CPU @ 3.40GHz and on ARM quad core 1500MHz Krait
(Android smartphone).
Benchmarks on Intel i7 shows a performance improvement on low and medium
work loads with lower power consumption. Specifics:
Phoronix Linux Kernel Compilation 3.1:
Time: -0.40%, energy: -0.07%
Phoronix Apache:
Time: -4.98%, energy: -2.35%
Phoronix FFMPEG:
Time: -6.29%, energy: -4.02%
Also, running mp3 decoding (very low load) shows no differences with and
without this patch.
Signed-off-by: Stratos Karafotis <stratosk@semaphore.gr>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2014-06-30 20:59:34 +04:00
__cpufreq_driver_target ( policy , freq_next , CPUFREQ_RELATION_C ) ;
2012-10-26 02:47:42 +04:00
}
2005-04-17 02:20:36 +04:00
}
2015-07-18 09:01:00 +03:00
static unsigned int od_dbs_timer ( struct cpu_dbs_info * cdbs ,
struct dbs_data * dbs_data , bool modify_all )
2012-10-26 02:47:42 +04:00
{
2015-07-18 09:01:00 +03:00
struct cpufreq_policy * policy = cdbs - > shared - > policy ;
2015-07-18 09:00:59 +03:00
unsigned int cpu = policy - > cpu ;
2015-07-18 09:01:00 +03:00
struct od_cpu_dbs_info_s * dbs_info = & per_cpu ( od_cpu_dbs_info ,
2013-01-31 21:28:02 +04:00
cpu ) ;
2013-03-27 19:58:58 +04:00
struct od_dbs_tuners * od_tuners = dbs_data - > tuners ;
2015-07-18 09:01:00 +03:00
int delay = 0 , sample_type = dbs_info - > sample_type ;
2013-01-31 21:28:02 +04:00
2015-07-18 09:01:00 +03:00
if ( ! modify_all )
2013-02-27 09:36:36 +04:00
goto max_delay ;
2005-04-17 02:20:36 +04:00
2012-10-26 02:47:42 +04:00
/* Common NORMAL_SAMPLE setup */
2015-07-18 09:01:00 +03:00
dbs_info - > sample_type = OD_NORMAL_SAMPLE ;
2012-10-26 02:47:42 +04:00
if ( sample_type = = OD_SUB_SAMPLE ) {
2015-07-18 09:01:00 +03:00
delay = dbs_info - > freq_lo_jiffies ;
__cpufreq_driver_target ( policy , dbs_info - > freq_lo ,
2015-06-19 14:48:05 +03:00
CPUFREQ_RELATION_H ) ;
2012-10-26 02:47:42 +04:00
} else {
2013-02-27 09:36:36 +04:00
dbs_check_cpu ( dbs_data , cpu ) ;
2015-07-18 09:01:00 +03:00
if ( dbs_info - > freq_lo ) {
2012-10-26 02:47:42 +04:00
/* Setup timer for SUB_SAMPLE */
2015-07-18 09:01:00 +03:00
dbs_info - > sample_type = OD_SUB_SAMPLE ;
delay = dbs_info - > freq_hi_jiffies ;
2012-10-26 02:47:42 +04:00
}
}
2013-02-27 09:36:36 +04:00
max_delay :
if ( ! delay )
delay = delay_for_sampling_rate ( od_tuners - > sampling_rate
2015-07-18 09:01:00 +03:00
* dbs_info - > rate_mult ) ;
2013-02-27 09:36:36 +04:00
2015-07-18 09:01:00 +03:00
return delay ;
2012-12-27 18:55:40 +04:00
}
2012-10-26 02:47:42 +04:00
/************************** sysfs interface ************************/
2013-03-27 19:58:58 +04:00
static struct common_dbs_data od_dbs_cdata ;
2005-04-17 02:20:36 +04:00
2012-02-29 12:54:41 +04:00
/**
* update_sampling_rate - update sampling rate effective immediately if needed .
* @ new_rate : new sampling rate
*
2013-02-08 21:24:18 +04:00
* If new rate is smaller than the old , simply updating
2012-10-26 02:47:42 +04:00
* dbs_tuners_int . sampling_rate might not be appropriate . For example , if the
* original sampling_rate was 1 second and the requested new sampling rate is 10
* ms because the user needs immediate reaction from ondemand governor , but not
* sure if higher frequency will be required or not , then , the governor may
* change the sampling rate too late ; up to 1 second later . Thus , if we are
* reducing the sampling rate , we need to make the new value effective
* immediately .
2012-02-29 12:54:41 +04:00
*/
2013-03-27 19:58:58 +04:00
static void update_sampling_rate ( struct dbs_data * dbs_data ,
unsigned int new_rate )
2012-02-29 12:54:41 +04:00
{
2013-03-27 19:58:58 +04:00
struct od_dbs_tuners * od_tuners = dbs_data - > tuners ;
2012-02-29 12:54:41 +04:00
int cpu ;
2013-03-27 19:58:58 +04:00
od_tuners - > sampling_rate = new_rate = max ( new_rate ,
dbs_data - > min_sampling_rate ) ;
2012-02-29 12:54:41 +04:00
for_each_online_cpu ( cpu ) {
struct cpufreq_policy * policy ;
2012-10-26 02:47:42 +04:00
struct od_cpu_dbs_info_s * dbs_info ;
2012-02-29 12:54:41 +04:00
unsigned long next_sampling , appointed_at ;
policy = cpufreq_cpu_get ( cpu ) ;
if ( ! policy )
continue ;
2012-11-26 22:10:12 +04:00
if ( policy - > governor ! = & cpufreq_gov_ondemand ) {
cpufreq_cpu_put ( policy ) ;
continue ;
}
2012-12-27 18:55:42 +04:00
dbs_info = & per_cpu ( od_cpu_dbs_info , cpu ) ;
2012-02-29 12:54:41 +04:00
cpufreq_cpu_put ( policy ) ;
2015-10-13 11:09:01 +03:00
if ( ! delayed_work_pending ( & dbs_info - > cdbs . dwork ) )
2012-02-29 12:54:41 +04:00
continue ;
2012-10-26 02:47:42 +04:00
next_sampling = jiffies + usecs_to_jiffies ( new_rate ) ;
2015-06-19 14:48:01 +03:00
appointed_at = dbs_info - > cdbs . dwork . timer . expires ;
2012-02-29 12:54:41 +04:00
if ( time_before ( next_sampling , appointed_at ) ) {
2015-06-19 14:48:01 +03:00
cancel_delayed_work_sync ( & dbs_info - > cdbs . dwork ) ;
2012-02-29 12:54:41 +04:00
2015-07-18 09:00:59 +03:00
gov_queue_work ( dbs_data , policy ,
2015-06-19 14:48:05 +03:00
usecs_to_jiffies ( new_rate ) , true ) ;
2012-02-29 12:54:41 +04:00
}
}
}
2013-03-27 19:58:58 +04:00
static ssize_t store_sampling_rate ( struct dbs_data * dbs_data , const char * buf ,
size_t count )
2005-04-17 02:20:36 +04:00
{
unsigned int input ;
int ret ;
2006-06-29 00:52:18 +04:00
ret = sscanf ( buf , " %u " , & input ) ;
2009-07-03 04:08:32 +04:00
if ( ret ! = 1 )
return - EINVAL ;
2013-03-27 19:58:58 +04:00
update_sampling_rate ( dbs_data , input ) ;
2005-04-17 02:20:36 +04:00
return count ;
}
2013-03-27 19:58:58 +04:00
static ssize_t store_io_is_busy ( struct dbs_data * dbs_data , const char * buf ,
size_t count )
2010-05-09 19:26:51 +04:00
{
2013-03-27 19:58:58 +04:00
struct od_dbs_tuners * od_tuners = dbs_data - > tuners ;
2010-05-09 19:26:51 +04:00
unsigned int input ;
int ret ;
2013-02-28 20:57:32 +04:00
unsigned int j ;
2010-05-09 19:26:51 +04:00
ret = sscanf ( buf , " %u " , & input ) ;
if ( ret ! = 1 )
return - EINVAL ;
2013-03-27 19:58:58 +04:00
od_tuners - > io_is_busy = ! ! input ;
2013-02-28 20:57:32 +04:00
/* we need to re-evaluate prev_cpu_idle */
for_each_online_cpu ( j ) {
struct od_cpu_dbs_info_s * dbs_info = & per_cpu ( od_cpu_dbs_info ,
j ) ;
dbs_info - > cdbs . prev_cpu_idle = get_cpu_idle_time ( j ,
& dbs_info - > cdbs . prev_cpu_wall , od_tuners - > io_is_busy ) ;
}
2010-05-09 19:26:51 +04:00
return count ;
}
2013-03-27 19:58:58 +04:00
static ssize_t store_up_threshold ( struct dbs_data * dbs_data , const char * buf ,
size_t count )
2005-04-17 02:20:36 +04:00
{
2013-03-27 19:58:58 +04:00
struct od_dbs_tuners * od_tuners = dbs_data - > tuners ;
2005-04-17 02:20:36 +04:00
unsigned int input ;
int ret ;
2006-06-29 00:52:18 +04:00
ret = sscanf ( buf , " %u " , & input ) ;
2005-04-17 02:20:36 +04:00
2006-02-28 08:43:23 +03:00
if ( ret ! = 1 | | input > MAX_FREQUENCY_UP_THRESHOLD | |
2005-06-01 06:03:50 +04:00
input < MIN_FREQUENCY_UP_THRESHOLD ) {
2005-04-17 02:20:36 +04:00
return - EINVAL ;
}
2013-02-06 16:34:00 +04:00
2013-03-27 19:58:58 +04:00
od_tuners - > up_threshold = input ;
2005-04-17 02:20:36 +04:00
return count ;
}
2013-03-27 19:58:58 +04:00
static ssize_t store_sampling_down_factor ( struct dbs_data * dbs_data ,
const char * buf , size_t count )
2010-10-07 00:54:24 +04:00
{
2013-03-27 19:58:58 +04:00
struct od_dbs_tuners * od_tuners = dbs_data - > tuners ;
2010-10-07 00:54:24 +04:00
unsigned int input , j ;
int ret ;
ret = sscanf ( buf , " %u " , & input ) ;
if ( ret ! = 1 | | input > MAX_SAMPLING_DOWN_FACTOR | | input < 1 )
return - EINVAL ;
2013-03-27 19:58:58 +04:00
od_tuners - > sampling_down_factor = input ;
2010-10-07 00:54:24 +04:00
/* Reset down sampling multiplier in case it was active */
for_each_online_cpu ( j ) {
2012-10-26 02:47:42 +04:00
struct od_cpu_dbs_info_s * dbs_info = & per_cpu ( od_cpu_dbs_info ,
j ) ;
2010-10-07 00:54:24 +04:00
dbs_info - > rate_mult = 1 ;
}
return count ;
}
2013-08-05 10:58:02 +04:00
static ssize_t store_ignore_nice_load ( struct dbs_data * dbs_data ,
const char * buf , size_t count )
2005-06-01 06:03:47 +04:00
{
2013-03-27 19:58:58 +04:00
struct od_dbs_tuners * od_tuners = dbs_data - > tuners ;
2005-06-01 06:03:47 +04:00
unsigned int input ;
int ret ;
unsigned int j ;
2006-02-28 08:43:23 +03:00
2006-06-29 00:52:18 +04:00
ret = sscanf ( buf , " %u " , & input ) ;
2009-01-18 09:43:44 +03:00
if ( ret ! = 1 )
2005-06-01 06:03:47 +04:00
return - EINVAL ;
2009-01-18 09:43:44 +03:00
if ( input > 1 )
2005-06-01 06:03:47 +04:00
input = 1 ;
2006-02-28 08:43:23 +03:00
2013-08-05 10:58:02 +04:00
if ( input = = od_tuners - > ignore_nice_load ) { /* nothing to do */
2005-06-01 06:03:47 +04:00
return count ;
}
2013-08-05 10:58:02 +04:00
od_tuners - > ignore_nice_load = input ;
2005-06-01 06:03:47 +04:00
2006-06-29 00:49:52 +04:00
/* we need to re-evaluate prev_cpu_idle */
2005-06-01 06:03:49 +04:00
for_each_online_cpu ( j ) {
2012-10-26 02:47:42 +04:00
struct od_cpu_dbs_info_s * dbs_info ;
2009-06-24 10:13:48 +04:00
dbs_info = & per_cpu ( od_cpu_dbs_info , j ) ;
2012-10-26 02:47:42 +04:00
dbs_info - > cdbs . prev_cpu_idle = get_cpu_idle_time ( j ,
2013-02-28 20:57:32 +04:00
& dbs_info - > cdbs . prev_cpu_wall , od_tuners - > io_is_busy ) ;
2013-08-05 10:58:02 +04:00
if ( od_tuners - > ignore_nice_load )
2012-10-26 02:47:42 +04:00
dbs_info - > cdbs . prev_cpu_nice =
kcpustat_cpu ( j ) . cpustat [ CPUTIME_NICE ] ;
2009-01-23 17:25:02 +03:00
2005-06-01 06:03:47 +04:00
}
return count ;
}
2013-03-27 19:58:58 +04:00
static ssize_t store_powersave_bias ( struct dbs_data * dbs_data , const char * buf ,
size_t count )
[CPUFREQ][2/2] ondemand: updated add powersave_bias tunable
ondemand selects the minimum frequency that can retire
a workload with negligible idle time -- ideally resulting in the highest
performance/power efficiency with negligible performance impact.
But on some systems and some workloads, this algorithm
is more performance biased than necessary, and
de-tuning it a bit to allow some performance impact
can save measurable power.
This patch adds a "powersave_bias" tunable to ondemand
to allow it to reduce its target frequency by a specified percent.
By default, the powersave_bias is 0 and has no effect.
powersave_bias is in units of 0.1%, so it has an effective range
of 1 through 1000, resulting in 0.1% to 100% impact.
In practice, users will not be able to detect a difference between
0.1% increments, but 1.0% increments turned out to be too large.
Also, the max value of 1000 (100%) would simply peg the system
in its deepest power saving P-state, unless the processor really has
a hardware P-state at 0Hz:-)
For example, If ondemand requests 2.0GHz based on utilization,
and powersave_bias=100, this code will knock 10% off the target
and seek a target of 1.8GHz instead of 2.0GHz until the
next sampling. If 1.8 is an exact match with an hardware frequency
we use it, otherwise we average our time between the frequency
next higher than 1.8 and next lower than 1.8.
Note that a user or administrative program can change powersave_bias
at run-time depending on how they expect the system to be used.
Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi at intel.com>
Signed-off-by: Alexey Starikovskiy <alexey.y.starikovskiy at intel.com>
Signed-off-by: Dave Jones <davej@redhat.com>
2006-07-31 22:28:12 +04:00
{
2013-03-27 19:58:58 +04:00
struct od_dbs_tuners * od_tuners = dbs_data - > tuners ;
[CPUFREQ][2/2] ondemand: updated add powersave_bias tunable
ondemand selects the minimum frequency that can retire
a workload with negligible idle time -- ideally resulting in the highest
performance/power efficiency with negligible performance impact.
But on some systems and some workloads, this algorithm
is more performance biased than necessary, and
de-tuning it a bit to allow some performance impact
can save measurable power.
This patch adds a "powersave_bias" tunable to ondemand
to allow it to reduce its target frequency by a specified percent.
By default, the powersave_bias is 0 and has no effect.
powersave_bias is in units of 0.1%, so it has an effective range
of 1 through 1000, resulting in 0.1% to 100% impact.
In practice, users will not be able to detect a difference between
0.1% increments, but 1.0% increments turned out to be too large.
Also, the max value of 1000 (100%) would simply peg the system
in its deepest power saving P-state, unless the processor really has
a hardware P-state at 0Hz:-)
For example, If ondemand requests 2.0GHz based on utilization,
and powersave_bias=100, this code will knock 10% off the target
and seek a target of 1.8GHz instead of 2.0GHz until the
next sampling. If 1.8 is an exact match with an hardware frequency
we use it, otherwise we average our time between the frequency
next higher than 1.8 and next lower than 1.8.
Note that a user or administrative program can change powersave_bias
at run-time depending on how they expect the system to be used.
Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi at intel.com>
Signed-off-by: Alexey Starikovskiy <alexey.y.starikovskiy at intel.com>
Signed-off-by: Dave Jones <davej@redhat.com>
2006-07-31 22:28:12 +04:00
unsigned int input ;
int ret ;
ret = sscanf ( buf , " %u " , & input ) ;
if ( ret ! = 1 )
return - EINVAL ;
if ( input > 1000 )
input = 1000 ;
2013-03-27 19:58:58 +04:00
od_tuners - > powersave_bias = input ;
[CPUFREQ][2/2] ondemand: updated add powersave_bias tunable
ondemand selects the minimum frequency that can retire
a workload with negligible idle time -- ideally resulting in the highest
performance/power efficiency with negligible performance impact.
But on some systems and some workloads, this algorithm
is more performance biased than necessary, and
de-tuning it a bit to allow some performance impact
can save measurable power.
This patch adds a "powersave_bias" tunable to ondemand
to allow it to reduce its target frequency by a specified percent.
By default, the powersave_bias is 0 and has no effect.
powersave_bias is in units of 0.1%, so it has an effective range
of 1 through 1000, resulting in 0.1% to 100% impact.
In practice, users will not be able to detect a difference between
0.1% increments, but 1.0% increments turned out to be too large.
Also, the max value of 1000 (100%) would simply peg the system
in its deepest power saving P-state, unless the processor really has
a hardware P-state at 0Hz:-)
For example, If ondemand requests 2.0GHz based on utilization,
and powersave_bias=100, this code will knock 10% off the target
and seek a target of 1.8GHz instead of 2.0GHz until the
next sampling. If 1.8 is an exact match with an hardware frequency
we use it, otherwise we average our time between the frequency
next higher than 1.8 and next lower than 1.8.
Note that a user or administrative program can change powersave_bias
at run-time depending on how they expect the system to be used.
Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi at intel.com>
Signed-off-by: Alexey Starikovskiy <alexey.y.starikovskiy at intel.com>
Signed-off-by: Dave Jones <davej@redhat.com>
2006-07-31 22:28:12 +04:00
ondemand_powersave_bias_init ( ) ;
return count ;
}
2013-03-27 19:58:58 +04:00
show_store_one ( od , sampling_rate ) ;
show_store_one ( od , io_is_busy ) ;
show_store_one ( od , up_threshold ) ;
show_store_one ( od , sampling_down_factor ) ;
2013-08-05 10:58:02 +04:00
show_store_one ( od , ignore_nice_load ) ;
2013-03-27 19:58:58 +04:00
show_store_one ( od , powersave_bias ) ;
declare_show_sampling_rate_min ( od ) ;
gov_sys_pol_attr_rw ( sampling_rate ) ;
gov_sys_pol_attr_rw ( io_is_busy ) ;
gov_sys_pol_attr_rw ( up_threshold ) ;
gov_sys_pol_attr_rw ( sampling_down_factor ) ;
2013-08-05 10:58:02 +04:00
gov_sys_pol_attr_rw ( ignore_nice_load ) ;
2013-03-27 19:58:58 +04:00
gov_sys_pol_attr_rw ( powersave_bias ) ;
gov_sys_pol_attr_ro ( sampling_rate_min ) ;
static struct attribute * dbs_attributes_gov_sys [ ] = {
& sampling_rate_min_gov_sys . attr ,
& sampling_rate_gov_sys . attr ,
& up_threshold_gov_sys . attr ,
& sampling_down_factor_gov_sys . attr ,
2013-08-05 10:58:02 +04:00
& ignore_nice_load_gov_sys . attr ,
2013-03-27 19:58:58 +04:00
& powersave_bias_gov_sys . attr ,
& io_is_busy_gov_sys . attr ,
2005-04-17 02:20:36 +04:00
NULL
} ;
2013-03-27 19:58:58 +04:00
static struct attribute_group od_attr_group_gov_sys = {
. attrs = dbs_attributes_gov_sys ,
. name = " ondemand " ,
} ;
static struct attribute * dbs_attributes_gov_pol [ ] = {
& sampling_rate_min_gov_pol . attr ,
& sampling_rate_gov_pol . attr ,
& up_threshold_gov_pol . attr ,
& sampling_down_factor_gov_pol . attr ,
2013-08-05 10:58:02 +04:00
& ignore_nice_load_gov_pol . attr ,
2013-03-27 19:58:58 +04:00
& powersave_bias_gov_pol . attr ,
& io_is_busy_gov_pol . attr ,
NULL
} ;
static struct attribute_group od_attr_group_gov_pol = {
. attrs = dbs_attributes_gov_pol ,
2005-04-17 02:20:36 +04:00
. name = " ondemand " ,
} ;
/************************** sysfs end ************************/
2015-06-03 13:27:11 +03:00
static int od_init ( struct dbs_data * dbs_data , bool notify )
2013-03-27 19:58:58 +04:00
{
struct od_dbs_tuners * tuners ;
u64 idle_time ;
int cpu ;
2013-08-06 21:23:06 +04:00
tuners = kzalloc ( sizeof ( * tuners ) , GFP_KERNEL ) ;
2013-03-27 19:58:58 +04:00
if ( ! tuners ) {
pr_err ( " %s: kzalloc failed \n " , __func__ ) ;
return - ENOMEM ;
}
cpu = get_cpu ( ) ;
idle_time = get_cpu_idle_time_us ( cpu , NULL ) ;
put_cpu ( ) ;
if ( idle_time ! = - 1ULL ) {
/* Idle micro accounting is supported. Use finer thresholds */
tuners - > up_threshold = MICRO_FREQUENCY_UP_THRESHOLD ;
/*
* In nohz / micro accounting case we set the minimum frequency
* not depending on HZ , but fixed ( very low ) . The deferred
* timer might skip some samples if idle / sleeping as needed .
*/
dbs_data - > min_sampling_rate = MICRO_FREQUENCY_MIN_SAMPLE_RATE ;
} else {
tuners - > up_threshold = DEF_FREQUENCY_UP_THRESHOLD ;
/* For correct statistics, we need 10 ticks for each measure */
dbs_data - > min_sampling_rate = MIN_SAMPLING_RATE_RATIO *
jiffies_to_usecs ( 10 ) ;
}
tuners - > sampling_down_factor = DEF_SAMPLING_DOWN_FACTOR ;
2013-08-05 10:58:02 +04:00
tuners - > ignore_nice_load = 0 ;
2013-06-26 00:42:37 +04:00
tuners - > powersave_bias = default_powersave_bias ;
2013-03-27 19:58:58 +04:00
tuners - > io_is_busy = should_io_be_busy ( ) ;
dbs_data - > tuners = tuners ;
return 0 ;
}
2015-06-03 13:27:11 +03:00
static void od_exit ( struct dbs_data * dbs_data , bool notify )
2013-03-27 19:58:58 +04:00
{
kfree ( dbs_data - > tuners ) ;
}
2012-10-26 02:47:42 +04:00
define_get_cpu_dbs_routines ( od_cpu_dbs_info ) ;
ondemand: Solve a big performance issue by counting IOWAIT time as busy
The ondemand cpufreq governor uses CPU busy time (e.g. not-idle
time) as a measure for scaling the CPU frequency up or down.
If the CPU is busy, the CPU frequency scales up, if it's idle,
the CPU frequency scales down. Effectively, it uses the CPU busy
time as proxy variable for the more nebulous "how critical is
performance right now" question.
This algorithm falls flat on its face in the light of workloads
where you're alternatingly disk and CPU bound, such as the ever
popular "git grep", but also things like startup of programs and
maildir using email clients... much to the chagarin of Andrew
Morton.
This patch changes the ondemand algorithm to count iowait time
as busy, not idle, time. As shown in the breakdown cases above,
iowait is performance critical often, and by counting iowait,
the proxy variable becomes a more accurate representation of the
"how critical is performance" question.
The problem and fix are both verified with the "perf timechar"
tool.
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Dave Jones <davej@redhat.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20100509082606.3d9f00d0@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2010-05-09 19:26:06 +04:00
2012-10-26 02:47:42 +04:00
static struct od_ops od_ops = {
. powersave_bias_init_cpu = ondemand_powersave_bias_init_cpu ,
2013-04-02 18:56:56 +04:00
. powersave_bias_target = generic_powersave_bias_target ,
2012-10-26 02:47:42 +04:00
. freq_increase = dbs_freq_increase ,
} ;
2006-06-29 00:51:19 +04:00
2013-03-27 19:58:58 +04:00
static struct common_dbs_data od_dbs_cdata = {
2012-10-26 02:47:42 +04:00
. governor = GOV_ONDEMAND ,
2013-03-27 19:58:58 +04:00
. attr_group_gov_sys = & od_attr_group_gov_sys ,
. attr_group_gov_pol = & od_attr_group_gov_pol ,
2012-10-26 02:47:42 +04:00
. get_cpu_cdbs = get_cpu_cdbs ,
. get_cpu_dbs_info_s = get_cpu_dbs_info_s ,
. gov_dbs_timer = od_dbs_timer ,
. gov_check_cpu = od_check_cpu ,
. gov_ops = & od_ops ,
2013-03-27 19:58:58 +04:00
. init = od_init ,
. exit = od_exit ,
cpufreq: governor: Serialize governor callbacks
There are several races reported in cpufreq core around governors (only
ondemand and conservative) by different people.
There are at least two race scenarios present in governor code:
(a) Concurrent access/updates of governor internal structures.
It is possible that fields such as 'dbs_data->usage_count', etc. are
accessed simultaneously for different policies using same governor
structure (i.e. CPUFREQ_HAVE_GOVERNOR_PER_POLICY flag unset). And
because of this we can dereference bad pointers.
For example consider a system with two CPUs with separate 'struct
cpufreq_policy' instances. CPU0 governor: ondemand and CPU1: powersave.
CPU0 switching to powersave and CPU1 to ondemand:
CPU0 CPU1
store* store*
cpufreq_governor_exit() cpufreq_governor_init()
dbs_data = cdata->gdbs_data;
if (!--dbs_data->usage_count)
kfree(dbs_data);
dbs_data->usage_count++;
*Bad pointer dereference*
There are other races possible between EXIT and START/STOP/LIMIT as
well. Its really complicated.
(b) Switching governor state in bad sequence:
For example trying to switch a governor to START state, when the
governor is in EXIT state. There are some checks present in
__cpufreq_governor() but they aren't sufficient as they compare events
against 'policy->governor_enabled', where as we need to take governor's
state into account, which can be used by multiple policies.
These two issues need to be solved separately and the responsibility
should be properly divided between cpufreq and governor core.
The first problem is more about the governor core, as it needs to
protect its structures properly. And the second problem should be fixed
in cpufreq core instead of governor, as its all about sequence of
events.
This patch is trying to solve only the first problem.
There are two types of data we need to protect,
- 'struct common_dbs_data': No matter what, there is going to be a
single copy of this per governor.
- 'struct dbs_data': With CPUFREQ_HAVE_GOVERNOR_PER_POLICY flag set, we
will have per-policy copy of this data, otherwise a single copy.
Because of such complexities, the mutex present in 'struct dbs_data' is
insufficient to solve our problem. For example we need to protect
fetching of 'dbs_data' from different structures at the beginning of
cpufreq_governor_dbs(), to make sure it isn't currently being updated.
This can be fixed if we can guarantee serialization of event parsing
code for an individual governor. This is best solved with a mutex per
governor, and the placeholder for that is 'struct common_dbs_data'.
And so this patch moves the mutex from 'struct dbs_data' to 'struct
common_dbs_data' and takes it at the beginning and drops it at the end
of cpufreq_governor_dbs().
Tested with and without following configuration options:
CONFIG_LOCKDEP_SUPPORT=y
CONFIG_DEBUG_RT_MUTEXES=y
CONFIG_DEBUG_PI_LIST=y
CONFIG_DEBUG_SPINLOCK=y
CONFIG_DEBUG_MUTEXES=y
CONFIG_DEBUG_LOCK_ALLOC=y
CONFIG_PROVE_LOCKING=y
CONFIG_LOCKDEP=y
CONFIG_DEBUG_ATOMIC_SLEEP=y
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Reviewed-by: Preeti U Murthy <preeti@linux.vnet.ibm.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2015-06-03 13:27:13 +03:00
. mutex = __MUTEX_INITIALIZER ( od_dbs_cdata . mutex ) ,
2012-10-26 02:47:42 +04:00
} ;
2005-04-17 02:20:36 +04:00
2013-04-02 18:56:56 +04:00
static void od_set_powersave_bias ( unsigned int powersave_bias )
{
struct cpufreq_policy * policy ;
struct dbs_data * dbs_data ;
struct od_dbs_tuners * od_tuners ;
unsigned int cpu ;
cpumask_t done ;
2013-06-26 00:42:37 +04:00
default_powersave_bias = powersave_bias ;
2013-04-02 18:56:56 +04:00
cpumask_clear ( & done ) ;
get_online_cpus ( ) ;
for_each_online_cpu ( cpu ) {
2015-07-18 09:00:59 +03:00
struct cpu_common_dbs_info * shared ;
2013-04-02 18:56:56 +04:00
if ( cpumask_test_cpu ( cpu , & done ) )
continue ;
2015-07-18 09:00:59 +03:00
shared = per_cpu ( od_cpu_dbs_info , cpu ) . cdbs . shared ;
if ( ! shared )
2013-06-26 00:42:37 +04:00
continue ;
2013-04-02 18:56:56 +04:00
2015-07-18 09:00:59 +03:00
policy = shared - > policy ;
2013-04-02 18:56:56 +04:00
cpumask_or ( & done , & done , policy - > cpus ) ;
2013-06-26 00:42:37 +04:00
if ( policy - > governor ! = & cpufreq_gov_ondemand )
continue ;
dbs_data = policy - > governor_data ;
od_tuners = dbs_data - > tuners ;
od_tuners - > powersave_bias = default_powersave_bias ;
2013-04-02 18:56:56 +04:00
}
put_online_cpus ( ) ;
}
void od_register_powersave_bias_handler ( unsigned int ( * f )
( struct cpufreq_policy * , unsigned int , unsigned int ) ,
unsigned int powersave_bias )
{
od_ops . powersave_bias_target = f ;
od_set_powersave_bias ( powersave_bias ) ;
}
EXPORT_SYMBOL_GPL ( od_register_powersave_bias_handler ) ;
void od_unregister_powersave_bias_handler ( void )
{
od_ops . powersave_bias_target = generic_powersave_bias_target ;
od_set_powersave_bias ( 0 ) ;
}
EXPORT_SYMBOL_GPL ( od_unregister_powersave_bias_handler ) ;
2012-10-26 02:47:42 +04:00
static int od_cpufreq_governor_dbs ( struct cpufreq_policy * policy ,
unsigned int event )
2005-04-17 02:20:36 +04:00
{
2013-03-27 19:58:58 +04:00
return cpufreq_governor_dbs ( policy , & od_dbs_cdata , event ) ;
2005-04-17 02:20:36 +04:00
}
2012-10-26 02:47:42 +04:00
# ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND
static
2010-05-09 19:26:51 +04:00
# endif
2012-10-26 02:47:42 +04:00
struct cpufreq_governor cpufreq_gov_ondemand = {
. name = " ondemand " ,
. governor = od_cpufreq_governor_dbs ,
. max_transition_latency = TRANSITION_LATENCY_LIMIT ,
. owner = THIS_MODULE ,
} ;
2005-04-17 02:20:36 +04:00
static int __init cpufreq_gov_dbs_init ( void )
{
2011-01-26 14:12:50 +03:00
return cpufreq_register_governor ( & cpufreq_gov_ondemand ) ;
2005-04-17 02:20:36 +04:00
}
static void __exit cpufreq_gov_dbs_exit ( void )
{
2007-10-03 00:28:12 +04:00
cpufreq_unregister_governor ( & cpufreq_gov_ondemand ) ;
2005-04-17 02:20:36 +04:00
}
2006-06-29 00:52:18 +04:00
MODULE_AUTHOR ( " Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> " ) ;
MODULE_AUTHOR ( " Alexey Starikovskiy <alexey.y.starikovskiy@intel.com> " ) ;
MODULE_DESCRIPTION ( " 'cpufreq_ondemand' - A dynamic cpufreq governor for "
2009-01-18 09:43:44 +03:00
" Low Latency Frequency Transition capable processors " ) ;
2006-06-29 00:52:18 +04:00
MODULE_LICENSE ( " GPL " ) ;
2005-04-17 02:20:36 +04:00
2008-01-18 02:21:08 +03:00
# ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND
fs_initcall ( cpufreq_gov_dbs_init ) ;
# else
2005-04-17 02:20:36 +04:00
module_init ( cpufreq_gov_dbs_init ) ;
2008-01-18 02:21:08 +03:00
# endif
2005-04-17 02:20:36 +04:00
module_exit ( cpufreq_gov_dbs_exit ) ;