2010-03-08 22:07:30 +03:00
/*
* intel_idle . c - native hardware idle loop for modern Intel processors
*
* Copyright ( c ) 2010 , Intel Corporation .
* Len Brown < len . brown @ intel . com >
*
* This program is free software ; you can redistribute it and / or modify it
* under the terms and conditions of the GNU General Public License ,
* version 2 , as published by the Free Software Foundation .
*
* This program is distributed in the hope it will be useful , but WITHOUT
* ANY WARRANTY ; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE . See the GNU General Public License for
* more details .
*
* You should have received a copy of the GNU General Public License along with
* this program ; if not , write to the Free Software Foundation , Inc . ,
* 51 Franklin St - Fifth Floor , Boston , MA 02110 - 1301 USA .
*/
/*
* intel_idle is a cpuidle driver that loads on specific Intel processors
* in lieu of the legacy ACPI processor_idle driver . The intent is to
* make Linux more efficient on these processors , as intel_idle knows
* more than ACPI , as well as make Linux more immune to ACPI BIOS bugs .
*/
/*
* Design Assumptions
*
* All CPUs have same idle states as boot CPU
*
* Chipset BM_STS ( bus master status ) bit is a NOP
* for preventing entry into deep C - stats
*/
/*
* Known limitations
*
* The driver currently initializes for_each_online_cpu ( ) upon modprobe .
* It it unaware of subsequent processors hot - added to the system .
* This means that if you boot with maxcpus = n and later online
* processors above n , those processors will use C1 only .
*
* ACPI has a . suspend hack to turn off deep c - statees during suspend
* to avoid complications with the lapic timer workaround .
* Have not seen issues with suspend , but may need same workaround here .
*
* There is currently no kernel - based automatic probing / loading mechanism
* if the driver is built as a module .
*/
/* un-comment DEBUG to enable pr_debug() statements */
# define DEBUG
# include <linux/kernel.h>
# include <linux/cpuidle.h>
# include <linux/clockchips.h>
# include <linux/hrtimer.h> /* ktime_get_real() */
# include <trace/events/power.h>
# include <linux/sched.h>
2011-01-10 04:38:12 +03:00
# include <linux/notifier.h>
# include <linux/cpu.h>
2010-09-18 02:36:40 +04:00
# include <asm/mwait.h>
2010-03-08 22:07:30 +03:00
# define INTEL_IDLE_VERSION "0.4"
# define PREFIX "intel_idle: "
static struct cpuidle_driver intel_idle_driver = {
. name = " intel_idle " ,
. owner = THIS_MODULE ,
} ;
/* intel_idle.max_cstate=0 disables driver */
static int max_cstate = MWAIT_MAX_NUM_CSTATES - 1 ;
2010-05-28 10:22:03 +04:00
static unsigned int mwait_substates ;
2010-03-08 22:07:30 +03:00
2011-01-10 04:38:12 +03:00
# define LAPIC_TIMER_ALWAYS_RELIABLE 0xFFFFFFFF
2010-03-08 22:07:30 +03:00
/* Reliable LAPIC Timer States, bit 1 for C1 etc. */
2010-07-07 08:12:03 +04:00
static unsigned int lapic_timer_reliable_states = ( 1 < < 1 ) ; /* Default to only C1 */
2010-03-08 22:07:30 +03:00
2010-08-07 22:10:03 +04:00
static struct cpuidle_device __percpu * intel_idle_cpuidle_devices ;
2010-03-08 22:07:30 +03:00
static int intel_idle ( struct cpuidle_device * dev , struct cpuidle_state * state ) ;
static struct cpuidle_state * cpuidle_state_table ;
2011-01-12 10:51:20 +03:00
/*
* Set this flag for states where the HW flushes the TLB for us
* and so we don ' t need cross - calls to keep it consistent .
* If this flag is set , SW flushes the TLB , so even if the
* HW doesn ' t do the flushing , this flag is safe to use .
*/
# define CPUIDLE_FLAG_TLB_FLUSHED 0x10000
2010-03-08 22:07:30 +03:00
/*
* States are indexed by the cstate number ,
* which is also the index into the MWAIT hint array .
* Thus C0 is a dummy .
*/
static struct cpuidle_state nehalem_cstates [ MWAIT_MAX_NUM_CSTATES ] = {
{ /* MWAIT C0 */ } ,
{ /* MWAIT C1 */
. name = " NHM-C1 " ,
. desc = " MWAIT 0x00 " ,
. driver_data = ( void * ) 0x00 ,
. flags = CPUIDLE_FLAG_TIME_VALID ,
. exit_latency = 3 ,
. target_residency = 6 ,
. enter = & intel_idle } ,
{ /* MWAIT C2 */
. name = " NHM-C3 " ,
. desc = " MWAIT 0x10 " ,
. driver_data = ( void * ) 0x10 ,
2010-10-01 05:19:07 +04:00
. flags = CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED ,
2010-03-08 22:07:30 +03:00
. exit_latency = 20 ,
. target_residency = 80 ,
. enter = & intel_idle } ,
{ /* MWAIT C3 */
. name = " NHM-C6 " ,
. desc = " MWAIT 0x20 " ,
. driver_data = ( void * ) 0x20 ,
2010-10-01 05:19:07 +04:00
. flags = CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED ,
2010-03-08 22:07:30 +03:00
. exit_latency = 200 ,
. target_residency = 800 ,
. enter = & intel_idle } ,
} ;
2010-07-07 08:12:03 +04:00
static struct cpuidle_state snb_cstates [ MWAIT_MAX_NUM_CSTATES ] = {
{ /* MWAIT C0 */ } ,
{ /* MWAIT C1 */
. name = " SNB-C1 " ,
. desc = " MWAIT 0x00 " ,
. driver_data = ( void * ) 0x00 ,
. flags = CPUIDLE_FLAG_TIME_VALID ,
. exit_latency = 1 ,
2010-12-14 02:28:22 +03:00
. target_residency = 1 ,
2010-07-07 08:12:03 +04:00
. enter = & intel_idle } ,
{ /* MWAIT C2 */
. name = " SNB-C3 " ,
. desc = " MWAIT 0x10 " ,
. driver_data = ( void * ) 0x10 ,
2010-10-23 10:33:50 +04:00
. flags = CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED ,
2010-07-07 08:12:03 +04:00
. exit_latency = 80 ,
2010-12-14 02:28:22 +03:00
. target_residency = 211 ,
2010-07-07 08:12:03 +04:00
. enter = & intel_idle } ,
{ /* MWAIT C3 */
. name = " SNB-C6 " ,
. desc = " MWAIT 0x20 " ,
. driver_data = ( void * ) 0x20 ,
2010-10-23 10:33:50 +04:00
. flags = CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED ,
2010-07-07 08:12:03 +04:00
. exit_latency = 104 ,
2010-12-14 02:28:22 +03:00
. target_residency = 345 ,
2010-07-07 08:12:03 +04:00
. enter = & intel_idle } ,
{ /* MWAIT C4 */
. name = " SNB-C7 " ,
. desc = " MWAIT 0x30 " ,
. driver_data = ( void * ) 0x30 ,
2010-10-23 10:33:50 +04:00
. flags = CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED ,
2010-07-07 08:12:03 +04:00
. exit_latency = 109 ,
2010-12-14 02:28:22 +03:00
. target_residency = 345 ,
2010-07-07 08:12:03 +04:00
. enter = & intel_idle } ,
} ;
2010-03-08 22:07:30 +03:00
static struct cpuidle_state atom_cstates [ MWAIT_MAX_NUM_CSTATES ] = {
{ /* MWAIT C0 */ } ,
{ /* MWAIT C1 */
. name = " ATM-C1 " ,
. desc = " MWAIT 0x00 " ,
. driver_data = ( void * ) 0x00 ,
. flags = CPUIDLE_FLAG_TIME_VALID ,
. exit_latency = 1 ,
. target_residency = 4 ,
. enter = & intel_idle } ,
{ /* MWAIT C2 */
. name = " ATM-C2 " ,
. desc = " MWAIT 0x10 " ,
. driver_data = ( void * ) 0x10 ,
. flags = CPUIDLE_FLAG_TIME_VALID ,
. exit_latency = 20 ,
. target_residency = 80 ,
. enter = & intel_idle } ,
{ /* MWAIT C3 */ } ,
{ /* MWAIT C4 */
. name = " ATM-C4 " ,
. desc = " MWAIT 0x30 " ,
. driver_data = ( void * ) 0x30 ,
2010-10-01 05:19:07 +04:00
. flags = CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED ,
2010-03-08 22:07:30 +03:00
. exit_latency = 100 ,
. target_residency = 400 ,
. enter = & intel_idle } ,
{ /* MWAIT C5 */ } ,
{ /* MWAIT C6 */
. name = " ATM-C6 " ,
2010-10-05 21:43:14 +04:00
. desc = " MWAIT 0x52 " ,
. driver_data = ( void * ) 0x52 ,
2010-10-01 05:19:07 +04:00
. flags = CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED ,
2010-10-05 21:43:14 +04:00
. exit_latency = 140 ,
. target_residency = 560 ,
. enter = & intel_idle } ,
2010-03-08 22:07:30 +03:00
} ;
/**
* intel_idle
* @ dev : cpuidle_device
* @ state : cpuidle state
*
*/
static int intel_idle ( struct cpuidle_device * dev , struct cpuidle_state * state )
{
unsigned long ecx = 1 ; /* break on interrupt flag */
unsigned long eax = ( unsigned long ) cpuidle_get_statedata ( state ) ;
unsigned int cstate ;
ktime_t kt_before , kt_after ;
s64 usec_delta ;
int cpu = smp_processor_id ( ) ;
cstate = ( ( ( eax ) > > MWAIT_SUBSTATE_SIZE ) & MWAIT_CSTATE_MASK ) + 1 ;
local_irq_disable ( ) ;
2010-10-01 05:19:07 +04:00
/*
2010-10-16 04:43:06 +04:00
* leave_mm ( ) to avoid costly and often unnecessary wakeups
* for flushing the user TLB ' s associated with the active mm .
2010-10-01 05:19:07 +04:00
*/
2010-10-16 04:43:06 +04:00
if ( state - > flags & CPUIDLE_FLAG_TLB_FLUSHED )
2010-10-01 05:19:07 +04:00
leave_mm ( cpu ) ;
2010-03-08 22:07:30 +03:00
if ( ! ( lapic_timer_reliable_states & ( 1 < < ( cstate ) ) ) )
clockevents_notify ( CLOCK_EVT_NOTIFY_BROADCAST_ENTER , & cpu ) ;
kt_before = ktime_get_real ( ) ;
stop_critical_timings ( ) ;
if ( ! need_resched ( ) ) {
__monitor ( ( void * ) & current_thread_info ( ) - > flags , 0 , 0 ) ;
smp_mb ( ) ;
if ( ! need_resched ( ) )
__mwait ( eax , ecx ) ;
}
start_critical_timings ( ) ;
kt_after = ktime_get_real ( ) ;
usec_delta = ktime_to_us ( ktime_sub ( kt_after , kt_before ) ) ;
local_irq_enable ( ) ;
if ( ! ( lapic_timer_reliable_states & ( 1 < < ( cstate ) ) ) )
clockevents_notify ( CLOCK_EVT_NOTIFY_BROADCAST_EXIT , & cpu ) ;
return usec_delta ;
}
2011-01-10 04:38:12 +03:00
static void __setup_broadcast_timer ( void * arg )
{
unsigned long reason = ( unsigned long ) arg ;
int cpu = smp_processor_id ( ) ;
reason = reason ?
CLOCK_EVT_NOTIFY_BROADCAST_ON : CLOCK_EVT_NOTIFY_BROADCAST_OFF ;
clockevents_notify ( reason , & cpu ) ;
}
2011-01-24 11:00:01 +03:00
static int setup_broadcast_cpuhp_notify ( struct notifier_block * n ,
2011-01-10 04:38:12 +03:00
unsigned long action , void * hcpu )
{
int hotcpu = ( unsigned long ) hcpu ;
switch ( action & 0xf ) {
case CPU_ONLINE :
smp_call_function_single ( hotcpu , __setup_broadcast_timer ,
( void * ) true , 1 ) ;
break ;
}
return NOTIFY_OK ;
}
2011-01-24 11:00:01 +03:00
static struct notifier_block setup_broadcast_notifier = {
2011-01-10 04:38:12 +03:00
. notifier_call = setup_broadcast_cpuhp_notify ,
} ;
2010-03-08 22:07:30 +03:00
/*
* intel_idle_probe ( )
*/
static int intel_idle_probe ( void )
{
2010-05-28 10:22:03 +04:00
unsigned int eax , ebx , ecx ;
2010-03-08 22:07:30 +03:00
if ( max_cstate = = 0 ) {
pr_debug ( PREFIX " disabled \n " ) ;
return - EPERM ;
}
if ( boot_cpu_data . x86_vendor ! = X86_VENDOR_INTEL )
return - ENODEV ;
if ( ! boot_cpu_has ( X86_FEATURE_MWAIT ) )
return - ENODEV ;
if ( boot_cpu_data . cpuid_level < CPUID_MWAIT_LEAF )
return - ENODEV ;
2010-05-28 10:22:03 +04:00
cpuid ( CPUID_MWAIT_LEAF , & eax , & ebx , & ecx , & mwait_substates ) ;
2010-03-08 22:07:30 +03:00
if ( ! ( ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED ) | |
! ( ecx & CPUID5_ECX_INTERRUPT_BREAK ) )
return - ENODEV ;
2010-05-28 10:22:03 +04:00
pr_debug ( PREFIX " MWAIT substates: 0x%x \n " , mwait_substates ) ;
2010-03-08 22:07:30 +03:00
if ( boot_cpu_data . x86 ! = 6 ) /* family 6 */
return - ENODEV ;
switch ( boot_cpu_data . x86_model ) {
case 0x1A : /* Core i7, Xeon 5500 series */
case 0x1E : /* Core i7 and i5 Processor - Lynnfield Jasper Forest */
case 0x1F : /* Core i7 and i5 Processor - Nehalem */
case 0x2E : /* Nehalem-EX Xeon */
2010-07-27 07:40:19 +04:00
case 0x2F : /* Westmere-EX Xeon */
2010-03-08 22:07:30 +03:00
case 0x25 : /* Westmere */
case 0x2C : /* Westmere */
cpuidle_state_table = nehalem_cstates ;
break ;
case 0x1C : /* 28 - Atom Processor */
2010-07-22 07:42:25 +04:00
case 0x26 : /* 38 - Lincroft Atom Processor */
2010-03-08 22:07:30 +03:00
cpuidle_state_table = atom_cstates ;
break ;
2010-07-07 08:12:03 +04:00
case 0x2A : /* SNB */
case 0x2D : /* SNB Xeon */
cpuidle_state_table = snb_cstates ;
break ;
2010-03-08 22:07:30 +03:00
default :
pr_debug ( PREFIX " does not run on family %d model %d \n " ,
boot_cpu_data . x86 , boot_cpu_data . x86_model ) ;
return - ENODEV ;
}
2010-12-02 09:19:32 +03:00
if ( boot_cpu_has ( X86_FEATURE_ARAT ) ) /* Always Reliable APIC Timer */
2011-01-10 04:38:12 +03:00
lapic_timer_reliable_states = LAPIC_TIMER_ALWAYS_RELIABLE ;
else {
smp_call_function ( __setup_broadcast_timer , ( void * ) true , 1 ) ;
register_cpu_notifier ( & setup_broadcast_notifier ) ;
}
2010-12-02 09:19:32 +03:00
2010-03-08 22:07:30 +03:00
pr_debug ( PREFIX " v " INTEL_IDLE_VERSION
" model 0x%X \n " , boot_cpu_data . x86_model ) ;
pr_debug ( PREFIX " lapic_timer_reliable_states 0x%x \n " ,
lapic_timer_reliable_states ) ;
return 0 ;
}
/*
* intel_idle_cpuidle_devices_uninit ( )
* unregister , free cpuidle_devices
*/
static void intel_idle_cpuidle_devices_uninit ( void )
{
int i ;
struct cpuidle_device * dev ;
for_each_online_cpu ( i ) {
dev = per_cpu_ptr ( intel_idle_cpuidle_devices , i ) ;
cpuidle_unregister_device ( dev ) ;
}
free_percpu ( intel_idle_cpuidle_devices ) ;
return ;
}
/*
* intel_idle_cpuidle_devices_init ( )
* allocate , initialize , register cpuidle_devices
*/
static int intel_idle_cpuidle_devices_init ( void )
{
int i , cstate ;
struct cpuidle_device * dev ;
intel_idle_cpuidle_devices = alloc_percpu ( struct cpuidle_device ) ;
if ( intel_idle_cpuidle_devices = = NULL )
return - ENOMEM ;
for_each_online_cpu ( i ) {
dev = per_cpu_ptr ( intel_idle_cpuidle_devices , i ) ;
dev - > state_count = 1 ;
for ( cstate = 1 ; cstate < MWAIT_MAX_NUM_CSTATES ; + + cstate ) {
int num_substates ;
if ( cstate > max_cstate ) {
printk ( PREFIX " max_cstate %d reached \n " ,
max_cstate ) ;
break ;
}
/* does the state exist in CPUID.MWAIT? */
2010-05-28 10:22:03 +04:00
num_substates = ( mwait_substates > > ( ( cstate ) * 4 ) )
2010-03-08 22:07:30 +03:00
& MWAIT_SUBSTATE_MASK ;
if ( num_substates = = 0 )
continue ;
/* is the state not enabled? */
if ( cpuidle_state_table [ cstate ] . enter = = NULL ) {
/* does the driver not know about the state? */
if ( * cpuidle_state_table [ cstate ] . name = = ' \0 ' )
pr_debug ( PREFIX " unaware of model 0x%x "
" MWAIT %d please "
" contact lenb@kernel.org " ,
boot_cpu_data . x86_model , cstate ) ;
continue ;
}
if ( ( cstate > 2 ) & &
! boot_cpu_has ( X86_FEATURE_NONSTOP_TSC ) )
mark_tsc_unstable ( " TSC halts in idle "
" states deeper than C2 " ) ;
dev - > states [ dev - > state_count ] = /* structure copy */
cpuidle_state_table [ cstate ] ;
dev - > state_count + = 1 ;
}
dev - > cpu = i ;
if ( cpuidle_register_device ( dev ) ) {
pr_debug ( PREFIX " cpuidle_register_device %d failed! \n " ,
i ) ;
intel_idle_cpuidle_devices_uninit ( ) ;
return - EIO ;
}
}
return 0 ;
}
static int __init intel_idle_init ( void )
{
int retval ;
2010-11-03 19:06:14 +03:00
/* Do not load intel_idle at all for now if idle= is passed */
if ( boot_option_idle_override ! = IDLE_NO_OVERRIDE )
return - ENODEV ;
2010-03-08 22:07:30 +03:00
retval = intel_idle_probe ( ) ;
if ( retval )
return retval ;
retval = cpuidle_register_driver ( & intel_idle_driver ) ;
if ( retval ) {
printk ( KERN_DEBUG PREFIX " intel_idle yielding to %s " ,
cpuidle_get_driver ( ) - > name ) ;
return retval ;
}
retval = intel_idle_cpuidle_devices_init ( ) ;
if ( retval ) {
cpuidle_unregister_driver ( & intel_idle_driver ) ;
return retval ;
}
return 0 ;
}
static void __exit intel_idle_exit ( void )
{
intel_idle_cpuidle_devices_uninit ( ) ;
cpuidle_unregister_driver ( & intel_idle_driver ) ;
2011-01-10 04:38:12 +03:00
if ( lapic_timer_reliable_states ! = LAPIC_TIMER_ALWAYS_RELIABLE ) {
smp_call_function ( __setup_broadcast_timer , ( void * ) false , 1 ) ;
unregister_cpu_notifier ( & setup_broadcast_notifier ) ;
}
2010-03-08 22:07:30 +03:00
return ;
}
module_init ( intel_idle_init ) ;
module_exit ( intel_idle_exit ) ;
module_param ( max_cstate , int , 0444 ) ;
MODULE_AUTHOR ( " Len Brown <len.brown@intel.com> " ) ;
MODULE_DESCRIPTION ( " Cpuidle driver for Intel Hardware v " INTEL_IDLE_VERSION ) ;
MODULE_LICENSE ( " GPL " ) ;