2005-04-16 15:20:36 -07:00
# include <linux/pci.h>
# include <linux/module.h>
PCI: add PCI Express ASPM support
PCI Express ASPM defines a protocol for PCI Express components in the D0
state to reduce Link power by placing their Links into a low power state
and instructing the other end of the Link to do likewise. This
capability allows hardware-autonomous, dynamic Link power reduction
beyond what is achievable by software-only controlled power management.
However, The device should be configured by software appropriately.
Enabling ASPM will save power, but will introduce device latency.
This patch adds ASPM support in Linux. It introduces a global policy for
ASPM, a sysfs file /sys/module/pcie_aspm/parameters/policy can control
it. The interface can be used as a boot option too. Currently we have
below setting:
-default, BIOS default setting
-powersave, highest power saving mode, enable all available ASPM
state and clock power management
-performance, highest performance, disable ASPM and clock power
management
By default, the 'default' policy is used currently.
In my test, power difference between powersave mode and performance mode
is about 1.3w in a system with 3 PCIE links.
Note: some devices might not work well with aspm, either because chipset
issue or device issue. The patch provide API (pci_disable_link_state),
driver can disable ASPM for specific device.
Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
2008-02-25 09:46:41 +08:00
# include <linux/pci-aspm.h>
2005-04-16 15:20:36 -07:00
# include "pci.h"
static void pci_free_resources ( struct pci_dev * dev )
{
int i ;
msi_remove_pci_irq_vectors ( dev ) ;
pci_cleanup_rom ( dev ) ;
for ( i = 0 ; i < PCI_NUM_RESOURCES ; i + + ) {
struct resource * res = dev - > resource + i ;
if ( res - > parent )
release_resource ( res ) ;
}
}
2006-09-12 10:16:36 -07:00
static void pci_stop_dev ( struct pci_dev * dev )
2005-04-16 15:20:36 -07:00
{
2008-02-14 14:56:56 -08:00
if ( dev - > is_added ) {
2005-04-28 00:25:49 -07:00
pci_proc_detach_device ( dev ) ;
pci_remove_sysfs_dev_files ( dev ) ;
device_unregister ( & dev - > dev ) ;
2008-02-14 14:56:56 -08:00
dev - > is_added = 0 ;
2005-04-28 00:25:49 -07:00
}
PCI: add PCI Express ASPM support
PCI Express ASPM defines a protocol for PCI Express components in the D0
state to reduce Link power by placing their Links into a low power state
and instructing the other end of the Link to do likewise. This
capability allows hardware-autonomous, dynamic Link power reduction
beyond what is achievable by software-only controlled power management.
However, The device should be configured by software appropriately.
Enabling ASPM will save power, but will introduce device latency.
This patch adds ASPM support in Linux. It introduces a global policy for
ASPM, a sysfs file /sys/module/pcie_aspm/parameters/policy can control
it. The interface can be used as a boot option too. Currently we have
below setting:
-default, BIOS default setting
-powersave, highest power saving mode, enable all available ASPM
state and clock power management
-performance, highest performance, disable ASPM and clock power
management
By default, the 'default' policy is used currently.
In my test, power difference between powersave mode and performance mode
is about 1.3w in a system with 3 PCIE links.
Note: some devices might not work well with aspm, either because chipset
issue or device issue. The patch provide API (pci_disable_link_state),
driver can disable ASPM for specific device.
Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
2008-02-25 09:46:41 +08:00
if ( dev - > bus - > self )
pcie_aspm_exit_link_state ( dev ) ;
2006-09-12 10:16:36 -07:00
}
static void pci_destroy_dev ( struct pci_dev * dev )
{
2005-04-16 15:20:36 -07:00
/* Remove the device from the device lists, and prevent any further
* list accesses from this device */
2006-06-02 12:35:43 +08:00
down_write ( & pci_bus_sem ) ;
2005-04-16 15:20:36 -07:00
list_del ( & dev - > bus_list ) ;
dev - > bus_list . next = dev - > bus_list . prev = NULL ;
2006-06-02 12:35:43 +08:00
up_write ( & pci_bus_sem ) ;
2005-04-16 15:20:36 -07:00
pci_free_resources ( dev ) ;
pci_dev_put ( dev ) ;
}
/**
* pci_remove_device_safe - remove an unused hotplug device
* @ dev : the device to remove
*
* Delete the device structure from the device lists and
* notify userspace ( / sbin / hotplug ) , but only if the device
* in question is not being used by a driver .
* Returns 0 on success .
*/
2005-12-22 01:08:52 +01:00
#if 0
2005-04-16 15:20:36 -07:00
int pci_remove_device_safe ( struct pci_dev * dev )
{
if ( pci_dev_driver ( dev ) )
return - EBUSY ;
pci_destroy_dev ( dev ) ;
return 0 ;
}
2005-12-22 01:08:52 +01:00
# endif /* 0 */
2005-04-16 15:20:36 -07:00
void pci_remove_bus ( struct pci_bus * pci_bus )
{
pci_proc_detach_bus ( pci_bus ) ;
2006-06-02 12:35:43 +08:00
down_write ( & pci_bus_sem ) ;
2005-04-16 15:20:36 -07:00
list_del ( & pci_bus - > node ) ;
2006-06-02 12:35:43 +08:00
up_write ( & pci_bus_sem ) ;
2009-01-28 18:27:21 +08:00
if ( ! pci_bus - > is_added )
return ;
2005-04-16 15:20:36 -07:00
pci_remove_legacy_files ( pci_bus ) ;
2007-05-22 22:47:54 -04:00
device_unregister ( & pci_bus - > dev ) ;
2005-04-16 15:20:36 -07:00
}
EXPORT_SYMBOL ( pci_remove_bus ) ;
2012-02-04 22:55:00 -08:00
static void __pci_remove_behind_bridge ( struct pci_dev * dev ) ;
2005-04-16 15:20:36 -07:00
/**
2012-02-25 13:54:20 -08:00
* pci_stop_and_remove_bus_device - remove a PCI device and any children
2005-04-16 15:20:36 -07:00
* @ dev : the device to remove
*
* Remove a PCI device from the device lists , informing the drivers
* that the device has been removed . We also remove any subordinate
* buses and children in a depth - first manner .
*
* For each device we remove , delete the device structure from the
* device lists , remove the / proc entry , and notify userspace
* ( / sbin / hotplug ) .
*/
2011-11-22 21:06:53 -08:00
static void __pci_remove_bus_device ( struct pci_dev * dev )
2005-04-16 15:20:36 -07:00
{
if ( dev - > subordinate ) {
struct pci_bus * b = dev - > subordinate ;
2012-02-04 22:55:00 -08:00
__pci_remove_behind_bridge ( dev ) ;
2005-04-16 15:20:36 -07:00
pci_remove_bus ( b ) ;
dev - > subordinate = NULL ;
}
pci_destroy_dev ( dev ) ;
}
2012-02-25 13:54:20 -08:00
void pci_stop_and_remove_bus_device ( struct pci_dev * dev )
2011-11-22 21:06:53 -08:00
{
pci_stop_bus_device ( dev ) ;
__pci_remove_bus_device ( dev ) ;
}
2005-04-16 15:20:36 -07:00
2012-02-04 22:55:00 -08:00
static void __pci_remove_behind_bridge ( struct pci_dev * dev )
{
struct list_head * l , * n ;
if ( dev - > subordinate )
list_for_each_safe ( l , n , & dev - > subordinate - > devices )
__pci_remove_bus_device ( pci_dev_b ( l ) ) ;
}
static void pci_stop_behind_bridge ( struct pci_dev * dev )
{
struct list_head * l , * n ;
if ( dev - > subordinate )
list_for_each_safe ( l , n , & dev - > subordinate - > devices )
pci_stop_bus_device ( pci_dev_b ( l ) ) ;
}
2005-04-16 15:20:36 -07:00
/**
* pci_remove_behind_bridge - remove all devices behind a PCI bridge
* @ dev : PCI bridge device
*
* Remove all devices on the bus , except for the parent bridge .
* This also removes any child buses , and any devices they may
* contain in a depth - first manner .
*/
void pci_remove_behind_bridge ( struct pci_dev * dev )
{
2012-02-04 22:55:00 -08:00
pci_stop_behind_bridge ( dev ) ;
__pci_remove_behind_bridge ( dev ) ;
2005-04-16 15:20:36 -07:00
}
2006-09-12 10:16:36 -07:00
static void pci_stop_bus_devices ( struct pci_bus * bus )
{
struct list_head * l , * n ;
PCI: make sriov work with hotplug remove
When hot removing a pci express module that has a pcie switch and supports
SRIOV, we got:
[ 5918.610127] pciehp 0000:80:02.2:pcie04: pcie_isr: intr_loc 1
[ 5918.615779] pciehp 0000:80:02.2:pcie04: Attention button interrupt received
[ 5918.622730] pciehp 0000:80:02.2:pcie04: Button pressed on Slot(3)
[ 5918.629002] pciehp 0000:80:02.2:pcie04: pciehp_get_power_status: SLOTCTRL a8 value read 1f9
[ 5918.637416] pciehp 0000:80:02.2:pcie04: PCI slot #3 - powering off due to button press.
[ 5918.647125] pciehp 0000:80:02.2:pcie04: pcie_isr: intr_loc 10
[ 5918.653039] pciehp 0000:80:02.2:pcie04: pciehp_green_led_blink: SLOTCTRL a8 write cmd 200
[ 5918.661229] pciehp 0000:80:02.2:pcie04: pciehp_set_attention_status: SLOTCTRL a8 write cmd c0
[ 5924.667627] pciehp 0000:80:02.2:pcie04: Disabling domain:bus:device=0000:b0:00
[ 5924.674909] pciehp 0000:80:02.2:pcie04: pciehp_get_power_status: SLOTCTRL a8 value read 2f9
[ 5924.683262] pciehp 0000:80:02.2:pcie04: pciehp_unconfigure_device: domain:bus:dev = 0000:b0:00
[ 5924.693976] libfcoe_device_notification: NETDEV_UNREGISTER eth6
[ 5924.764979] libfcoe_device_notification: NETDEV_UNREGISTER eth14
[ 5924.873539] libfcoe_device_notification: NETDEV_UNREGISTER eth15
[ 5924.995209] libfcoe_device_notification: NETDEV_UNREGISTER eth16
[ 5926.114407] sxge 0000:b2:00.0: PCI INT A disabled
[ 5926.119342] BUG: unable to handle kernel NULL pointer dereference at (null)
[ 5926.127189] IP: [<ffffffff81353a3b>] pci_stop_bus_device+0x33/0x83
[ 5926.133377] PGD 0
[ 5926.135402] Oops: 0000 [#1] SMP
[ 5926.138659] CPU 2
[ 5926.140499] Modules linked in:
...
[ 5926.143754]
[ 5926.275823] Call Trace:
[ 5926.278267] [<ffffffff81353a38>] pci_stop_bus_device+0x30/0x83
[ 5926.284180] [<ffffffff81353af4>] pci_remove_bus_device+0x1a/0xba
[ 5926.290264] [<ffffffff81366311>] pciehp_unconfigure_device+0x110/0x17b
[ 5926.296866] [<ffffffff81365dd9>] ? pciehp_disable_slot+0x188/0x188
[ 5926.303123] [<ffffffff81365d6f>] pciehp_disable_slot+0x11e/0x188
[ 5926.309206] [<ffffffff81365e68>] pciehp_power_thread+0x8f/0xe0
...
+-[0000:80]-+-00.0-[81-8f]--
| +-01.0-[90-9f]--
| +-02.0-[a0-af]--
| +-02.2-[b0-bf]----00.0-[b1-b3]--+-02.0-[b2]--+-00.0 Device
| | | +-00.1 Device
| | | +-00.2 Device
| | | \-00.3 Device
| | \-03.0-[b3]--+-00.0 Device
| | +-00.1 Device
| | +-00.2 Device
| | \-00.3 Device
root complex: 80:02.2
pci express modules: have pcie switch and are listed as b0:00.0, b1:02.0 and b1:03.0.
end devices are b2:00.0 and b3.00.0.
VFs are: b2:00.1,... b2:00.3, and b3:00.1,...,b3:00.3
Root cause: when doing pci_stop_bus_device() with phys fn, it will stop
virt fn and remove the fn, so
list_for_each_safe(l, n, &bus->devices)
will have problem to refer freed n that is pointed to vf entry.
Solution is just replacing list_for_each_safe() with
list_for_each_prev_safe(). This will make sure we can get valid n pointer
to PF instead of the freed VF pointer (because newly added devices are
inserted to the bus->devices list tail).
During reviewing the patch, Bjorn said:
| The PCI hot-remove path calls pci_stop_bus_devices() via
| pci_remove_bus_device().
|
| pci_stop_bus_devices() traverses the bus->devices list (point A below),
| stopping each device in turn, which calls the driver remove() method. When
| the device is an SR-IOV PF, the driver calls pci_disable_sriov(), which
| also uses pci_remove_bus_device() to remove the VF devices from the
| bus->devices list (point B).
|
| pci_remove_bus_device
| pci_stop_bus_device
| pci_stop_bus_devices(subordinate)
| list_for_each(bus->devices) <-- A
| pci_stop_bus_device(PF)
| ...
| driver->remove
| pci_disable_sriov
| ...
| pci_remove_bus_device(VF)
| <remove from bus_list> <-- B
|
| At B, we're changing the same list we're iterating through at A, so when
| the driver remove() method returns, the pci_stop_bus_devices() iterator has
| a pointer to a list entry that has already been freed.
Discussion thread can be found : https://lkml.org/lkml/2011/10/15/141
https://lkml.org/lkml/2012/1/23/360
-v5: According to Linus to make remove more robust, Change to
list_for_each_prev_safe instead. That is more reasonable, because
those devices are added to tail of the list before.
Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
2012-01-27 10:55:09 -08:00
/*
2012-02-25 13:54:20 -08:00
* VFs could be removed by pci_stop_and_remove_bus_device ( ) in the
PCI: make sriov work with hotplug remove
When hot removing a pci express module that has a pcie switch and supports
SRIOV, we got:
[ 5918.610127] pciehp 0000:80:02.2:pcie04: pcie_isr: intr_loc 1
[ 5918.615779] pciehp 0000:80:02.2:pcie04: Attention button interrupt received
[ 5918.622730] pciehp 0000:80:02.2:pcie04: Button pressed on Slot(3)
[ 5918.629002] pciehp 0000:80:02.2:pcie04: pciehp_get_power_status: SLOTCTRL a8 value read 1f9
[ 5918.637416] pciehp 0000:80:02.2:pcie04: PCI slot #3 - powering off due to button press.
[ 5918.647125] pciehp 0000:80:02.2:pcie04: pcie_isr: intr_loc 10
[ 5918.653039] pciehp 0000:80:02.2:pcie04: pciehp_green_led_blink: SLOTCTRL a8 write cmd 200
[ 5918.661229] pciehp 0000:80:02.2:pcie04: pciehp_set_attention_status: SLOTCTRL a8 write cmd c0
[ 5924.667627] pciehp 0000:80:02.2:pcie04: Disabling domain:bus:device=0000:b0:00
[ 5924.674909] pciehp 0000:80:02.2:pcie04: pciehp_get_power_status: SLOTCTRL a8 value read 2f9
[ 5924.683262] pciehp 0000:80:02.2:pcie04: pciehp_unconfigure_device: domain:bus:dev = 0000:b0:00
[ 5924.693976] libfcoe_device_notification: NETDEV_UNREGISTER eth6
[ 5924.764979] libfcoe_device_notification: NETDEV_UNREGISTER eth14
[ 5924.873539] libfcoe_device_notification: NETDEV_UNREGISTER eth15
[ 5924.995209] libfcoe_device_notification: NETDEV_UNREGISTER eth16
[ 5926.114407] sxge 0000:b2:00.0: PCI INT A disabled
[ 5926.119342] BUG: unable to handle kernel NULL pointer dereference at (null)
[ 5926.127189] IP: [<ffffffff81353a3b>] pci_stop_bus_device+0x33/0x83
[ 5926.133377] PGD 0
[ 5926.135402] Oops: 0000 [#1] SMP
[ 5926.138659] CPU 2
[ 5926.140499] Modules linked in:
...
[ 5926.143754]
[ 5926.275823] Call Trace:
[ 5926.278267] [<ffffffff81353a38>] pci_stop_bus_device+0x30/0x83
[ 5926.284180] [<ffffffff81353af4>] pci_remove_bus_device+0x1a/0xba
[ 5926.290264] [<ffffffff81366311>] pciehp_unconfigure_device+0x110/0x17b
[ 5926.296866] [<ffffffff81365dd9>] ? pciehp_disable_slot+0x188/0x188
[ 5926.303123] [<ffffffff81365d6f>] pciehp_disable_slot+0x11e/0x188
[ 5926.309206] [<ffffffff81365e68>] pciehp_power_thread+0x8f/0xe0
...
+-[0000:80]-+-00.0-[81-8f]--
| +-01.0-[90-9f]--
| +-02.0-[a0-af]--
| +-02.2-[b0-bf]----00.0-[b1-b3]--+-02.0-[b2]--+-00.0 Device
| | | +-00.1 Device
| | | +-00.2 Device
| | | \-00.3 Device
| | \-03.0-[b3]--+-00.0 Device
| | +-00.1 Device
| | +-00.2 Device
| | \-00.3 Device
root complex: 80:02.2
pci express modules: have pcie switch and are listed as b0:00.0, b1:02.0 and b1:03.0.
end devices are b2:00.0 and b3.00.0.
VFs are: b2:00.1,... b2:00.3, and b3:00.1,...,b3:00.3
Root cause: when doing pci_stop_bus_device() with phys fn, it will stop
virt fn and remove the fn, so
list_for_each_safe(l, n, &bus->devices)
will have problem to refer freed n that is pointed to vf entry.
Solution is just replacing list_for_each_safe() with
list_for_each_prev_safe(). This will make sure we can get valid n pointer
to PF instead of the freed VF pointer (because newly added devices are
inserted to the bus->devices list tail).
During reviewing the patch, Bjorn said:
| The PCI hot-remove path calls pci_stop_bus_devices() via
| pci_remove_bus_device().
|
| pci_stop_bus_devices() traverses the bus->devices list (point A below),
| stopping each device in turn, which calls the driver remove() method. When
| the device is an SR-IOV PF, the driver calls pci_disable_sriov(), which
| also uses pci_remove_bus_device() to remove the VF devices from the
| bus->devices list (point B).
|
| pci_remove_bus_device
| pci_stop_bus_device
| pci_stop_bus_devices(subordinate)
| list_for_each(bus->devices) <-- A
| pci_stop_bus_device(PF)
| ...
| driver->remove
| pci_disable_sriov
| ...
| pci_remove_bus_device(VF)
| <remove from bus_list> <-- B
|
| At B, we're changing the same list we're iterating through at A, so when
| the driver remove() method returns, the pci_stop_bus_devices() iterator has
| a pointer to a list entry that has already been freed.
Discussion thread can be found : https://lkml.org/lkml/2011/10/15/141
https://lkml.org/lkml/2012/1/23/360
-v5: According to Linus to make remove more robust, Change to
list_for_each_prev_safe instead. That is more reasonable, because
those devices are added to tail of the list before.
Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
2012-01-27 10:55:09 -08:00
* pci_stop_bus_devices ( ) code path for PF .
* aka , bus - > devices get updated in the process .
* but VFs are inserted after PFs when SRIOV is enabled for PF ,
* We can iterate the list backwards to get prev valid PF instead
* of removed VF .
*/
list_for_each_prev_safe ( l , n , & bus - > devices ) {
2006-09-12 10:16:36 -07:00
struct pci_dev * dev = pci_dev_b ( l ) ;
pci_stop_bus_device ( dev ) ;
}
}
/**
* pci_stop_bus_device - stop a PCI device and any children
* @ dev : the device to stop
*
* Stop a PCI device ( detach the driver , remove from the global list
* and so on ) . This also stop any subordinate buses and children in a
* depth - first manner .
*/
void pci_stop_bus_device ( struct pci_dev * dev )
{
if ( dev - > subordinate )
pci_stop_bus_devices ( dev - > subordinate ) ;
pci_stop_dev ( dev ) ;
}
2012-02-25 13:54:20 -08:00
EXPORT_SYMBOL ( pci_stop_and_remove_bus_device ) ;
2005-04-16 15:20:36 -07:00
EXPORT_SYMBOL ( pci_remove_behind_bridge ) ;
2006-09-12 10:16:36 -07:00
EXPORT_SYMBOL_GPL ( pci_stop_bus_device ) ;