EFI changes for v5.8:
- preliminary changes for RISC-V - add support for setting the resolution on the EFI framebuffer - simplify kernel image loading for arm64 - Move .bss into .data via the linker script instead of relying on symbol annotations. - Get rid of __pure getters to access global variables - Clean up the config table matching arrays -----BEGIN PGP SIGNATURE----- iQEzBAABCgAdFiEEnNKg2mrY9zMBdeK7wjcgfpV0+n0FAl6i4QUACgkQwjcgfpV0 +n3wxwf/d8+8UD2fsx0MZtGP6SguVFVjiKo/cfaIHPQh+i4uFFqatPpdF/gFDT+M SUVG1xtxqfSbwC0XlbfRf2LIsRjwy9J6WXV1hMARMd5mV9G18FRudMyIHbYPRjJS SQds6/4uTzKpRtUUHusocKfyaH9adxE3dGuFMfI0gMPBy0jpwvmMMyULFUnnxOAX XPuB9EQHAZOPjA/QnmahKyPWKpDOtxx+t5WMejig0+IWHSLsVGzjOsFPEb41Ekqz 7lGP2o/upQ7FPsbEwANcDZcieRW2u8E4y9Yri5gJpEei6TGzo2oHgwBJROjUTRvB 8j1H7xyTO9ADNnKpClrnGpOkPoSLjw== =sg+h -----END PGP SIGNATURE----- Merge tag 'efi-next' of git://git.kernel.org/pub/scm/linux/kernel/git/efi/efi into efi/core Pull EFI changes for v5.8 from Ard Biesheuvel: "- preliminary changes for RISC-V - add support for setting the resolution on the EFI framebuffer - simplify kernel image loading for arm64 - Move .bss into .data via the linker script instead of relying on symbol annotations. - Get rid of __pure getters to access global variables - Clean up the config table matching arrays" Signed-off-by: Ingo Molnar <mingo@kernel.org>
This commit is contained in:
commit
4353dd3b70
@ -86,6 +86,8 @@ ForEachMacros:
|
||||
- 'bio_for_each_segment_all'
|
||||
- 'bio_list_for_each'
|
||||
- 'bip_for_each_vec'
|
||||
- 'bitmap_for_each_clear_region'
|
||||
- 'bitmap_for_each_set_region'
|
||||
- 'blkg_for_each_descendant_post'
|
||||
- 'blkg_for_each_descendant_pre'
|
||||
- 'blk_queue_for_each_rl'
|
||||
@ -115,6 +117,7 @@ ForEachMacros:
|
||||
- 'drm_client_for_each_connector_iter'
|
||||
- 'drm_client_for_each_modeset'
|
||||
- 'drm_connector_for_each_possible_encoder'
|
||||
- 'drm_for_each_bridge_in_chain'
|
||||
- 'drm_for_each_connector_iter'
|
||||
- 'drm_for_each_crtc'
|
||||
- 'drm_for_each_encoder'
|
||||
@ -136,9 +139,10 @@ ForEachMacros:
|
||||
- 'for_each_bio'
|
||||
- 'for_each_board_func_rsrc'
|
||||
- 'for_each_bvec'
|
||||
- 'for_each_card_auxs'
|
||||
- 'for_each_card_auxs_safe'
|
||||
- 'for_each_card_components'
|
||||
- 'for_each_card_links'
|
||||
- 'for_each_card_links_safe'
|
||||
- 'for_each_card_pre_auxs'
|
||||
- 'for_each_card_prelinks'
|
||||
- 'for_each_card_rtds'
|
||||
- 'for_each_card_rtds_safe'
|
||||
@ -166,6 +170,7 @@ ForEachMacros:
|
||||
- 'for_each_dpcm_fe'
|
||||
- 'for_each_drhd_unit'
|
||||
- 'for_each_dss_dev'
|
||||
- 'for_each_efi_handle'
|
||||
- 'for_each_efi_memory_desc'
|
||||
- 'for_each_efi_memory_desc_in_map'
|
||||
- 'for_each_element'
|
||||
@ -190,6 +195,7 @@ ForEachMacros:
|
||||
- 'for_each_lru'
|
||||
- 'for_each_matching_node'
|
||||
- 'for_each_matching_node_and_match'
|
||||
- 'for_each_member'
|
||||
- 'for_each_memblock'
|
||||
- 'for_each_memblock_type'
|
||||
- 'for_each_memcg_cache_index'
|
||||
@ -200,9 +206,11 @@ ForEachMacros:
|
||||
- 'for_each_msi_entry'
|
||||
- 'for_each_msi_entry_safe'
|
||||
- 'for_each_net'
|
||||
- 'for_each_net_continue_reverse'
|
||||
- 'for_each_netdev'
|
||||
- 'for_each_netdev_continue'
|
||||
- 'for_each_netdev_continue_rcu'
|
||||
- 'for_each_netdev_continue_reverse'
|
||||
- 'for_each_netdev_feature'
|
||||
- 'for_each_netdev_in_bond_rcu'
|
||||
- 'for_each_netdev_rcu'
|
||||
@ -254,10 +262,10 @@ ForEachMacros:
|
||||
- 'for_each_reserved_mem_region'
|
||||
- 'for_each_rtd_codec_dai'
|
||||
- 'for_each_rtd_codec_dai_rollback'
|
||||
- 'for_each_rtdcom'
|
||||
- 'for_each_rtdcom_safe'
|
||||
- 'for_each_rtd_components'
|
||||
- 'for_each_set_bit'
|
||||
- 'for_each_set_bit_from'
|
||||
- 'for_each_set_clump8'
|
||||
- 'for_each_sg'
|
||||
- 'for_each_sg_dma_page'
|
||||
- 'for_each_sg_page'
|
||||
@ -267,6 +275,7 @@ ForEachMacros:
|
||||
- 'for_each_subelement_id'
|
||||
- '__for_each_thread'
|
||||
- 'for_each_thread'
|
||||
- 'for_each_wakeup_source'
|
||||
- 'for_each_zone'
|
||||
- 'for_each_zone_zonelist'
|
||||
- 'for_each_zone_zonelist_nodemask'
|
||||
@ -330,6 +339,7 @@ ForEachMacros:
|
||||
- 'list_for_each'
|
||||
- 'list_for_each_codec'
|
||||
- 'list_for_each_codec_safe'
|
||||
- 'list_for_each_continue'
|
||||
- 'list_for_each_entry'
|
||||
- 'list_for_each_entry_continue'
|
||||
- 'list_for_each_entry_continue_rcu'
|
||||
@ -351,6 +361,7 @@ ForEachMacros:
|
||||
- 'llist_for_each_entry'
|
||||
- 'llist_for_each_entry_safe'
|
||||
- 'llist_for_each_safe'
|
||||
- 'mci_for_each_dimm'
|
||||
- 'media_device_for_each_entity'
|
||||
- 'media_device_for_each_intf'
|
||||
- 'media_device_for_each_link'
|
||||
@ -444,10 +455,16 @@ ForEachMacros:
|
||||
- 'virtio_device_for_each_vq'
|
||||
- 'xa_for_each'
|
||||
- 'xa_for_each_marked'
|
||||
- 'xa_for_each_range'
|
||||
- 'xa_for_each_start'
|
||||
- 'xas_for_each'
|
||||
- 'xas_for_each_conflict'
|
||||
- 'xas_for_each_marked'
|
||||
- 'xbc_array_for_each_value'
|
||||
- 'xbc_for_each_key_value'
|
||||
- 'xbc_node_for_each_array_value'
|
||||
- 'xbc_node_for_each_child'
|
||||
- 'xbc_node_for_each_key_value'
|
||||
- 'zorro_for_each_dev'
|
||||
|
||||
#IncludeBlocks: Preserve # Unknown to clang-format-5.0
|
||||
|
1
.gitignore
vendored
1
.gitignore
vendored
@ -1,3 +1,4 @@
|
||||
# SPDX-License-Identifier: GPL-2.0-only
|
||||
#
|
||||
# NOTE! Don't add files that are generated in specific
|
||||
# subdirectories here. Add them in the ".gitignore" file
|
||||
|
4
.mailmap
4
.mailmap
@ -210,6 +210,7 @@ Oleksij Rempel <linux@rempel-privat.de> <external.Oleksij.Rempel@de.bosch.com>
|
||||
Oleksij Rempel <linux@rempel-privat.de> <fixed-term.Oleksij.Rempel@de.bosch.com>
|
||||
Oleksij Rempel <linux@rempel-privat.de> <o.rempel@pengutronix.de>
|
||||
Oleksij Rempel <linux@rempel-privat.de> <ore@pengutronix.de>
|
||||
Pali Rohár <pali@kernel.org> <pali.rohar@gmail.com>
|
||||
Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
|
||||
Patrick Mochel <mochel@digitalimplant.org>
|
||||
Paul Burton <paulburton@kernel.org> <paul.burton@imgtec.com>
|
||||
@ -225,6 +226,7 @@ Pratyush Anand <pratyush.anand@gmail.com> <pratyush.anand@st.com>
|
||||
Praveen BP <praveenbp@ti.com>
|
||||
Punit Agrawal <punitagrawal@gmail.com> <punit.agrawal@arm.com>
|
||||
Qais Yousef <qsyousef@gmail.com> <qais.yousef@imgtec.com>
|
||||
Quentin Monnet <quentin@isovalent.com> <quentin.monnet@netronome.com>
|
||||
Quentin Perret <qperret@qperret.net> <quentin.perret@arm.com>
|
||||
Rafael J. Wysocki <rjw@rjwysocki.net> <rjw@sisk.pl>
|
||||
Rajesh Shah <rajesh.shah@intel.com>
|
||||
@ -243,9 +245,11 @@ Santosh Shilimkar <ssantosh@kernel.org>
|
||||
Santosh Shilimkar <santosh.shilimkar@oracle.org>
|
||||
Sascha Hauer <s.hauer@pengutronix.de>
|
||||
S.Çağlar Onur <caglar@pardus.org.tr>
|
||||
Sakari Ailus <sakari.ailus@linux.intel.com> <sakari.ailus@iki.fi>
|
||||
Sean Nyekjaer <sean@geanix.com> <sean.nyekjaer@prevas.dk>
|
||||
Sebastian Reichel <sre@kernel.org> <sre@debian.org>
|
||||
Sebastian Reichel <sre@kernel.org> <sebastian.reichel@collabora.co.uk>
|
||||
Sedat Dilek <sedat.dilek@gmail.com> <sedat.dilek@credativ.de>
|
||||
Shiraz Hashim <shiraz.linux.kernel@gmail.com> <shiraz.hashim@st.com>
|
||||
Shuah Khan <shuah@kernel.org> <shuahkhan@gmail.com>
|
||||
Shuah Khan <shuah@kernel.org> <shuah.khan@hp.com>
|
||||
|
1
Documentation/.gitignore
vendored
1
Documentation/.gitignore
vendored
@ -1,2 +1,3 @@
|
||||
# SPDX-License-Identifier: GPL-2.0-only
|
||||
output
|
||||
*.pyc
|
||||
|
9
Documentation/ABI/obsolete/sysfs-kernel-fadump_enabled
Normal file
9
Documentation/ABI/obsolete/sysfs-kernel-fadump_enabled
Normal file
@ -0,0 +1,9 @@
|
||||
This ABI is renamed and moved to a new location /sys/kernel/fadump/enabled.
|
||||
|
||||
What: /sys/kernel/fadump_enabled
|
||||
Date: Feb 2012
|
||||
Contact: linuxppc-dev@lists.ozlabs.org
|
||||
Description: read only
|
||||
Primarily used to identify whether the FADump is enabled in
|
||||
the kernel or not.
|
||||
User: Kdump service
|
10
Documentation/ABI/obsolete/sysfs-kernel-fadump_registered
Normal file
10
Documentation/ABI/obsolete/sysfs-kernel-fadump_registered
Normal file
@ -0,0 +1,10 @@
|
||||
This ABI is renamed and moved to a new location /sys/kernel/fadump/registered.¬
|
||||
|
||||
What: /sys/kernel/fadump_registered
|
||||
Date: Feb 2012
|
||||
Contact: linuxppc-dev@lists.ozlabs.org
|
||||
Description: read/write
|
||||
Helps to control the dump collect feature from userspace.
|
||||
Setting 1 to this file enables the system to collect the
|
||||
dump and 0 to disable it.
|
||||
User: Kdump service
|
10
Documentation/ABI/obsolete/sysfs-kernel-fadump_release_mem
Normal file
10
Documentation/ABI/obsolete/sysfs-kernel-fadump_release_mem
Normal file
@ -0,0 +1,10 @@
|
||||
This ABI is renamed and moved to a new location /sys/kernel/fadump/release_mem.¬
|
||||
|
||||
What: /sys/kernel/fadump_release_mem
|
||||
Date: Feb 2012
|
||||
Contact: linuxppc-dev@lists.ozlabs.org
|
||||
Description: write only
|
||||
This is a special sysfs file and only available when
|
||||
the system is booted to capture the vmcore using FADump.
|
||||
It is used to release the memory reserved by FADump to
|
||||
save the crash dump.
|
23
Documentation/ABI/obsolete/sysfs-selinux-checkreqprot
Normal file
23
Documentation/ABI/obsolete/sysfs-selinux-checkreqprot
Normal file
@ -0,0 +1,23 @@
|
||||
What: /sys/fs/selinux/checkreqprot
|
||||
Date: April 2005 (predates git)
|
||||
KernelVersion: 2.6.12-rc2 (predates git)
|
||||
Contact: selinux@vger.kernel.org
|
||||
Description:
|
||||
|
||||
The selinuxfs "checkreqprot" node allows SELinux to be configured
|
||||
to check the protection requested by userspace for mmap/mprotect
|
||||
calls instead of the actual protection applied by the kernel.
|
||||
This was a compatibility mechanism for legacy userspace and
|
||||
for the READ_IMPLIES_EXEC personality flag. However, if set to
|
||||
1, it weakens security by allowing mappings to be made executable
|
||||
without authorization by policy. The default value of checkreqprot
|
||||
at boot was changed starting in Linux v4.4 to 0 (i.e. check the
|
||||
actual protection), and Android and Linux distributions have been
|
||||
explicitly writing a "0" to /sys/fs/selinux/checkreqprot during
|
||||
initialization for some time. Support for setting checkreqprot to 1
|
||||
will be removed in a future kernel release, at which point the kernel
|
||||
will always cease using checkreqprot internally and will always
|
||||
check the actual protections being applied upon mmap/mprotect calls.
|
||||
The checkreqprot selinuxfs node will remain for backward compatibility
|
||||
but will discard writes of the "0" value and will reject writes of the
|
||||
"1" value when this mechanism is removed.
|
@ -0,0 +1,9 @@
|
||||
This ABI is moved to /sys/firmware/opal/mpipl/release_core.
|
||||
|
||||
What: /sys/kernel/fadump_release_opalcore
|
||||
Date: Sep 2019
|
||||
Contact: linuxppc-dev@lists.ozlabs.org
|
||||
Description: write only
|
||||
The sysfs file is available when the system is booted to
|
||||
collect the dump on OPAL based machine. It used to release
|
||||
the memory used to collect the opalcore.
|
@ -1,5 +1,5 @@
|
||||
What: /sys/kernel/uids/<uid>/cpu_shares
|
||||
Date: December 2007
|
||||
Date: December 2007, finally removed in kernel v2.6.34-rc1
|
||||
Contact: Dhaval Giani <dhaval@linux.vnet.ibm.com>
|
||||
Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
|
||||
Description:
|
@ -194,11 +194,3 @@ Description:
|
||||
|
||||
destroy_link write '1' to this attribute to destroy an
|
||||
active link
|
||||
|
||||
What: /sys/kernel/config/rdma_cm/<hca>/ports/<port-num>/default_roce_tos
|
||||
Date: March 8, 2019
|
||||
KernelVersion: 5.2
|
||||
Description: RDMA-CM QPs from HCA <hca> at port <port-num>
|
||||
will be created with this TOS as default.
|
||||
This can be overridden by using the rdma_set_option API.
|
||||
The possible RoCE TOS values are 0-255.
|
@ -43,6 +43,20 @@ Description: Allows the root user to read or write directly through the
|
||||
If the IOMMU is disabled, it also allows the root user to read
|
||||
or write from the host a device VA of a host mapped memory
|
||||
|
||||
What: /sys/kernel/debug/habanalabs/hl<n>/data64
|
||||
Date: Jan 2020
|
||||
KernelVersion: 5.6
|
||||
Contact: oded.gabbay@gmail.com
|
||||
Description: Allows the root user to read or write 64 bit data directly
|
||||
through the device's PCI bar. Writing to this file generates a
|
||||
write transaction while reading from the file generates a read
|
||||
transaction. This custom interface is needed (instead of using
|
||||
the generic Linux user-space PCI mapping) because the DDR bar
|
||||
is very small compared to the DDR memory and only the driver can
|
||||
move the bar before and after the transaction.
|
||||
If the IOMMU is disabled, it also allows the root user to read
|
||||
or write from the host a device VA of a host mapped memory
|
||||
|
||||
What: /sys/kernel/debug/habanalabs/hl<n>/device
|
||||
Date: Jan 2019
|
||||
KernelVersion: 5.1
|
||||
|
241
Documentation/ABI/testing/sysfs-bus-coresight-devices-cti
Normal file
241
Documentation/ABI/testing/sysfs-bus-coresight-devices-cti
Normal file
@ -0,0 +1,241 @@
|
||||
What: /sys/bus/coresight/devices/<cti-name>/enable
|
||||
Date: March 2020
|
||||
KernelVersion 5.7
|
||||
Contact: Mike Leach or Mathieu Poirier
|
||||
Description: (RW) Enable/Disable the CTI hardware.
|
||||
|
||||
What: /sys/bus/coresight/devices/<cti-name>/powered
|
||||
Date: March 2020
|
||||
KernelVersion 5.7
|
||||
Contact: Mike Leach or Mathieu Poirier
|
||||
Description: (R) Indicate if the CTI hardware is powered.
|
||||
|
||||
What: /sys/bus/coresight/devices/<cti-name>/ctmid
|
||||
Date: March 2020
|
||||
KernelVersion 5.7
|
||||
Contact: Mike Leach or Mathieu Poirier
|
||||
Description: (R) Display the associated CTM ID
|
||||
|
||||
What: /sys/bus/coresight/devices/<cti-name>/nr_trigger_cons
|
||||
Date: March 2020
|
||||
KernelVersion 5.7
|
||||
Contact: Mike Leach or Mathieu Poirier
|
||||
Description: (R) Number of devices connected to triggers on this CTI
|
||||
|
||||
What: /sys/bus/coresight/devices/<cti-name>/triggers<N>/name
|
||||
Date: March 2020
|
||||
KernelVersion 5.7
|
||||
Contact: Mike Leach or Mathieu Poirier
|
||||
Description: (R) Name of connected device <N>
|
||||
|
||||
What: /sys/bus/coresight/devices/<cti-name>/triggers<N>/in_signals
|
||||
Date: March 2020
|
||||
KernelVersion 5.7
|
||||
Contact: Mike Leach or Mathieu Poirier
|
||||
Description: (R) Input trigger signals from connected device <N>
|
||||
|
||||
What: /sys/bus/coresight/devices/<cti-name>/triggers<N>/in_types
|
||||
Date: March 2020
|
||||
KernelVersion 5.7
|
||||
Contact: Mike Leach or Mathieu Poirier
|
||||
Description: (R) Functional types for the input trigger signals
|
||||
from connected device <N>
|
||||
|
||||
What: /sys/bus/coresight/devices/<cti-name>/triggers<N>/out_signals
|
||||
Date: March 2020
|
||||
KernelVersion 5.7
|
||||
Contact: Mike Leach or Mathieu Poirier
|
||||
Description: (R) Output trigger signals to connected device <N>
|
||||
|
||||
What: /sys/bus/coresight/devices/<cti-name>/triggers<N>/out_types
|
||||
Date: March 2020
|
||||
KernelVersion 5.7
|
||||
Contact: Mike Leach or Mathieu Poirier
|
||||
Description: (R) Functional types for the output trigger signals
|
||||
to connected device <N>
|
||||
|
||||
What: /sys/bus/coresight/devices/<cti-name>/regs/inout_sel
|
||||
Date: March 2020
|
||||
KernelVersion 5.7
|
||||
Contact: Mike Leach or Mathieu Poirier
|
||||
Description: (RW) Select the index for inen and outen registers.
|
||||
|
||||
What: /sys/bus/coresight/devices/<cti-name>/regs/inen
|
||||
Date: March 2020
|
||||
KernelVersion 5.7
|
||||
Contact: Mike Leach or Mathieu Poirier
|
||||
Description: (RW) Read or write the CTIINEN register selected by inout_sel.
|
||||
|
||||
What: /sys/bus/coresight/devices/<cti-name>/regs/outen
|
||||
Date: March 2020
|
||||
KernelVersion 5.7
|
||||
Contact: Mike Leach or Mathieu Poirier
|
||||
Description: (RW) Read or write the CTIOUTEN register selected by inout_sel.
|
||||
|
||||
What: /sys/bus/coresight/devices/<cti-name>/regs/gate
|
||||
Date: March 2020
|
||||
KernelVersion 5.7
|
||||
Contact: Mike Leach or Mathieu Poirier
|
||||
Description: (RW) Read or write CTIGATE register.
|
||||
|
||||
What: /sys/bus/coresight/devices/<cti-name>/regs/asicctl
|
||||
Date: March 2020
|
||||
KernelVersion 5.7
|
||||
Contact: Mike Leach or Mathieu Poirier
|
||||
Description: (RW) Read or write ASICCTL register.
|
||||
|
||||
What: /sys/bus/coresight/devices/<cti-name>/regs/intack
|
||||
Date: March 2020
|
||||
KernelVersion 5.7
|
||||
Contact: Mike Leach or Mathieu Poirier
|
||||
Description: (W) Write the INTACK register.
|
||||
|
||||
What: /sys/bus/coresight/devices/<cti-name>/regs/appset
|
||||
Date: March 2020
|
||||
KernelVersion 5.7
|
||||
Contact: Mike Leach or Mathieu Poirier
|
||||
Description: (RW) Set CTIAPPSET register to activate channel. Read back to
|
||||
determine current value of register.
|
||||
|
||||
What: /sys/bus/coresight/devices/<cti-name>/regs/appclear
|
||||
Date: March 2020
|
||||
KernelVersion 5.7
|
||||
Contact: Mike Leach or Mathieu Poirier
|
||||
Description: (W) Write APPCLEAR register to deactivate channel.
|
||||
|
||||
What: /sys/bus/coresight/devices/<cti-name>/regs/apppulse
|
||||
Date: March 2020
|
||||
KernelVersion 5.7
|
||||
Contact: Mike Leach or Mathieu Poirier
|
||||
Description: (W) Write APPPULSE to pulse a channel active for one clock
|
||||
cycle.
|
||||
|
||||
What: /sys/bus/coresight/devices/<cti-name>/regs/chinstatus
|
||||
Date: March 2020
|
||||
KernelVersion 5.7
|
||||
Contact: Mike Leach or Mathieu Poirier
|
||||
Description: (R) Read current status of channel inputs.
|
||||
|
||||
What: /sys/bus/coresight/devices/<cti-name>/regs/choutstatus
|
||||
Date: March 2020
|
||||
KernelVersion 5.7
|
||||
Contact: Mike Leach or Mathieu Poirier
|
||||
Description: (R) read current status of channel outputs.
|
||||
|
||||
What: /sys/bus/coresight/devices/<cti-name>/regs/triginstatus
|
||||
Date: March 2020
|
||||
KernelVersion 5.7
|
||||
Contact: Mike Leach or Mathieu Poirier
|
||||
Description: (R) read current status of input trigger signals
|
||||
|
||||
What: /sys/bus/coresight/devices/<cti-name>/regs/trigoutstatus
|
||||
Date: March 2020
|
||||
KernelVersion 5.7
|
||||
Contact: Mike Leach or Mathieu Poirier
|
||||
Description: (R) read current status of output trigger signals.
|
||||
|
||||
What: /sys/bus/coresight/devices/<cti-name>/channels/trigin_attach
|
||||
Date: March 2020
|
||||
KernelVersion 5.7
|
||||
Contact: Mike Leach or Mathieu Poirier
|
||||
Description: (W) Attach a CTI input trigger to a CTM channel.
|
||||
|
||||
What: /sys/bus/coresight/devices/<cti-name>/channels/trigin_detach
|
||||
Date: March 2020
|
||||
KernelVersion 5.7
|
||||
Contact: Mike Leach or Mathieu Poirier
|
||||
Description: (W) Detach a CTI input trigger from a CTM channel.
|
||||
|
||||
What: /sys/bus/coresight/devices/<cti-name>/channels/trigout_attach
|
||||
Date: March 2020
|
||||
KernelVersion 5.7
|
||||
Contact: Mike Leach or Mathieu Poirier
|
||||
Description: (W) Attach a CTI output trigger to a CTM channel.
|
||||
|
||||
What: /sys/bus/coresight/devices/<cti-name>/channels/trigout_detach
|
||||
Date: March 2020
|
||||
KernelVersion 5.7
|
||||
Contact: Mike Leach or Mathieu Poirier
|
||||
Description: (W) Detach a CTI output trigger from a CTM channel.
|
||||
|
||||
What: /sys/bus/coresight/devices/<cti-name>/channels/chan_gate_enable
|
||||
Date: March 2020
|
||||
KernelVersion 5.7
|
||||
Contact: Mike Leach or Mathieu Poirier
|
||||
Description: (RW) Enable CTIGATE for single channel (W) or list enabled
|
||||
channels through the gate (R).
|
||||
|
||||
What: /sys/bus/coresight/devices/<cti-name>/channels/chan_gate_disable
|
||||
Date: March 2020
|
||||
KernelVersion 5.7
|
||||
Contact: Mike Leach or Mathieu Poirier
|
||||
Description: (W) Disable CTIGATE for single channel.
|
||||
|
||||
What: /sys/bus/coresight/devices/<cti-name>/channels/chan_set
|
||||
Date: March 2020
|
||||
KernelVersion 5.7
|
||||
Contact: Mike Leach or Mathieu Poirier
|
||||
Description: (W) Activate a single channel.
|
||||
|
||||
What: /sys/bus/coresight/devices/<cti-name>/channels/chan_clear
|
||||
Date: March 2020
|
||||
KernelVersion 5.7
|
||||
Contact: Mike Leach or Mathieu Poirier
|
||||
Description: (W) Deactivate a single channel.
|
||||
|
||||
What: /sys/bus/coresight/devices/<cti-name>/channels/chan_pulse
|
||||
Date: March 2020
|
||||
KernelVersion 5.7
|
||||
Contact: Mike Leach or Mathieu Poirier
|
||||
Description: (W) Pulse a single channel - activate for a single clock cycle.
|
||||
|
||||
What: /sys/bus/coresight/devices/<cti-name>/channels/trigout_filtered
|
||||
Date: March 2020
|
||||
KernelVersion 5.7
|
||||
Contact: Mike Leach or Mathieu Poirier
|
||||
Description: (R) List of output triggers filtered across all connections.
|
||||
|
||||
What: /sys/bus/coresight/devices/<cti-name>/channels/trig_filter_enable
|
||||
Date: March 2020
|
||||
KernelVersion 5.7
|
||||
Contact: Mike Leach or Mathieu Poirier
|
||||
Description: (RW) Enable or disable trigger output signal filtering.
|
||||
|
||||
What: /sys/bus/coresight/devices/<cti-name>/channels/chan_inuse
|
||||
Date: March 2020
|
||||
KernelVersion 5.7
|
||||
Contact: Mike Leach or Mathieu Poirier
|
||||
Description: (R) show channels with at least one attached trigger signal.
|
||||
|
||||
What: /sys/bus/coresight/devices/<cti-name>/channels/chan_free
|
||||
Date: March 2020
|
||||
KernelVersion 5.7
|
||||
Contact: Mike Leach or Mathieu Poirier
|
||||
Description: (R) show channels with no attached trigger signals.
|
||||
|
||||
What: /sys/bus/coresight/devices/<cti-name>/channels/chan_xtrigs_sel
|
||||
Date: March 2020
|
||||
KernelVersion 5.7
|
||||
Contact: Mike Leach or Mathieu Poirier
|
||||
Description: (RW) Write channel number to select a channel to view, read to
|
||||
see selected channel number.
|
||||
|
||||
What: /sys/bus/coresight/devices/<cti-name>/channels/chan_xtrigs_in
|
||||
Date: March 2020
|
||||
KernelVersion 5.7
|
||||
Contact: Mike Leach or Mathieu Poirier
|
||||
Description: (R) Read to see input triggers connected to selected view
|
||||
channel.
|
||||
|
||||
What: /sys/bus/coresight/devices/<cti-name>/channels/chan_xtrigs_out
|
||||
Date: March 2020
|
||||
KernelVersion 5.7
|
||||
Contact: Mike Leach or Mathieu Poirier
|
||||
Description: (R) Read to see output triggers connected to selected view
|
||||
channel.
|
||||
|
||||
What: /sys/bus/coresight/devices/<cti-name>/channels/chan_xtrigs_reset
|
||||
Date: March 2020
|
||||
KernelVersion 5.7
|
||||
Contact: Mike Leach or Mathieu Poirier
|
||||
Description: (W) Clear all channel / trigger programming.
|
@ -1,3 +1,28 @@
|
||||
What: /sys/bus/counter/devices/counterX/signalY/cable_fault
|
||||
KernelVersion: 5.7
|
||||
Contact: linux-iio@vger.kernel.org
|
||||
Description:
|
||||
Read-only attribute that indicates whether a differential
|
||||
encoder cable fault (not connected or loose wires) is detected
|
||||
for the respective channel of Signal Y. Valid attribute values
|
||||
are boolean. Detection must first be enabled via the
|
||||
corresponding cable_fault_enable attribute.
|
||||
|
||||
What: /sys/bus/counter/devices/counterX/signalY/cable_fault_enable
|
||||
KernelVersion: 5.7
|
||||
Contact: linux-iio@vger.kernel.org
|
||||
Description:
|
||||
Whether detection of differential encoder cable faults for the
|
||||
respective channel of Signal Y is enabled. Valid attribute
|
||||
values are boolean.
|
||||
|
||||
What: /sys/bus/counter/devices/counterX/signalY/filter_clock_prescaler
|
||||
KernelVersion: 5.7
|
||||
Contact: linux-iio@vger.kernel.org
|
||||
Description:
|
||||
Filter clock factor for input Signal Y. This prescaler value
|
||||
affects the inputs of both quadrature pair signals.
|
||||
|
||||
What: /sys/bus/counter/devices/counterX/signalY/index_polarity
|
||||
KernelVersion: 5.2
|
||||
Contact: linux-iio@vger.kernel.org
|
||||
|
@ -2,17 +2,22 @@ What: /sys/bus/iio/devices/iio:deviceX/ac_excitation_en
|
||||
KernelVersion:
|
||||
Contact: linux-iio@vger.kernel.org
|
||||
Description:
|
||||
Reading gives the state of AC excitation.
|
||||
Writing '1' enables AC excitation.
|
||||
This attribute, if available, is used to enable the AC
|
||||
excitation mode found on some converters. In ac excitation mode,
|
||||
the polarity of the excitation voltage is reversed on
|
||||
alternate cycles, to eliminate DC errors.
|
||||
|
||||
What: /sys/bus/iio/devices/iio:deviceX/bridge_switch_en
|
||||
KernelVersion:
|
||||
Contact: linux-iio@vger.kernel.org
|
||||
Description:
|
||||
This bridge switch is used to disconnect it when there is a
|
||||
need to minimize the system current consumption.
|
||||
Reading gives the state of the bridge switch.
|
||||
Writing '1' enables the bridge switch.
|
||||
This attribute, if available, is used to close or open the
|
||||
bridge power down switch found on some converters.
|
||||
In bridge applications, such as strain gauges and load cells,
|
||||
the bridge itself consumes the majority of the current in the
|
||||
system. To minimize the current consumption of the system,
|
||||
the bridge can be disconnected (when it is not being used
|
||||
using the bridge_switch_en attribute.
|
||||
|
||||
What: /sys/bus/iio/devices/iio:deviceX/in_voltagex_sys_calibration
|
||||
KernelVersion:
|
||||
@ -21,6 +26,13 @@ Description:
|
||||
Initiates the system calibration procedure. This is done on a
|
||||
single channel at a time. Write '1' to start the calibration.
|
||||
|
||||
What: /sys/bus/iio/devices/iio:deviceX/in_voltage2-voltage2_shorted_raw
|
||||
KernelVersion:
|
||||
Contact: linux-iio@vger.kernel.org
|
||||
Description:
|
||||
Measure voltage from AIN2 pin connected to AIN(+)
|
||||
and AIN(-) shorted.
|
||||
|
||||
What: /sys/bus/iio/devices/iio:deviceX/in_voltagex_sys_calibration_mode_available
|
||||
KernelVersion:
|
||||
Contact: linux-iio@vger.kernel.org
|
||||
|
@ -40,3 +40,11 @@ Description: (RW) Trigger window switch for the MSC's buffer, in
|
||||
triggering a window switch for the buffer. Returns an error in any
|
||||
other operating mode or attempts to write something other than "1".
|
||||
|
||||
What: /sys/bus/intel_th/devices/<intel_th_id>-msc<msc-id>/stop_on_full
|
||||
Date: March 2020
|
||||
KernelVersion: 5.7
|
||||
Contact: Alexander Shishkin <alexander.shishkin@linux.intel.com>
|
||||
Description: (RW) Configure whether trace stops when the last available window
|
||||
becomes full (1/y/Y) or wraps around and continues until the next
|
||||
window becomes available again (0/n/N).
|
||||
|
||||
|
@ -5,7 +5,7 @@ Contact: Christian Gromm <christian.gromm@microchip.com>
|
||||
Description:
|
||||
Provides information about the interface type and the physical
|
||||
location of the device. Hardware attached via USB, for instance,
|
||||
might return <usb_device 1-1.1:1.0>
|
||||
might return <1-1.1:1.0>
|
||||
Users:
|
||||
|
||||
What: /sys/bus/most/devices/.../interface
|
||||
@ -278,25 +278,7 @@ Description:
|
||||
Indicates whether current channel ran out of buffers.
|
||||
Users:
|
||||
|
||||
What: /sys/bus/most/drivers/mostcore/add_link
|
||||
Date: March 2017
|
||||
KernelVersion: 4.15
|
||||
Contact: Christian Gromm <christian.gromm@microchip.com>
|
||||
Description:
|
||||
This is used to link a channel to a component of the
|
||||
mostcore. A link created by writing to this file is
|
||||
referred to as pipe.
|
||||
Users:
|
||||
|
||||
What: /sys/bus/most/drivers/mostcore/remove_link
|
||||
Date: March 2017
|
||||
KernelVersion: 4.15
|
||||
Contact: Christian Gromm <christian.gromm@microchip.com>
|
||||
Description:
|
||||
This is used to unlink a channel from a component.
|
||||
Users:
|
||||
|
||||
What: /sys/bus/most/drivers/mostcore/components
|
||||
What: /sys/bus/most/drivers/most_core/components
|
||||
Date: March 2017
|
||||
KernelVersion: 4.15
|
||||
Contact: Christian Gromm <christian.gromm@microchip.com>
|
||||
@ -304,7 +286,7 @@ Description:
|
||||
This is used to retrieve a list of registered components.
|
||||
Users:
|
||||
|
||||
What: /sys/bus/most/drivers/mostcore/links
|
||||
What: /sys/bus/most/drivers/most_core/links
|
||||
Date: March 2017
|
||||
KernelVersion: 4.15
|
||||
Contact: Christian Gromm <christian.gromm@microchip.com>
|
@ -20,13 +20,13 @@ Date: April 2017
|
||||
Contact: Heikki Krogerus <heikki.krogerus@linux.intel.com>
|
||||
Description:
|
||||
The supported power roles. This attribute can be used to request
|
||||
power role swap on the port when the port supports USB Power
|
||||
Delivery. Swapping is supported as synchronous operation, so
|
||||
write(2) to the attribute will not return until the operation
|
||||
has finished. The attribute is notified about role changes so
|
||||
that poll(2) on the attribute wakes up. Change on the role will
|
||||
also generate uevent KOBJ_CHANGE. The current role is show in
|
||||
brackets, for example "[source] sink" when in source mode.
|
||||
power role swap on the port. Swapping is supported as
|
||||
synchronous operation, so write(2) to the attribute will not
|
||||
return until the operation has finished. The attribute is
|
||||
notified about role changes so that poll(2) on the attribute
|
||||
wakes up. Change on the role will also generate uevent
|
||||
KOBJ_CHANGE. The current role is show in brackets, for example
|
||||
"[source] sink" when in source mode.
|
||||
|
||||
Valid values: source, sink
|
||||
|
||||
@ -108,6 +108,15 @@ Contact: Heikki Krogerus <heikki.krogerus@linux.intel.com>
|
||||
Description:
|
||||
Revision number of the supported USB Type-C specification.
|
||||
|
||||
What: /sys/class/typec/<port>/orientation
|
||||
Date: February 2020
|
||||
Contact: Badhri Jagan Sridharan <badhri@google.com>
|
||||
Description:
|
||||
Indicates the active orientation of the Type-C connector.
|
||||
Valid values:
|
||||
- "normal": CC1 orientation
|
||||
- "reverse": CC2 orientation
|
||||
- "unknown": Orientation cannot be determined.
|
||||
|
||||
USB Type-C partner devices (eg. /sys/class/typec/port0-partner/)
|
||||
|
||||
|
16
Documentation/ABI/testing/sysfs-driver-jz4780-efuse
Normal file
16
Documentation/ABI/testing/sysfs-driver-jz4780-efuse
Normal file
@ -0,0 +1,16 @@
|
||||
What: /sys/devices/*/<our-device>/nvmem
|
||||
Date: December 2017
|
||||
Contact: PrasannaKumar Muralidharan <prasannatsmkumar@gmail.com>
|
||||
Description: read-only access to the efuse on the Ingenic JZ4780 SoC
|
||||
The SoC has a one time programmable 8K efuse that is
|
||||
split into segments. The driver supports read only.
|
||||
The segments are
|
||||
0x000 64 bit Random Number
|
||||
0x008 128 bit Ingenic Chip ID
|
||||
0x018 128 bit Customer ID
|
||||
0x028 3520 bit Reserved
|
||||
0x1E0 8 bit Protect Segment
|
||||
0x1E1 2296 bit HDMI Key
|
||||
0x300 2048 bit Security boot key
|
||||
Users: any user space application which wants to read the Chip
|
||||
and Customer ID
|
39
Documentation/ABI/testing/sysfs-driver-uacce
Normal file
39
Documentation/ABI/testing/sysfs-driver-uacce
Normal file
@ -0,0 +1,39 @@
|
||||
What: /sys/class/uacce/<dev_name>/api
|
||||
Date: Feb 2020
|
||||
KernelVersion: 5.7
|
||||
Contact: linux-accelerators@lists.ozlabs.org
|
||||
Description: Api of the device
|
||||
Can be any string and up to userspace to parse.
|
||||
Application use the api to match the correct driver
|
||||
|
||||
What: /sys/class/uacce/<dev_name>/flags
|
||||
Date: Feb 2020
|
||||
KernelVersion: 5.7
|
||||
Contact: linux-accelerators@lists.ozlabs.org
|
||||
Description: Attributes of the device, see UACCE_DEV_xxx flag defined in uacce.h
|
||||
|
||||
What: /sys/class/uacce/<dev_name>/available_instances
|
||||
Date: Feb 2020
|
||||
KernelVersion: 5.7
|
||||
Contact: linux-accelerators@lists.ozlabs.org
|
||||
Description: Available instances left of the device
|
||||
Return -ENODEV if uacce_ops get_available_instances is not provided
|
||||
|
||||
What: /sys/class/uacce/<dev_name>/algorithms
|
||||
Date: Feb 2020
|
||||
KernelVersion: 5.7
|
||||
Contact: linux-accelerators@lists.ozlabs.org
|
||||
Description: Algorithms supported by this accelerator, separated by new line.
|
||||
Can be any string and up to userspace to parse.
|
||||
|
||||
What: /sys/class/uacce/<dev_name>/region_mmio_size
|
||||
Date: Feb 2020
|
||||
KernelVersion: 5.7
|
||||
Contact: linux-accelerators@lists.ozlabs.org
|
||||
Description: Size (bytes) of mmio region queue file
|
||||
|
||||
What: /sys/class/uacce/<dev_name>/region_dus_size
|
||||
Date: Feb 2020
|
||||
KernelVersion: 5.7
|
||||
Contact: linux-accelerators@lists.ozlabs.org
|
||||
Description: Size (bytes) of dus region queue file
|
21
Documentation/ABI/testing/sysfs-firmware-opal-sensor-groups
Normal file
21
Documentation/ABI/testing/sysfs-firmware-opal-sensor-groups
Normal file
@ -0,0 +1,21 @@
|
||||
What: /sys/firmware/opal/sensor_groups
|
||||
Date: August 2017
|
||||
Contact: Linux for PowerPC mailing list <linuxppc-dev@ozlabs.org>
|
||||
Description: Sensor groups directory for POWER9 powernv servers
|
||||
|
||||
Each folder in this directory contains a sensor group
|
||||
which are classified based on type of the sensor
|
||||
like power, temperature, frequency, current, etc. They
|
||||
can also indicate the group of sensors belonging to
|
||||
different owners like CSM, Profiler, Job-Scheduler
|
||||
|
||||
What: /sys/firmware/opal/sensor_groups/<sensor_group_name>/clear
|
||||
Date: August 2017
|
||||
Contact: Linux for PowerPC mailing list <linuxppc-dev@ozlabs.org>
|
||||
Description: Sysfs file to clear the min-max of all the sensors
|
||||
belonging to the group.
|
||||
|
||||
Writing 1 to this file will clear the minimum and
|
||||
maximum values of all the sensors in the group.
|
||||
In POWER9, the min-max of a sensor is the historical minimum
|
||||
and maximum value of the sensor cached by OCC.
|
@ -318,3 +318,8 @@ Date: September 2019
|
||||
Contact: "Hridya Valsaraju" <hridya@google.com>
|
||||
Description: Average number of valid blocks.
|
||||
Available when CONFIG_F2FS_STAT_FS=y.
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/mounted_time_sec
|
||||
Date: February 2020
|
||||
Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
|
||||
Description: Show the mounted time in secs of this partition.
|
||||
|
40
Documentation/ABI/testing/sysfs-kernel-fadump
Normal file
40
Documentation/ABI/testing/sysfs-kernel-fadump
Normal file
@ -0,0 +1,40 @@
|
||||
What: /sys/kernel/fadump/*
|
||||
Date: Dec 2019
|
||||
Contact: linuxppc-dev@lists.ozlabs.org
|
||||
Description:
|
||||
The /sys/kernel/fadump/* is a collection of FADump sysfs
|
||||
file provide information about the configuration status
|
||||
of Firmware Assisted Dump (FADump).
|
||||
|
||||
What: /sys/kernel/fadump/enabled
|
||||
Date: Dec 2019
|
||||
Contact: linuxppc-dev@lists.ozlabs.org
|
||||
Description: read only
|
||||
Primarily used to identify whether the FADump is enabled in
|
||||
the kernel or not.
|
||||
User: Kdump service
|
||||
|
||||
What: /sys/kernel/fadump/registered
|
||||
Date: Dec 2019
|
||||
Contact: linuxppc-dev@lists.ozlabs.org
|
||||
Description: read/write
|
||||
Helps to control the dump collect feature from userspace.
|
||||
Setting 1 to this file enables the system to collect the
|
||||
dump and 0 to disable it.
|
||||
User: Kdump service
|
||||
|
||||
What: /sys/kernel/fadump/release_mem
|
||||
Date: Dec 2019
|
||||
Contact: linuxppc-dev@lists.ozlabs.org
|
||||
Description: write only
|
||||
This is a special sysfs file and only available when
|
||||
the system is booted to capture the vmcore using FADump.
|
||||
It is used to release the memory reserved by FADump to
|
||||
save the crash dump.
|
||||
|
||||
What: /sys/kernel/fadump/mem_reserved
|
||||
Date: Dec 2019
|
||||
Contact: linuxppc-dev@lists.ozlabs.org
|
||||
Description: read only
|
||||
Provide information about the amount of memory reserved by
|
||||
FADump to save the crash dump in bytes.
|
@ -2,7 +2,7 @@ What: /sys/class/leds/dell::kbd_backlight/als_enabled
|
||||
Date: December 2014
|
||||
KernelVersion: 3.19
|
||||
Contact: Gabriele Mazzotta <gabriele.mzt@gmail.com>,
|
||||
Pali Rohár <pali.rohar@gmail.com>
|
||||
Pali Rohár <pali@kernel.org>
|
||||
Description:
|
||||
This file allows to control the automatic keyboard
|
||||
illumination mode on some systems that have an ambient
|
||||
@ -13,7 +13,7 @@ What: /sys/class/leds/dell::kbd_backlight/als_setting
|
||||
Date: December 2014
|
||||
KernelVersion: 3.19
|
||||
Contact: Gabriele Mazzotta <gabriele.mzt@gmail.com>,
|
||||
Pali Rohár <pali.rohar@gmail.com>
|
||||
Pali Rohár <pali@kernel.org>
|
||||
Description:
|
||||
This file allows to specifiy the on/off threshold value,
|
||||
as reported by the ambient light sensor.
|
||||
@ -22,7 +22,7 @@ What: /sys/class/leds/dell::kbd_backlight/start_triggers
|
||||
Date: December 2014
|
||||
KernelVersion: 3.19
|
||||
Contact: Gabriele Mazzotta <gabriele.mzt@gmail.com>,
|
||||
Pali Rohár <pali.rohar@gmail.com>
|
||||
Pali Rohár <pali@kernel.org>
|
||||
Description:
|
||||
This file allows to control the input triggers that
|
||||
turn on the keyboard backlight illumination that is
|
||||
@ -45,7 +45,7 @@ What: /sys/class/leds/dell::kbd_backlight/stop_timeout
|
||||
Date: December 2014
|
||||
KernelVersion: 3.19
|
||||
Contact: Gabriele Mazzotta <gabriele.mzt@gmail.com>,
|
||||
Pali Rohár <pali.rohar@gmail.com>
|
||||
Pali Rohár <pali@kernel.org>
|
||||
Description:
|
||||
This file allows to specify the interval after which the
|
||||
keyboard illumination is disabled because of inactivity.
|
||||
|
@ -154,3 +154,10 @@ Description:
|
||||
device specification. For example, when user sets 7bytes on
|
||||
16550A, which has 1/4/8/14 bytes trigger, the RX trigger is
|
||||
automatically changed to 4 bytes.
|
||||
|
||||
What: /sys/class/tty/ttyS0/console
|
||||
Date: February 2020
|
||||
Contact: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
|
||||
Description:
|
||||
Allows user to detach or attach back the given device as
|
||||
kernel console. It shows and accepts a boolean variable.
|
||||
|
@ -2,7 +2,8 @@
|
||||
# Makefile for Sphinx documentation
|
||||
#
|
||||
|
||||
subdir-y := devicetree/bindings/
|
||||
# for cleaning
|
||||
subdir- := devicetree/bindings
|
||||
|
||||
# Check for broken documentation file references
|
||||
ifeq ($(CONFIG_WARN_MISSING_DOCUMENTS),y)
|
||||
@ -13,7 +14,7 @@ endif
|
||||
SPHINXBUILD = sphinx-build
|
||||
SPHINXOPTS =
|
||||
SPHINXDIRS = .
|
||||
_SPHINXDIRS = $(patsubst $(srctree)/Documentation/%/index.rst,%,$(wildcard $(srctree)/Documentation/*/index.rst))
|
||||
_SPHINXDIRS = $(sort $(patsubst $(srctree)/Documentation/%/index.rst,%,$(wildcard $(srctree)/Documentation/*/index.rst)))
|
||||
SPHINX_CONF = conf.py
|
||||
PAPER =
|
||||
BUILDDIR = $(obj)/output
|
||||
|
155
Documentation/PCI/boot-interrupts.rst
Normal file
155
Documentation/PCI/boot-interrupts.rst
Normal file
@ -0,0 +1,155 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
===============
|
||||
Boot Interrupts
|
||||
===============
|
||||
|
||||
:Author: - Sean V Kelley <sean.v.kelley@linux.intel.com>
|
||||
|
||||
Overview
|
||||
========
|
||||
|
||||
On PCI Express, interrupts are represented with either MSI or inbound
|
||||
interrupt messages (Assert_INTx/Deassert_INTx). The integrated IO-APIC in a
|
||||
given Core IO converts the legacy interrupt messages from PCI Express to
|
||||
MSI interrupts. If the IO-APIC is disabled (via the mask bits in the
|
||||
IO-APIC table entries), the messages are routed to the legacy PCH. This
|
||||
in-band interrupt mechanism was traditionally necessary for systems that
|
||||
did not support the IO-APIC and for boot. Intel in the past has used the
|
||||
term "boot interrupts" to describe this mechanism. Further, the PCI Express
|
||||
protocol describes this in-band legacy wire-interrupt INTx mechanism for
|
||||
I/O devices to signal PCI-style level interrupts. The subsequent paragraphs
|
||||
describe problems with the Core IO handling of INTx message routing to the
|
||||
PCH and mitigation within BIOS and the OS.
|
||||
|
||||
|
||||
Issue
|
||||
=====
|
||||
|
||||
When in-band legacy INTx messages are forwarded to the PCH, they in turn
|
||||
trigger a new interrupt for which the OS likely lacks a handler. When an
|
||||
interrupt goes unhandled over time, they are tracked by the Linux kernel as
|
||||
Spurious Interrupts. The IRQ will be disabled by the Linux kernel after it
|
||||
reaches a specific count with the error "nobody cared". This disabled IRQ
|
||||
now prevents valid usage by an existing interrupt which may happen to share
|
||||
the IRQ line.
|
||||
|
||||
irq 19: nobody cared (try booting with the "irqpoll" option)
|
||||
CPU: 0 PID: 2988 Comm: irq/34-nipalk Tainted: 4.14.87-rt49-02410-g4a640ec-dirty #1
|
||||
Hardware name: National Instruments NI PXIe-8880/NI PXIe-8880, BIOS 2.1.5f1 01/09/2020
|
||||
Call Trace:
|
||||
<IRQ>
|
||||
? dump_stack+0x46/0x5e
|
||||
? __report_bad_irq+0x2e/0xb0
|
||||
? note_interrupt+0x242/0x290
|
||||
? nNIKAL100_memoryRead16+0x8/0x10 [nikal]
|
||||
? handle_irq_event_percpu+0x55/0x70
|
||||
? handle_irq_event+0x4f/0x80
|
||||
? handle_fasteoi_irq+0x81/0x180
|
||||
? handle_irq+0x1c/0x30
|
||||
? do_IRQ+0x41/0xd0
|
||||
? common_interrupt+0x84/0x84
|
||||
</IRQ>
|
||||
|
||||
handlers:
|
||||
irq_default_primary_handler threaded usb_hcd_irq
|
||||
Disabling IRQ #19
|
||||
|
||||
|
||||
Conditions
|
||||
==========
|
||||
|
||||
The use of threaded interrupts is the most likely condition to trigger
|
||||
this problem today. Threaded interrupts may not be reenabled after the IRQ
|
||||
handler wakes. These "one shot" conditions mean that the threaded interrupt
|
||||
needs to keep the interrupt line masked until the threaded handler has run.
|
||||
Especially when dealing with high data rate interrupts, the thread needs to
|
||||
run to completion; otherwise some handlers will end up in stack overflows
|
||||
since the interrupt of the issuing device is still active.
|
||||
|
||||
Affected Chipsets
|
||||
=================
|
||||
|
||||
The legacy interrupt forwarding mechanism exists today in a number of
|
||||
devices including but not limited to chipsets from AMD/ATI, Broadcom, and
|
||||
Intel. Changes made through the mitigations below have been applied to
|
||||
drivers/pci/quirks.c
|
||||
|
||||
Starting with ICX there are no longer any IO-APICs in the Core IO's
|
||||
devices. IO-APIC is only in the PCH. Devices connected to the Core IO's
|
||||
PCIe Root Ports will use native MSI/MSI-X mechanisms.
|
||||
|
||||
Mitigations
|
||||
===========
|
||||
|
||||
The mitigations take the form of PCI quirks. The preference has been to
|
||||
first identify and make use of a means to disable the routing to the PCH.
|
||||
In such a case a quirk to disable boot interrupt generation can be
|
||||
added.[1]
|
||||
|
||||
Intel® 6300ESB I/O Controller Hub
|
||||
Alternate Base Address Register:
|
||||
BIE: Boot Interrupt Enable
|
||||
0 = Boot interrupt is enabled.
|
||||
1 = Boot interrupt is disabled.
|
||||
|
||||
Intel® Sandy Bridge through Sky Lake based Xeon servers:
|
||||
Coherent Interface Protocol Interrupt Control
|
||||
dis_intx_route2pch/dis_intx_route2ich/dis_intx_route2dmi2:
|
||||
When this bit is set. Local INTx messages received from the
|
||||
Intel® Quick Data DMA/PCI Express ports are not routed to legacy
|
||||
PCH - they are either converted into MSI via the integrated IO-APIC
|
||||
(if the IO-APIC mask bit is clear in the appropriate entries)
|
||||
or cause no further action (when mask bit is set)
|
||||
|
||||
In the absence of a way to directly disable the routing, another approach
|
||||
has been to make use of PCI Interrupt pin to INTx routing tables for
|
||||
purposes of redirecting the interrupt handler to the rerouted interrupt
|
||||
line by default. Therefore, on chipsets where this INTx routing cannot be
|
||||
disabled, the Linux kernel will reroute the valid interrupt to its legacy
|
||||
interrupt. This redirection of the handler will prevent the occurrence of
|
||||
the spurious interrupt detection which would ordinarily disable the IRQ
|
||||
line due to excessive unhandled counts.[2]
|
||||
|
||||
The config option X86_REROUTE_FOR_BROKEN_BOOT_IRQS exists to enable (or
|
||||
disable) the redirection of the interrupt handler to the PCH interrupt
|
||||
line. The option can be overridden by either pci=ioapicreroute or
|
||||
pci=noioapicreroute.[3]
|
||||
|
||||
|
||||
More Documentation
|
||||
==================
|
||||
|
||||
There is an overview of the legacy interrupt handling in several datasheets
|
||||
(6300ESB and 6700PXH below). While largely the same, it provides insight
|
||||
into the evolution of its handling with chipsets.
|
||||
|
||||
Example of disabling of the boot interrupt
|
||||
------------------------------------------
|
||||
|
||||
Intel® 6300ESB I/O Controller Hub (Document # 300641-004US)
|
||||
5.7.3 Boot Interrupt
|
||||
https://www.intel.com/content/dam/doc/datasheet/6300esb-io-controller-hub-datasheet.pdf
|
||||
|
||||
Intel® Xeon® Processor E5-1600/2400/2600/4600 v3 Product Families
|
||||
Datasheet - Volume 2: Registers (Document # 330784-003)
|
||||
6.6.41 cipintrc Coherent Interface Protocol Interrupt Control
|
||||
https://www.intel.com/content/dam/www/public/us/en/documents/datasheets/xeon-e5-v3-datasheet-vol-2.pdf
|
||||
|
||||
Example of handler rerouting
|
||||
----------------------------
|
||||
|
||||
Intel® 6700PXH 64-bit PCI Hub (Document # 302628)
|
||||
2.15.2 PCI Express Legacy INTx Support and Boot Interrupt
|
||||
https://www.intel.com/content/dam/doc/datasheet/6700pxh-64-bit-pci-hub-datasheet.pdf
|
||||
|
||||
|
||||
If you have any legacy PCI interrupt questions that aren't answered, email me.
|
||||
|
||||
Cheers,
|
||||
Sean V Kelley
|
||||
sean.v.kelley@linux.intel.com
|
||||
|
||||
[1] https://lore.kernel.org/r/12131949181903-git-send-email-sassmann@suse.de/
|
||||
[2] https://lore.kernel.org/r/12131949182094-git-send-email-sassmann@suse.de/
|
||||
[3] https://lore.kernel.org/r/487C8EA7.6020205@suse.de/
|
@ -16,3 +16,4 @@ Linux PCI Bus Subsystem
|
||||
pci-error-recovery
|
||||
pcieaer-howto
|
||||
endpoint/index
|
||||
boot-interrupts
|
||||
|
@ -239,7 +239,7 @@ from the PCI device config space. Use the values in the pci_dev structure
|
||||
as the PCI "bus address" might have been remapped to a "host physical"
|
||||
address by the arch/chip-set specific kernel support.
|
||||
|
||||
See Documentation/io-mapping.txt for how to access device registers
|
||||
See Documentation/driver-api/io-mapping.rst for how to access device registers
|
||||
or device memory.
|
||||
|
||||
The device driver needs to call pci_request_region() to verify
|
||||
|
@ -156,12 +156,6 @@ default reset_link function, but different upstream ports might
|
||||
have different specifications to reset pci express link, so all
|
||||
upstream ports should provide their own reset_link functions.
|
||||
|
||||
In struct pcie_port_service_driver, a new pointer, reset_link, is
|
||||
added.
|
||||
::
|
||||
|
||||
pci_ers_result_t (*reset_link) (struct pci_dev *dev);
|
||||
|
||||
Section 3.2.2.2 provides more detailed info on when to call
|
||||
reset_link.
|
||||
|
||||
@ -212,15 +206,10 @@ error_detected(dev, pci_channel_io_frozen) to all drivers within
|
||||
a hierarchy in question. Then, performing link reset at upstream is
|
||||
necessary. As different kinds of devices might use different approaches
|
||||
to reset link, AER port service driver is required to provide the
|
||||
function to reset link. Firstly, kernel looks for if the upstream
|
||||
component has an aer driver. If it has, kernel uses the reset_link
|
||||
callback of the aer driver. If the upstream component has no aer driver
|
||||
and the port is downstream port, we will perform a hot reset as the
|
||||
default by setting the Secondary Bus Reset bit of the Bridge Control
|
||||
register associated with the downstream port. As for upstream ports,
|
||||
they should provide their own aer service drivers with reset_link
|
||||
function. If error_detected returns PCI_ERS_RESULT_CAN_RECOVER and
|
||||
reset_link returns PCI_ERS_RESULT_RECOVERED, the error handling goes
|
||||
function to reset link via callback parameter of pcie_do_recovery()
|
||||
function. If reset_link is not NULL, recovery function will use it
|
||||
to reset the link. If error_detected returns PCI_ERS_RESULT_CAN_RECOVER
|
||||
and reset_link returns PCI_ERS_RESULT_RECOVERED, the error handling goes
|
||||
to mmio_enabled.
|
||||
|
||||
helper functions
|
||||
@ -243,9 +232,9 @@ messages to root port when an error is detected.
|
||||
|
||||
::
|
||||
|
||||
int pci_cleanup_aer_uncorrect_error_status(struct pci_dev *dev);`
|
||||
int pci_aer_clear_nonfatal_status(struct pci_dev *dev);`
|
||||
|
||||
pci_cleanup_aer_uncorrect_error_status cleanups the uncorrectable
|
||||
pci_aer_clear_nonfatal_status clears non-fatal errors in the uncorrectable
|
||||
error status register.
|
||||
|
||||
Frequent Asked Questions
|
||||
|
@ -4,7 +4,7 @@ A Tour Through TREE_RCU's Grace-Period Memory Ordering
|
||||
|
||||
August 8, 2017
|
||||
|
||||
This article was contributed by Paul E. McKenney
|
||||
This article was contributed by Paul E. McKenney
|
||||
|
||||
Introduction
|
||||
============
|
||||
@ -48,7 +48,7 @@ Tree RCU Grace Period Memory Ordering Building Blocks
|
||||
|
||||
The workhorse for RCU's grace-period memory ordering is the
|
||||
critical section for the ``rcu_node`` structure's
|
||||
``->lock``. These critical sections use helper functions for lock
|
||||
``->lock``. These critical sections use helper functions for lock
|
||||
acquisition, including ``raw_spin_lock_rcu_node()``,
|
||||
``raw_spin_lock_irq_rcu_node()``, and ``raw_spin_lock_irqsave_rcu_node()``.
|
||||
Their lock-release counterparts are ``raw_spin_unlock_rcu_node()``,
|
||||
@ -102,9 +102,9 @@ lock-acquisition and lock-release functions::
|
||||
23 r3 = READ_ONCE(x);
|
||||
24 }
|
||||
25
|
||||
26 WARN_ON(r1 == 0 && r2 == 0 && r3 == 0);
|
||||
26 WARN_ON(r1 == 0 && r2 == 0 && r3 == 0);
|
||||
|
||||
The ``WARN_ON()`` is evaluated at “the end of time”,
|
||||
The ``WARN_ON()`` is evaluated at "the end of time",
|
||||
after all changes have propagated throughout the system.
|
||||
Without the ``smp_mb__after_unlock_lock()`` provided by the
|
||||
acquisition functions, this ``WARN_ON()`` could trigger, for example
|
||||
|
@ -4,12 +4,61 @@ Using RCU to Protect Read-Mostly Linked Lists
|
||||
=============================================
|
||||
|
||||
One of the best applications of RCU is to protect read-mostly linked lists
|
||||
("struct list_head" in list.h). One big advantage of this approach
|
||||
(``struct list_head`` in list.h). One big advantage of this approach
|
||||
is that all of the required memory barriers are included for you in
|
||||
the list macros. This document describes several applications of RCU,
|
||||
with the best fits first.
|
||||
|
||||
Example 1: Read-Side Action Taken Outside of Lock, No In-Place Updates
|
||||
|
||||
Example 1: Read-mostly list: Deferred Destruction
|
||||
-------------------------------------------------
|
||||
|
||||
A widely used usecase for RCU lists in the kernel is lockless iteration over
|
||||
all processes in the system. ``task_struct::tasks`` represents the list node that
|
||||
links all the processes. The list can be traversed in parallel to any list
|
||||
additions or removals.
|
||||
|
||||
The traversal of the list is done using ``for_each_process()`` which is defined
|
||||
by the 2 macros::
|
||||
|
||||
#define next_task(p) \
|
||||
list_entry_rcu((p)->tasks.next, struct task_struct, tasks)
|
||||
|
||||
#define for_each_process(p) \
|
||||
for (p = &init_task ; (p = next_task(p)) != &init_task ; )
|
||||
|
||||
The code traversing the list of all processes typically looks like::
|
||||
|
||||
rcu_read_lock();
|
||||
for_each_process(p) {
|
||||
/* Do something with p */
|
||||
}
|
||||
rcu_read_unlock();
|
||||
|
||||
The simplified code for removing a process from a task list is::
|
||||
|
||||
void release_task(struct task_struct *p)
|
||||
{
|
||||
write_lock(&tasklist_lock);
|
||||
list_del_rcu(&p->tasks);
|
||||
write_unlock(&tasklist_lock);
|
||||
call_rcu(&p->rcu, delayed_put_task_struct);
|
||||
}
|
||||
|
||||
When a process exits, ``release_task()`` calls ``list_del_rcu(&p->tasks)`` under
|
||||
``tasklist_lock`` writer lock protection, to remove the task from the list of
|
||||
all tasks. The ``tasklist_lock`` prevents concurrent list additions/removals
|
||||
from corrupting the list. Readers using ``for_each_process()`` are not protected
|
||||
with the ``tasklist_lock``. To prevent readers from noticing changes in the list
|
||||
pointers, the ``task_struct`` object is freed only after one or more grace
|
||||
periods elapse (with the help of call_rcu()). This deferring of destruction
|
||||
ensures that any readers traversing the list will see valid ``p->tasks.next``
|
||||
pointers and deletion/freeing can happen in parallel with traversal of the list.
|
||||
This pattern is also called an **existence lock**, since RCU pins the object in
|
||||
memory until all existing readers finish.
|
||||
|
||||
|
||||
Example 2: Read-Side Action Taken Outside of Lock: No In-Place Updates
|
||||
----------------------------------------------------------------------
|
||||
|
||||
The best applications are cases where, if reader-writer locking were
|
||||
@ -26,7 +75,7 @@ added or deleted, rather than being modified in place.
|
||||
|
||||
A straightforward example of this use of RCU may be found in the
|
||||
system-call auditing support. For example, a reader-writer locked
|
||||
implementation of audit_filter_task() might be as follows::
|
||||
implementation of ``audit_filter_task()`` might be as follows::
|
||||
|
||||
static enum audit_state audit_filter_task(struct task_struct *tsk)
|
||||
{
|
||||
@ -34,7 +83,7 @@ implementation of audit_filter_task() might be as follows::
|
||||
enum audit_state state;
|
||||
|
||||
read_lock(&auditsc_lock);
|
||||
/* Note: audit_netlink_sem held by caller. */
|
||||
/* Note: audit_filter_mutex held by caller. */
|
||||
list_for_each_entry(e, &audit_tsklist, list) {
|
||||
if (audit_filter_rules(tsk, &e->rule, NULL, &state)) {
|
||||
read_unlock(&auditsc_lock);
|
||||
@ -58,7 +107,7 @@ This means that RCU can be easily applied to the read side, as follows::
|
||||
enum audit_state state;
|
||||
|
||||
rcu_read_lock();
|
||||
/* Note: audit_netlink_sem held by caller. */
|
||||
/* Note: audit_filter_mutex held by caller. */
|
||||
list_for_each_entry_rcu(e, &audit_tsklist, list) {
|
||||
if (audit_filter_rules(tsk, &e->rule, NULL, &state)) {
|
||||
rcu_read_unlock();
|
||||
@ -69,18 +118,18 @@ This means that RCU can be easily applied to the read side, as follows::
|
||||
return AUDIT_BUILD_CONTEXT;
|
||||
}
|
||||
|
||||
The read_lock() and read_unlock() calls have become rcu_read_lock()
|
||||
The ``read_lock()`` and ``read_unlock()`` calls have become rcu_read_lock()
|
||||
and rcu_read_unlock(), respectively, and the list_for_each_entry() has
|
||||
become list_for_each_entry_rcu(). The _rcu() list-traversal primitives
|
||||
become list_for_each_entry_rcu(). The **_rcu()** list-traversal primitives
|
||||
insert the read-side memory barriers that are required on DEC Alpha CPUs.
|
||||
|
||||
The changes to the update side are also straightforward. A reader-writer
|
||||
lock might be used as follows for deletion and insertion::
|
||||
The changes to the update side are also straightforward. A reader-writer lock
|
||||
might be used as follows for deletion and insertion::
|
||||
|
||||
static inline int audit_del_rule(struct audit_rule *rule,
|
||||
struct list_head *list)
|
||||
{
|
||||
struct audit_entry *e;
|
||||
struct audit_entry *e;
|
||||
|
||||
write_lock(&auditsc_lock);
|
||||
list_for_each_entry(e, list, list) {
|
||||
@ -113,9 +162,9 @@ Following are the RCU equivalents for these two functions::
|
||||
static inline int audit_del_rule(struct audit_rule *rule,
|
||||
struct list_head *list)
|
||||
{
|
||||
struct audit_entry *e;
|
||||
struct audit_entry *e;
|
||||
|
||||
/* Do not use the _rcu iterator here, since this is the only
|
||||
/* No need to use the _rcu iterator here, since this is the only
|
||||
* deletion routine. */
|
||||
list_for_each_entry(e, list, list) {
|
||||
if (!audit_compare_rule(rule, &e->rule)) {
|
||||
@ -139,45 +188,45 @@ Following are the RCU equivalents for these two functions::
|
||||
return 0;
|
||||
}
|
||||
|
||||
Normally, the write_lock() and write_unlock() would be replaced by
|
||||
a spin_lock() and a spin_unlock(), but in this case, all callers hold
|
||||
audit_netlink_sem, so no additional locking is required. The auditsc_lock
|
||||
can therefore be eliminated, since use of RCU eliminates the need for
|
||||
writers to exclude readers. Normally, the write_lock() calls would
|
||||
be converted into spin_lock() calls.
|
||||
Normally, the ``write_lock()`` and ``write_unlock()`` would be replaced by a
|
||||
spin_lock() and a spin_unlock(). But in this case, all callers hold
|
||||
``audit_filter_mutex``, so no additional locking is required. The
|
||||
``auditsc_lock`` can therefore be eliminated, since use of RCU eliminates the
|
||||
need for writers to exclude readers.
|
||||
|
||||
The list_del(), list_add(), and list_add_tail() primitives have been
|
||||
replaced by list_del_rcu(), list_add_rcu(), and list_add_tail_rcu().
|
||||
The _rcu() list-manipulation primitives add memory barriers that are
|
||||
needed on weakly ordered CPUs (most of them!). The list_del_rcu()
|
||||
primitive omits the pointer poisoning debug-assist code that would
|
||||
otherwise cause concurrent readers to fail spectacularly.
|
||||
The **_rcu()** list-manipulation primitives add memory barriers that are needed on
|
||||
weakly ordered CPUs (most of them!). The list_del_rcu() primitive omits the
|
||||
pointer poisoning debug-assist code that would otherwise cause concurrent
|
||||
readers to fail spectacularly.
|
||||
|
||||
So, when readers can tolerate stale data and when entries are either added
|
||||
or deleted, without in-place modification, it is very easy to use RCU!
|
||||
So, when readers can tolerate stale data and when entries are either added or
|
||||
deleted, without in-place modification, it is very easy to use RCU!
|
||||
|
||||
Example 2: Handling In-Place Updates
|
||||
|
||||
Example 3: Handling In-Place Updates
|
||||
------------------------------------
|
||||
|
||||
The system-call auditing code does not update auditing rules in place.
|
||||
However, if it did, reader-writer-locked code to do so might look as
|
||||
follows (presumably, the field_count is only permitted to decrease,
|
||||
otherwise, the added fields would need to be filled in)::
|
||||
The system-call auditing code does not update auditing rules in place. However,
|
||||
if it did, the reader-writer-locked code to do so might look as follows
|
||||
(assuming only ``field_count`` is updated, otherwise, the added fields would
|
||||
need to be filled in)::
|
||||
|
||||
static inline int audit_upd_rule(struct audit_rule *rule,
|
||||
struct list_head *list,
|
||||
__u32 newaction,
|
||||
__u32 newfield_count)
|
||||
{
|
||||
struct audit_entry *e;
|
||||
struct audit_newentry *ne;
|
||||
struct audit_entry *e;
|
||||
struct audit_entry *ne;
|
||||
|
||||
write_lock(&auditsc_lock);
|
||||
/* Note: audit_netlink_sem held by caller. */
|
||||
/* Note: audit_filter_mutex held by caller. */
|
||||
list_for_each_entry(e, list, list) {
|
||||
if (!audit_compare_rule(rule, &e->rule)) {
|
||||
e->rule.action = newaction;
|
||||
e->rule.file_count = newfield_count;
|
||||
e->rule.field_count = newfield_count;
|
||||
write_unlock(&auditsc_lock);
|
||||
return 0;
|
||||
}
|
||||
@ -188,16 +237,16 @@ otherwise, the added fields would need to be filled in)::
|
||||
|
||||
The RCU version creates a copy, updates the copy, then replaces the old
|
||||
entry with the newly updated entry. This sequence of actions, allowing
|
||||
concurrent reads while doing a copy to perform an update, is what gives
|
||||
RCU ("read-copy update") its name. The RCU code is as follows::
|
||||
concurrent reads while making a copy to perform an update, is what gives
|
||||
RCU (*read-copy update*) its name. The RCU code is as follows::
|
||||
|
||||
static inline int audit_upd_rule(struct audit_rule *rule,
|
||||
struct list_head *list,
|
||||
__u32 newaction,
|
||||
__u32 newfield_count)
|
||||
{
|
||||
struct audit_entry *e;
|
||||
struct audit_newentry *ne;
|
||||
struct audit_entry *e;
|
||||
struct audit_entry *ne;
|
||||
|
||||
list_for_each_entry(e, list, list) {
|
||||
if (!audit_compare_rule(rule, &e->rule)) {
|
||||
@ -206,7 +255,7 @@ RCU ("read-copy update") its name. The RCU code is as follows::
|
||||
return -ENOMEM;
|
||||
audit_copy_rule(&ne->rule, &e->rule);
|
||||
ne->rule.action = newaction;
|
||||
ne->rule.file_count = newfield_count;
|
||||
ne->rule.field_count = newfield_count;
|
||||
list_replace_rcu(&e->list, &ne->list);
|
||||
call_rcu(&e->rcu, audit_free_rule);
|
||||
return 0;
|
||||
@ -215,34 +264,45 @@ RCU ("read-copy update") its name. The RCU code is as follows::
|
||||
return -EFAULT; /* No matching rule */
|
||||
}
|
||||
|
||||
Again, this assumes that the caller holds audit_netlink_sem. Normally,
|
||||
the reader-writer lock would become a spinlock in this sort of code.
|
||||
Again, this assumes that the caller holds ``audit_filter_mutex``. Normally, the
|
||||
writer lock would become a spinlock in this sort of code.
|
||||
|
||||
Example 3: Eliminating Stale Data
|
||||
Another use of this pattern can be found in the openswitch driver's *connection
|
||||
tracking table* code in ``ct_limit_set()``. The table holds connection tracking
|
||||
entries and has a limit on the maximum entries. There is one such table
|
||||
per-zone and hence one *limit* per zone. The zones are mapped to their limits
|
||||
through a hashtable using an RCU-managed hlist for the hash chains. When a new
|
||||
limit is set, a new limit object is allocated and ``ct_limit_set()`` is called
|
||||
to replace the old limit object with the new one using list_replace_rcu().
|
||||
The old limit object is then freed after a grace period using kfree_rcu().
|
||||
|
||||
|
||||
Example 4: Eliminating Stale Data
|
||||
---------------------------------
|
||||
|
||||
The auditing examples above tolerate stale data, as do most algorithms
|
||||
The auditing example above tolerates stale data, as do most algorithms
|
||||
that are tracking external state. Because there is a delay from the
|
||||
time the external state changes before Linux becomes aware of the change,
|
||||
additional RCU-induced staleness is normally not a problem.
|
||||
additional RCU-induced staleness is generally not a problem.
|
||||
|
||||
However, there are many examples where stale data cannot be tolerated.
|
||||
One example in the Linux kernel is the System V IPC (see the ipc_lock()
|
||||
function in ipc/util.c). This code checks a "deleted" flag under a
|
||||
per-entry spinlock, and, if the "deleted" flag is set, pretends that the
|
||||
One example in the Linux kernel is the System V IPC (see the shm_lock()
|
||||
function in ipc/shm.c). This code checks a *deleted* flag under a
|
||||
per-entry spinlock, and, if the *deleted* flag is set, pretends that the
|
||||
entry does not exist. For this to be helpful, the search function must
|
||||
return holding the per-entry spinlock, as ipc_lock() does in fact do.
|
||||
return holding the per-entry spinlock, as shm_lock() does in fact do.
|
||||
|
||||
.. _quick_quiz:
|
||||
|
||||
Quick Quiz:
|
||||
Why does the search function need to return holding the per-entry lock for
|
||||
this deleted-flag technique to be helpful?
|
||||
For the deleted-flag technique to be helpful, why is it necessary
|
||||
to hold the per-entry lock while returning from the search function?
|
||||
|
||||
:ref:`Answer to Quick Quiz <answer_quick_quiz_list>`
|
||||
:ref:`Answer to Quick Quiz <quick_quiz_answer>`
|
||||
|
||||
If the system-call audit module were to ever need to reject stale data,
|
||||
one way to accomplish this would be to add a "deleted" flag and a "lock"
|
||||
spinlock to the audit_entry structure, and modify audit_filter_task()
|
||||
as follows::
|
||||
If the system-call audit module were to ever need to reject stale data, one way
|
||||
to accomplish this would be to add a ``deleted`` flag and a ``lock`` spinlock to the
|
||||
audit_entry structure, and modify ``audit_filter_task()`` as follows::
|
||||
|
||||
static enum audit_state audit_filter_task(struct task_struct *tsk)
|
||||
{
|
||||
@ -267,20 +327,20 @@ as follows::
|
||||
}
|
||||
|
||||
Note that this example assumes that entries are only added and deleted.
|
||||
Additional mechanism is required to deal correctly with the
|
||||
update-in-place performed by audit_upd_rule(). For one thing,
|
||||
audit_upd_rule() would need additional memory barriers to ensure
|
||||
that the list_add_rcu() was really executed before the list_del_rcu().
|
||||
Additional mechanism is required to deal correctly with the update-in-place
|
||||
performed by ``audit_upd_rule()``. For one thing, ``audit_upd_rule()`` would
|
||||
need additional memory barriers to ensure that the list_add_rcu() was really
|
||||
executed before the list_del_rcu().
|
||||
|
||||
The audit_del_rule() function would need to set the "deleted"
|
||||
flag under the spinlock as follows::
|
||||
The ``audit_del_rule()`` function would need to set the ``deleted`` flag under the
|
||||
spinlock as follows::
|
||||
|
||||
static inline int audit_del_rule(struct audit_rule *rule,
|
||||
struct list_head *list)
|
||||
{
|
||||
struct audit_entry *e;
|
||||
struct audit_entry *e;
|
||||
|
||||
/* Do not need to use the _rcu iterator here, since this
|
||||
/* No need to use the _rcu iterator here, since this
|
||||
* is the only deletion routine. */
|
||||
list_for_each_entry(e, list, list) {
|
||||
if (!audit_compare_rule(rule, &e->rule)) {
|
||||
@ -295,6 +355,91 @@ flag under the spinlock as follows::
|
||||
return -EFAULT; /* No matching rule */
|
||||
}
|
||||
|
||||
This too assumes that the caller holds ``audit_filter_mutex``.
|
||||
|
||||
|
||||
Example 5: Skipping Stale Objects
|
||||
---------------------------------
|
||||
|
||||
For some usecases, reader performance can be improved by skipping stale objects
|
||||
during read-side list traversal if the object in concern is pending destruction
|
||||
after one or more grace periods. One such example can be found in the timerfd
|
||||
subsystem. When a ``CLOCK_REALTIME`` clock is reprogrammed - for example due to
|
||||
setting of the system time, then all programmed timerfds that depend on this
|
||||
clock get triggered and processes waiting on them to expire are woken up in
|
||||
advance of their scheduled expiry. To facilitate this, all such timers are added
|
||||
to an RCU-managed ``cancel_list`` when they are setup in
|
||||
``timerfd_setup_cancel()``::
|
||||
|
||||
static void timerfd_setup_cancel(struct timerfd_ctx *ctx, int flags)
|
||||
{
|
||||
spin_lock(&ctx->cancel_lock);
|
||||
if ((ctx->clockid == CLOCK_REALTIME &&
|
||||
(flags & TFD_TIMER_ABSTIME) && (flags & TFD_TIMER_CANCEL_ON_SET)) {
|
||||
if (!ctx->might_cancel) {
|
||||
ctx->might_cancel = true;
|
||||
spin_lock(&cancel_lock);
|
||||
list_add_rcu(&ctx->clist, &cancel_list);
|
||||
spin_unlock(&cancel_lock);
|
||||
}
|
||||
}
|
||||
spin_unlock(&ctx->cancel_lock);
|
||||
}
|
||||
|
||||
When a timerfd is freed (fd is closed), then the ``might_cancel`` flag of the
|
||||
timerfd object is cleared, the object removed from the ``cancel_list`` and
|
||||
destroyed::
|
||||
|
||||
int timerfd_release(struct inode *inode, struct file *file)
|
||||
{
|
||||
struct timerfd_ctx *ctx = file->private_data;
|
||||
|
||||
spin_lock(&ctx->cancel_lock);
|
||||
if (ctx->might_cancel) {
|
||||
ctx->might_cancel = false;
|
||||
spin_lock(&cancel_lock);
|
||||
list_del_rcu(&ctx->clist);
|
||||
spin_unlock(&cancel_lock);
|
||||
}
|
||||
spin_unlock(&ctx->cancel_lock);
|
||||
|
||||
hrtimer_cancel(&ctx->t.tmr);
|
||||
kfree_rcu(ctx, rcu);
|
||||
return 0;
|
||||
}
|
||||
|
||||
If the ``CLOCK_REALTIME`` clock is set, for example by a time server, the
|
||||
hrtimer framework calls ``timerfd_clock_was_set()`` which walks the
|
||||
``cancel_list`` and wakes up processes waiting on the timerfd. While iterating
|
||||
the ``cancel_list``, the ``might_cancel`` flag is consulted to skip stale
|
||||
objects::
|
||||
|
||||
void timerfd_clock_was_set(void)
|
||||
{
|
||||
struct timerfd_ctx *ctx;
|
||||
unsigned long flags;
|
||||
|
||||
rcu_read_lock();
|
||||
list_for_each_entry_rcu(ctx, &cancel_list, clist) {
|
||||
if (!ctx->might_cancel)
|
||||
continue;
|
||||
spin_lock_irqsave(&ctx->wqh.lock, flags);
|
||||
if (ctx->moffs != ktime_mono_to_real(0)) {
|
||||
ctx->moffs = KTIME_MAX;
|
||||
ctx->ticks++;
|
||||
wake_up_locked_poll(&ctx->wqh, EPOLLIN);
|
||||
}
|
||||
spin_unlock_irqrestore(&ctx->wqh.lock, flags);
|
||||
}
|
||||
rcu_read_unlock();
|
||||
}
|
||||
|
||||
The key point here is, because RCU-traversal of the ``cancel_list`` happens
|
||||
while objects are being added and removed to the list, sometimes the traversal
|
||||
can step on an object that has been removed from the list. In this example, it
|
||||
is seen that it is better to skip such objects using a flag.
|
||||
|
||||
|
||||
Summary
|
||||
-------
|
||||
|
||||
@ -303,19 +448,21 @@ the most amenable to use of RCU. The simplest case is where entries are
|
||||
either added or deleted from the data structure (or atomically modified
|
||||
in place), but non-atomic in-place modifications can be handled by making
|
||||
a copy, updating the copy, then replacing the original with the copy.
|
||||
If stale data cannot be tolerated, then a "deleted" flag may be used
|
||||
If stale data cannot be tolerated, then a *deleted* flag may be used
|
||||
in conjunction with a per-entry spinlock in order to allow the search
|
||||
function to reject newly deleted data.
|
||||
|
||||
.. _answer_quick_quiz_list:
|
||||
.. _quick_quiz_answer:
|
||||
|
||||
Answer to Quick Quiz:
|
||||
Why does the search function need to return holding the per-entry
|
||||
lock for this deleted-flag technique to be helpful?
|
||||
For the deleted-flag technique to be helpful, why is it necessary
|
||||
to hold the per-entry lock while returning from the search function?
|
||||
|
||||
If the search function drops the per-entry lock before returning,
|
||||
then the caller will be processing stale data in any case. If it
|
||||
is really OK to be processing stale data, then you don't need a
|
||||
"deleted" flag. If processing stale data really is a problem,
|
||||
*deleted* flag. If processing stale data really is a problem,
|
||||
then you need to hold the per-entry lock across all of the code
|
||||
that uses the value that was returned.
|
||||
|
||||
:ref:`Back to Quick Quiz <quick_quiz>`
|
||||
|
@ -11,8 +11,8 @@ must be long enough that any readers accessing the item being deleted have
|
||||
since dropped their references. For example, an RCU-protected deletion
|
||||
from a linked list would first remove the item from the list, wait for
|
||||
a grace period to elapse, then free the element. See the
|
||||
Documentation/RCU/listRCU.rst file for more information on using RCU with
|
||||
linked lists.
|
||||
:ref:`Documentation/RCU/listRCU.rst <list_rcu_doc>` for more information on
|
||||
using RCU with linked lists.
|
||||
|
||||
Frequently Asked Questions
|
||||
--------------------------
|
||||
@ -50,7 +50,7 @@ Frequently Asked Questions
|
||||
- If I am running on a uniprocessor kernel, which can only do one
|
||||
thing at a time, why should I wait for a grace period?
|
||||
|
||||
See the Documentation/RCU/UP.rst file for more information.
|
||||
See :ref:`Documentation/RCU/UP.rst <up_doc>` for more information.
|
||||
|
||||
- How can I see where RCU is currently used in the Linux kernel?
|
||||
|
||||
@ -68,18 +68,18 @@ Frequently Asked Questions
|
||||
|
||||
- Why the name "RCU"?
|
||||
|
||||
"RCU" stands for "read-copy update". The file Documentation/RCU/listRCU.rst
|
||||
has more information on where this name came from, search for
|
||||
"read-copy update" to find it.
|
||||
"RCU" stands for "read-copy update".
|
||||
:ref:`Documentation/RCU/listRCU.rst <list_rcu_doc>` has more information on where
|
||||
this name came from, search for "read-copy update" to find it.
|
||||
|
||||
- I hear that RCU is patented? What is with that?
|
||||
|
||||
Yes, it is. There are several known patents related to RCU,
|
||||
search for the string "Patent" in RTFP.txt to find them.
|
||||
search for the string "Patent" in Documentation/RCU/RTFP.txt to find them.
|
||||
Of these, one was allowed to lapse by the assignee, and the
|
||||
others have been contributed to the Linux kernel under GPL.
|
||||
There are now also LGPL implementations of user-level RCU
|
||||
available (http://liburcu.org/).
|
||||
available (https://liburcu.org/).
|
||||
|
||||
- I hear that RCU needs work in order to support realtime kernels?
|
||||
|
||||
@ -88,5 +88,5 @@ Frequently Asked Questions
|
||||
|
||||
- Where can I find more information on RCU?
|
||||
|
||||
See the RTFP.txt file in this directory.
|
||||
See the Documentation/RCU/RTFP.txt file.
|
||||
Or point your browser at (http://www.rdrop.com/users/paulmck/RCU/).
|
||||
|
@ -124,9 +124,14 @@ using a dynamically allocated srcu_struct (hence "srcud-" rather than
|
||||
debugging. The final "T" entry contains the totals of the counters.
|
||||
|
||||
|
||||
USAGE
|
||||
USAGE ON SPECIFIC KERNEL BUILDS
|
||||
|
||||
The following script may be used to torture RCU:
|
||||
It is sometimes desirable to torture RCU on a specific kernel build,
|
||||
for example, when preparing to put that kernel build into production.
|
||||
In that case, the kernel should be built with CONFIG_RCU_TORTURE_TEST=m
|
||||
so that the test can be started using modprobe and terminated using rmmod.
|
||||
|
||||
For example, the following script may be used to torture RCU:
|
||||
|
||||
#!/bin/sh
|
||||
|
||||
@ -142,8 +147,136 @@ checked for such errors. The "rmmod" command forces a "SUCCESS",
|
||||
two are self-explanatory, while the last indicates that while there
|
||||
were no RCU failures, CPU-hotplug problems were detected.
|
||||
|
||||
However, the tools/testing/selftests/rcutorture/bin/kvm.sh script
|
||||
provides better automation, including automatic failure analysis.
|
||||
It assumes a qemu/kvm-enabled platform, and runs guest OSes out of initrd.
|
||||
See tools/testing/selftests/rcutorture/doc/initrd.txt for instructions
|
||||
on setting up such an initrd.
|
||||
|
||||
USAGE ON MAINLINE KERNELS
|
||||
|
||||
When using rcutorture to test changes to RCU itself, it is often
|
||||
necessary to build a number of kernels in order to test that change
|
||||
across a broad range of combinations of the relevant Kconfig options
|
||||
and of the relevant kernel boot parameters. In this situation, use
|
||||
of modprobe and rmmod can be quite time-consuming and error-prone.
|
||||
|
||||
Therefore, the tools/testing/selftests/rcutorture/bin/kvm.sh
|
||||
script is available for mainline testing for x86, arm64, and
|
||||
powerpc. By default, it will run the series of tests specified by
|
||||
tools/testing/selftests/rcutorture/configs/rcu/CFLIST, with each test
|
||||
running for 30 minutes within a guest OS using a minimal userspace
|
||||
supplied by an automatically generated initrd. After the tests are
|
||||
complete, the resulting build products and console output are analyzed
|
||||
for errors and the results of the runs are summarized.
|
||||
|
||||
On larger systems, rcutorture testing can be accelerated by passing the
|
||||
--cpus argument to kvm.sh. For example, on a 64-CPU system, "--cpus 43"
|
||||
would use up to 43 CPUs to run tests concurrently, which as of v5.4 would
|
||||
complete all the scenarios in two batches, reducing the time to complete
|
||||
from about eight hours to about one hour (not counting the time to build
|
||||
the sixteen kernels). The "--dryrun sched" argument will not run tests,
|
||||
but rather tell you how the tests would be scheduled into batches. This
|
||||
can be useful when working out how many CPUs to specify in the --cpus
|
||||
argument.
|
||||
|
||||
Not all changes require that all scenarios be run. For example, a change
|
||||
to Tree SRCU might run only the SRCU-N and SRCU-P scenarios using the
|
||||
--configs argument to kvm.sh as follows: "--configs 'SRCU-N SRCU-P'".
|
||||
Large systems can run multiple copies of of the full set of scenarios,
|
||||
for example, a system with 448 hardware threads can run five instances
|
||||
of the full set concurrently. To make this happen:
|
||||
|
||||
kvm.sh --cpus 448 --configs '5*CFLIST'
|
||||
|
||||
Alternatively, such a system can run 56 concurrent instances of a single
|
||||
eight-CPU scenario:
|
||||
|
||||
kvm.sh --cpus 448 --configs '56*TREE04'
|
||||
|
||||
Or 28 concurrent instances of each of two eight-CPU scenarios:
|
||||
|
||||
kvm.sh --cpus 448 --configs '28*TREE03 28*TREE04'
|
||||
|
||||
Of course, each concurrent instance will use memory, which can be
|
||||
limited using the --memory argument, which defaults to 512M. Small
|
||||
values for memory may require disabling the callback-flooding tests
|
||||
using the --bootargs parameter discussed below.
|
||||
|
||||
Sometimes additional debugging is useful, and in such cases the --kconfig
|
||||
parameter to kvm.sh may be used, for example, "--kconfig 'CONFIG_KASAN=y'".
|
||||
|
||||
Kernel boot arguments can also be supplied, for example, to control
|
||||
rcutorture's module parameters. For example, to test a change to RCU's
|
||||
CPU stall-warning code, use "--bootargs 'rcutorture.stall_cpu=30'".
|
||||
This will of course result in the scripting reporting a failure, namely
|
||||
the resuling RCU CPU stall warning. As noted above, reducing memory may
|
||||
require disabling rcutorture's callback-flooding tests:
|
||||
|
||||
kvm.sh --cpus 448 --configs '56*TREE04' --memory 128M \
|
||||
--bootargs 'rcutorture.fwd_progress=0'
|
||||
|
||||
Sometimes all that is needed is a full set of kernel builds. This is
|
||||
what the --buildonly argument does.
|
||||
|
||||
Finally, the --trust-make argument allows each kernel build to reuse what
|
||||
it can from the previous kernel build.
|
||||
|
||||
There are additional more arcane arguments that are documented in the
|
||||
source code of the kvm.sh script.
|
||||
|
||||
If a run contains failures, the number of buildtime and runtime failures
|
||||
is listed at the end of the kvm.sh output, which you really should redirect
|
||||
to a file. The build products and console output of each run is kept in
|
||||
tools/testing/selftests/rcutorture/res in timestamped directories. A
|
||||
given directory can be supplied to kvm-find-errors.sh in order to have
|
||||
it cycle you through summaries of errors and full error logs. For example:
|
||||
|
||||
tools/testing/selftests/rcutorture/bin/kvm-find-errors.sh \
|
||||
tools/testing/selftests/rcutorture/res/2020.01.20-15.54.23
|
||||
|
||||
However, it is often more convenient to access the files directly.
|
||||
Files pertaining to all scenarios in a run reside in the top-level
|
||||
directory (2020.01.20-15.54.23 in the example above), while per-scenario
|
||||
files reside in a subdirectory named after the scenario (for example,
|
||||
"TREE04"). If a given scenario ran more than once (as in "--configs
|
||||
'56*TREE04'" above), the directories corresponding to the second and
|
||||
subsequent runs of that scenario include a sequence number, for example,
|
||||
"TREE04.2", "TREE04.3", and so on.
|
||||
|
||||
The most frequently used file in the top-level directory is testid.txt.
|
||||
If the test ran in a git repository, then this file contains the commit
|
||||
that was tested and any uncommitted changes in diff format.
|
||||
|
||||
The most frequently used files in each per-scenario-run directory are:
|
||||
|
||||
.config: This file contains the Kconfig options.
|
||||
|
||||
Make.out: This contains build output for a specific scenario.
|
||||
|
||||
console.log: This contains the console output for a specific scenario.
|
||||
This file may be examined once the kernel has booted, but
|
||||
it might not exist if the build failed.
|
||||
|
||||
vmlinux: This contains the kernel, which can be useful with tools like
|
||||
objdump and gdb.
|
||||
|
||||
A number of additional files are available, but are less frequently used.
|
||||
Many are intended for debugging of rcutorture itself or of its scripting.
|
||||
|
||||
As of v5.4, a successful run with the default set of scenarios produces
|
||||
the following summary at the end of the run on a 12-CPU system:
|
||||
|
||||
SRCU-N ------- 804233 GPs (148.932/s) [srcu: g10008272 f0x0 ]
|
||||
SRCU-P ------- 202320 GPs (37.4667/s) [srcud: g1809476 f0x0 ]
|
||||
SRCU-t ------- 1122086 GPs (207.794/s) [srcu: g0 f0x0 ]
|
||||
SRCU-u ------- 1111285 GPs (205.794/s) [srcud: g1 f0x0 ]
|
||||
TASKS01 ------- 19666 GPs (3.64185/s) [tasks: g0 f0x0 ]
|
||||
TASKS02 ------- 20541 GPs (3.80389/s) [tasks: g0 f0x0 ]
|
||||
TASKS03 ------- 19416 GPs (3.59556/s) [tasks: g0 f0x0 ]
|
||||
TINY01 ------- 836134 GPs (154.84/s) [rcu: g0 f0x0 ] n_max_cbs: 34198
|
||||
TINY02 ------- 850371 GPs (157.476/s) [rcu: g0 f0x0 ] n_max_cbs: 2631
|
||||
TREE01 ------- 162625 GPs (30.1157/s) [rcu: g1124169 f0x0 ]
|
||||
TREE02 ------- 333003 GPs (61.6672/s) [rcu: g2647753 f0x0 ] n_max_cbs: 35844
|
||||
TREE03 ------- 306623 GPs (56.782/s) [rcu: g2975325 f0x0 ] n_max_cbs: 1496497
|
||||
CPU count limited from 16 to 12
|
||||
TREE04 ------- 246149 GPs (45.5831/s) [rcu: g1695737 f0x0 ] n_max_cbs: 434961
|
||||
TREE05 ------- 314603 GPs (58.2598/s) [rcu: g2257741 f0x2 ] n_max_cbs: 193997
|
||||
TREE07 ------- 167347 GPs (30.9902/s) [rcu: g1079021 f0x0 ] n_max_cbs: 478732
|
||||
CPU count limited from 16 to 12
|
||||
TREE09 ------- 752238 GPs (139.303/s) [rcu: g13075057 f0x0 ] n_max_cbs: 99011
|
||||
|
@ -1,3 +1,5 @@
|
||||
.. _psi:
|
||||
|
||||
================================
|
||||
PSI - Pressure Stall Information
|
||||
================================
|
||||
|
@ -33,6 +33,12 @@ max
|
||||
a per-instance limit. If ``max=<count>`` is set then only ``<count>`` number
|
||||
of binder devices can be allocated in this binderfs instance.
|
||||
|
||||
stats
|
||||
Using ``stats=global`` enables global binder statistics.
|
||||
``stats=global`` is only available for a binderfs instance mounted in the
|
||||
initial user namespace. An attempt to use the option to mount a binderfs
|
||||
instance in another user namespace will return a permission error.
|
||||
|
||||
Allocating binder Devices
|
||||
-------------------------
|
||||
|
||||
|
@ -1,5 +1,5 @@
|
||||
Kernel Support for miscellaneous (your favourite) Binary Formats v1.1
|
||||
=====================================================================
|
||||
Kernel Support for miscellaneous Binary Formats (binfmt_misc)
|
||||
=============================================================
|
||||
|
||||
This Kernel feature allows you to invoke almost (for restrictions see below)
|
||||
every program by simply typing its name in the shell.
|
||||
@ -140,8 +140,8 @@ Hints
|
||||
-----
|
||||
|
||||
If you want to pass special arguments to your interpreter, you can
|
||||
write a wrapper script for it. See Documentation/admin-guide/java.rst for an
|
||||
example.
|
||||
write a wrapper script for it.
|
||||
See :doc:`Documentation/admin-guide/java.rst <./java>` for an example.
|
||||
|
||||
Your interpreter should NOT look in the PATH for the filename; the kernel
|
||||
passes it the full filename (or the file descriptor) to use. Using ``$PATH`` can
|
||||
|
@ -251,8 +251,6 @@ line of text and contains the following stats separated by whitespace:
|
||||
|
||||
================ =============================================================
|
||||
orig_data_size uncompressed size of data stored in this disk.
|
||||
This excludes same-element-filled pages (same_pages) since
|
||||
no memory is allocated for them.
|
||||
Unit: bytes
|
||||
compr_data_size compressed size of data stored in this disk
|
||||
mem_used_total the amount of memory allocated for this disk. This
|
||||
|
@ -23,7 +23,7 @@ of dot-connected-words, and key and value are connected by ``=``. The value
|
||||
has to be terminated by semi-colon (``;``) or newline (``\n``).
|
||||
For array value, array entries are separated by comma (``,``). ::
|
||||
|
||||
KEY[.WORD[...]] = VALUE[, VALUE2[...]][;]
|
||||
KEY[.WORD[...]] = VALUE[, VALUE2[...]][;]
|
||||
|
||||
Unlike the kernel command line syntax, spaces are OK around the comma and ``=``.
|
||||
|
||||
|
@ -223,6 +223,17 @@ cpu_online_mask using a CPU hotplug notifier, and the mems file
|
||||
automatically tracks the value of node_states[N_MEMORY]--i.e.,
|
||||
nodes with memory--using the cpuset_track_online_nodes() hook.
|
||||
|
||||
The cpuset.effective_cpus and cpuset.effective_mems files are
|
||||
normally read-only copies of cpuset.cpus and cpuset.mems files
|
||||
respectively. If the cpuset cgroup filesystem is mounted with the
|
||||
special "cpuset_v2_mode" option, the behavior of these files will become
|
||||
similar to the corresponding files in cpuset v2. In other words, hotplug
|
||||
events will not change cpuset.cpus and cpuset.mems. Those events will
|
||||
only affect cpuset.effective_cpus and cpuset.effective_mems which show
|
||||
the actual cpus and memory nodes that are currently used by this cpuset.
|
||||
See Documentation/admin-guide/cgroup-v2.rst for more information about
|
||||
cpuset v2 behavior.
|
||||
|
||||
|
||||
1.4 What are exclusive cpusets ?
|
||||
--------------------------------
|
||||
|
@ -2,13 +2,6 @@
|
||||
HugeTLB Controller
|
||||
==================
|
||||
|
||||
The HugeTLB controller allows to limit the HugeTLB usage per control group and
|
||||
enforces the controller limit during page fault. Since HugeTLB doesn't
|
||||
support page reclaim, enforcing the limit at page fault time implies that,
|
||||
the application will get SIGBUS signal if it tries to access HugeTLB pages
|
||||
beyond its limit. This requires the application to know beforehand how much
|
||||
HugeTLB pages it would require for its use.
|
||||
|
||||
HugeTLB controller can be created by first mounting the cgroup filesystem.
|
||||
|
||||
# mount -t cgroup -o hugetlb none /sys/fs/cgroup
|
||||
@ -28,10 +21,14 @@ process (bash) into it.
|
||||
|
||||
Brief summary of control files::
|
||||
|
||||
hugetlb.<hugepagesize>.limit_in_bytes # set/show limit of "hugepagesize" hugetlb usage
|
||||
hugetlb.<hugepagesize>.max_usage_in_bytes # show max "hugepagesize" hugetlb usage recorded
|
||||
hugetlb.<hugepagesize>.usage_in_bytes # show current usage for "hugepagesize" hugetlb
|
||||
hugetlb.<hugepagesize>.failcnt # show the number of allocation failure due to HugeTLB limit
|
||||
hugetlb.<hugepagesize>.rsvd.limit_in_bytes # set/show limit of "hugepagesize" hugetlb reservations
|
||||
hugetlb.<hugepagesize>.rsvd.max_usage_in_bytes # show max "hugepagesize" hugetlb reservations and no-reserve faults
|
||||
hugetlb.<hugepagesize>.rsvd.usage_in_bytes # show current reservations and no-reserve faults for "hugepagesize" hugetlb
|
||||
hugetlb.<hugepagesize>.rsvd.failcnt # show the number of allocation failure due to HugeTLB reservation limit
|
||||
hugetlb.<hugepagesize>.limit_in_bytes # set/show limit of "hugepagesize" hugetlb faults
|
||||
hugetlb.<hugepagesize>.max_usage_in_bytes # show max "hugepagesize" hugetlb usage recorded
|
||||
hugetlb.<hugepagesize>.usage_in_bytes # show current usage for "hugepagesize" hugetlb
|
||||
hugetlb.<hugepagesize>.failcnt # show the number of allocation failure due to HugeTLB usage limit
|
||||
|
||||
For a system supporting three hugepage sizes (64k, 32M and 1G), the control
|
||||
files include::
|
||||
@ -40,11 +37,95 @@ files include::
|
||||
hugetlb.1GB.max_usage_in_bytes
|
||||
hugetlb.1GB.usage_in_bytes
|
||||
hugetlb.1GB.failcnt
|
||||
hugetlb.1GB.rsvd.limit_in_bytes
|
||||
hugetlb.1GB.rsvd.max_usage_in_bytes
|
||||
hugetlb.1GB.rsvd.usage_in_bytes
|
||||
hugetlb.1GB.rsvd.failcnt
|
||||
hugetlb.64KB.limit_in_bytes
|
||||
hugetlb.64KB.max_usage_in_bytes
|
||||
hugetlb.64KB.usage_in_bytes
|
||||
hugetlb.64KB.failcnt
|
||||
hugetlb.64KB.rsvd.limit_in_bytes
|
||||
hugetlb.64KB.rsvd.max_usage_in_bytes
|
||||
hugetlb.64KB.rsvd.usage_in_bytes
|
||||
hugetlb.64KB.rsvd.failcnt
|
||||
hugetlb.32MB.limit_in_bytes
|
||||
hugetlb.32MB.max_usage_in_bytes
|
||||
hugetlb.32MB.usage_in_bytes
|
||||
hugetlb.32MB.failcnt
|
||||
hugetlb.32MB.rsvd.limit_in_bytes
|
||||
hugetlb.32MB.rsvd.max_usage_in_bytes
|
||||
hugetlb.32MB.rsvd.usage_in_bytes
|
||||
hugetlb.32MB.rsvd.failcnt
|
||||
|
||||
|
||||
1. Page fault accounting
|
||||
|
||||
hugetlb.<hugepagesize>.limit_in_bytes
|
||||
hugetlb.<hugepagesize>.max_usage_in_bytes
|
||||
hugetlb.<hugepagesize>.usage_in_bytes
|
||||
hugetlb.<hugepagesize>.failcnt
|
||||
|
||||
The HugeTLB controller allows users to limit the HugeTLB usage (page fault) per
|
||||
control group and enforces the limit during page fault. Since HugeTLB
|
||||
doesn't support page reclaim, enforcing the limit at page fault time implies
|
||||
that, the application will get SIGBUS signal if it tries to fault in HugeTLB
|
||||
pages beyond its limit. Therefore the application needs to know exactly how many
|
||||
HugeTLB pages it uses before hand, and the sysadmin needs to make sure that
|
||||
there are enough available on the machine for all the users to avoid processes
|
||||
getting SIGBUS.
|
||||
|
||||
|
||||
2. Reservation accounting
|
||||
|
||||
hugetlb.<hugepagesize>.rsvd.limit_in_bytes
|
||||
hugetlb.<hugepagesize>.rsvd.max_usage_in_bytes
|
||||
hugetlb.<hugepagesize>.rsvd.usage_in_bytes
|
||||
hugetlb.<hugepagesize>.rsvd.failcnt
|
||||
|
||||
The HugeTLB controller allows to limit the HugeTLB reservations per control
|
||||
group and enforces the controller limit at reservation time and at the fault of
|
||||
HugeTLB memory for which no reservation exists. Since reservation limits are
|
||||
enforced at reservation time (on mmap or shget), reservation limits never causes
|
||||
the application to get SIGBUS signal if the memory was reserved before hand. For
|
||||
MAP_NORESERVE allocations, the reservation limit behaves the same as the fault
|
||||
limit, enforcing memory usage at fault time and causing the application to
|
||||
receive a SIGBUS if it's crossing its limit.
|
||||
|
||||
Reservation limits are superior to page fault limits described above, since
|
||||
reservation limits are enforced at reservation time (on mmap or shget), and
|
||||
never causes the application to get SIGBUS signal if the memory was reserved
|
||||
before hand. This allows for easier fallback to alternatives such as
|
||||
non-HugeTLB memory for example. In the case of page fault accounting, it's very
|
||||
hard to avoid processes getting SIGBUS since the sysadmin needs precisely know
|
||||
the HugeTLB usage of all the tasks in the system and make sure there is enough
|
||||
pages to satisfy all requests. Avoiding tasks getting SIGBUS on overcommited
|
||||
systems is practically impossible with page fault accounting.
|
||||
|
||||
|
||||
3. Caveats with shared memory
|
||||
|
||||
For shared HugeTLB memory, both HugeTLB reservation and page faults are charged
|
||||
to the first task that causes the memory to be reserved or faulted, and all
|
||||
subsequent uses of this reserved or faulted memory is done without charging.
|
||||
|
||||
Shared HugeTLB memory is only uncharged when it is unreserved or deallocated.
|
||||
This is usually when the HugeTLB file is deleted, and not when the task that
|
||||
caused the reservation or fault has exited.
|
||||
|
||||
|
||||
4. Caveats with HugeTLB cgroup offline.
|
||||
|
||||
When a HugeTLB cgroup goes offline with some reservations or faults still
|
||||
charged to it, the behavior is as follows:
|
||||
|
||||
- The fault charges are charged to the parent HugeTLB cgroup (reparented),
|
||||
- the reservation charges remain on the offline HugeTLB cgroup.
|
||||
|
||||
This means that if a HugeTLB cgroup gets offlined while there is still HugeTLB
|
||||
reservations charged to it, that cgroup persists as a zombie until all HugeTLB
|
||||
reservations are uncharged. HugeTLB reservations behave in this manner to match
|
||||
the memory controller whose cgroups also persist as zombie until all charged
|
||||
memory is uncharged. Also, the tracking of HugeTLB reservations is a bit more
|
||||
complex compared to the tracking of HugeTLB faults, so it is significantly
|
||||
harder to reparent reservations at offline time.
|
||||
|
@ -1,3 +1,5 @@
|
||||
.. _cgroup-v1:
|
||||
|
||||
========================
|
||||
Control Groups version 1
|
||||
========================
|
||||
|
@ -9,7 +9,7 @@ This is the authoritative documentation on the design, interface and
|
||||
conventions of cgroup v2. It describes all userland-visible aspects
|
||||
of cgroup including core and specific controller behaviors. All
|
||||
future changes must be reflected in this document. Documentation for
|
||||
v1 is available under Documentation/admin-guide/cgroup-v1/.
|
||||
v1 is available under :ref:`Documentation/admin-guide/cgroup-v1/index.rst <cgroup-v1>`.
|
||||
|
||||
.. CONTENTS
|
||||
|
||||
@ -188,6 +188,17 @@ cgroup v2 currently supports the following mount options.
|
||||
modified through remount from the init namespace. The mount
|
||||
option is ignored on non-init namespace mounts.
|
||||
|
||||
memory_recursiveprot
|
||||
|
||||
Recursively apply memory.min and memory.low protection to
|
||||
entire subtrees, without requiring explicit downward
|
||||
propagation into leaf cgroups. This allows protecting entire
|
||||
subtrees from one another, while retaining free competition
|
||||
within those subtrees. This should have been the default
|
||||
behavior but is a mount-option to avoid regressing setups
|
||||
relying on the original semantics (e.g. specifying bogusly
|
||||
high 'bypass' protection values at higher tree levels).
|
||||
|
||||
|
||||
Organizing Processes and Threads
|
||||
--------------------------------
|
||||
@ -1023,7 +1034,7 @@ All time durations are in microseconds.
|
||||
A read-only nested-key file which exists on non-root cgroups.
|
||||
|
||||
Shows pressure stall information for CPU. See
|
||||
Documentation/accounting/psi.rst for details.
|
||||
:ref:`Documentation/accounting/psi.rst <psi>` for details.
|
||||
|
||||
cpu.uclamp.min
|
||||
A read-write single value file which exists on non-root cgroups.
|
||||
@ -1103,7 +1114,7 @@ PAGE_SIZE multiple when read back.
|
||||
proportionally to the overage, reducing reclaim pressure for
|
||||
smaller overages.
|
||||
|
||||
Effective min boundary is limited by memory.min values of
|
||||
Effective min boundary is limited by memory.min values of
|
||||
all ancestor cgroups. If there is memory.min overcommitment
|
||||
(child cgroup or cgroups are requiring more protected memory
|
||||
than parent will allow), then each child cgroup will get
|
||||
@ -1313,53 +1324,41 @@ PAGE_SIZE multiple when read back.
|
||||
Number of major page faults incurred
|
||||
|
||||
workingset_refault
|
||||
|
||||
Number of refaults of previously evicted pages
|
||||
|
||||
workingset_activate
|
||||
|
||||
Number of refaulted pages that were immediately activated
|
||||
|
||||
workingset_nodereclaim
|
||||
|
||||
Number of times a shadow node has been reclaimed
|
||||
|
||||
pgrefill
|
||||
|
||||
Amount of scanned pages (in an active LRU list)
|
||||
|
||||
pgscan
|
||||
|
||||
Amount of scanned pages (in an inactive LRU list)
|
||||
|
||||
pgsteal
|
||||
|
||||
Amount of reclaimed pages
|
||||
|
||||
pgactivate
|
||||
|
||||
Amount of pages moved to the active LRU list
|
||||
|
||||
pgdeactivate
|
||||
|
||||
Amount of pages moved to the inactive LRU list
|
||||
|
||||
pglazyfree
|
||||
|
||||
Amount of pages postponed to be freed under memory pressure
|
||||
|
||||
pglazyfreed
|
||||
|
||||
Amount of reclaimed lazyfree pages
|
||||
|
||||
thp_fault_alloc
|
||||
|
||||
Number of transparent hugepages which were allocated to satisfy
|
||||
a page fault, including COW faults. This counter is not present
|
||||
when CONFIG_TRANSPARENT_HUGEPAGE is not set.
|
||||
|
||||
thp_collapse_alloc
|
||||
|
||||
Number of transparent hugepages which were allocated to allow
|
||||
collapsing an existing range of pages. This counter is not
|
||||
present when CONFIG_TRANSPARENT_HUGEPAGE is not set.
|
||||
@ -1403,7 +1402,7 @@ PAGE_SIZE multiple when read back.
|
||||
A read-only nested-key file which exists on non-root cgroups.
|
||||
|
||||
Shows pressure stall information for memory. See
|
||||
Documentation/accounting/psi.rst for details.
|
||||
:ref:`Documentation/accounting/psi.rst <psi>` for details.
|
||||
|
||||
|
||||
Usage Guidelines
|
||||
@ -1478,7 +1477,7 @@ IO Interface Files
|
||||
dios Number of discard IOs
|
||||
====== =====================
|
||||
|
||||
An example read output follows:
|
||||
An example read output follows::
|
||||
|
||||
8:16 rbytes=1459200 wbytes=314773504 rios=192 wios=353 dbytes=0 dios=0
|
||||
8:0 rbytes=90430464 wbytes=299008000 rios=8950 wios=1252 dbytes=50331648 dios=3021
|
||||
@ -1643,7 +1642,7 @@ IO Interface Files
|
||||
A read-only nested-key file which exists on non-root cgroups.
|
||||
|
||||
Shows pressure stall information for IO. See
|
||||
Documentation/accounting/psi.rst for details.
|
||||
:ref:`Documentation/accounting/psi.rst <psi>` for details.
|
||||
|
||||
|
||||
Writeback
|
||||
@ -1853,7 +1852,7 @@ Cpuset Interface Files
|
||||
from the requested CPUs.
|
||||
|
||||
The CPU numbers are comma-separated numbers or ranges.
|
||||
For example:
|
||||
For example::
|
||||
|
||||
# cat cpuset.cpus
|
||||
0-4,6,8-10
|
||||
@ -1892,7 +1891,7 @@ Cpuset Interface Files
|
||||
from the requested memory nodes.
|
||||
|
||||
The memory node numbers are comma-separated numbers or ranges.
|
||||
For example:
|
||||
For example::
|
||||
|
||||
# cat cpuset.mems
|
||||
0-1,3
|
||||
|
@ -54,6 +54,9 @@ If you make a mistake with the syntax, the write will fail thus::
|
||||
<debugfs>/dynamic_debug/control
|
||||
-bash: echo: write error: Invalid argument
|
||||
|
||||
Note, for systems without 'debugfs' enabled, the control file can be
|
||||
found in ``/proc/dynamic_debug/control``.
|
||||
|
||||
Viewing Dynamic Debug Behaviour
|
||||
===============================
|
||||
|
||||
|
@ -11,11 +11,13 @@ Today, with the advent of Kernel Mode Setting, a graphics board is
|
||||
either correctly working because all components follow the standards -
|
||||
or the computer is unusable, because the screen remains dark after
|
||||
booting or it displays the wrong area. Cases when this happens are:
|
||||
|
||||
- The graphics board does not recognize the monitor.
|
||||
- The graphics board is unable to detect any EDID data.
|
||||
- The graphics board incorrectly forwards EDID data to the driver.
|
||||
- The monitor sends no or bogus EDID data.
|
||||
- A KVM sends its own EDID data instead of querying the connected monitor.
|
||||
|
||||
Adding the kernel parameter "nomodeset" helps in most cases, but causes
|
||||
restrictions later on.
|
||||
|
||||
@ -32,7 +34,7 @@ individual data for a specific misbehaving monitor, commented sources
|
||||
and a Makefile environment are given here.
|
||||
|
||||
To create binary EDID and C source code files from the existing data
|
||||
material, simply type "make".
|
||||
material, simply type "make" in tools/edid/.
|
||||
|
||||
If you want to create your own EDID file, copy the file 1024x768.S,
|
||||
replace the settings with your own data and add a new target to the
|
@ -136,8 +136,6 @@ enables the mitigation by default.
|
||||
The mitigation can be controlled at boot time via a kernel command line option.
|
||||
See :ref:`taa_mitigation_control_command_line`.
|
||||
|
||||
.. _virt_mechanism:
|
||||
|
||||
Virtualization mitigation
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
|
@ -75,6 +75,7 @@ configure specific aspects of kernel behavior to your liking.
|
||||
cputopology
|
||||
dell_rbu
|
||||
device-mapper/index
|
||||
edid
|
||||
efi-stub
|
||||
ext4
|
||||
nfs/index
|
||||
|
@ -100,7 +100,7 @@ Field 10 -- # of milliseconds spent doing I/Os (unsigned int)
|
||||
|
||||
Since 5.0 this field counts jiffies when at least one request was
|
||||
started or completed. If request runs more than 2 jiffies then some
|
||||
I/O time will not be accounted unless there are other requests.
|
||||
I/O time might be not accounted in case of concurrent requests.
|
||||
|
||||
Field 11 -- weighted # of milliseconds spent doing I/Os (unsigned int)
|
||||
This field is incremented at each I/O start, I/O completion, I/O
|
||||
@ -143,6 +143,9 @@ are summed (possibly overflowing the unsigned long variable they are
|
||||
summed to) and the result given to the user. There is no convenient
|
||||
user interface for accessing the per-CPU counters themselves.
|
||||
|
||||
Since 4.19 request times are measured with nanoseconds precision and
|
||||
truncated to milliseconds before showing in this interface.
|
||||
|
||||
Disks vs Partitions
|
||||
-------------------
|
||||
|
||||
|
@ -22,11 +22,13 @@
|
||||
default: 0
|
||||
|
||||
acpi_backlight= [HW,ACPI]
|
||||
acpi_backlight=vendor
|
||||
acpi_backlight=video
|
||||
If set to vendor, prefer vendor specific driver
|
||||
{ vendor | video | native | none }
|
||||
If set to vendor, prefer vendor-specific driver
|
||||
(e.g. thinkpad_acpi, sony_acpi, etc.) instead
|
||||
of the ACPI video.ko driver.
|
||||
If set to video, use the ACPI video.ko driver.
|
||||
If set to native, use the device's native backlight mode.
|
||||
If set to none, disable the ACPI backlight interface.
|
||||
|
||||
acpi_force_32bit_fadt_addr
|
||||
force FADT to use 32 bit addresses rather than the
|
||||
@ -450,6 +452,9 @@
|
||||
bert_disable [ACPI]
|
||||
Disable BERT OS support on buggy BIOSes.
|
||||
|
||||
bgrt_disable [ACPI][X86]
|
||||
Disable BGRT to avoid flickering OEM logo.
|
||||
|
||||
bttv.card= [HW,V4L] bttv (bt848 + bt878 based grabber cards)
|
||||
bttv.radio= Most important insmod options are available as
|
||||
kernel args too.
|
||||
@ -522,6 +527,7 @@
|
||||
Default value is set via a kernel config option.
|
||||
Value can be changed at runtime via
|
||||
/sys/fs/selinux/checkreqprot.
|
||||
Setting checkreqprot to 1 is deprecated.
|
||||
|
||||
cio_ignore= [S390]
|
||||
See Documentation/s390/common_io.rst for details.
|
||||
@ -679,7 +685,7 @@
|
||||
coredump_filter=
|
||||
[KNL] Change the default value for
|
||||
/proc/<pid>/coredump_filter.
|
||||
See also Documentation/filesystems/proc.txt.
|
||||
See also Documentation/filesystems/proc.rst.
|
||||
|
||||
coresight_cpu_debug.enable
|
||||
[ARM,ARM64]
|
||||
@ -956,7 +962,7 @@
|
||||
edid/1680x1050.bin, or edid/1920x1080.bin is given
|
||||
and no file with the same name exists. Details and
|
||||
instructions how to build your own EDID data are
|
||||
available in Documentation/driver-api/edid.rst. An EDID
|
||||
available in Documentation/admin-guide/edid.rst. An EDID
|
||||
data set will only be used for a particular connector,
|
||||
if its name and a colon are prepended to the EDID
|
||||
name. Each connector may use a unique EDID data
|
||||
@ -986,10 +992,6 @@
|
||||
Documentation/admin-guide/dynamic-debug-howto.rst
|
||||
for details.
|
||||
|
||||
nompx [X86] Disables Intel Memory Protection Extensions.
|
||||
See Documentation/x86/intel_mpx.rst for more
|
||||
information about the feature.
|
||||
|
||||
nopku [X86] Disable Memory Protection Keys CPU feature found
|
||||
in some Intel CPUs.
|
||||
|
||||
@ -1099,6 +1101,12 @@
|
||||
A valid base address must be provided, and the serial
|
||||
port must already be setup and configured.
|
||||
|
||||
ec_imx21,<addr>
|
||||
ec_imx6q,<addr>
|
||||
Start an early, polled-mode, output-only console on the
|
||||
Freescale i.MX UART at the specified address. The UART
|
||||
must already be setup and configured.
|
||||
|
||||
ar3700_uart,<addr>
|
||||
Start an early, polled-mode console on the
|
||||
Armada 3700 serial port at the specified
|
||||
@ -1354,6 +1362,24 @@
|
||||
can be changed at run time by the max_graph_depth file
|
||||
in the tracefs tracing directory. default: 0 (no limit)
|
||||
|
||||
fw_devlink= [KNL] Create device links between consumer and supplier
|
||||
devices by scanning the firmware to infer the
|
||||
consumer/supplier relationships. This feature is
|
||||
especially useful when drivers are loaded as modules as
|
||||
it ensures proper ordering of tasks like device probing
|
||||
(suppliers first, then consumers), supplier boot state
|
||||
clean up (only after all consumers have probed),
|
||||
suspend/resume & runtime PM (consumers first, then
|
||||
suppliers).
|
||||
Format: { off | permissive | on | rpm }
|
||||
off -- Don't create device links from firmware info.
|
||||
permissive -- Create device links from firmware info
|
||||
but use it only for ordering boot state clean
|
||||
up (sync_state() calls).
|
||||
on -- Create device links from firmware info and use it
|
||||
to enforce probe and suspend/resume ordering.
|
||||
rpm -- Like "on", but also use to order runtime PM.
|
||||
|
||||
gamecon.map[2|3]=
|
||||
[HW,JOY] Multisystem joystick and NES/SNES/PSX pad
|
||||
support via parallel port (up to 5 devices per port)
|
||||
@ -1445,6 +1471,14 @@
|
||||
hpet_mmap= [X86, HPET_MMAP] Allow userspace to mmap HPET
|
||||
registers. Default set by CONFIG_HPET_MMAP_DEFAULT.
|
||||
|
||||
hugetlb_cma= [HW] The size of a cma area used for allocation
|
||||
of gigantic hugepages.
|
||||
Format: nn[KMGTPE]
|
||||
|
||||
Reserve a cma area of given size and allocate gigantic
|
||||
hugepages using the cma allocator. If enabled, the
|
||||
boot-time allocation of gigantic hugepages is skipped.
|
||||
|
||||
hugepages= [HW,X86-32,IA-64] HugeTLB pages to allocate at boot.
|
||||
hugepagesz= [HW,IA-64,PPC,X86-64] The size of the HugeTLB pages.
|
||||
On x86-64 and powerpc, this option can be specified
|
||||
@ -1779,7 +1813,7 @@
|
||||
provided by tboot because it makes the system
|
||||
vulnerable to DMA attacks.
|
||||
nobounce [Default off]
|
||||
Disable bounce buffer for unstrusted devices such as
|
||||
Disable bounce buffer for untrusted devices such as
|
||||
the Thunderbolt devices. This will treat the untrusted
|
||||
devices as the trusted ones, hence might expose security
|
||||
risks of DMA attacks.
|
||||
@ -1883,7 +1917,7 @@
|
||||
No delay
|
||||
|
||||
ip= [IP_PNP]
|
||||
See Documentation/filesystems/nfs/nfsroot.txt.
|
||||
See Documentation/admin-guide/nfs/nfsroot.rst.
|
||||
|
||||
ipcmni_extend [KNL] Extend the maximum number of unique System V
|
||||
IPC identifiers from 32,768 to 16,777,216.
|
||||
@ -2543,13 +2577,22 @@
|
||||
For details see: Documentation/admin-guide/hw-vuln/mds.rst
|
||||
|
||||
mem=nn[KMG] [KNL,BOOT] Force usage of a specific amount of memory
|
||||
Amount of memory to be used when the kernel is not able
|
||||
to see the whole system memory or for test.
|
||||
Amount of memory to be used in cases as follows:
|
||||
|
||||
1 for test;
|
||||
2 when the kernel is not able to see the whole system memory;
|
||||
3 memory that lies after 'mem=' boundary is excluded from
|
||||
the hypervisor, then assigned to KVM guests.
|
||||
|
||||
[X86] Work as limiting max address. Use together
|
||||
with memmap= to avoid physical address space collisions.
|
||||
Without memmap= PCI devices could be placed at addresses
|
||||
belonging to unused RAM.
|
||||
|
||||
Note that this only takes effects during boot time since
|
||||
in above case 3, memory may need be hot added after boot
|
||||
if system memory of hypervisor is not sufficient.
|
||||
|
||||
mem=nopentium [BUGS=X86-32] Disable usage of 4MB pages for kernel
|
||||
memory.
|
||||
|
||||
@ -2795,7 +2838,7 @@
|
||||
<name>,<region-number>[,<base>,<size>,<buswidth>,<altbuswidth>]
|
||||
|
||||
mtdparts= [MTD]
|
||||
See drivers/mtd/cmdlinepart.c.
|
||||
See drivers/mtd/parsers/cmdlinepart.c
|
||||
|
||||
multitce=off [PPC] This parameter disables the use of the pSeries
|
||||
firmware feature for updating multiple TCE entries
|
||||
@ -2853,13 +2896,13 @@
|
||||
Default value is 0.
|
||||
|
||||
nfsaddrs= [NFS] Deprecated. Use ip= instead.
|
||||
See Documentation/filesystems/nfs/nfsroot.txt.
|
||||
See Documentation/admin-guide/nfs/nfsroot.rst.
|
||||
|
||||
nfsroot= [NFS] nfs root filesystem for disk-less boxes.
|
||||
See Documentation/filesystems/nfs/nfsroot.txt.
|
||||
See Documentation/admin-guide/nfs/nfsroot.rst.
|
||||
|
||||
nfsrootdebug [NFS] enable nfsroot debugging messages.
|
||||
See Documentation/filesystems/nfs/nfsroot.txt.
|
||||
See Documentation/admin-guide/nfs/nfsroot.rst.
|
||||
|
||||
nfs.callback_nr_threads=
|
||||
[NFSv4] set the total number of threads that the
|
||||
@ -3174,7 +3217,7 @@
|
||||
[X86,PV_OPS] Disable paravirtualized VMware scheduler
|
||||
clock and use the default one.
|
||||
|
||||
no-steal-acc [X86,KVM,ARM64] Disable paravirtualized steal time
|
||||
no-steal-acc [X86,PV_OPS,ARM64] Disable paravirtualized steal time
|
||||
accounting. steal time is computed, but won't
|
||||
influence scheduler behaviour
|
||||
|
||||
@ -3285,12 +3328,6 @@
|
||||
This can be set from sysctl after boot.
|
||||
See Documentation/admin-guide/sysctl/vm.rst for details.
|
||||
|
||||
of_devlink [OF, KNL] Create device links between consumer and
|
||||
supplier devices by scanning the devictree to infer the
|
||||
consumer/supplier relationships. A consumer device
|
||||
will not be probed until all the supplier devices have
|
||||
probed successfully.
|
||||
|
||||
ohci1394_dma=early [HW] enable debugging via the ohci1394 driver.
|
||||
See Documentation/debugging-via-ohci1394.txt for more
|
||||
info.
|
||||
@ -3698,6 +3735,9 @@
|
||||
Override pmtimer IOPort with a hex value.
|
||||
e.g. pmtmr=0x508
|
||||
|
||||
pm_debug_messages [SUSPEND,KNL]
|
||||
Enable suspend/resume debug messages during boot up.
|
||||
|
||||
pnp.debug=1 [PNP]
|
||||
Enable PNP debug messages (depends on the
|
||||
CONFIG_PNP_DEBUG_MESSAGES option). Change at run-time
|
||||
@ -3799,6 +3839,11 @@
|
||||
before loading.
|
||||
See Documentation/admin-guide/blockdev/ramdisk.rst.
|
||||
|
||||
prot_virt= [S390] enable hosting protected virtual machines
|
||||
isolated from the hypervisor (if hardware supports
|
||||
that).
|
||||
Format: <bool>
|
||||
|
||||
psi= [KNL] Enable or disable pressure stall information
|
||||
tracking.
|
||||
Format: <bool>
|
||||
@ -3984,6 +4029,15 @@
|
||||
Set threshold of queued RCU callbacks below which
|
||||
batch limiting is re-enabled.
|
||||
|
||||
rcutree.qovld= [KNL]
|
||||
Set threshold of queued RCU callbacks beyond which
|
||||
RCU's force-quiescent-state scan will aggressively
|
||||
enlist help from cond_resched() and sched IPIs to
|
||||
help CPUs more quickly reach quiescent states.
|
||||
Set to less than zero to make this be set based
|
||||
on rcutree.qhimark at boot time and to zero to
|
||||
disable more aggressive help enlistment.
|
||||
|
||||
rcutree.rcu_idle_gp_delay= [KNL]
|
||||
Set wakeup interval for idle CPUs that have
|
||||
RCU callbacks (RCU_FAST_NO_HZ=y).
|
||||
@ -4199,6 +4253,12 @@
|
||||
rcupdate.rcu_cpu_stall_suppress= [KNL]
|
||||
Suppress RCU CPU stall warning messages.
|
||||
|
||||
rcupdate.rcu_cpu_stall_suppress_at_boot= [KNL]
|
||||
Suppress RCU CPU stall warning messages and
|
||||
rcutorture writer stall warnings that occur
|
||||
during early boot, that is, during the time
|
||||
before the init task is spawned.
|
||||
|
||||
rcupdate.rcu_cpu_stall_timeout= [KNL]
|
||||
Set timeout for RCU CPU stall warning messages.
|
||||
|
||||
@ -4392,6 +4452,22 @@
|
||||
incurs a small amount of overhead in the scheduler
|
||||
but is useful for debugging and performance tuning.
|
||||
|
||||
sched_thermal_decay_shift=
|
||||
[KNL, SMP] Set a decay shift for scheduler thermal
|
||||
pressure signal. Thermal pressure signal follows the
|
||||
default decay period of other scheduler pelt
|
||||
signals(usually 32 ms but configurable). Setting
|
||||
sched_thermal_decay_shift will left shift the decay
|
||||
period for the thermal pressure signal by the shift
|
||||
value.
|
||||
i.e. with the default pelt decay period of 32 ms
|
||||
sched_thermal_decay_shift thermal pressure decay pr
|
||||
1 64 ms
|
||||
2 128 ms
|
||||
and so on.
|
||||
Format: integer between 0 and 10
|
||||
Default is 0.
|
||||
|
||||
skew_tick= [KNL] Offset the periodic timer tick per cpu to mitigate
|
||||
xtime_lock contention on larger systems, and/or RCU lock
|
||||
contention on all systems with CONFIG_MAXSMP set.
|
||||
@ -4514,10 +4590,10 @@
|
||||
Format: <integer>
|
||||
|
||||
A nonzero value instructs the soft-lockup detector
|
||||
to panic the machine when a soft-lockup occurs. This
|
||||
is also controlled by CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC
|
||||
which is the respective build-time switch to that
|
||||
functionality.
|
||||
to panic the machine when a soft-lockup occurs. It is
|
||||
also controlled by the kernel.softlockup_panic sysctl
|
||||
and CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC, which is the
|
||||
respective build-time switch to that functionality.
|
||||
|
||||
softlockup_all_cpu_backtrace=
|
||||
[KNL] Should the soft-lockup detector generate
|
||||
@ -4659,6 +4735,28 @@
|
||||
spia_pedr=
|
||||
spia_peddr=
|
||||
|
||||
split_lock_detect=
|
||||
[X86] Enable split lock detection
|
||||
|
||||
When enabled (and if hardware support is present), atomic
|
||||
instructions that access data across cache line
|
||||
boundaries will result in an alignment check exception.
|
||||
|
||||
off - not enabled
|
||||
|
||||
warn - the kernel will emit rate limited warnings
|
||||
about applications triggering the #AC
|
||||
exception. This mode is the default on CPUs
|
||||
that supports split lock detection.
|
||||
|
||||
fatal - the kernel will send SIGBUS to applications
|
||||
that trigger the #AC exception.
|
||||
|
||||
If an #AC exception is hit in the kernel or in
|
||||
firmware (i.e. not while executing in user mode)
|
||||
the kernel will oops in either "warn" or "fatal"
|
||||
mode.
|
||||
|
||||
srcutree.counter_wrap_check [KNL]
|
||||
Specifies how frequently to check for
|
||||
grace-period sequence counter wrap for the
|
||||
@ -4871,6 +4969,10 @@
|
||||
topology updates sent by the hypervisor to this
|
||||
LPAR.
|
||||
|
||||
torture.disable_onoff_at_boot= [KNL]
|
||||
Prevent the CPU-hotplug component of torturing
|
||||
until after init has spawned.
|
||||
|
||||
tp720= [HW,PS2]
|
||||
|
||||
tpm_suspend_pcr=[HW,TPM]
|
||||
|
@ -234,7 +234,7 @@ To reduce its OS jitter, do any of the following:
|
||||
Such a workqueue can be confined to a given subset of the
|
||||
CPUs using the ``/sys/devices/virtual/workqueue/*/cpumask`` sysfs
|
||||
files. The set of WQ_SYSFS workqueues can be displayed using
|
||||
"ls sys/devices/virtual/workqueue". That said, the workqueues
|
||||
"ls /sys/devices/virtual/workqueue". That said, the workqueues
|
||||
maintainer would like to caution people against indiscriminately
|
||||
sprinkling WQ_SYSFS across all the workqueues. The reason for
|
||||
caution is that it is easy to add WQ_SYSFS, but because sysfs is
|
||||
|
@ -310,6 +310,11 @@ thp_fault_fallback
|
||||
is incremented if a page fault fails to allocate
|
||||
a huge page and instead falls back to using small pages.
|
||||
|
||||
thp_fault_fallback_charge
|
||||
is incremented if a page fault fails to charge a huge page and
|
||||
instead falls back to using small pages even though the
|
||||
allocation was successful.
|
||||
|
||||
thp_collapse_alloc_failed
|
||||
is incremented if khugepaged found a range
|
||||
of pages that should be collapsed into one huge page but failed
|
||||
@ -319,6 +324,15 @@ thp_file_alloc
|
||||
is incremented every time a file huge page is successfully
|
||||
allocated.
|
||||
|
||||
thp_file_fallback
|
||||
is incremented if a file huge page is attempted to be allocated
|
||||
but fails and instead falls back to using small pages.
|
||||
|
||||
thp_file_fallback_charge
|
||||
is incremented if a file huge page cannot be charged and instead
|
||||
falls back to using small pages even though the allocation was
|
||||
successful.
|
||||
|
||||
thp_file_mapped
|
||||
is incremented every time a file huge page is mapped into
|
||||
user address space.
|
||||
|
@ -108,6 +108,57 @@ UFFDIO_COPY. They're atomic as in guaranteeing that nothing can see an
|
||||
half copied page since it'll keep userfaulting until the copy has
|
||||
finished.
|
||||
|
||||
Notes:
|
||||
|
||||
- If you requested UFFDIO_REGISTER_MODE_MISSING when registering then
|
||||
you must provide some kind of page in your thread after reading from
|
||||
the uffd. You must provide either UFFDIO_COPY or UFFDIO_ZEROPAGE.
|
||||
The normal behavior of the OS automatically providing a zero page on
|
||||
an annonymous mmaping is not in place.
|
||||
|
||||
- None of the page-delivering ioctls default to the range that you
|
||||
registered with. You must fill in all fields for the appropriate
|
||||
ioctl struct including the range.
|
||||
|
||||
- You get the address of the access that triggered the missing page
|
||||
event out of a struct uffd_msg that you read in the thread from the
|
||||
uffd. You can supply as many pages as you want with UFFDIO_COPY or
|
||||
UFFDIO_ZEROPAGE. Keep in mind that unless you used DONTWAKE then
|
||||
the first of any of those IOCTLs wakes up the faulting thread.
|
||||
|
||||
- Be sure to test for all errors including (pollfd[0].revents &
|
||||
POLLERR). This can happen, e.g. when ranges supplied were
|
||||
incorrect.
|
||||
|
||||
Write Protect Notifications
|
||||
---------------------------
|
||||
|
||||
This is equivalent to (but faster than) using mprotect and a SIGSEGV
|
||||
signal handler.
|
||||
|
||||
Firstly you need to register a range with UFFDIO_REGISTER_MODE_WP.
|
||||
Instead of using mprotect(2) you use ioctl(uffd, UFFDIO_WRITEPROTECT,
|
||||
struct *uffdio_writeprotect) while mode = UFFDIO_WRITEPROTECT_MODE_WP
|
||||
in the struct passed in. The range does not default to and does not
|
||||
have to be identical to the range you registered with. You can write
|
||||
protect as many ranges as you like (inside the registered range).
|
||||
Then, in the thread reading from uffd the struct will have
|
||||
msg.arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WP set. Now you send
|
||||
ioctl(uffd, UFFDIO_WRITEPROTECT, struct *uffdio_writeprotect) again
|
||||
while pagefault.mode does not have UFFDIO_WRITEPROTECT_MODE_WP set.
|
||||
This wakes up the thread which will continue to run with writes. This
|
||||
allows you to do the bookkeeping about the write in the uffd reading
|
||||
thread before the ioctl.
|
||||
|
||||
If you registered with both UFFDIO_REGISTER_MODE_MISSING and
|
||||
UFFDIO_REGISTER_MODE_WP then you need to think about the sequence in
|
||||
which you supply a page and undo write protect. Note that there is a
|
||||
difference between writes into a WP area and into a !WP area. The
|
||||
former will have UFFD_PAGEFAULT_FLAG_WP set, the latter
|
||||
UFFD_PAGEFAULT_FLAG_WRITE. The latter did not fail on protection but
|
||||
you still need to supply a page when UFFDIO_REGISTER_MODE_MISSING was
|
||||
used.
|
||||
|
||||
QEMU/KVM
|
||||
========
|
||||
|
||||
|
@ -43,7 +43,8 @@ value 1 for supported.
|
||||
|
||||
AXI_ID and AXI_MASKING are mapped on DPCR1 register in performance counter.
|
||||
When non-masked bits are matching corresponding AXI_ID bits then counter is
|
||||
incremented. Perf counter is incremented if
|
||||
incremented. Perf counter is incremented if::
|
||||
|
||||
AxID && AXI_MASKING == AXI_ID && AXI_MASKING
|
||||
|
||||
This filter doesn't support filter different AXI ID for axid-read and axid-write
|
||||
|
274
Documentation/admin-guide/pm/cpufreq_drivers.rst
Normal file
274
Documentation/admin-guide/pm/cpufreq_drivers.rst
Normal file
@ -0,0 +1,274 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
=======================================================
|
||||
Legacy Documentation of CPU Performance Scaling Drivers
|
||||
=======================================================
|
||||
|
||||
Included below are historic documents describing assorted
|
||||
:doc:`CPU performance scaling <cpufreq>` drivers. They are reproduced verbatim,
|
||||
with the original white space formatting and indentation preserved, except for
|
||||
the added leading space character in every line of text.
|
||||
|
||||
|
||||
AMD PowerNow! Drivers
|
||||
=====================
|
||||
|
||||
::
|
||||
|
||||
PowerNow! and Cool'n'Quiet are AMD names for frequency
|
||||
management capabilities in AMD processors. As the hardware
|
||||
implementation changes in new generations of the processors,
|
||||
there is a different cpu-freq driver for each generation.
|
||||
|
||||
Note that the driver's will not load on the "wrong" hardware,
|
||||
so it is safe to try each driver in turn when in doubt as to
|
||||
which is the correct driver.
|
||||
|
||||
Note that the functionality to change frequency (and voltage)
|
||||
is not available in all processors. The drivers will refuse
|
||||
to load on processors without this capability. The capability
|
||||
is detected with the cpuid instruction.
|
||||
|
||||
The drivers use BIOS supplied tables to obtain frequency and
|
||||
voltage information appropriate for a particular platform.
|
||||
Frequency transitions will be unavailable if the BIOS does
|
||||
not supply these tables.
|
||||
|
||||
6th Generation: powernow-k6
|
||||
|
||||
7th Generation: powernow-k7: Athlon, Duron, Geode.
|
||||
|
||||
8th Generation: powernow-k8: Athlon, Athlon 64, Opteron, Sempron.
|
||||
Documentation on this functionality in 8th generation processors
|
||||
is available in the "BIOS and Kernel Developer's Guide", publication
|
||||
26094, in chapter 9, available for download from www.amd.com.
|
||||
|
||||
BIOS supplied data, for powernow-k7 and for powernow-k8, may be
|
||||
from either the PSB table or from ACPI objects. The ACPI support
|
||||
is only available if the kernel config sets CONFIG_ACPI_PROCESSOR.
|
||||
The powernow-k8 driver will attempt to use ACPI if so configured,
|
||||
and fall back to PST if that fails.
|
||||
The powernow-k7 driver will try to use the PSB support first, and
|
||||
fall back to ACPI if the PSB support fails. A module parameter,
|
||||
acpi_force, is provided to force ACPI support to be used instead
|
||||
of PSB support.
|
||||
|
||||
|
||||
``cpufreq-nforce2``
|
||||
===================
|
||||
|
||||
::
|
||||
|
||||
The cpufreq-nforce2 driver changes the FSB on nVidia nForce2 platforms.
|
||||
|
||||
This works better than on other platforms, because the FSB of the CPU
|
||||
can be controlled independently from the PCI/AGP clock.
|
||||
|
||||
The module has two options:
|
||||
|
||||
fid: multiplier * 10 (for example 8.5 = 85)
|
||||
min_fsb: minimum FSB
|
||||
|
||||
If not set, fid is calculated from the current CPU speed and the FSB.
|
||||
min_fsb defaults to FSB at boot time - 50 MHz.
|
||||
|
||||
IMPORTANT: The available range is limited downwards!
|
||||
Also the minimum available FSB can differ, for systems
|
||||
booting with 200 MHz, 150 should always work.
|
||||
|
||||
|
||||
``pcc-cpufreq``
|
||||
===============
|
||||
|
||||
::
|
||||
|
||||
/*
|
||||
* pcc-cpufreq.txt - PCC interface documentation
|
||||
*
|
||||
* Copyright (C) 2009 Red Hat, Matthew Garrett <mjg@redhat.com>
|
||||
* Copyright (C) 2009 Hewlett-Packard Development Company, L.P.
|
||||
* Nagananda Chumbalkar <nagananda.chumbalkar@hp.com>
|
||||
*/
|
||||
|
||||
|
||||
Processor Clocking Control Driver
|
||||
---------------------------------
|
||||
|
||||
Contents:
|
||||
---------
|
||||
1. Introduction
|
||||
1.1 PCC interface
|
||||
1.1.1 Get Average Frequency
|
||||
1.1.2 Set Desired Frequency
|
||||
1.2 Platforms affected
|
||||
2. Driver and /sys details
|
||||
2.1 scaling_available_frequencies
|
||||
2.2 cpuinfo_transition_latency
|
||||
2.3 cpuinfo_cur_freq
|
||||
2.4 related_cpus
|
||||
3. Caveats
|
||||
|
||||
1. Introduction:
|
||||
----------------
|
||||
Processor Clocking Control (PCC) is an interface between the platform
|
||||
firmware and OSPM. It is a mechanism for coordinating processor
|
||||
performance (ie: frequency) between the platform firmware and the OS.
|
||||
|
||||
The PCC driver (pcc-cpufreq) allows OSPM to take advantage of the PCC
|
||||
interface.
|
||||
|
||||
OS utilizes the PCC interface to inform platform firmware what frequency the
|
||||
OS wants for a logical processor. The platform firmware attempts to achieve
|
||||
the requested frequency. If the request for the target frequency could not be
|
||||
satisfied by platform firmware, then it usually means that power budget
|
||||
conditions are in place, and "power capping" is taking place.
|
||||
|
||||
1.1 PCC interface:
|
||||
------------------
|
||||
The complete PCC specification is available here:
|
||||
https://acpica.org/sites/acpica/files/Processor-Clocking-Control-v1p0.pdf
|
||||
|
||||
PCC relies on a shared memory region that provides a channel for communication
|
||||
between the OS and platform firmware. PCC also implements a "doorbell" that
|
||||
is used by the OS to inform the platform firmware that a command has been
|
||||
sent.
|
||||
|
||||
The ACPI PCCH() method is used to discover the location of the PCC shared
|
||||
memory region. The shared memory region header contains the "command" and
|
||||
"status" interface. PCCH() also contains details on how to access the platform
|
||||
doorbell.
|
||||
|
||||
The following commands are supported by the PCC interface:
|
||||
* Get Average Frequency
|
||||
* Set Desired Frequency
|
||||
|
||||
The ACPI PCCP() method is implemented for each logical processor and is
|
||||
used to discover the offsets for the input and output buffers in the shared
|
||||
memory region.
|
||||
|
||||
When PCC mode is enabled, the platform will not expose processor performance
|
||||
or throttle states (_PSS, _TSS and related ACPI objects) to OSPM. Therefore,
|
||||
the native P-state driver (such as acpi-cpufreq for Intel, powernow-k8 for
|
||||
AMD) will not load.
|
||||
|
||||
However, OSPM remains in control of policy. The governor (eg: "ondemand")
|
||||
computes the required performance for each processor based on server workload.
|
||||
The PCC driver fills in the command interface, and the input buffer and
|
||||
communicates the request to the platform firmware. The platform firmware is
|
||||
responsible for delivering the requested performance.
|
||||
|
||||
Each PCC command is "global" in scope and can affect all the logical CPUs in
|
||||
the system. Therefore, PCC is capable of performing "group" updates. With PCC
|
||||
the OS is capable of getting/setting the frequency of all the logical CPUs in
|
||||
the system with a single call to the BIOS.
|
||||
|
||||
1.1.1 Get Average Frequency:
|
||||
----------------------------
|
||||
This command is used by the OSPM to query the running frequency of the
|
||||
processor since the last time this command was completed. The output buffer
|
||||
indicates the average unhalted frequency of the logical processor expressed as
|
||||
a percentage of the nominal (ie: maximum) CPU frequency. The output buffer
|
||||
also signifies if the CPU frequency is limited by a power budget condition.
|
||||
|
||||
1.1.2 Set Desired Frequency:
|
||||
----------------------------
|
||||
This command is used by the OSPM to communicate to the platform firmware the
|
||||
desired frequency for a logical processor. The output buffer is currently
|
||||
ignored by OSPM. The next invocation of "Get Average Frequency" will inform
|
||||
OSPM if the desired frequency was achieved or not.
|
||||
|
||||
1.2 Platforms affected:
|
||||
-----------------------
|
||||
The PCC driver will load on any system where the platform firmware:
|
||||
* supports the PCC interface, and the associated PCCH() and PCCP() methods
|
||||
* assumes responsibility for managing the hardware clocking controls in order
|
||||
to deliver the requested processor performance
|
||||
|
||||
Currently, certain HP ProLiant platforms implement the PCC interface. On those
|
||||
platforms PCC is the "default" choice.
|
||||
|
||||
However, it is possible to disable this interface via a BIOS setting. In
|
||||
such an instance, as is also the case on platforms where the PCC interface
|
||||
is not implemented, the PCC driver will fail to load silently.
|
||||
|
||||
2. Driver and /sys details:
|
||||
---------------------------
|
||||
When the driver loads, it merely prints the lowest and the highest CPU
|
||||
frequencies supported by the platform firmware.
|
||||
|
||||
The PCC driver loads with a message such as:
|
||||
pcc-cpufreq: (v1.00.00) driver loaded with frequency limits: 1600 MHz, 2933
|
||||
MHz
|
||||
|
||||
This means that the OPSM can request the CPU to run at any frequency in
|
||||
between the limits (1600 MHz, and 2933 MHz) specified in the message.
|
||||
|
||||
Internally, there is no need for the driver to convert the "target" frequency
|
||||
to a corresponding P-state.
|
||||
|
||||
The VERSION number for the driver will be of the format v.xy.ab.
|
||||
eg: 1.00.02
|
||||
----- --
|
||||
| |
|
||||
| -- this will increase with bug fixes/enhancements to the driver
|
||||
|-- this is the version of the PCC specification the driver adheres to
|
||||
|
||||
|
||||
The following is a brief discussion on some of the fields exported via the
|
||||
/sys filesystem and how their values are affected by the PCC driver:
|
||||
|
||||
2.1 scaling_available_frequencies:
|
||||
----------------------------------
|
||||
scaling_available_frequencies is not created in /sys. No intermediate
|
||||
frequencies need to be listed because the BIOS will try to achieve any
|
||||
frequency, within limits, requested by the governor. A frequency does not have
|
||||
to be strictly associated with a P-state.
|
||||
|
||||
2.2 cpuinfo_transition_latency:
|
||||
-------------------------------
|
||||
The cpuinfo_transition_latency field is 0. The PCC specification does
|
||||
not include a field to expose this value currently.
|
||||
|
||||
2.3 cpuinfo_cur_freq:
|
||||
---------------------
|
||||
A) Often cpuinfo_cur_freq will show a value different than what is declared
|
||||
in the scaling_available_frequencies or scaling_cur_freq, or scaling_max_freq.
|
||||
This is due to "turbo boost" available on recent Intel processors. If certain
|
||||
conditions are met the BIOS can achieve a slightly higher speed than requested
|
||||
by OSPM. An example:
|
||||
|
||||
scaling_cur_freq : 2933000
|
||||
cpuinfo_cur_freq : 3196000
|
||||
|
||||
B) There is a round-off error associated with the cpuinfo_cur_freq value.
|
||||
Since the driver obtains the current frequency as a "percentage" (%) of the
|
||||
nominal frequency from the BIOS, sometimes, the values displayed by
|
||||
scaling_cur_freq and cpuinfo_cur_freq may not match. An example:
|
||||
|
||||
scaling_cur_freq : 1600000
|
||||
cpuinfo_cur_freq : 1583000
|
||||
|
||||
In this example, the nominal frequency is 2933 MHz. The driver obtains the
|
||||
current frequency, cpuinfo_cur_freq, as 54% of the nominal frequency:
|
||||
|
||||
54% of 2933 MHz = 1583 MHz
|
||||
|
||||
Nominal frequency is the maximum frequency of the processor, and it usually
|
||||
corresponds to the frequency of the P0 P-state.
|
||||
|
||||
2.4 related_cpus:
|
||||
-----------------
|
||||
The related_cpus field is identical to affected_cpus.
|
||||
|
||||
affected_cpus : 4
|
||||
related_cpus : 4
|
||||
|
||||
Currently, the PCC driver does not evaluate _PSD. The platforms that support
|
||||
PCC do not implement SW_ALL. So OSPM doesn't need to perform any coordination
|
||||
to ensure that the same frequency is requested of all dependent CPUs.
|
||||
|
||||
3. Caveats:
|
||||
-----------
|
||||
The "cpufreq_stats" module in its present form cannot be loaded and
|
||||
expected to work with the PCC driver. Since the "cpufreq_stats" module
|
||||
provides information wrt each P-state, it is not applicable to the PCC driver.
|
@ -583,20 +583,17 @@ Power Management Quality of Service for CPUs
|
||||
The power management quality of service (PM QoS) framework in the Linux kernel
|
||||
allows kernel code and user space processes to set constraints on various
|
||||
energy-efficiency features of the kernel to prevent performance from dropping
|
||||
below a required level. The PM QoS constraints can be set globally, in
|
||||
predefined categories referred to as PM QoS classes, or against individual
|
||||
devices.
|
||||
below a required level.
|
||||
|
||||
CPU idle time management can be affected by PM QoS in two ways, through the
|
||||
global constraint in the ``PM_QOS_CPU_DMA_LATENCY`` class and through the
|
||||
resume latency constraints for individual CPUs. Kernel code (e.g. device
|
||||
drivers) can set both of them with the help of special internal interfaces
|
||||
provided by the PM QoS framework. User space can modify the former by opening
|
||||
the :file:`cpu_dma_latency` special device file under :file:`/dev/` and writing
|
||||
a binary value (interpreted as a signed 32-bit integer) to it. In turn, the
|
||||
resume latency constraint for a CPU can be modified by user space by writing a
|
||||
string (representing a signed 32-bit integer) to the
|
||||
:file:`power/pm_qos_resume_latency_us` file under
|
||||
global CPU latency limit and through the resume latency constraints for
|
||||
individual CPUs. Kernel code (e.g. device drivers) can set both of them with
|
||||
the help of special internal interfaces provided by the PM QoS framework. User
|
||||
space can modify the former by opening the :file:`cpu_dma_latency` special
|
||||
device file under :file:`/dev/` and writing a binary value (interpreted as a
|
||||
signed 32-bit integer) to it. In turn, the resume latency constraint for a CPU
|
||||
can be modified from user space by writing a string (representing a signed
|
||||
32-bit integer) to the :file:`power/pm_qos_resume_latency_us` file under
|
||||
:file:`/sys/devices/system/cpu/cpu<N>/` in ``sysfs``, where the CPU number
|
||||
``<N>`` is allocated at the system initialization time. Negative values
|
||||
will be rejected in both cases and, also in both cases, the written integer
|
||||
@ -605,32 +602,34 @@ number will be interpreted as a requested PM QoS constraint in microseconds.
|
||||
The requested value is not automatically applied as a new constraint, however,
|
||||
as it may be less restrictive (greater in this particular case) than another
|
||||
constraint previously requested by someone else. For this reason, the PM QoS
|
||||
framework maintains a list of requests that have been made so far in each
|
||||
global class and for each device, aggregates them and applies the effective
|
||||
(minimum in this particular case) value as the new constraint.
|
||||
framework maintains a list of requests that have been made so far for the
|
||||
global CPU latency limit and for each individual CPU, aggregates them and
|
||||
applies the effective (minimum in this particular case) value as the new
|
||||
constraint.
|
||||
|
||||
In fact, opening the :file:`cpu_dma_latency` special device file causes a new
|
||||
PM QoS request to be created and added to the priority list of requests in the
|
||||
``PM_QOS_CPU_DMA_LATENCY`` class and the file descriptor coming from the
|
||||
"open" operation represents that request. If that file descriptor is then
|
||||
used for writing, the number written to it will be associated with the PM QoS
|
||||
request represented by it as a new requested constraint value. Next, the
|
||||
priority list mechanism will be used to determine the new effective value of
|
||||
the entire list of requests and that effective value will be set as a new
|
||||
constraint. Thus setting a new requested constraint value will only change the
|
||||
real constraint if the effective "list" value is affected by it. In particular,
|
||||
for the ``PM_QOS_CPU_DMA_LATENCY`` class it only affects the real constraint if
|
||||
it is the minimum of the requested constraints in the list. The process holding
|
||||
a file descriptor obtained by opening the :file:`cpu_dma_latency` special device
|
||||
file controls the PM QoS request associated with that file descriptor, but it
|
||||
controls this particular PM QoS request only.
|
||||
PM QoS request to be created and added to a global priority list of CPU latency
|
||||
limit requests and the file descriptor coming from the "open" operation
|
||||
represents that request. If that file descriptor is then used for writing, the
|
||||
number written to it will be associated with the PM QoS request represented by
|
||||
it as a new requested limit value. Next, the priority list mechanism will be
|
||||
used to determine the new effective value of the entire list of requests and
|
||||
that effective value will be set as a new CPU latency limit. Thus requesting a
|
||||
new limit value will only change the real limit if the effective "list" value is
|
||||
affected by it, which is the case if it is the minimum of the requested values
|
||||
in the list.
|
||||
|
||||
The process holding a file descriptor obtained by opening the
|
||||
:file:`cpu_dma_latency` special device file controls the PM QoS request
|
||||
associated with that file descriptor, but it controls this particular PM QoS
|
||||
request only.
|
||||
|
||||
Closing the :file:`cpu_dma_latency` special device file or, more precisely, the
|
||||
file descriptor obtained while opening it, causes the PM QoS request associated
|
||||
with that file descriptor to be removed from the ``PM_QOS_CPU_DMA_LATENCY``
|
||||
class priority list and destroyed. If that happens, the priority list mechanism
|
||||
will be used, again, to determine the new effective value for the whole list
|
||||
and that value will become the new real constraint.
|
||||
with that file descriptor to be removed from the global priority list of CPU
|
||||
latency limit requests and destroyed. If that happens, the priority list
|
||||
mechanism will be used again, to determine the new effective value for the whole
|
||||
list and that value will become the new limit.
|
||||
|
||||
In turn, for each CPU there is one resume latency PM QoS request associated with
|
||||
the :file:`power/pm_qos_resume_latency_us` file under
|
||||
@ -647,10 +646,10 @@ CPU in question every time the list of requests is updated this way or another
|
||||
(there may be other requests coming from kernel code in that list).
|
||||
|
||||
CPU idle time governors are expected to regard the minimum of the global
|
||||
effective ``PM_QOS_CPU_DMA_LATENCY`` class constraint and the effective
|
||||
resume latency constraint for the given CPU as the upper limit for the exit
|
||||
latency of the idle states they can select for that CPU. They should never
|
||||
select any idle states with exit latency beyond that limit.
|
||||
(effective) CPU latency limit and the effective resume latency constraint for
|
||||
the given CPU as the upper limit for the exit latency of the idle states that
|
||||
they are allowed to select for that CPU. They should never select any idle
|
||||
states with exit latency beyond that limit.
|
||||
|
||||
|
||||
Idle States Control Via Kernel Command Line
|
||||
|
@ -734,10 +734,10 @@ References
|
||||
==========
|
||||
|
||||
.. [1] Kristen Accardi, *Balancing Power and Performance in the Linux Kernel*,
|
||||
http://events.linuxfoundation.org/sites/events/files/slides/LinuxConEurope_2015.pdf
|
||||
https://events.static.linuxfound.org/sites/events/files/slides/LinuxConEurope_2015.pdf
|
||||
|
||||
.. [2] *Intel® 64 and IA-32 Architectures Software Developer’s Manual Volume 3: System Programming Guide*,
|
||||
http://www.intel.com/content/www/us/en/architecture-and-technology/64-ia-32-architectures-software-developer-system-programming-manual-325384.html
|
||||
https://www.intel.com/content/www/us/en/architecture-and-technology/64-ia-32-architectures-software-developer-system-programming-manual-325384.html
|
||||
|
||||
.. [3] *Advanced Configuration and Power Interface Specification*,
|
||||
https://uefi.org/sites/default/files/resources/ACPI_6_3_final_Jan30.pdf
|
||||
|
270
Documentation/admin-guide/pm/suspend-flows.rst
Normal file
270
Documentation/admin-guide/pm/suspend-flows.rst
Normal file
@ -0,0 +1,270 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
.. include:: <isonum.txt>
|
||||
|
||||
=========================
|
||||
System Suspend Code Flows
|
||||
=========================
|
||||
|
||||
:Copyright: |copy| 2020 Intel Corporation
|
||||
|
||||
:Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
|
||||
|
||||
At least one global system-wide transition needs to be carried out for the
|
||||
system to get from the working state into one of the supported
|
||||
:doc:`sleep states <sleep-states>`. Hibernation requires more than one
|
||||
transition to occur for this purpose, but the other sleep states, commonly
|
||||
referred to as *system-wide suspend* (or simply *system suspend*) states, need
|
||||
only one.
|
||||
|
||||
For those sleep states, the transition from the working state of the system into
|
||||
the target sleep state is referred to as *system suspend* too (in the majority
|
||||
of cases, whether this means a transition or a sleep state of the system should
|
||||
be clear from the context) and the transition back from the sleep state into the
|
||||
working state is referred to as *system resume*.
|
||||
|
||||
The kernel code flows associated with the suspend and resume transitions for
|
||||
different sleep states of the system are quite similar, but there are some
|
||||
significant differences between the :ref:`suspend-to-idle <s2idle>` code flows
|
||||
and the code flows related to the :ref:`suspend-to-RAM <s2ram>` and
|
||||
:ref:`standby <standby>` sleep states.
|
||||
|
||||
The :ref:`suspend-to-RAM <s2ram>` and :ref:`standby <standby>` sleep states
|
||||
cannot be implemented without platform support and the difference between them
|
||||
boils down to the platform-specific actions carried out by the suspend and
|
||||
resume hooks that need to be provided by the platform driver to make them
|
||||
available. Apart from that, the suspend and resume code flows for these sleep
|
||||
states are mostly identical, so they both together will be referred to as
|
||||
*platform-dependent suspend* states in what follows.
|
||||
|
||||
|
||||
.. _s2idle_suspend:
|
||||
|
||||
Suspend-to-idle Suspend Code Flow
|
||||
=================================
|
||||
|
||||
The following steps are taken in order to transition the system from the working
|
||||
state to the :ref:`suspend-to-idle <s2idle>` sleep state:
|
||||
|
||||
1. Invoking system-wide suspend notifiers.
|
||||
|
||||
Kernel subsystems can register callbacks to be invoked when the suspend
|
||||
transition is about to occur and when the resume transition has finished.
|
||||
|
||||
That allows them to prepare for the change of the system state and to clean
|
||||
up after getting back to the working state.
|
||||
|
||||
2. Freezing tasks.
|
||||
|
||||
Tasks are frozen primarily in order to avoid unchecked hardware accesses
|
||||
from user space through MMIO regions or I/O registers exposed directly to
|
||||
it and to prevent user space from entering the kernel while the next step
|
||||
of the transition is in progress (which might have been problematic for
|
||||
various reasons).
|
||||
|
||||
All user space tasks are intercepted as though they were sent a signal and
|
||||
put into uninterruptible sleep until the end of the subsequent system resume
|
||||
transition.
|
||||
|
||||
The kernel threads that choose to be frozen during system suspend for
|
||||
specific reasons are frozen subsequently, but they are not intercepted.
|
||||
Instead, they are expected to periodically check whether or not they need
|
||||
to be frozen and to put themselves into uninterruptible sleep if so. [Note,
|
||||
however, that kernel threads can use locking and other concurrency controls
|
||||
available in kernel space to synchronize themselves with system suspend and
|
||||
resume, which can be much more precise than the freezing, so the latter is
|
||||
not a recommended option for kernel threads.]
|
||||
|
||||
3. Suspending devices and reconfiguring IRQs.
|
||||
|
||||
Devices are suspended in four phases called *prepare*, *suspend*,
|
||||
*late suspend* and *noirq suspend* (see :ref:`driverapi_pm_devices` for more
|
||||
information on what exactly happens in each phase).
|
||||
|
||||
Every device is visited in each phase, but typically it is not physically
|
||||
accessed in more than two of them.
|
||||
|
||||
The runtime PM API is disabled for every device during the *late* suspend
|
||||
phase and high-level ("action") interrupt handlers are prevented from being
|
||||
invoked before the *noirq* suspend phase.
|
||||
|
||||
Interrupts are still handled after that, but they are only acknowledged to
|
||||
interrupt controllers without performing any device-specific actions that
|
||||
would be triggered in the working state of the system (those actions are
|
||||
deferred till the subsequent system resume transition as described
|
||||
`below <s2idle_resume_>`_).
|
||||
|
||||
IRQs associated with system wakeup devices are "armed" so that the resume
|
||||
transition of the system is started when one of them signals an event.
|
||||
|
||||
4. Freezing the scheduler tick and suspending timekeeping.
|
||||
|
||||
When all devices have been suspended, CPUs enter the idle loop and are put
|
||||
into the deepest available idle state. While doing that, each of them
|
||||
"freezes" its own scheduler tick so that the timer events associated with
|
||||
the tick do not occur until the CPU is woken up by another interrupt source.
|
||||
|
||||
The last CPU to enter the idle state also stops the timekeeping which
|
||||
(among other things) prevents high resolution timers from triggering going
|
||||
forward until the first CPU that is woken up restarts the timekeeping.
|
||||
That allows the CPUs to stay in the deep idle state relatively long in one
|
||||
go.
|
||||
|
||||
From this point on, the CPUs can only be woken up by non-timer hardware
|
||||
interrupts. If that happens, they go back to the idle state unless the
|
||||
interrupt that woke up one of them comes from an IRQ that has been armed for
|
||||
system wakeup, in which case the system resume transition is started.
|
||||
|
||||
|
||||
.. _s2idle_resume:
|
||||
|
||||
Suspend-to-idle Resume Code Flow
|
||||
================================
|
||||
|
||||
The following steps are taken in order to transition the system from the
|
||||
:ref:`suspend-to-idle <s2idle>` sleep state into the working state:
|
||||
|
||||
1. Resuming timekeeping and unfreezing the scheduler tick.
|
||||
|
||||
When one of the CPUs is woken up (by a non-timer hardware interrupt), it
|
||||
leaves the idle state entered in the last step of the preceding suspend
|
||||
transition, restarts the timekeeping (unless it has been restarted already
|
||||
by another CPU that woke up earlier) and the scheduler tick on that CPU is
|
||||
unfrozen.
|
||||
|
||||
If the interrupt that has woken up the CPU was armed for system wakeup,
|
||||
the system resume transition begins.
|
||||
|
||||
2. Resuming devices and restoring the working-state configuration of IRQs.
|
||||
|
||||
Devices are resumed in four phases called *noirq resume*, *early resume*,
|
||||
*resume* and *complete* (see :ref:`driverapi_pm_devices` for more
|
||||
information on what exactly happens in each phase).
|
||||
|
||||
Every device is visited in each phase, but typically it is not physically
|
||||
accessed in more than two of them.
|
||||
|
||||
The working-state configuration of IRQs is restored after the *noirq* resume
|
||||
phase and the runtime PM API is re-enabled for every device whose driver
|
||||
supports it during the *early* resume phase.
|
||||
|
||||
3. Thawing tasks.
|
||||
|
||||
Tasks frozen in step 2 of the preceding `suspend <s2idle_suspend_>`_
|
||||
transition are "thawed", which means that they are woken up from the
|
||||
uninterruptible sleep that they went into at that time and user space tasks
|
||||
are allowed to exit the kernel.
|
||||
|
||||
4. Invoking system-wide resume notifiers.
|
||||
|
||||
This is analogous to step 1 of the `suspend <s2idle_suspend_>`_ transition
|
||||
and the same set of callbacks is invoked at this point, but a different
|
||||
"notification type" parameter value is passed to them.
|
||||
|
||||
|
||||
Platform-dependent Suspend Code Flow
|
||||
====================================
|
||||
|
||||
The following steps are taken in order to transition the system from the working
|
||||
state to platform-dependent suspend state:
|
||||
|
||||
1. Invoking system-wide suspend notifiers.
|
||||
|
||||
This step is the same as step 1 of the suspend-to-idle suspend transition
|
||||
described `above <s2idle_suspend_>`_.
|
||||
|
||||
2. Freezing tasks.
|
||||
|
||||
This step is the same as step 2 of the suspend-to-idle suspend transition
|
||||
described `above <s2idle_suspend_>`_.
|
||||
|
||||
3. Suspending devices and reconfiguring IRQs.
|
||||
|
||||
This step is analogous to step 3 of the suspend-to-idle suspend transition
|
||||
described `above <s2idle_suspend_>`_, but the arming of IRQs for system
|
||||
wakeup generally does not have any effect on the platform.
|
||||
|
||||
There are platforms that can go into a very deep low-power state internally
|
||||
when all CPUs in them are in sufficiently deep idle states and all I/O
|
||||
devices have been put into low-power states. On those platforms,
|
||||
suspend-to-idle can reduce system power very effectively.
|
||||
|
||||
On the other platforms, however, low-level components (like interrupt
|
||||
controllers) need to be turned off in a platform-specific way (implemented
|
||||
in the hooks provided by the platform driver) to achieve comparable power
|
||||
reduction.
|
||||
|
||||
That usually prevents in-band hardware interrupts from waking up the system,
|
||||
which must be done in a special platform-dependent way. Then, the
|
||||
configuration of system wakeup sources usually starts when system wakeup
|
||||
devices are suspended and is finalized by the platform suspend hooks later
|
||||
on.
|
||||
|
||||
4. Disabling non-boot CPUs.
|
||||
|
||||
On some platforms the suspend hooks mentioned above must run in a one-CPU
|
||||
configuration of the system (in particular, the hardware cannot be accessed
|
||||
by any code running in parallel with the platform suspend hooks that may,
|
||||
and often do, trap into the platform firmware in order to finalize the
|
||||
suspend transition).
|
||||
|
||||
For this reason, the CPU offline/online (CPU hotplug) framework is used
|
||||
to take all of the CPUs in the system, except for one (the boot CPU),
|
||||
offline (typically, the CPUs that have been taken offline go into deep idle
|
||||
states).
|
||||
|
||||
This means that all tasks are migrated away from those CPUs and all IRQs are
|
||||
rerouted to the only CPU that remains online.
|
||||
|
||||
5. Suspending core system components.
|
||||
|
||||
This prepares the core system components for (possibly) losing power going
|
||||
forward and suspends the timekeeping.
|
||||
|
||||
6. Platform-specific power removal.
|
||||
|
||||
This is expected to remove power from all of the system components except
|
||||
for the memory controller and RAM (in order to preserve the contents of the
|
||||
latter) and some devices designated for system wakeup.
|
||||
|
||||
In many cases control is passed to the platform firmware which is expected
|
||||
to finalize the suspend transition as needed.
|
||||
|
||||
|
||||
Platform-dependent Resume Code Flow
|
||||
===================================
|
||||
|
||||
The following steps are taken in order to transition the system from a
|
||||
platform-dependent suspend state into the working state:
|
||||
|
||||
1. Platform-specific system wakeup.
|
||||
|
||||
The platform is woken up by a signal from one of the designated system
|
||||
wakeup devices (which need not be an in-band hardware interrupt) and
|
||||
control is passed back to the kernel (the working configuration of the
|
||||
platform may need to be restored by the platform firmware before the
|
||||
kernel gets control again).
|
||||
|
||||
2. Resuming core system components.
|
||||
|
||||
The suspend-time configuration of the core system components is restored and
|
||||
the timekeeping is resumed.
|
||||
|
||||
3. Re-enabling non-boot CPUs.
|
||||
|
||||
The CPUs disabled in step 4 of the preceding suspend transition are taken
|
||||
back online and their suspend-time configuration is restored.
|
||||
|
||||
4. Resuming devices and restoring the working-state configuration of IRQs.
|
||||
|
||||
This step is the same as step 2 of the suspend-to-idle suspend transition
|
||||
described `above <s2idle_resume_>`_.
|
||||
|
||||
5. Thawing tasks.
|
||||
|
||||
This step is the same as step 3 of the suspend-to-idle suspend transition
|
||||
described `above <s2idle_resume_>`_.
|
||||
|
||||
6. Invoking system-wide resume notifiers.
|
||||
|
||||
This step is the same as step 4 of the suspend-to-idle suspend transition
|
||||
described `above <s2idle_resume_>`_.
|
@ -8,3 +8,4 @@ System-Wide Power Management
|
||||
:maxdepth: 2
|
||||
|
||||
sleep-states
|
||||
suspend-flows
|
||||
|
@ -11,4 +11,5 @@ Working-State Power Management
|
||||
intel_idle
|
||||
cpufreq
|
||||
intel_pstate
|
||||
cpufreq_drivers
|
||||
intel_epb
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -67,7 +67,8 @@ two flavors of JITs, the newer eBPF JIT currently supported on:
|
||||
- sparc64
|
||||
- mips64
|
||||
- s390x
|
||||
- riscv
|
||||
- riscv64
|
||||
- riscv32
|
||||
|
||||
And the older cBPF JIT supported on the following archs:
|
||||
|
||||
|
@ -65,6 +65,12 @@ max_pid_namespaces
|
||||
The maximum number of pid namespaces that any user in the current
|
||||
user namespace may create.
|
||||
|
||||
max_time_namespaces
|
||||
===================
|
||||
|
||||
The maximum number of time namespaces that any user in the current
|
||||
user namespace may create.
|
||||
|
||||
max_user_namespaces
|
||||
===================
|
||||
|
||||
|
@ -128,6 +128,9 @@ allowed to examine the unevictable lru (mlocked pages) for pages to compact.
|
||||
This should be used on systems where stalls for minor page faults are an
|
||||
acceptable trade for large contiguous free memory. Set to 0 to prevent
|
||||
compaction from moving pages that are unevictable. Default value is 1.
|
||||
On CONFIG_PREEMPT_RT the default value is 0 in order to avoid a page fault, due
|
||||
to compaction, which would block the task from becomming active until the fault
|
||||
is resolved.
|
||||
|
||||
|
||||
dirty_background_bytes
|
||||
|
@ -48,9 +48,10 @@ always allowed (by a user with admin privileges).
|
||||
How do I use the magic SysRq key?
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
On x86 - You press the key combo :kbd:`ALT-SysRq-<command key>`.
|
||||
On x86
|
||||
You press the key combo :kbd:`ALT-SysRq-<command key>`.
|
||||
|
||||
.. note::
|
||||
.. note::
|
||||
Some
|
||||
keyboards may not have a key labeled 'SysRq'. The 'SysRq' key is
|
||||
also known as the 'Print Screen' key. Also some keyboards cannot
|
||||
@ -58,14 +59,15 @@ On x86 - You press the key combo :kbd:`ALT-SysRq-<command key>`.
|
||||
have better luck with press :kbd:`Alt`, press :kbd:`SysRq`,
|
||||
release :kbd:`SysRq`, press :kbd:`<command key>`, release everything.
|
||||
|
||||
On SPARC - You press :kbd:`ALT-STOP-<command key>`, I believe.
|
||||
On SPARC
|
||||
You press :kbd:`ALT-STOP-<command key>`, I believe.
|
||||
|
||||
On the serial console (PC style standard serial ports only)
|
||||
You send a ``BREAK``, then within 5 seconds a command key. Sending
|
||||
``BREAK`` twice is interpreted as a normal BREAK.
|
||||
|
||||
On PowerPC
|
||||
Press :kbd:`ALT - Print Screen` (or :kbd:`F13`) - :kbd:`<command key>`,
|
||||
Press :kbd:`ALT - Print Screen` (or :kbd:`F13`) - :kbd:`<command key>`.
|
||||
:kbd:`Print Screen` (or :kbd:`F13`) - :kbd:`<command key>` may suffice.
|
||||
|
||||
On other
|
||||
@ -73,7 +75,7 @@ On other
|
||||
let me know so I can add them to this section.
|
||||
|
||||
On all
|
||||
write a character to /proc/sysrq-trigger. e.g.::
|
||||
Write a character to /proc/sysrq-trigger. e.g.::
|
||||
|
||||
echo t > /proc/sysrq-trigger
|
||||
|
||||
@ -282,7 +284,7 @@ Just ask them on the linux-kernel mailing list:
|
||||
Credits
|
||||
~~~~~~~
|
||||
|
||||
Written by Mydraal <vulpyne@vulpyne.net>
|
||||
Updated by Adam Sulmicki <adam@cfar.umd.edu>
|
||||
Updated by Jeremy M. Dolan <jmd@turbogeek.org> 2001/01/28 10:15:59
|
||||
Added to by Crutcher Dunnavant <crutcher+kernel@datastacks.com>
|
||||
- Written by Mydraal <vulpyne@vulpyne.net>
|
||||
- Updated by Adam Sulmicki <adam@cfar.umd.edu>
|
||||
- Updated by Jeremy M. Dolan <jmd@turbogeek.org> 2001/01/28 10:15:59
|
||||
- Added to by Crutcher Dunnavant <crutcher+kernel@datastacks.com>
|
||||
|
@ -4,18 +4,18 @@ ARM TCM (Tightly-Coupled Memory) handling in Linux
|
||||
|
||||
Written by Linus Walleij <linus.walleij@stericsson.com>
|
||||
|
||||
Some ARM SoC:s have a so-called TCM (Tightly-Coupled Memory).
|
||||
Some ARM SoCs have a so-called TCM (Tightly-Coupled Memory).
|
||||
This is usually just a few (4-64) KiB of RAM inside the ARM
|
||||
processor.
|
||||
|
||||
Due to being embedded inside the CPU The TCM has a
|
||||
Due to being embedded inside the CPU, the TCM has a
|
||||
Harvard-architecture, so there is an ITCM (instruction TCM)
|
||||
and a DTCM (data TCM). The DTCM can not contain any
|
||||
instructions, but the ITCM can actually contain data.
|
||||
The size of DTCM or ITCM is minimum 4KiB so the typical
|
||||
minimum configuration is 4KiB ITCM and 4KiB DTCM.
|
||||
|
||||
ARM CPU:s have special registers to read out status, physical
|
||||
ARM CPUs have special registers to read out status, physical
|
||||
location and size of TCM memories. arch/arm/include/asm/cputype.h
|
||||
defines a CPUID_TCM register that you can read out from the
|
||||
system control coprocessor. Documentation from ARM can be found
|
||||
|
112
Documentation/arm64/amu.rst
Normal file
112
Documentation/arm64/amu.rst
Normal file
@ -0,0 +1,112 @@
|
||||
=======================================================
|
||||
Activity Monitors Unit (AMU) extension in AArch64 Linux
|
||||
=======================================================
|
||||
|
||||
Author: Ionela Voinescu <ionela.voinescu@arm.com>
|
||||
|
||||
Date: 2019-09-10
|
||||
|
||||
This document briefly describes the provision of Activity Monitors Unit
|
||||
support in AArch64 Linux.
|
||||
|
||||
|
||||
Architecture overview
|
||||
---------------------
|
||||
|
||||
The activity monitors extension is an optional extension introduced by the
|
||||
ARMv8.4 CPU architecture.
|
||||
|
||||
The activity monitors unit, implemented in each CPU, provides performance
|
||||
counters intended for system management use. The AMU extension provides a
|
||||
system register interface to the counter registers and also supports an
|
||||
optional external memory-mapped interface.
|
||||
|
||||
Version 1 of the Activity Monitors architecture implements a counter group
|
||||
of four fixed and architecturally defined 64-bit event counters.
|
||||
- CPU cycle counter: increments at the frequency of the CPU.
|
||||
- Constant counter: increments at the fixed frequency of the system
|
||||
clock.
|
||||
- Instructions retired: increments with every architecturally executed
|
||||
instruction.
|
||||
- Memory stall cycles: counts instruction dispatch stall cycles caused by
|
||||
misses in the last level cache within the clock domain.
|
||||
|
||||
When in WFI or WFE these counters do not increment.
|
||||
|
||||
The Activity Monitors architecture provides space for up to 16 architected
|
||||
event counters. Future versions of the architecture may use this space to
|
||||
implement additional architected event counters.
|
||||
|
||||
Additionally, version 1 implements a counter group of up to 16 auxiliary
|
||||
64-bit event counters.
|
||||
|
||||
On cold reset all counters reset to 0.
|
||||
|
||||
|
||||
Basic support
|
||||
-------------
|
||||
|
||||
The kernel can safely run a mix of CPUs with and without support for the
|
||||
activity monitors extension. Therefore, when CONFIG_ARM64_AMU_EXTN is
|
||||
selected we unconditionally enable the capability to allow any late CPU
|
||||
(secondary or hotplugged) to detect and use the feature.
|
||||
|
||||
When the feature is detected on a CPU, we flag the availability of the
|
||||
feature but this does not guarantee the correct functionality of the
|
||||
counters, only the presence of the extension.
|
||||
|
||||
Firmware (code running at higher exception levels, e.g. arm-tf) support is
|
||||
needed to:
|
||||
- Enable access for lower exception levels (EL2 and EL1) to the AMU
|
||||
registers.
|
||||
- Enable the counters. If not enabled these will read as 0.
|
||||
- Save/restore the counters before/after the CPU is being put/brought up
|
||||
from the 'off' power state.
|
||||
|
||||
When using kernels that have this feature enabled but boot with broken
|
||||
firmware the user may experience panics or lockups when accessing the
|
||||
counter registers. Even if these symptoms are not observed, the values
|
||||
returned by the register reads might not correctly reflect reality. Most
|
||||
commonly, the counters will read as 0, indicating that they are not
|
||||
enabled.
|
||||
|
||||
If proper support is not provided in firmware it's best to disable
|
||||
CONFIG_ARM64_AMU_EXTN. To be noted that for security reasons, this does not
|
||||
bypass the setting of AMUSERENR_EL0 to trap accesses from EL0 (userspace) to
|
||||
EL1 (kernel). Therefore, firmware should still ensure accesses to AMU registers
|
||||
are not trapped in EL2/EL3.
|
||||
|
||||
The fixed counters of AMUv1 are accessible though the following system
|
||||
register definitions:
|
||||
- SYS_AMEVCNTR0_CORE_EL0
|
||||
- SYS_AMEVCNTR0_CONST_EL0
|
||||
- SYS_AMEVCNTR0_INST_RET_EL0
|
||||
- SYS_AMEVCNTR0_MEM_STALL_EL0
|
||||
|
||||
Auxiliary platform specific counters can be accessed using
|
||||
SYS_AMEVCNTR1_EL0(n), where n is a value between 0 and 15.
|
||||
|
||||
Details can be found in: arch/arm64/include/asm/sysreg.h.
|
||||
|
||||
|
||||
Userspace access
|
||||
----------------
|
||||
|
||||
Currently, access from userspace to the AMU registers is disabled due to:
|
||||
- Security reasons: they might expose information about code executed in
|
||||
secure mode.
|
||||
- Purpose: AMU counters are intended for system management use.
|
||||
|
||||
Also, the presence of the feature is not visible to userspace.
|
||||
|
||||
|
||||
Virtualization
|
||||
--------------
|
||||
|
||||
Currently, access from userspace (EL0) and kernelspace (EL1) on the KVM
|
||||
guest side is disabled due to:
|
||||
- Security reasons: they might expose information about code executed
|
||||
by other guests or the host.
|
||||
|
||||
Any attempt to access the AMU registers will result in an UNDEFINED
|
||||
exception being injected into the guest.
|
@ -248,6 +248,20 @@ Before jumping into the kernel, the following conditions must be met:
|
||||
- HCR_EL2.APK (bit 40) must be initialised to 0b1
|
||||
- HCR_EL2.API (bit 41) must be initialised to 0b1
|
||||
|
||||
For CPUs with Activity Monitors Unit v1 (AMUv1) extension present:
|
||||
- If EL3 is present:
|
||||
CPTR_EL3.TAM (bit 30) must be initialised to 0b0
|
||||
CPTR_EL2.TAM (bit 30) must be initialised to 0b0
|
||||
AMCNTENSET0_EL0 must be initialised to 0b1111
|
||||
AMCNTENSET1_EL0 must be initialised to a platform specific value
|
||||
having 0b1 set for the corresponding bit for each of the auxiliary
|
||||
counters present.
|
||||
- If the kernel is entered at EL1:
|
||||
AMCNTENSET0_EL0 must be initialised to 0b1111
|
||||
AMCNTENSET1_EL0 must be initialised to a platform specific value
|
||||
having 0b1 set for the corresponding bit for each of the auxiliary
|
||||
counters present.
|
||||
|
||||
The requirements described above for CPU mode, caches, MMUs, architected
|
||||
timers, coherency and system registers apply to all CPUs. All CPUs must
|
||||
enter the kernel in the same exception level.
|
||||
|
@ -6,6 +6,7 @@ ARM64 Architecture
|
||||
:maxdepth: 1
|
||||
|
||||
acpi_object_usage
|
||||
amu
|
||||
arm-acpi
|
||||
booting
|
||||
cpu-feature-registers
|
||||
|
@ -110,6 +110,8 @@ stable kernels.
|
||||
+----------------+-----------------+-----------------+-----------------------------+
|
||||
| Cavium | ThunderX GICv3 | #23154 | CAVIUM_ERRATUM_23154 |
|
||||
+----------------+-----------------+-----------------+-----------------------------+
|
||||
| Cavium | ThunderX GICv3 | #38539 | N/A |
|
||||
+----------------+-----------------+-----------------+-----------------------------+
|
||||
| Cavium | ThunderX Core | #27456 | CAVIUM_ERRATUM_27456 |
|
||||
+----------------+-----------------+-----------------+-----------------------------+
|
||||
| Cavium | ThunderX Core | #30115 | CAVIUM_ERRATUM_30115 |
|
||||
|
@ -2,17 +2,9 @@
|
||||
Generic Block Device Capability
|
||||
===============================
|
||||
|
||||
This file documents the sysfs file block/<disk>/capability
|
||||
This file documents the sysfs file ``block/<disk>/capability``.
|
||||
|
||||
capability is a hex word indicating which capabilities a specific disk
|
||||
supports. For more information on bits not listed here, see
|
||||
include/linux/genhd.h
|
||||
``capability`` is a bitfield, printed in hexadecimal, indicating which
|
||||
capabilities a specific block device supports:
|
||||
|
||||
GENHD_FL_MEDIA_CHANGE_NOTIFY
|
||||
----------------------------
|
||||
|
||||
Value: 4
|
||||
|
||||
When this bit is set, the disk supports Asynchronous Notification
|
||||
of media change events. These events will be broadcast to user
|
||||
space via kernel uevent.
|
||||
.. kernel-doc:: include/linux/genhd.h
|
||||
|
@ -20,11 +20,11 @@ Reporting bugs
|
||||
Q: How do I report bugs for BPF kernel code?
|
||||
--------------------------------------------
|
||||
A: Since all BPF kernel development as well as bpftool and iproute2 BPF
|
||||
loader development happens through the netdev kernel mailing list,
|
||||
loader development happens through the bpf kernel mailing list,
|
||||
please report any found issues around BPF to the following mailing
|
||||
list:
|
||||
|
||||
netdev@vger.kernel.org
|
||||
bpf@vger.kernel.org
|
||||
|
||||
This may also include issues related to XDP, BPF tracing, etc.
|
||||
|
||||
@ -46,17 +46,12 @@ Submitting patches
|
||||
|
||||
Q: To which mailing list do I need to submit my BPF patches?
|
||||
------------------------------------------------------------
|
||||
A: Please submit your BPF patches to the netdev kernel mailing list:
|
||||
A: Please submit your BPF patches to the bpf kernel mailing list:
|
||||
|
||||
netdev@vger.kernel.org
|
||||
|
||||
Historically, BPF came out of networking and has always been maintained
|
||||
by the kernel networking community. Although these days BPF touches
|
||||
many other subsystems as well, the patches are still routed mainly
|
||||
through the networking community.
|
||||
bpf@vger.kernel.org
|
||||
|
||||
In case your patch has changes in various different subsystems (e.g.
|
||||
tracing, security, etc), make sure to Cc the related kernel mailing
|
||||
networking, tracing, security, etc), make sure to Cc the related kernel mailing
|
||||
lists and maintainers from there as well, so they are able to review
|
||||
the changes and provide their Acked-by's to the patches.
|
||||
|
||||
@ -168,7 +163,7 @@ a BPF point of view.
|
||||
Be aware that this is not a final verdict that the patch will
|
||||
automatically get accepted into net or net-next trees eventually:
|
||||
|
||||
On the netdev kernel mailing list reviews can come in at any point
|
||||
On the bpf kernel mailing list reviews can come in at any point
|
||||
in time. If discussions around a patch conclude that they cannot
|
||||
get included as-is, we will either apply a follow-up fix or drop
|
||||
them from the trees entirely. Therefore, we also reserve to rebase
|
||||
@ -494,15 +489,15 @@ A: You need cmake and gcc-c++ as build requisites for LLVM. Once you have
|
||||
that set up, proceed with building the latest LLVM and clang version
|
||||
from the git repositories::
|
||||
|
||||
$ git clone http://llvm.org/git/llvm.git
|
||||
$ cd llvm/tools
|
||||
$ git clone --depth 1 http://llvm.org/git/clang.git
|
||||
$ cd ..; mkdir build; cd build
|
||||
$ cmake .. -DLLVM_TARGETS_TO_BUILD="BPF;X86" \
|
||||
$ git clone https://github.com/llvm/llvm-project.git
|
||||
$ mkdir -p llvm-project/llvm/build/install
|
||||
$ cd llvm-project/llvm/build
|
||||
$ cmake .. -G "Ninja" -DLLVM_TARGETS_TO_BUILD="BPF;X86" \
|
||||
-DLLVM_ENABLE_PROJECTS="clang" \
|
||||
-DBUILD_SHARED_LIBS=OFF \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DLLVM_BUILD_RUNTIME=OFF
|
||||
$ make -j $(getconf _NPROCESSORS_ONLN)
|
||||
$ ninja
|
||||
|
||||
The built binaries can then be found in the build/bin/ directory, where
|
||||
you can point the PATH variable to.
|
||||
|
142
Documentation/bpf/bpf_lsm.rst
Normal file
142
Documentation/bpf/bpf_lsm.rst
Normal file
@ -0,0 +1,142 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0+
|
||||
.. Copyright (C) 2020 Google LLC.
|
||||
|
||||
================
|
||||
LSM BPF Programs
|
||||
================
|
||||
|
||||
These BPF programs allow runtime instrumentation of the LSM hooks by privileged
|
||||
users to implement system-wide MAC (Mandatory Access Control) and Audit
|
||||
policies using eBPF.
|
||||
|
||||
Structure
|
||||
---------
|
||||
|
||||
The example shows an eBPF program that can be attached to the ``file_mprotect``
|
||||
LSM hook:
|
||||
|
||||
.. c:function:: int file_mprotect(struct vm_area_struct *vma, unsigned long reqprot, unsigned long prot);
|
||||
|
||||
Other LSM hooks which can be instrumented can be found in
|
||||
``include/linux/lsm_hooks.h``.
|
||||
|
||||
eBPF programs that use :doc:`/bpf/btf` do not need to include kernel headers
|
||||
for accessing information from the attached eBPF program's context. They can
|
||||
simply declare the structures in the eBPF program and only specify the fields
|
||||
that need to be accessed.
|
||||
|
||||
.. code-block:: c
|
||||
|
||||
struct mm_struct {
|
||||
unsigned long start_brk, brk, start_stack;
|
||||
} __attribute__((preserve_access_index));
|
||||
|
||||
struct vm_area_struct {
|
||||
unsigned long start_brk, brk, start_stack;
|
||||
unsigned long vm_start, vm_end;
|
||||
struct mm_struct *vm_mm;
|
||||
} __attribute__((preserve_access_index));
|
||||
|
||||
|
||||
.. note:: The order of the fields is irrelevant.
|
||||
|
||||
This can be further simplified (if one has access to the BTF information at
|
||||
build time) by generating the ``vmlinux.h`` with:
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
# bpftool btf dump file <path-to-btf-vmlinux> format c > vmlinux.h
|
||||
|
||||
.. note:: ``path-to-btf-vmlinux`` can be ``/sys/kernel/btf/vmlinux`` if the
|
||||
build environment matches the environment the BPF programs are
|
||||
deployed in.
|
||||
|
||||
The ``vmlinux.h`` can then simply be included in the BPF programs without
|
||||
requiring the definition of the types.
|
||||
|
||||
The eBPF programs can be declared using the``BPF_PROG``
|
||||
macros defined in `tools/lib/bpf/bpf_tracing.h`_. In this
|
||||
example:
|
||||
|
||||
* ``"lsm/file_mprotect"`` indicates the LSM hook that the program must
|
||||
be attached to
|
||||
* ``mprotect_audit`` is the name of the eBPF program
|
||||
|
||||
.. code-block:: c
|
||||
|
||||
SEC("lsm/file_mprotect")
|
||||
int BPF_PROG(mprotect_audit, struct vm_area_struct *vma,
|
||||
unsigned long reqprot, unsigned long prot, int ret)
|
||||
{
|
||||
/* ret is the return value from the previous BPF program
|
||||
* or 0 if it's the first hook.
|
||||
*/
|
||||
if (ret != 0)
|
||||
return ret;
|
||||
|
||||
int is_heap;
|
||||
|
||||
is_heap = (vma->vm_start >= vma->vm_mm->start_brk &&
|
||||
vma->vm_end <= vma->vm_mm->brk);
|
||||
|
||||
/* Return an -EPERM or write information to the perf events buffer
|
||||
* for auditing
|
||||
*/
|
||||
if (is_heap)
|
||||
return -EPERM;
|
||||
}
|
||||
|
||||
The ``__attribute__((preserve_access_index))`` is a clang feature that allows
|
||||
the BPF verifier to update the offsets for the access at runtime using the
|
||||
:doc:`/bpf/btf` information. Since the BPF verifier is aware of the types, it
|
||||
also validates all the accesses made to the various types in the eBPF program.
|
||||
|
||||
Loading
|
||||
-------
|
||||
|
||||
eBPF programs can be loaded with the :manpage:`bpf(2)` syscall's
|
||||
``BPF_PROG_LOAD`` operation:
|
||||
|
||||
.. code-block:: c
|
||||
|
||||
struct bpf_object *obj;
|
||||
|
||||
obj = bpf_object__open("./my_prog.o");
|
||||
bpf_object__load(obj);
|
||||
|
||||
This can be simplified by using a skeleton header generated by ``bpftool``:
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
# bpftool gen skeleton my_prog.o > my_prog.skel.h
|
||||
|
||||
and the program can be loaded by including ``my_prog.skel.h`` and using
|
||||
the generated helper, ``my_prog__open_and_load``.
|
||||
|
||||
Attachment to LSM Hooks
|
||||
-----------------------
|
||||
|
||||
The LSM allows attachment of eBPF programs as LSM hooks using :manpage:`bpf(2)`
|
||||
syscall's ``BPF_RAW_TRACEPOINT_OPEN`` operation or more simply by
|
||||
using the libbpf helper ``bpf_program__attach_lsm``.
|
||||
|
||||
The program can be detached from the LSM hook by *destroying* the ``link``
|
||||
link returned by ``bpf_program__attach_lsm`` using ``bpf_link__destroy``.
|
||||
|
||||
One can also use the helpers generated in ``my_prog.skel.h`` i.e.
|
||||
``my_prog__attach`` for attachment and ``my_prog__destroy`` for cleaning up.
|
||||
|
||||
Examples
|
||||
--------
|
||||
|
||||
An example eBPF program can be found in
|
||||
`tools/testing/selftests/bpf/progs/lsm.c`_ and the corresponding
|
||||
userspace code in `tools/testing/selftests/bpf/prog_tests/test_lsm.c`_
|
||||
|
||||
.. Links
|
||||
.. _tools/lib/bpf/bpf_tracing.h:
|
||||
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/tree/tools/lib/bpf/bpf_tracing.h
|
||||
.. _tools/testing/selftests/bpf/progs/lsm.c:
|
||||
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/tree/tools/testing/selftests/bpf/progs/lsm.c
|
||||
.. _tools/testing/selftests/bpf/prog_tests/test_lsm.c:
|
||||
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/tree/tools/testing/selftests/bpf/prog_tests/test_lsm.c
|
213
Documentation/bpf/drgn.rst
Normal file
213
Documentation/bpf/drgn.rst
Normal file
@ -0,0 +1,213 @@
|
||||
.. SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
|
||||
|
||||
==============
|
||||
BPF drgn tools
|
||||
==============
|
||||
|
||||
drgn scripts is a convenient and easy to use mechanism to retrieve arbitrary
|
||||
kernel data structures. drgn is not relying on kernel UAPI to read the data.
|
||||
Instead it's reading directly from ``/proc/kcore`` or vmcore and pretty prints
|
||||
the data based on DWARF debug information from vmlinux.
|
||||
|
||||
This document describes BPF related drgn tools.
|
||||
|
||||
See `drgn/tools`_ for all tools available at the moment and `drgn/doc`_ for
|
||||
more details on drgn itself.
|
||||
|
||||
bpf_inspect.py
|
||||
--------------
|
||||
|
||||
Description
|
||||
===========
|
||||
|
||||
`bpf_inspect.py`_ is a tool intended to inspect BPF programs and maps. It can
|
||||
iterate over all programs and maps in the system and print basic information
|
||||
about these objects, including id, type and name.
|
||||
|
||||
The main use-case `bpf_inspect.py`_ covers is to show BPF programs of types
|
||||
``BPF_PROG_TYPE_EXT`` and ``BPF_PROG_TYPE_TRACING`` attached to other BPF
|
||||
programs via ``freplace``/``fentry``/``fexit`` mechanisms, since there is no
|
||||
user-space API to get this information.
|
||||
|
||||
Getting started
|
||||
===============
|
||||
|
||||
List BPF programs (full names are obtained from BTF)::
|
||||
|
||||
% sudo bpf_inspect.py prog
|
||||
27: BPF_PROG_TYPE_TRACEPOINT tracepoint__tcp__tcp_send_reset
|
||||
4632: BPF_PROG_TYPE_CGROUP_SOCK_ADDR tw_ipt_bind
|
||||
49464: BPF_PROG_TYPE_RAW_TRACEPOINT raw_tracepoint__sched_process_exit
|
||||
|
||||
List BPF maps::
|
||||
|
||||
% sudo bpf_inspect.py map
|
||||
2577: BPF_MAP_TYPE_HASH tw_ipt_vips
|
||||
4050: BPF_MAP_TYPE_STACK_TRACE stack_traces
|
||||
4069: BPF_MAP_TYPE_PERCPU_ARRAY ned_dctcp_cntr
|
||||
|
||||
Find BPF programs attached to BPF program ``test_pkt_access``::
|
||||
|
||||
% sudo bpf_inspect.py p | grep test_pkt_access
|
||||
650: BPF_PROG_TYPE_SCHED_CLS test_pkt_access
|
||||
654: BPF_PROG_TYPE_TRACING test_main linked:[650->25: BPF_TRAMP_FEXIT test_pkt_access->test_pkt_access()]
|
||||
655: BPF_PROG_TYPE_TRACING test_subprog1 linked:[650->29: BPF_TRAMP_FEXIT test_pkt_access->test_pkt_access_subprog1()]
|
||||
656: BPF_PROG_TYPE_TRACING test_subprog2 linked:[650->31: BPF_TRAMP_FEXIT test_pkt_access->test_pkt_access_subprog2()]
|
||||
657: BPF_PROG_TYPE_TRACING test_subprog3 linked:[650->21: BPF_TRAMP_FEXIT test_pkt_access->test_pkt_access_subprog3()]
|
||||
658: BPF_PROG_TYPE_EXT new_get_skb_len linked:[650->16: BPF_TRAMP_REPLACE test_pkt_access->get_skb_len()]
|
||||
659: BPF_PROG_TYPE_EXT new_get_skb_ifindex linked:[650->23: BPF_TRAMP_REPLACE test_pkt_access->get_skb_ifindex()]
|
||||
660: BPF_PROG_TYPE_EXT new_get_constant linked:[650->19: BPF_TRAMP_REPLACE test_pkt_access->get_constant()]
|
||||
|
||||
It can be seen that there is a program ``test_pkt_access``, id 650 and there
|
||||
are multiple other tracing and ext programs attached to functions in
|
||||
``test_pkt_access``.
|
||||
|
||||
For example the line::
|
||||
|
||||
658: BPF_PROG_TYPE_EXT new_get_skb_len linked:[650->16: BPF_TRAMP_REPLACE test_pkt_access->get_skb_len()]
|
||||
|
||||
, means that BPF program id 658, type ``BPF_PROG_TYPE_EXT``, name
|
||||
``new_get_skb_len`` replaces (``BPF_TRAMP_REPLACE``) function ``get_skb_len()``
|
||||
that has BTF id 16 in BPF program id 650, name ``test_pkt_access``.
|
||||
|
||||
Getting help:
|
||||
|
||||
.. code-block:: none
|
||||
|
||||
% sudo bpf_inspect.py
|
||||
usage: bpf_inspect.py [-h] {prog,p,map,m} ...
|
||||
|
||||
drgn script to list BPF programs or maps and their properties
|
||||
unavailable via kernel API.
|
||||
|
||||
See https://github.com/osandov/drgn/ for more details on drgn.
|
||||
|
||||
optional arguments:
|
||||
-h, --help show this help message and exit
|
||||
|
||||
subcommands:
|
||||
{prog,p,map,m}
|
||||
prog (p) list BPF programs
|
||||
map (m) list BPF maps
|
||||
|
||||
Customization
|
||||
=============
|
||||
|
||||
The script is intended to be customized by developers to print relevant
|
||||
information about BPF programs, maps and other objects.
|
||||
|
||||
For example, to print ``struct bpf_prog_aux`` for BPF program id 53077:
|
||||
|
||||
.. code-block:: none
|
||||
|
||||
% git diff
|
||||
diff --git a/tools/bpf_inspect.py b/tools/bpf_inspect.py
|
||||
index 650e228..aea2357 100755
|
||||
--- a/tools/bpf_inspect.py
|
||||
+++ b/tools/bpf_inspect.py
|
||||
@@ -112,7 +112,9 @@ def list_bpf_progs(args):
|
||||
if linked:
|
||||
linked = f" linked:[{linked}]"
|
||||
|
||||
- print(f"{id_:>6}: {type_:32} {name:32} {linked}")
|
||||
+ if id_ == 53077:
|
||||
+ print(f"{id_:>6}: {type_:32} {name:32}")
|
||||
+ print(f"{bpf_prog.aux}")
|
||||
|
||||
|
||||
def list_bpf_maps(args):
|
||||
|
||||
It produces the output::
|
||||
|
||||
% sudo bpf_inspect.py p
|
||||
53077: BPF_PROG_TYPE_XDP tw_xdp_policer
|
||||
*(struct bpf_prog_aux *)0xffff8893fad4b400 = {
|
||||
.refcnt = (atomic64_t){
|
||||
.counter = (long)58,
|
||||
},
|
||||
.used_map_cnt = (u32)1,
|
||||
.max_ctx_offset = (u32)8,
|
||||
.max_pkt_offset = (u32)15,
|
||||
.max_tp_access = (u32)0,
|
||||
.stack_depth = (u32)8,
|
||||
.id = (u32)53077,
|
||||
.func_cnt = (u32)0,
|
||||
.func_idx = (u32)0,
|
||||
.attach_btf_id = (u32)0,
|
||||
.linked_prog = (struct bpf_prog *)0x0,
|
||||
.verifier_zext = (bool)0,
|
||||
.offload_requested = (bool)0,
|
||||
.attach_btf_trace = (bool)0,
|
||||
.func_proto_unreliable = (bool)0,
|
||||
.trampoline_prog_type = (enum bpf_tramp_prog_type)BPF_TRAMP_FENTRY,
|
||||
.trampoline = (struct bpf_trampoline *)0x0,
|
||||
.tramp_hlist = (struct hlist_node){
|
||||
.next = (struct hlist_node *)0x0,
|
||||
.pprev = (struct hlist_node **)0x0,
|
||||
},
|
||||
.attach_func_proto = (const struct btf_type *)0x0,
|
||||
.attach_func_name = (const char *)0x0,
|
||||
.func = (struct bpf_prog **)0x0,
|
||||
.jit_data = (void *)0x0,
|
||||
.poke_tab = (struct bpf_jit_poke_descriptor *)0x0,
|
||||
.size_poke_tab = (u32)0,
|
||||
.ksym_tnode = (struct latch_tree_node){
|
||||
.node = (struct rb_node [2]){
|
||||
{
|
||||
.__rb_parent_color = (unsigned long)18446612956263126665,
|
||||
.rb_right = (struct rb_node *)0x0,
|
||||
.rb_left = (struct rb_node *)0xffff88a0be3d0088,
|
||||
},
|
||||
{
|
||||
.__rb_parent_color = (unsigned long)18446612956263126689,
|
||||
.rb_right = (struct rb_node *)0x0,
|
||||
.rb_left = (struct rb_node *)0xffff88a0be3d00a0,
|
||||
},
|
||||
},
|
||||
},
|
||||
.ksym_lnode = (struct list_head){
|
||||
.next = (struct list_head *)0xffff88bf481830b8,
|
||||
.prev = (struct list_head *)0xffff888309f536b8,
|
||||
},
|
||||
.ops = (const struct bpf_prog_ops *)xdp_prog_ops+0x0 = 0xffffffff820fa350,
|
||||
.used_maps = (struct bpf_map **)0xffff889ff795de98,
|
||||
.prog = (struct bpf_prog *)0xffffc9000cf2d000,
|
||||
.user = (struct user_struct *)root_user+0x0 = 0xffffffff82444820,
|
||||
.load_time = (u64)2408348759285319,
|
||||
.cgroup_storage = (struct bpf_map *[2]){},
|
||||
.name = (char [16])"tw_xdp_policer",
|
||||
.security = (void *)0xffff889ff795d548,
|
||||
.offload = (struct bpf_prog_offload *)0x0,
|
||||
.btf = (struct btf *)0xffff8890ce6d0580,
|
||||
.func_info = (struct bpf_func_info *)0xffff889ff795d240,
|
||||
.func_info_aux = (struct bpf_func_info_aux *)0xffff889ff795de20,
|
||||
.linfo = (struct bpf_line_info *)0xffff888a707afc00,
|
||||
.jited_linfo = (void **)0xffff8893fad48600,
|
||||
.func_info_cnt = (u32)1,
|
||||
.nr_linfo = (u32)37,
|
||||
.linfo_idx = (u32)0,
|
||||
.num_exentries = (u32)0,
|
||||
.extable = (struct exception_table_entry *)0xffffffffa032d950,
|
||||
.stats = (struct bpf_prog_stats *)0x603fe3a1f6d0,
|
||||
.work = (struct work_struct){
|
||||
.data = (atomic_long_t){
|
||||
.counter = (long)0,
|
||||
},
|
||||
.entry = (struct list_head){
|
||||
.next = (struct list_head *)0x0,
|
||||
.prev = (struct list_head *)0x0,
|
||||
},
|
||||
.func = (work_func_t)0x0,
|
||||
},
|
||||
.rcu = (struct callback_head){
|
||||
.next = (struct callback_head *)0x0,
|
||||
.func = (void (*)(struct callback_head *))0x0,
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
.. Links
|
||||
.. _drgn/doc: https://drgn.readthedocs.io/en/latest/
|
||||
.. _drgn/tools: https://github.com/osandov/drgn/tree/master/tools
|
||||
.. _bpf_inspect.py:
|
||||
https://github.com/osandov/drgn/blob/master/tools/bpf_inspect.py
|
@ -45,14 +45,16 @@ Program types
|
||||
prog_cgroup_sockopt
|
||||
prog_cgroup_sysctl
|
||||
prog_flow_dissector
|
||||
bpf_lsm
|
||||
|
||||
|
||||
Testing BPF
|
||||
===========
|
||||
Testing and debugging BPF
|
||||
=========================
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 1
|
||||
|
||||
drgn
|
||||
s390
|
||||
|
||||
|
||||
|
@ -38,7 +38,11 @@ needs_sphinx = '1.3'
|
||||
# ones.
|
||||
extensions = ['kerneldoc', 'rstFlatTable', 'kernel_include', 'cdomain',
|
||||
'kfigure', 'sphinx.ext.ifconfig', 'automarkup',
|
||||
'maintainers_include']
|
||||
'maintainers_include', 'sphinx.ext.autosectionlabel' ]
|
||||
|
||||
# Ensure that autosectionlabel will produce unique names
|
||||
autosectionlabel_prefix_document = True
|
||||
autosectionlabel_maxdepth = 2
|
||||
|
||||
# The name of the math extension changed on Sphinx 1.4
|
||||
if (major == 1 and minor > 3) or (major > 1):
|
||||
|
@ -8,41 +8,81 @@ This is the beginning of a manual for core kernel APIs. The conversion
|
||||
Core utilities
|
||||
==============
|
||||
|
||||
This section has general and "core core" documentation. The first is a
|
||||
massive grab-bag of kerneldoc info left over from the docbook days; it
|
||||
should really be broken up someday when somebody finds the energy to do
|
||||
it.
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 1
|
||||
|
||||
kernel-api
|
||||
assoc_array
|
||||
atomic_ops
|
||||
cachetlb
|
||||
refcount-vs-atomic
|
||||
cpu_hotplug
|
||||
idr
|
||||
local_ops
|
||||
workqueue
|
||||
genericirq
|
||||
xarray
|
||||
librs
|
||||
genalloc
|
||||
errseq
|
||||
packing
|
||||
printk-formats
|
||||
symbol-namespaces
|
||||
|
||||
Data structures and low-level utilities
|
||||
=======================================
|
||||
|
||||
Library functionality that is used throughout the kernel.
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 1
|
||||
|
||||
kobject
|
||||
assoc_array
|
||||
xarray
|
||||
idr
|
||||
circular-buffers
|
||||
generic-radix-tree
|
||||
packing
|
||||
timekeeping
|
||||
errseq
|
||||
|
||||
Concurrency primitives
|
||||
======================
|
||||
|
||||
How Linux keeps everything from happening at the same time. See
|
||||
:doc:`/locking/index` for more related documentation.
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 1
|
||||
|
||||
atomic_ops
|
||||
refcount-vs-atomic
|
||||
local_ops
|
||||
padata
|
||||
../RCU/index
|
||||
|
||||
Low-level hardware management
|
||||
=============================
|
||||
|
||||
Cache management, managing CPU hotplug, etc.
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 1
|
||||
|
||||
cachetlb
|
||||
cpu_hotplug
|
||||
memory-hotplug
|
||||
genericirq
|
||||
protection-keys
|
||||
|
||||
Memory management
|
||||
=================
|
||||
|
||||
How to allocate and use memory in the kernel. Note that there is a lot
|
||||
more memory-management documentation in :doc:`/vm/index`.
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 1
|
||||
|
||||
memory-allocation
|
||||
mm-api
|
||||
genalloc
|
||||
pin_user_pages
|
||||
gfp_mask-from-fs-io
|
||||
timekeeping
|
||||
boot-time-mm
|
||||
memory-hotplug
|
||||
protection-keys
|
||||
../RCU/index
|
||||
gcc-plugins
|
||||
symbol-namespaces
|
||||
padata
|
||||
ioctl
|
||||
|
||||
gfp_mask-from-fs-io
|
||||
|
||||
Interfaces for kernel debugging
|
||||
===============================
|
||||
@ -53,6 +93,16 @@ Interfaces for kernel debugging
|
||||
debug-objects
|
||||
tracepoint
|
||||
|
||||
Everything else
|
||||
===============
|
||||
|
||||
Documents that don't fit elsewhere or which have yet to be categorized.
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 1
|
||||
|
||||
librs
|
||||
|
||||
.. only:: subproject and html
|
||||
|
||||
Indices
|
||||
|
@ -25,7 +25,7 @@ some terms we will be working with.
|
||||
usually embedded within some other structure which contains the stuff
|
||||
the code is really interested in.
|
||||
|
||||
No structure should EVER have more than one kobject embedded within it.
|
||||
No structure should **EVER** have more than one kobject embedded within it.
|
||||
If it does, the reference counting for the object is sure to be messed
|
||||
up and incorrect, and your code will be buggy. So do not do this.
|
||||
|
||||
@ -55,7 +55,7 @@ a larger, domain-specific object. To this end, kobjects will be found
|
||||
embedded in other structures. If you are used to thinking of things in
|
||||
object-oriented terms, kobjects can be seen as a top-level, abstract class
|
||||
from which other classes are derived. A kobject implements a set of
|
||||
capabilities which are not particularly useful by themselves, but which are
|
||||
capabilities which are not particularly useful by themselves, but are
|
||||
nice to have in other objects. The C language does not allow for the
|
||||
direct expression of inheritance, so other techniques - such as structure
|
||||
embedding - must be used.
|
||||
@ -65,12 +65,12 @@ this is analogous as to how "list_head" structs are rarely useful on
|
||||
their own, but are invariably found embedded in the larger objects of
|
||||
interest.)
|
||||
|
||||
So, for example, the UIO code in drivers/uio/uio.c has a structure that
|
||||
So, for example, the UIO code in ``drivers/uio/uio.c`` has a structure that
|
||||
defines the memory region associated with a uio device::
|
||||
|
||||
struct uio_map {
|
||||
struct kobject kobj;
|
||||
struct uio_mem *mem;
|
||||
struct kobject kobj;
|
||||
struct uio_mem *mem;
|
||||
};
|
||||
|
||||
If you have a struct uio_map structure, finding its embedded kobject is
|
||||
@ -78,30 +78,30 @@ just a matter of using the kobj member. Code that works with kobjects will
|
||||
often have the opposite problem, however: given a struct kobject pointer,
|
||||
what is the pointer to the containing structure? You must avoid tricks
|
||||
(such as assuming that the kobject is at the beginning of the structure)
|
||||
and, instead, use the container_of() macro, found in <linux/kernel.h>::
|
||||
and, instead, use the container_of() macro, found in ``<linux/kernel.h>``::
|
||||
|
||||
container_of(pointer, type, member)
|
||||
|
||||
where:
|
||||
|
||||
* "pointer" is the pointer to the embedded kobject,
|
||||
* "type" is the type of the containing structure, and
|
||||
* "member" is the name of the structure field to which "pointer" points.
|
||||
* ``pointer`` is the pointer to the embedded kobject,
|
||||
* ``type`` is the type of the containing structure, and
|
||||
* ``member`` is the name of the structure field to which ``pointer`` points.
|
||||
|
||||
The return value from container_of() is a pointer to the corresponding
|
||||
container type. So, for example, a pointer "kp" to a struct kobject
|
||||
embedded *within* a struct uio_map could be converted to a pointer to the
|
||||
*containing* uio_map structure with::
|
||||
container type. So, for example, a pointer ``kp`` to a struct kobject
|
||||
embedded **within** a struct uio_map could be converted to a pointer to the
|
||||
**containing** uio_map structure with::
|
||||
|
||||
struct uio_map *u_map = container_of(kp, struct uio_map, kobj);
|
||||
|
||||
For convenience, programmers often define a simple macro for "back-casting"
|
||||
For convenience, programmers often define a simple macro for **back-casting**
|
||||
kobject pointers to the containing type. Exactly this happens in the
|
||||
earlier drivers/uio/uio.c, as you can see here::
|
||||
earlier ``drivers/uio/uio.c``, as you can see here::
|
||||
|
||||
struct uio_map {
|
||||
struct kobject kobj;
|
||||
struct uio_mem *mem;
|
||||
struct kobject kobj;
|
||||
struct uio_mem *mem;
|
||||
};
|
||||
|
||||
#define to_map(map) container_of(map, struct uio_map, kobj)
|
||||
@ -125,7 +125,7 @@ must have an associated kobj_type. After calling kobject_init(), to
|
||||
register the kobject with sysfs, the function kobject_add() must be called::
|
||||
|
||||
int kobject_add(struct kobject *kobj, struct kobject *parent,
|
||||
const char *fmt, ...);
|
||||
const char *fmt, ...);
|
||||
|
||||
This sets up the parent of the kobject and the name for the kobject
|
||||
properly. If the kobject is to be associated with a specific kset,
|
||||
@ -172,13 +172,13 @@ call to kobject_uevent()::
|
||||
|
||||
int kobject_uevent(struct kobject *kobj, enum kobject_action action);
|
||||
|
||||
Use the KOBJ_ADD action for when the kobject is first added to the kernel.
|
||||
Use the **KOBJ_ADD** action for when the kobject is first added to the kernel.
|
||||
This should be done only after any attributes or children of the kobject
|
||||
have been initialized properly, as userspace will instantly start to look
|
||||
for them when this call happens.
|
||||
|
||||
When the kobject is removed from the kernel (details on how to do that are
|
||||
below), the uevent for KOBJ_REMOVE will be automatically created by the
|
||||
below), the uevent for **KOBJ_REMOVE** will be automatically created by the
|
||||
kobject core, so the caller does not have to worry about doing that by
|
||||
hand.
|
||||
|
||||
@ -238,7 +238,7 @@ Both types of attributes used here, with a kobject that has been created
|
||||
with the kobject_create_and_add(), can be of type kobj_attribute, so no
|
||||
special custom attribute is needed to be created.
|
||||
|
||||
See the example module, samples/kobject/kobject-example.c for an
|
||||
See the example module, ``samples/kobject/kobject-example.c`` for an
|
||||
implementation of a simple kobject and attributes.
|
||||
|
||||
|
||||
@ -270,10 +270,10 @@ such a method has a form like::
|
||||
|
||||
void my_object_release(struct kobject *kobj)
|
||||
{
|
||||
struct my_object *mine = container_of(kobj, struct my_object, kobj);
|
||||
struct my_object *mine = container_of(kobj, struct my_object, kobj);
|
||||
|
||||
/* Perform any additional cleanup on this object, then... */
|
||||
kfree(mine);
|
||||
/* Perform any additional cleanup on this object, then... */
|
||||
kfree(mine);
|
||||
}
|
||||
|
||||
One important point cannot be overstated: every kobject must have a
|
||||
@ -297,11 +297,11 @@ instead, it is associated with the ktype. So let us introduce struct
|
||||
kobj_type::
|
||||
|
||||
struct kobj_type {
|
||||
void (*release)(struct kobject *kobj);
|
||||
const struct sysfs_ops *sysfs_ops;
|
||||
struct attribute **default_attrs;
|
||||
const struct kobj_ns_type_operations *(*child_ns_type)(struct kobject *kobj);
|
||||
const void *(*namespace)(struct kobject *kobj);
|
||||
void (*release)(struct kobject *kobj);
|
||||
const struct sysfs_ops *sysfs_ops;
|
||||
struct attribute **default_attrs;
|
||||
const struct kobj_ns_type_operations *(*child_ns_type)(struct kobject *kobj);
|
||||
const void *(*namespace)(struct kobject *kobj);
|
||||
};
|
||||
|
||||
This structure is used to describe a particular type of kobject (or, more
|
||||
@ -352,8 +352,8 @@ created and never declared statically or on the stack. To create a new
|
||||
kset use::
|
||||
|
||||
struct kset *kset_create_and_add(const char *name,
|
||||
struct kset_uevent_ops *u,
|
||||
struct kobject *parent);
|
||||
struct kset_uevent_ops *u,
|
||||
struct kobject *parent);
|
||||
|
||||
When you are finished with the kset, call::
|
||||
|
||||
@ -365,16 +365,16 @@ Because other references to the kset may still exist, the release may happen
|
||||
after kset_unregister() returns.
|
||||
|
||||
An example of using a kset can be seen in the
|
||||
samples/kobject/kset-example.c file in the kernel tree.
|
||||
``samples/kobject/kset-example.c`` file in the kernel tree.
|
||||
|
||||
If a kset wishes to control the uevent operations of the kobjects
|
||||
associated with it, it can use the struct kset_uevent_ops to handle it::
|
||||
|
||||
struct kset_uevent_ops {
|
||||
int (*filter)(struct kset *kset, struct kobject *kobj);
|
||||
const char *(*name)(struct kset *kset, struct kobject *kobj);
|
||||
int (*uevent)(struct kset *kset, struct kobject *kobj,
|
||||
struct kobj_uevent_env *env);
|
||||
int (*filter)(struct kset *kset, struct kobject *kobj);
|
||||
const char *(*name)(struct kset *kset, struct kobject *kobj);
|
||||
int (*uevent)(struct kset *kset, struct kobject *kobj,
|
||||
struct kobj_uevent_env *env);
|
||||
};
|
||||
|
||||
|
||||
@ -408,8 +408,8 @@ Kobject removal
|
||||
After a kobject has been registered with the kobject core successfully, it
|
||||
must be cleaned up when the code is finished with it. To do that, call
|
||||
kobject_put(). By doing this, the kobject core will automatically clean up
|
||||
all of the memory allocated by this kobject. If a KOBJ_ADD uevent has been
|
||||
sent for the object, a corresponding KOBJ_REMOVE uevent will be sent, and
|
||||
all of the memory allocated by this kobject. If a ``KOBJ_ADD`` uevent has been
|
||||
sent for the object, a corresponding ``KOBJ_REMOVE`` uevent will be sent, and
|
||||
any other sysfs housekeeping will be handled for the caller properly.
|
||||
|
||||
If you need to do a two-stage delete of the kobject (say you are not
|
||||
@ -430,5 +430,5 @@ Example code to copy from
|
||||
=========================
|
||||
|
||||
For a more complete example of using ksets and kobjects properly, see the
|
||||
example programs samples/kobject/{kobject-example.c,kset-example.c},
|
||||
which will be built as loadable modules if you select CONFIG_SAMPLE_KOBJECT.
|
||||
example programs ``samples/kobject/{kobject-example.c,kset-example.c}``,
|
||||
which will be built as loadable modules if you select ``CONFIG_SAMPLE_KOBJECT``.
|
@ -73,6 +73,9 @@ File Mapping and Page Cache
|
||||
.. kernel-doc:: mm/truncate.c
|
||||
:export:
|
||||
|
||||
.. kernel-doc:: include/linux/pagemap.h
|
||||
:internal:
|
||||
|
||||
Memory pools
|
||||
============
|
||||
|
||||
|
@ -52,8 +52,22 @@ Which flags are set by each wrapper
|
||||
|
||||
For these pin_user_pages*() functions, FOLL_PIN is OR'd in with whatever gup
|
||||
flags the caller provides. The caller is required to pass in a non-null struct
|
||||
pages* array, and the function then pin pages by incrementing each by a special
|
||||
value. For now, that value is +1, just like get_user_pages*().::
|
||||
pages* array, and the function then pins pages by incrementing each by a special
|
||||
value: GUP_PIN_COUNTING_BIAS.
|
||||
|
||||
For huge pages (and in fact, any compound page of more than 2 pages), the
|
||||
GUP_PIN_COUNTING_BIAS scheme is not used. Instead, an exact form of pin counting
|
||||
is achieved, by using the 3rd struct page in the compound page. A new struct
|
||||
page field, hpage_pinned_refcount, has been added in order to support this.
|
||||
|
||||
This approach for compound pages avoids the counting upper limit problems that
|
||||
are discussed below. Those limitations would have been aggravated severely by
|
||||
huge pages, because each tail page adds a refcount to the head page. And in
|
||||
fact, testing revealed that, without a separate hpage_pinned_refcount field,
|
||||
page overflows were seen in some huge page stress tests.
|
||||
|
||||
This also means that huge pages and compound pages (of order > 1) do not suffer
|
||||
from the false positives problem that is mentioned below.::
|
||||
|
||||
Function
|
||||
--------
|
||||
@ -99,27 +113,6 @@ pages:
|
||||
This also leads to limitations: there are only 31-10==21 bits available for a
|
||||
counter that increments 10 bits at a time.
|
||||
|
||||
TODO: for 1GB and larger huge pages, this is cutting it close. That's because
|
||||
when pin_user_pages() follows such pages, it increments the head page by "1"
|
||||
(where "1" used to mean "+1" for get_user_pages(), but now means "+1024" for
|
||||
pin_user_pages()) for each tail page. So if you have a 1GB huge page:
|
||||
|
||||
* There are 256K (18 bits) worth of 4 KB tail pages.
|
||||
* There are 21 bits available to count up via GUP_PIN_COUNTING_BIAS (that is,
|
||||
10 bits at a time)
|
||||
* There are 21 - 18 == 3 bits available to count. Except that there aren't,
|
||||
because you need to allow for a few normal get_page() calls on the head page,
|
||||
as well. Fortunately, the approach of using addition, rather than "hard"
|
||||
bitfields, within page->_refcount, allows for sharing these bits gracefully.
|
||||
But we're still looking at about 8 references.
|
||||
|
||||
This, however, is a missing feature more than anything else, because it's easily
|
||||
solved by addressing an obvious inefficiency in the original get_user_pages()
|
||||
approach of retrieving pages: stop treating all the pages as if they were
|
||||
PAGE_SIZE. Retrieve huge pages as huge pages. The callers need to be aware of
|
||||
this, so some work is required. Once that's in place, this limitation mostly
|
||||
disappears from view, because there will be ample refcounting range available.
|
||||
|
||||
* Callers must specifically request "dma-pinned tracking of pages". In other
|
||||
words, just calling get_user_pages() will not suffice; a new set of functions,
|
||||
pin_user_page() and related, must be used.
|
||||
@ -173,8 +166,8 @@ CASE 4: Pinning for struct page manipulation only
|
||||
-------------------------------------------------
|
||||
Here, normal GUP calls are sufficient, so neither flag needs to be set.
|
||||
|
||||
page_dma_pinned(): the whole point of pinning
|
||||
=============================================
|
||||
page_maybe_dma_pinned(): the whole point of pinning
|
||||
===================================================
|
||||
|
||||
The whole point of marking pages as "DMA-pinned" or "gup-pinned" is to be able
|
||||
to query, "is this page DMA-pinned?" That allows code such as page_mkclean()
|
||||
@ -186,7 +179,7 @@ and debates (see the References at the end of this document). It's a TODO item
|
||||
here: fill in the details once that's worked out. Meanwhile, it's safe to say
|
||||
that having this available: ::
|
||||
|
||||
static inline bool page_dma_pinned(struct page *page)
|
||||
static inline bool page_maybe_dma_pinned(struct page *page)
|
||||
|
||||
...is a prerequisite to solving the long-running gup+DMA problem.
|
||||
|
||||
@ -215,12 +208,42 @@ has the following new calls to exercise the new pin*() wrapper functions:
|
||||
You can monitor how many total dma-pinned pages have been acquired and released
|
||||
since the system was booted, via two new /proc/vmstat entries: ::
|
||||
|
||||
/proc/vmstat/nr_foll_pin_requested
|
||||
/proc/vmstat/nr_foll_pin_requested
|
||||
/proc/vmstat/nr_foll_pin_acquired
|
||||
/proc/vmstat/nr_foll_pin_released
|
||||
|
||||
Those are both going to show zero, unless CONFIG_DEBUG_VM is set. This is
|
||||
because there is a noticeable performance drop in unpin_user_page(), when they
|
||||
are activated.
|
||||
Under normal conditions, these two values will be equal unless there are any
|
||||
long-term [R]DMA pins in place, or during pin/unpin transitions.
|
||||
|
||||
* nr_foll_pin_acquired: This is the number of logical pins that have been
|
||||
acquired since the system was powered on. For huge pages, the head page is
|
||||
pinned once for each page (head page and each tail page) within the huge page.
|
||||
This follows the same sort of behavior that get_user_pages() uses for huge
|
||||
pages: the head page is refcounted once for each tail or head page in the huge
|
||||
page, when get_user_pages() is applied to a huge page.
|
||||
|
||||
* nr_foll_pin_released: The number of logical pins that have been released since
|
||||
the system was powered on. Note that pages are released (unpinned) on a
|
||||
PAGE_SIZE granularity, even if the original pin was applied to a huge page.
|
||||
Becaused of the pin count behavior described above in "nr_foll_pin_acquired",
|
||||
the accounting balances out, so that after doing this::
|
||||
|
||||
pin_user_pages(huge_page);
|
||||
for (each page in huge_page)
|
||||
unpin_user_page(page);
|
||||
|
||||
...the following is expected::
|
||||
|
||||
nr_foll_pin_released == nr_foll_pin_acquired
|
||||
|
||||
(...unless it was already out of balance due to a long-term RDMA pin being in
|
||||
place.)
|
||||
|
||||
Other diagnostics
|
||||
=================
|
||||
|
||||
dump_page() has been enhanced slightly, to handle these new counting fields, and
|
||||
to better report on compound pages in general. Specifically, for compound pages
|
||||
with order > 1, the exact (hpage_pinned_refcount) pincount is reported.
|
||||
|
||||
References
|
||||
==========
|
||||
@ -228,5 +251,6 @@ References
|
||||
* `Some slow progress on get_user_pages() (Apr 2, 2019) <https://lwn.net/Articles/784574/>`_
|
||||
* `DMA and get_user_pages() (LPC: Dec 12, 2018) <https://lwn.net/Articles/774411/>`_
|
||||
* `The trouble with get_user_pages() (Apr 30, 2018) <https://lwn.net/Articles/753027/>`_
|
||||
* `LWN kernel index: get_user_pages() <https://lwn.net/Kernel/Index/#Memory_management-get_user_pages>`_
|
||||
|
||||
John Hubbard, October, 2019
|
||||
|
@ -154,9 +154,9 @@ architectures. These are the recommended replacements:
|
||||
|
||||
Use ktime_get() or ktime_get_ts64() instead.
|
||||
|
||||
.. c:function:: struct timeval do_gettimeofday( void )
|
||||
struct timespec getnstimeofday( void )
|
||||
struct timespec64 getnstimeofday64( void )
|
||||
.. c:function:: void do_gettimeofday( struct timeval * )
|
||||
void getnstimeofday( struct timespec * )
|
||||
void getnstimeofday64( struct timespec64 * )
|
||||
void ktime_get_real_ts( struct timespec * )
|
||||
|
||||
ktime_get_real_ts64() is a direct replacement, but consider using
|
||||
|
@ -1,38 +0,0 @@
|
||||
|
||||
PowerNow! and Cool'n'Quiet are AMD names for frequency
|
||||
management capabilities in AMD processors. As the hardware
|
||||
implementation changes in new generations of the processors,
|
||||
there is a different cpu-freq driver for each generation.
|
||||
|
||||
Note that the driver's will not load on the "wrong" hardware,
|
||||
so it is safe to try each driver in turn when in doubt as to
|
||||
which is the correct driver.
|
||||
|
||||
Note that the functionality to change frequency (and voltage)
|
||||
is not available in all processors. The drivers will refuse
|
||||
to load on processors without this capability. The capability
|
||||
is detected with the cpuid instruction.
|
||||
|
||||
The drivers use BIOS supplied tables to obtain frequency and
|
||||
voltage information appropriate for a particular platform.
|
||||
Frequency transitions will be unavailable if the BIOS does
|
||||
not supply these tables.
|
||||
|
||||
6th Generation: powernow-k6
|
||||
|
||||
7th Generation: powernow-k7: Athlon, Duron, Geode.
|
||||
|
||||
8th Generation: powernow-k8: Athlon, Athlon 64, Opteron, Sempron.
|
||||
Documentation on this functionality in 8th generation processors
|
||||
is available in the "BIOS and Kernel Developer's Guide", publication
|
||||
26094, in chapter 9, available for download from www.amd.com.
|
||||
|
||||
BIOS supplied data, for powernow-k7 and for powernow-k8, may be
|
||||
from either the PSB table or from ACPI objects. The ACPI support
|
||||
is only available if the kernel config sets CONFIG_ACPI_PROCESSOR.
|
||||
The powernow-k8 driver will attempt to use ACPI if so configured,
|
||||
and fall back to PST if that fails.
|
||||
The powernow-k7 driver will try to use the PSB support first, and
|
||||
fall back to ACPI if the PSB support fails. A module parameter,
|
||||
acpi_force, is provided to force ACPI support to be used instead
|
||||
of PSB support.
|
@ -1,31 +1,23 @@
|
||||
CPU frequency and voltage scaling code in the Linux(TM) kernel
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
=============================================================
|
||||
General description of the CPUFreq core and CPUFreq notifiers
|
||||
=============================================================
|
||||
|
||||
L i n u x C P U F r e q
|
||||
Authors:
|
||||
- Dominik Brodowski <linux@brodo.de>
|
||||
- David Kimdon <dwhedon@debian.org>
|
||||
- Rafael J. Wysocki <rafael.j.wysocki@intel.com>
|
||||
- Viresh Kumar <viresh.kumar@linaro.org>
|
||||
|
||||
C P U F r e q C o r e
|
||||
.. Contents:
|
||||
|
||||
|
||||
Dominik Brodowski <linux@brodo.de>
|
||||
David Kimdon <dwhedon@debian.org>
|
||||
Rafael J. Wysocki <rafael.j.wysocki@intel.com>
|
||||
Viresh Kumar <viresh.kumar@linaro.org>
|
||||
|
||||
|
||||
|
||||
Clock scaling allows you to change the clock speed of the CPUs on the
|
||||
fly. This is a nice method to save battery power, because the lower
|
||||
the clock speed, the less power the CPU consumes.
|
||||
|
||||
|
||||
Contents:
|
||||
---------
|
||||
1. CPUFreq core and interfaces
|
||||
2. CPUFreq notifiers
|
||||
3. CPUFreq Table Generation with Operating Performance Point (OPP)
|
||||
1. CPUFreq core and interfaces
|
||||
2. CPUFreq notifiers
|
||||
3. CPUFreq Table Generation with Operating Performance Point (OPP)
|
||||
|
||||
1. General Information
|
||||
=======================
|
||||
======================
|
||||
|
||||
The CPUFreq core code is located in drivers/cpufreq/cpufreq.c. This
|
||||
cpufreq code offers a standardized interface for the CPUFreq
|
||||
@ -63,7 +55,7 @@ The phase is specified in the second argument to the notifier. The phase is
|
||||
CPUFREQ_CREATE_POLICY when the policy is first created and it is
|
||||
CPUFREQ_REMOVE_POLICY when the policy is removed.
|
||||
|
||||
The third argument, a void *pointer, points to a struct cpufreq_policy
|
||||
The third argument, a ``void *pointer``, points to a struct cpufreq_policy
|
||||
consisting of several values, including min, max (the lower and upper
|
||||
frequencies (in kHz) of the new policy).
|
||||
|
||||
@ -80,10 +72,13 @@ CPUFREQ_POSTCHANGE.
|
||||
|
||||
The third argument is a struct cpufreq_freqs with the following
|
||||
values:
|
||||
cpu - number of the affected CPU
|
||||
old - old frequency
|
||||
new - new frequency
|
||||
flags - flags of the cpufreq driver
|
||||
|
||||
===== ===========================
|
||||
cpu number of the affected CPU
|
||||
old old frequency
|
||||
new new frequency
|
||||
flags flags of the cpufreq driver
|
||||
===== ===========================
|
||||
|
||||
3. CPUFreq Table Generation with Operating Performance Point (OPP)
|
||||
==================================================================
|
||||
@ -94,9 +89,12 @@ dev_pm_opp_init_cpufreq_table -
|
||||
the OPP layer's internal information about the available frequencies
|
||||
into a format readily providable to cpufreq.
|
||||
|
||||
WARNING: Do not use this function in interrupt context.
|
||||
.. Warning::
|
||||
|
||||
Do not use this function in interrupt context.
|
||||
|
||||
Example::
|
||||
|
||||
Example:
|
||||
soc_pm_init()
|
||||
{
|
||||
/* Do things */
|
||||
@ -106,7 +104,10 @@ dev_pm_opp_init_cpufreq_table -
|
||||
/* Do other things */
|
||||
}
|
||||
|
||||
NOTE: This function is available only if CONFIG_CPU_FREQ is enabled in
|
||||
addition to CONFIG_PM_OPP.
|
||||
.. note::
|
||||
|
||||
dev_pm_opp_free_cpufreq_table - Free up the table allocated by dev_pm_opp_init_cpufreq_table
|
||||
This function is available only if CONFIG_CPU_FREQ is enabled in
|
||||
addition to CONFIG_PM_OPP.
|
||||
|
||||
dev_pm_opp_free_cpufreq_table
|
||||
Free up the table allocated by dev_pm_opp_init_cpufreq_table
|
@ -1,35 +1,27 @@
|
||||
CPU frequency and voltage scaling code in the Linux(TM) kernel
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
===============================================
|
||||
How to Implement a new CPUFreq Processor Driver
|
||||
===============================================
|
||||
|
||||
Authors:
|
||||
|
||||
|
||||
L i n u x C P U F r e q
|
||||
- Dominik Brodowski <linux@brodo.de>
|
||||
- Rafael J. Wysocki <rafael.j.wysocki@intel.com>
|
||||
- Viresh Kumar <viresh.kumar@linaro.org>
|
||||
|
||||
C P U D r i v e r s
|
||||
.. Contents
|
||||
|
||||
- information for developers -
|
||||
|
||||
|
||||
Dominik Brodowski <linux@brodo.de>
|
||||
Rafael J. Wysocki <rafael.j.wysocki@intel.com>
|
||||
Viresh Kumar <viresh.kumar@linaro.org>
|
||||
|
||||
|
||||
|
||||
Clock scaling allows you to change the clock speed of the CPUs on the
|
||||
fly. This is a nice method to save battery power, because the lower
|
||||
the clock speed, the less power the CPU consumes.
|
||||
|
||||
|
||||
Contents:
|
||||
---------
|
||||
1. What To Do?
|
||||
1.1 Initialization
|
||||
1.2 Per-CPU Initialization
|
||||
1.3 verify
|
||||
1.4 target/target_index or setpolicy?
|
||||
1.5 target/target_index
|
||||
1.6 setpolicy
|
||||
1.7 get_intermediate and target_intermediate
|
||||
2. Frequency Table Helpers
|
||||
1. What To Do?
|
||||
1.1 Initialization
|
||||
1.2 Per-CPU Initialization
|
||||
1.3 verify
|
||||
1.4 target/target_index or setpolicy?
|
||||
1.5 target/target_index
|
||||
1.6 setpolicy
|
||||
1.7 get_intermediate and target_intermediate
|
||||
2. Frequency Table Helpers
|
||||
|
||||
|
||||
|
||||
@ -49,7 +41,7 @@ function check whether this kernel runs on the right CPU and the right
|
||||
chipset. If so, register a struct cpufreq_driver with the CPUfreq core
|
||||
using cpufreq_register_driver()
|
||||
|
||||
What shall this struct cpufreq_driver contain?
|
||||
What shall this struct cpufreq_driver contain?
|
||||
|
||||
.name - The name of this driver.
|
||||
|
||||
@ -108,37 +100,42 @@ Whenever a new CPU is registered with the device model, or after the
|
||||
cpufreq driver registers itself, the per-policy initialization function
|
||||
cpufreq_driver.init is called if no cpufreq policy existed for the CPU.
|
||||
Note that the .init() and .exit() routines are called only once for the
|
||||
policy and not for each CPU managed by the policy. It takes a struct
|
||||
cpufreq_policy *policy as argument. What to do now?
|
||||
policy and not for each CPU managed by the policy. It takes a ``struct
|
||||
cpufreq_policy *policy`` as argument. What to do now?
|
||||
|
||||
If necessary, activate the CPUfreq support on your CPU.
|
||||
|
||||
Then, the driver must fill in the following values:
|
||||
|
||||
policy->cpuinfo.min_freq _and_
|
||||
policy->cpuinfo.max_freq - the minimum and maximum frequency
|
||||
(in kHz) which is supported by
|
||||
this CPU
|
||||
policy->cpuinfo.transition_latency the time it takes on this CPU to
|
||||
switch between two frequencies in
|
||||
nanoseconds (if appropriate, else
|
||||
specify CPUFREQ_ETERNAL)
|
||||
|
||||
policy->cur The current operating frequency of
|
||||
this CPU (if appropriate)
|
||||
policy->min,
|
||||
policy->max,
|
||||
policy->policy and, if necessary,
|
||||
policy->governor must contain the "default policy" for
|
||||
this CPU. A few moments later,
|
||||
cpufreq_driver.verify and either
|
||||
cpufreq_driver.setpolicy or
|
||||
cpufreq_driver.target/target_index is called
|
||||
with these values.
|
||||
policy->cpus Update this with the masks of the
|
||||
(online + offline) CPUs that do DVFS
|
||||
along with this CPU (i.e. that share
|
||||
clock/voltage rails with it).
|
||||
+-----------------------------------+--------------------------------------+
|
||||
|policy->cpuinfo.min_freq _and_ | |
|
||||
|policy->cpuinfo.max_freq | the minimum and maximum frequency |
|
||||
| | (in kHz) which is supported by |
|
||||
| | this CPU |
|
||||
+-----------------------------------+--------------------------------------+
|
||||
|policy->cpuinfo.transition_latency | the time it takes on this CPU to |
|
||||
| | switch between two frequencies in |
|
||||
| | nanoseconds (if appropriate, else |
|
||||
| | specify CPUFREQ_ETERNAL) |
|
||||
+-----------------------------------+--------------------------------------+
|
||||
|policy->cur | The current operating frequency of |
|
||||
| | this CPU (if appropriate) |
|
||||
+-----------------------------------+--------------------------------------+
|
||||
|policy->min, | |
|
||||
|policy->max, | |
|
||||
|policy->policy and, if necessary, | |
|
||||
|policy->governor | must contain the "default policy" for|
|
||||
| | this CPU. A few moments later, |
|
||||
| | cpufreq_driver.verify and either |
|
||||
| | cpufreq_driver.setpolicy or |
|
||||
| | cpufreq_driver.target/target_index is|
|
||||
| | called with these values. |
|
||||
+-----------------------------------+--------------------------------------+
|
||||
|policy->cpus | Update this with the masks of the |
|
||||
| | (online + offline) CPUs that do DVFS |
|
||||
| | along with this CPU (i.e. that share|
|
||||
| | clock/voltage rails with it). |
|
||||
+-----------------------------------+--------------------------------------+
|
||||
|
||||
For setting some of these values (cpuinfo.min[max]_freq, policy->min[max]), the
|
||||
frequency table helpers might be helpful. See the section 2 for more information
|
||||
@ -151,8 +148,8 @@ on them.
|
||||
When the user decides a new policy (consisting of
|
||||
"policy,governor,min,max") shall be set, this policy must be validated
|
||||
so that incompatible values can be corrected. For verifying these
|
||||
values cpufreq_verify_within_limits(struct cpufreq_policy *policy,
|
||||
unsigned int min_freq, unsigned int max_freq) function might be helpful.
|
||||
values cpufreq_verify_within_limits(``struct cpufreq_policy *policy``,
|
||||
``unsigned int min_freq``, ``unsigned int max_freq``) function might be helpful.
|
||||
See section 2 for details on frequency table helpers.
|
||||
|
||||
You need to make sure that at least one valid frequency (or operating
|
||||
@ -163,7 +160,7 @@ policy->max first, and only if this is no solution, decrease policy->min.
|
||||
1.4 target or target_index or setpolicy or fast_switch?
|
||||
-------------------------------------------------------
|
||||
|
||||
Most cpufreq drivers or even most cpu frequency scaling algorithms
|
||||
Most cpufreq drivers or even most cpu frequency scaling algorithms
|
||||
only allow the CPU frequency to be set to predefined fixed values. For
|
||||
these, you use the ->target(), ->target_index() or ->fast_switch()
|
||||
callbacks.
|
||||
@ -175,8 +172,8 @@ limits on their own. These shall use the ->setpolicy() callback.
|
||||
1.5. target/target_index
|
||||
------------------------
|
||||
|
||||
The target_index call has two arguments: struct cpufreq_policy *policy,
|
||||
and unsigned int index (into the exposed frequency table).
|
||||
The target_index call has two arguments: ``struct cpufreq_policy *policy``,
|
||||
and ``unsigned int`` index (into the exposed frequency table).
|
||||
|
||||
The CPUfreq driver must set the new frequency when called here. The
|
||||
actual frequency must be determined by freq_table[index].frequency.
|
||||
@ -184,9 +181,9 @@ actual frequency must be determined by freq_table[index].frequency.
|
||||
It should always restore to earlier frequency (i.e. policy->restore_freq) in
|
||||
case of errors, even if we switched to intermediate frequency earlier.
|
||||
|
||||
Deprecated:
|
||||
Deprecated
|
||||
----------
|
||||
The target call has three arguments: struct cpufreq_policy *policy,
|
||||
The target call has three arguments: ``struct cpufreq_policy *policy``,
|
||||
unsigned int target_frequency, unsigned int relation.
|
||||
|
||||
The CPUfreq driver must set the new frequency when called here. The
|
||||
@ -210,14 +207,14 @@ Not all drivers are expected to implement it, as sleeping from within
|
||||
this callback isn't allowed. This callback must be highly optimized to
|
||||
do switching as fast as possible.
|
||||
|
||||
This function has two arguments: struct cpufreq_policy *policy and
|
||||
unsigned int target_frequency.
|
||||
This function has two arguments: ``struct cpufreq_policy *policy`` and
|
||||
``unsigned int target_frequency``.
|
||||
|
||||
|
||||
1.7 setpolicy
|
||||
-------------
|
||||
|
||||
The setpolicy call only takes a struct cpufreq_policy *policy as
|
||||
The setpolicy call only takes a ``struct cpufreq_policy *policy`` as
|
||||
argument. You need to set the lower limit of the in-processor or
|
||||
in-chipset dynamic frequency switching to policy->min, the upper limit
|
||||
to policy->max, and -if supported- select a performance-oriented
|
||||
@ -278,10 +275,10 @@ table.
|
||||
|
||||
cpufreq_for_each_valid_entry(pos, table) - iterates over all entries,
|
||||
excluding CPUFREQ_ENTRY_INVALID frequencies.
|
||||
Use arguments "pos" - a cpufreq_frequency_table * as a loop cursor and
|
||||
"table" - the cpufreq_frequency_table * you want to iterate over.
|
||||
Use arguments "pos" - a ``cpufreq_frequency_table *`` as a loop cursor and
|
||||
"table" - the ``cpufreq_frequency_table *`` you want to iterate over.
|
||||
|
||||
For example:
|
||||
For example::
|
||||
|
||||
struct cpufreq_frequency_table *pos, *driver_freq_table;
|
||||
|
@ -1,19 +0,0 @@
|
||||
|
||||
The cpufreq-nforce2 driver changes the FSB on nVidia nForce2 platforms.
|
||||
|
||||
This works better than on other platforms, because the FSB of the CPU
|
||||
can be controlled independently from the PCI/AGP clock.
|
||||
|
||||
The module has two options:
|
||||
|
||||
fid: multiplier * 10 (for example 8.5 = 85)
|
||||
min_fsb: minimum FSB
|
||||
|
||||
If not set, fid is calculated from the current CPU speed and the FSB.
|
||||
min_fsb defaults to FSB at boot time - 50 MHz.
|
||||
|
||||
IMPORTANT: The available range is limited downwards!
|
||||
Also the minimum available FSB can differ, for systems
|
||||
booting with 200 MHz, 150 should always work.
|
||||
|
||||
|
@ -1,21 +1,23 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
CPU frequency and voltage scaling statistics in the Linux(TM) kernel
|
||||
==========================================
|
||||
General Description of sysfs CPUFreq Stats
|
||||
==========================================
|
||||
|
||||
information for users
|
||||
|
||||
|
||||
L i n u x c p u f r e q - s t a t s d r i v e r
|
||||
Author: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
|
||||
|
||||
- information for users -
|
||||
.. Contents
|
||||
|
||||
|
||||
Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
|
||||
|
||||
Contents
|
||||
1. Introduction
|
||||
2. Statistics Provided (with example)
|
||||
3. Configuring cpufreq-stats
|
||||
1. Introduction
|
||||
2. Statistics Provided (with example)
|
||||
3. Configuring cpufreq-stats
|
||||
|
||||
|
||||
1. Introduction
|
||||
===============
|
||||
|
||||
cpufreq-stats is a driver that provides CPU frequency statistics for each CPU.
|
||||
These statistics are provided in /sysfs as a bunch of read_only interfaces. This
|
||||
@ -28,8 +30,10 @@ that may be running on your CPU. So, it will work with any cpufreq_driver.
|
||||
|
||||
|
||||
2. Statistics Provided (with example)
|
||||
=====================================
|
||||
|
||||
cpufreq stats provides following statistics (explained in detail below).
|
||||
|
||||
- time_in_state
|
||||
- total_trans
|
||||
- trans_table
|
||||
@ -39,53 +43,57 @@ All the statistics will be from the time the stats driver has been inserted
|
||||
statistic is done. Obviously, stats driver will not have any information
|
||||
about the frequency transitions before the stats driver insertion.
|
||||
|
||||
--------------------------------------------------------------------------------
|
||||
<mysystem>:/sys/devices/system/cpu/cpu0/cpufreq/stats # ls -l
|
||||
total 0
|
||||
drwxr-xr-x 2 root root 0 May 14 16:06 .
|
||||
drwxr-xr-x 3 root root 0 May 14 15:58 ..
|
||||
--w------- 1 root root 4096 May 14 16:06 reset
|
||||
-r--r--r-- 1 root root 4096 May 14 16:06 time_in_state
|
||||
-r--r--r-- 1 root root 4096 May 14 16:06 total_trans
|
||||
-r--r--r-- 1 root root 4096 May 14 16:06 trans_table
|
||||
--------------------------------------------------------------------------------
|
||||
::
|
||||
|
||||
<mysystem>:/sys/devices/system/cpu/cpu0/cpufreq/stats # ls -l
|
||||
total 0
|
||||
drwxr-xr-x 2 root root 0 May 14 16:06 .
|
||||
drwxr-xr-x 3 root root 0 May 14 15:58 ..
|
||||
--w------- 1 root root 4096 May 14 16:06 reset
|
||||
-r--r--r-- 1 root root 4096 May 14 16:06 time_in_state
|
||||
-r--r--r-- 1 root root 4096 May 14 16:06 total_trans
|
||||
-r--r--r-- 1 root root 4096 May 14 16:06 trans_table
|
||||
|
||||
- **reset**
|
||||
|
||||
- reset
|
||||
Write-only attribute that can be used to reset the stat counters. This can be
|
||||
useful for evaluating system behaviour under different governors without the
|
||||
need for a reboot.
|
||||
|
||||
- time_in_state
|
||||
- **time_in_state**
|
||||
|
||||
This gives the amount of time spent in each of the frequencies supported by
|
||||
this CPU. The cat output will have "<frequency> <time>" pair in each line, which
|
||||
will mean this CPU spent <time> usertime units of time at <frequency>. Output
|
||||
will have one line for each of the supported frequencies. usertime units here
|
||||
will have one line for each of the supported frequencies. usertime units here
|
||||
is 10mS (similar to other time exported in /proc).
|
||||
|
||||
--------------------------------------------------------------------------------
|
||||
<mysystem>:/sys/devices/system/cpu/cpu0/cpufreq/stats # cat time_in_state
|
||||
3600000 2089
|
||||
3400000 136
|
||||
3200000 34
|
||||
3000000 67
|
||||
2800000 172488
|
||||
--------------------------------------------------------------------------------
|
||||
::
|
||||
|
||||
<mysystem>:/sys/devices/system/cpu/cpu0/cpufreq/stats # cat time_in_state
|
||||
3600000 2089
|
||||
3400000 136
|
||||
3200000 34
|
||||
3000000 67
|
||||
2800000 172488
|
||||
|
||||
|
||||
- total_trans
|
||||
This gives the total number of frequency transitions on this CPU. The cat
|
||||
- **total_trans**
|
||||
|
||||
This gives the total number of frequency transitions on this CPU. The cat
|
||||
output will have a single count which is the total number of frequency
|
||||
transitions.
|
||||
|
||||
--------------------------------------------------------------------------------
|
||||
<mysystem>:/sys/devices/system/cpu/cpu0/cpufreq/stats # cat total_trans
|
||||
20
|
||||
--------------------------------------------------------------------------------
|
||||
::
|
||||
|
||||
<mysystem>:/sys/devices/system/cpu/cpu0/cpufreq/stats # cat total_trans
|
||||
20
|
||||
|
||||
- **trans_table**
|
||||
|
||||
- trans_table
|
||||
This will give a fine grained information about all the CPU frequency
|
||||
transitions. The cat output here is a two dimensional matrix, where an entry
|
||||
<i,j> (row i, column j) represents the count of number of transitions from
|
||||
<i,j> (row i, column j) represents the count of number of transitions from
|
||||
Freq_i to Freq_j. Freq_i rows and Freq_j columns follow the sorting order in
|
||||
which the driver has provided the frequency table initially to the cpufreq core
|
||||
and so can be sorted (ascending or descending) or unsorted. The output here
|
||||
@ -95,26 +103,27 @@ readability.
|
||||
If the transition table is bigger than PAGE_SIZE, reading this will
|
||||
return an -EFBIG error.
|
||||
|
||||
--------------------------------------------------------------------------------
|
||||
<mysystem>:/sys/devices/system/cpu/cpu0/cpufreq/stats # cat trans_table
|
||||
From : To
|
||||
: 3600000 3400000 3200000 3000000 2800000
|
||||
3600000: 0 5 0 0 0
|
||||
3400000: 4 0 2 0 0
|
||||
3200000: 0 1 0 2 0
|
||||
3000000: 0 0 1 0 3
|
||||
2800000: 0 0 0 2 0
|
||||
--------------------------------------------------------------------------------
|
||||
::
|
||||
|
||||
<mysystem>:/sys/devices/system/cpu/cpu0/cpufreq/stats # cat trans_table
|
||||
From : To
|
||||
: 3600000 3400000 3200000 3000000 2800000
|
||||
3600000: 0 5 0 0 0
|
||||
3400000: 4 0 2 0 0
|
||||
3200000: 0 1 0 2 0
|
||||
3000000: 0 0 1 0 3
|
||||
2800000: 0 0 0 2 0
|
||||
|
||||
3. Configuring cpufreq-stats
|
||||
============================
|
||||
|
||||
To configure cpufreq-stats in your kernel
|
||||
Config Main Menu
|
||||
Power management options (ACPI, APM) --->
|
||||
CPU Frequency scaling --->
|
||||
[*] CPU Frequency scaling
|
||||
[*] CPU frequency translation statistics
|
||||
To configure cpufreq-stats in your kernel::
|
||||
|
||||
Config Main Menu
|
||||
Power management options (ACPI, APM) --->
|
||||
CPU Frequency scaling --->
|
||||
[*] CPU Frequency scaling
|
||||
[*] CPU frequency translation statistics
|
||||
|
||||
|
||||
"CPU Frequency scaling" (CONFIG_CPU_FREQ) should be enabled to configure
|
39
Documentation/cpu-freq/index.rst
Normal file
39
Documentation/cpu-freq/index.rst
Normal file
@ -0,0 +1,39 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
==============================================================================
|
||||
Linux CPUFreq - CPU frequency and voltage scaling code in the Linux(TM) kernel
|
||||
==============================================================================
|
||||
|
||||
Author: Dominik Brodowski <linux@brodo.de>
|
||||
|
||||
Clock scaling allows you to change the clock speed of the CPUs on the
|
||||
fly. This is a nice method to save battery power, because the lower
|
||||
the clock speed, the less power the CPU consumes.
|
||||
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 1
|
||||
|
||||
core
|
||||
cpu-drivers
|
||||
cpufreq-stats
|
||||
|
||||
Mailing List
|
||||
------------
|
||||
There is a CPU frequency changing CVS commit and general list where
|
||||
you can report bugs, problems or submit patches. To post a message,
|
||||
send an email to linux-pm@vger.kernel.org.
|
||||
|
||||
Links
|
||||
-----
|
||||
the FTP archives:
|
||||
* ftp://ftp.linux.org.uk/pub/linux/cpufreq/
|
||||
|
||||
how to access the CVS repository:
|
||||
* http://cvs.arm.linux.org.uk/
|
||||
|
||||
the CPUFreq Mailing list:
|
||||
* http://vger.kernel.org/vger-lists.html#linux-pm
|
||||
|
||||
Clock and voltage scaling for the SA-1100:
|
||||
* http://www.lartmaker.nl/projects/scaling
|
@ -1,56 +0,0 @@
|
||||
CPU frequency and voltage scaling code in the Linux(TM) kernel
|
||||
|
||||
|
||||
L i n u x C P U F r e q
|
||||
|
||||
|
||||
|
||||
|
||||
Dominik Brodowski <linux@brodo.de>
|
||||
|
||||
|
||||
|
||||
Clock scaling allows you to change the clock speed of the CPUs on the
|
||||
fly. This is a nice method to save battery power, because the lower
|
||||
the clock speed, the less power the CPU consumes.
|
||||
|
||||
|
||||
|
||||
Documents in this directory:
|
||||
----------------------------
|
||||
|
||||
amd-powernow.txt - AMD powernow driver specific file.
|
||||
|
||||
core.txt - General description of the CPUFreq core and
|
||||
of CPUFreq notifiers.
|
||||
|
||||
cpu-drivers.txt - How to implement a new cpufreq processor driver.
|
||||
|
||||
cpufreq-nforce2.txt - nVidia nForce2 platform specific file.
|
||||
|
||||
cpufreq-stats.txt - General description of sysfs cpufreq stats.
|
||||
|
||||
index.txt - File index, Mailing list and Links (this document)
|
||||
|
||||
pcc-cpufreq.txt - PCC cpufreq driver specific file.
|
||||
|
||||
|
||||
Mailing List
|
||||
------------
|
||||
There is a CPU frequency changing CVS commit and general list where
|
||||
you can report bugs, problems or submit patches. To post a message,
|
||||
send an email to linux-pm@vger.kernel.org.
|
||||
|
||||
Links
|
||||
-----
|
||||
the FTP archives:
|
||||
* ftp://ftp.linux.org.uk/pub/linux/cpufreq/
|
||||
|
||||
how to access the CVS repository:
|
||||
* http://cvs.arm.linux.org.uk/
|
||||
|
||||
the CPUFreq Mailing list:
|
||||
* http://vger.kernel.org/vger-lists.html#linux-pm
|
||||
|
||||
Clock and voltage scaling for the SA-1100:
|
||||
* http://www.lartmaker.nl/projects/scaling
|
@ -1,207 +0,0 @@
|
||||
/*
|
||||
* pcc-cpufreq.txt - PCC interface documentation
|
||||
*
|
||||
* Copyright (C) 2009 Red Hat, Matthew Garrett <mjg@redhat.com>
|
||||
* Copyright (C) 2009 Hewlett-Packard Development Company, L.P.
|
||||
* Nagananda Chumbalkar <nagananda.chumbalkar@hp.com>
|
||||
*
|
||||
* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; version 2 of the License.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful, but
|
||||
* WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or NON
|
||||
* INFRINGEMENT. See the GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License along
|
||||
* with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
* 675 Mass Ave, Cambridge, MA 02139, USA.
|
||||
*
|
||||
* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
*/
|
||||
|
||||
|
||||
Processor Clocking Control Driver
|
||||
---------------------------------
|
||||
|
||||
Contents:
|
||||
---------
|
||||
1. Introduction
|
||||
1.1 PCC interface
|
||||
1.1.1 Get Average Frequency
|
||||
1.1.2 Set Desired Frequency
|
||||
1.2 Platforms affected
|
||||
2. Driver and /sys details
|
||||
2.1 scaling_available_frequencies
|
||||
2.2 cpuinfo_transition_latency
|
||||
2.3 cpuinfo_cur_freq
|
||||
2.4 related_cpus
|
||||
3. Caveats
|
||||
|
||||
1. Introduction:
|
||||
----------------
|
||||
Processor Clocking Control (PCC) is an interface between the platform
|
||||
firmware and OSPM. It is a mechanism for coordinating processor
|
||||
performance (ie: frequency) between the platform firmware and the OS.
|
||||
|
||||
The PCC driver (pcc-cpufreq) allows OSPM to take advantage of the PCC
|
||||
interface.
|
||||
|
||||
OS utilizes the PCC interface to inform platform firmware what frequency the
|
||||
OS wants for a logical processor. The platform firmware attempts to achieve
|
||||
the requested frequency. If the request for the target frequency could not be
|
||||
satisfied by platform firmware, then it usually means that power budget
|
||||
conditions are in place, and "power capping" is taking place.
|
||||
|
||||
1.1 PCC interface:
|
||||
------------------
|
||||
The complete PCC specification is available here:
|
||||
http://www.acpica.org/download/Processor-Clocking-Control-v1p0.pdf
|
||||
|
||||
PCC relies on a shared memory region that provides a channel for communication
|
||||
between the OS and platform firmware. PCC also implements a "doorbell" that
|
||||
is used by the OS to inform the platform firmware that a command has been
|
||||
sent.
|
||||
|
||||
The ACPI PCCH() method is used to discover the location of the PCC shared
|
||||
memory region. The shared memory region header contains the "command" and
|
||||
"status" interface. PCCH() also contains details on how to access the platform
|
||||
doorbell.
|
||||
|
||||
The following commands are supported by the PCC interface:
|
||||
* Get Average Frequency
|
||||
* Set Desired Frequency
|
||||
|
||||
The ACPI PCCP() method is implemented for each logical processor and is
|
||||
used to discover the offsets for the input and output buffers in the shared
|
||||
memory region.
|
||||
|
||||
When PCC mode is enabled, the platform will not expose processor performance
|
||||
or throttle states (_PSS, _TSS and related ACPI objects) to OSPM. Therefore,
|
||||
the native P-state driver (such as acpi-cpufreq for Intel, powernow-k8 for
|
||||
AMD) will not load.
|
||||
|
||||
However, OSPM remains in control of policy. The governor (eg: "ondemand")
|
||||
computes the required performance for each processor based on server workload.
|
||||
The PCC driver fills in the command interface, and the input buffer and
|
||||
communicates the request to the platform firmware. The platform firmware is
|
||||
responsible for delivering the requested performance.
|
||||
|
||||
Each PCC command is "global" in scope and can affect all the logical CPUs in
|
||||
the system. Therefore, PCC is capable of performing "group" updates. With PCC
|
||||
the OS is capable of getting/setting the frequency of all the logical CPUs in
|
||||
the system with a single call to the BIOS.
|
||||
|
||||
1.1.1 Get Average Frequency:
|
||||
----------------------------
|
||||
This command is used by the OSPM to query the running frequency of the
|
||||
processor since the last time this command was completed. The output buffer
|
||||
indicates the average unhalted frequency of the logical processor expressed as
|
||||
a percentage of the nominal (ie: maximum) CPU frequency. The output buffer
|
||||
also signifies if the CPU frequency is limited by a power budget condition.
|
||||
|
||||
1.1.2 Set Desired Frequency:
|
||||
----------------------------
|
||||
This command is used by the OSPM to communicate to the platform firmware the
|
||||
desired frequency for a logical processor. The output buffer is currently
|
||||
ignored by OSPM. The next invocation of "Get Average Frequency" will inform
|
||||
OSPM if the desired frequency was achieved or not.
|
||||
|
||||
1.2 Platforms affected:
|
||||
-----------------------
|
||||
The PCC driver will load on any system where the platform firmware:
|
||||
* supports the PCC interface, and the associated PCCH() and PCCP() methods
|
||||
* assumes responsibility for managing the hardware clocking controls in order
|
||||
to deliver the requested processor performance
|
||||
|
||||
Currently, certain HP ProLiant platforms implement the PCC interface. On those
|
||||
platforms PCC is the "default" choice.
|
||||
|
||||
However, it is possible to disable this interface via a BIOS setting. In
|
||||
such an instance, as is also the case on platforms where the PCC interface
|
||||
is not implemented, the PCC driver will fail to load silently.
|
||||
|
||||
2. Driver and /sys details:
|
||||
---------------------------
|
||||
When the driver loads, it merely prints the lowest and the highest CPU
|
||||
frequencies supported by the platform firmware.
|
||||
|
||||
The PCC driver loads with a message such as:
|
||||
pcc-cpufreq: (v1.00.00) driver loaded with frequency limits: 1600 MHz, 2933
|
||||
MHz
|
||||
|
||||
This means that the OPSM can request the CPU to run at any frequency in
|
||||
between the limits (1600 MHz, and 2933 MHz) specified in the message.
|
||||
|
||||
Internally, there is no need for the driver to convert the "target" frequency
|
||||
to a corresponding P-state.
|
||||
|
||||
The VERSION number for the driver will be of the format v.xy.ab.
|
||||
eg: 1.00.02
|
||||
----- --
|
||||
| |
|
||||
| -- this will increase with bug fixes/enhancements to the driver
|
||||
|-- this is the version of the PCC specification the driver adheres to
|
||||
|
||||
|
||||
The following is a brief discussion on some of the fields exported via the
|
||||
/sys filesystem and how their values are affected by the PCC driver:
|
||||
|
||||
2.1 scaling_available_frequencies:
|
||||
----------------------------------
|
||||
scaling_available_frequencies is not created in /sys. No intermediate
|
||||
frequencies need to be listed because the BIOS will try to achieve any
|
||||
frequency, within limits, requested by the governor. A frequency does not have
|
||||
to be strictly associated with a P-state.
|
||||
|
||||
2.2 cpuinfo_transition_latency:
|
||||
-------------------------------
|
||||
The cpuinfo_transition_latency field is 0. The PCC specification does
|
||||
not include a field to expose this value currently.
|
||||
|
||||
2.3 cpuinfo_cur_freq:
|
||||
---------------------
|
||||
A) Often cpuinfo_cur_freq will show a value different than what is declared
|
||||
in the scaling_available_frequencies or scaling_cur_freq, or scaling_max_freq.
|
||||
This is due to "turbo boost" available on recent Intel processors. If certain
|
||||
conditions are met the BIOS can achieve a slightly higher speed than requested
|
||||
by OSPM. An example:
|
||||
|
||||
scaling_cur_freq : 2933000
|
||||
cpuinfo_cur_freq : 3196000
|
||||
|
||||
B) There is a round-off error associated with the cpuinfo_cur_freq value.
|
||||
Since the driver obtains the current frequency as a "percentage" (%) of the
|
||||
nominal frequency from the BIOS, sometimes, the values displayed by
|
||||
scaling_cur_freq and cpuinfo_cur_freq may not match. An example:
|
||||
|
||||
scaling_cur_freq : 1600000
|
||||
cpuinfo_cur_freq : 1583000
|
||||
|
||||
In this example, the nominal frequency is 2933 MHz. The driver obtains the
|
||||
current frequency, cpuinfo_cur_freq, as 54% of the nominal frequency:
|
||||
|
||||
54% of 2933 MHz = 1583 MHz
|
||||
|
||||
Nominal frequency is the maximum frequency of the processor, and it usually
|
||||
corresponds to the frequency of the P0 P-state.
|
||||
|
||||
2.4 related_cpus:
|
||||
-----------------
|
||||
The related_cpus field is identical to affected_cpus.
|
||||
|
||||
affected_cpus : 4
|
||||
related_cpus : 4
|
||||
|
||||
Currently, the PCC driver does not evaluate _PSD. The platforms that support
|
||||
PCC do not implement SW_ALL. So OSPM doesn't need to perform any coordination
|
||||
to ensure that the same frequency is requested of all dependent CPUs.
|
||||
|
||||
3. Caveats:
|
||||
-----------
|
||||
The "cpufreq_stats" module in its present form cannot be loaded and
|
||||
expected to work with the PCC driver. Since the "cpufreq_stats" module
|
||||
provides information wrt each P-state, it is not applicable to the PCC driver.
|
@ -1,22 +0,0 @@
|
||||
Debugging Modules after 2.6.3
|
||||
-----------------------------
|
||||
|
||||
In almost all distributions, the kernel asks for modules which don't
|
||||
exist, such as "net-pf-10" or whatever. Changing "modprobe -q" to
|
||||
"succeed" in this case is hacky and breaks some setups, and also we
|
||||
want to know if it failed for the fallback code for old aliases in
|
||||
fs/char_dev.c, for example.
|
||||
|
||||
In the past a debugging message which would fill people's logs was
|
||||
emitted. This debugging message has been removed. The correct way
|
||||
of debugging module problems is something like this:
|
||||
|
||||
echo '#! /bin/sh' > /tmp/modprobe
|
||||
echo 'echo "$@" >> /tmp/modprobe.log' >> /tmp/modprobe
|
||||
echo 'exec /sbin/modprobe "$@"' >> /tmp/modprobe
|
||||
chmod a+x /tmp/modprobe
|
||||
echo /tmp/modprobe > /proc/sys/kernel/modprobe
|
||||
|
||||
Note that the above applies only when the *kernel* is requesting
|
||||
that the module be loaded -- it won't have any effect if that module
|
||||
is being loaded explicitly using "modprobe" from userspace.
|
@ -203,7 +203,7 @@ Cause
|
||||
may not correctly copy files from sysfs.
|
||||
|
||||
Solution
|
||||
Use ``cat``' to read ``.gcda`` files and ``cp -d`` to copy links.
|
||||
Use ``cat`` to read ``.gcda`` files and ``cp -d`` to copy links.
|
||||
Alternatively use the mechanism shown in Appendix B.
|
||||
|
||||
|
||||
|
@ -8,7 +8,8 @@ with the difference that the orphan objects are not freed but only
|
||||
reported via /sys/kernel/debug/kmemleak. A similar method is used by the
|
||||
Valgrind tool (``memcheck --leak-check``) to detect the memory leaks in
|
||||
user-space applications.
|
||||
Kmemleak is supported on x86, arm, powerpc, sparc, sh, microblaze, ppc, mips, s390 and tile.
|
||||
Kmemleak is supported on x86, arm, arm64, powerpc, sparc, sh, microblaze, mips,
|
||||
s390, nds32, arc and xtensa.
|
||||
|
||||
Usage
|
||||
-----
|
||||
|
@ -17,14 +17,23 @@ What is KUnit?
|
||||
==============
|
||||
|
||||
KUnit is a lightweight unit testing and mocking framework for the Linux kernel.
|
||||
These tests are able to be run locally on a developer's workstation without a VM
|
||||
or special hardware.
|
||||
|
||||
KUnit is heavily inspired by JUnit, Python's unittest.mock, and
|
||||
Googletest/Googlemock for C++. KUnit provides facilities for defining unit test
|
||||
cases, grouping related test cases into test suites, providing common
|
||||
infrastructure for running tests, and much more.
|
||||
|
||||
KUnit consists of a kernel component, which provides a set of macros for easily
|
||||
writing unit tests. Tests written against KUnit will run on kernel boot if
|
||||
built-in, or when loaded if built as a module. These tests write out results to
|
||||
the kernel log in `TAP <https://testanything.org/>`_ format.
|
||||
|
||||
To make running these tests (and reading the results) easier, KUnit offers
|
||||
:doc:`kunit_tool <kunit-tool>`, which builds a `User Mode Linux
|
||||
<http://user-mode-linux.sourceforge.net>`_ kernel, runs it, and parses the test
|
||||
results. This provides a quick way of running KUnit tests during development,
|
||||
without requiring a virtual machine or separate hardware.
|
||||
|
||||
Get started now: :doc:`start`
|
||||
|
||||
Why KUnit?
|
||||
@ -36,21 +45,20 @@ allow all possible code paths to be tested in the code under test; this is only
|
||||
possible if the code under test is very small and does not have any external
|
||||
dependencies outside of the test's control like hardware.
|
||||
|
||||
Outside of KUnit, there are no testing frameworks currently
|
||||
available for the kernel that do not require installing the kernel on a test
|
||||
machine or in a VM and all require tests to be written in userspace running on
|
||||
the kernel; this is true for Autotest, and kselftest, disqualifying
|
||||
any of them from being considered unit testing frameworks.
|
||||
KUnit provides a common framework for unit tests within the kernel.
|
||||
|
||||
KUnit addresses the problem of being able to run tests without needing a virtual
|
||||
machine or actual hardware with User Mode Linux. User Mode Linux is a Linux
|
||||
architecture, like ARM or x86; however, unlike other architectures it compiles
|
||||
to a standalone program that can be run like any other program directly inside
|
||||
of a host operating system; to be clear, it does not require any virtualization
|
||||
support; it is just a regular program.
|
||||
KUnit tests can be run on most architectures, and most tests are architecture
|
||||
independent. All built-in KUnit tests run on kernel startup. Alternatively,
|
||||
KUnit and KUnit tests can be built as modules and tests will run when the test
|
||||
module is loaded.
|
||||
|
||||
Alternatively, kunit and kunit tests can be built as modules and tests will
|
||||
run when the test module is loaded.
|
||||
.. note::
|
||||
|
||||
KUnit can also run tests without needing a virtual machine or actual
|
||||
hardware under User Mode Linux. User Mode Linux is a Linux architecture,
|
||||
like ARM or x86, which compiles the kernel as a Linux executable. KUnit
|
||||
can be used with UML either by building with ``ARCH=um`` (like any other
|
||||
architecture), or by using :doc:`kunit_tool <kunit-tool>`.
|
||||
|
||||
KUnit is fast. Excluding build time, from invocation to completion KUnit can run
|
||||
several dozen tests in only 10 to 20 seconds; this might not sound like a big
|
||||
@ -81,3 +89,5 @@ How do I use it?
|
||||
* :doc:`start` - for new users of KUnit
|
||||
* :doc:`usage` - for a more detailed explanation of KUnit features
|
||||
* :doc:`api/index` - for the list of KUnit APIs used for testing
|
||||
* :doc:`kunit-tool` - for more information on the kunit_tool helper script
|
||||
* :doc:`faq` - for answers to some common questions about KUnit
|
||||
|
@ -12,6 +12,13 @@ the Linux kernel as UML (`User Mode Linux
|
||||
<http://user-mode-linux.sourceforge.net/>`_), running KUnit tests, parsing
|
||||
the test results and displaying them in a user friendly manner.
|
||||
|
||||
kunit_tool addresses the problem of being able to run tests without needing a
|
||||
virtual machine or actual hardware with User Mode Linux. User Mode Linux is a
|
||||
Linux architecture, like ARM or x86; however, unlike other architectures it
|
||||
compiles the kernel as a standalone Linux executable that can be run like any
|
||||
other program directly inside of a host operating system. To be clear, it does
|
||||
not require any virtualization support: it is just a regular program.
|
||||
|
||||
What is a kunitconfig?
|
||||
======================
|
||||
|
||||
|
@ -9,11 +9,10 @@ Installing dependencies
|
||||
KUnit has the same dependencies as the Linux kernel. As long as you can build
|
||||
the kernel, you can run KUnit.
|
||||
|
||||
KUnit Wrapper
|
||||
=============
|
||||
Included with KUnit is a simple Python wrapper that helps format the output to
|
||||
easily use and read KUnit output. It handles building and running the kernel, as
|
||||
well as formatting the output.
|
||||
Running tests with the KUnit Wrapper
|
||||
====================================
|
||||
Included with KUnit is a simple Python wrapper which runs tests under User Mode
|
||||
Linux, and formats the test results.
|
||||
|
||||
The wrapper can be run with:
|
||||
|
||||
@ -21,22 +20,42 @@ The wrapper can be run with:
|
||||
|
||||
./tools/testing/kunit/kunit.py run --defconfig
|
||||
|
||||
For more information on this wrapper (also called kunit_tool) checkout the
|
||||
For more information on this wrapper (also called kunit_tool) check out the
|
||||
:doc:`kunit-tool` page.
|
||||
|
||||
Creating a .kunitconfig
|
||||
=======================
|
||||
The Python script is a thin wrapper around Kbuild. As such, it needs to be
|
||||
configured with a ``.kunitconfig`` file. This file essentially contains the
|
||||
regular Kernel config, with the specific test targets as well.
|
||||
-----------------------
|
||||
If you want to run a specific set of tests (rather than those listed in the
|
||||
KUnit defconfig), you can provide Kconfig options in the ``.kunitconfig`` file.
|
||||
This file essentially contains the regular Kernel config, with the specific
|
||||
test targets as well. The ``.kunitconfig`` should also contain any other config
|
||||
options required by the tests.
|
||||
|
||||
A good starting point for a ``.kunitconfig`` is the KUnit defconfig:
|
||||
.. code-block:: bash
|
||||
|
||||
cd $PATH_TO_LINUX_REPO
|
||||
cp arch/um/configs/kunit_defconfig .kunitconfig
|
||||
|
||||
Verifying KUnit Works
|
||||
---------------------
|
||||
You can then add any other Kconfig options you wish, e.g.:
|
||||
.. code-block:: none
|
||||
|
||||
CONFIG_LIST_KUNIT_TEST=y
|
||||
|
||||
:doc:`kunit_tool <kunit-tool>` will ensure that all config options set in
|
||||
``.kunitconfig`` are set in the kernel ``.config`` before running the tests.
|
||||
It'll warn you if you haven't included the dependencies of the options you're
|
||||
using.
|
||||
|
||||
.. note::
|
||||
Note that removing something from the ``.kunitconfig`` will not trigger a
|
||||
rebuild of the ``.config`` file: the configuration is only updated if the
|
||||
``.kunitconfig`` is not a subset of ``.config``. This means that you can use
|
||||
other tools (such as make menuconfig) to adjust other config options.
|
||||
|
||||
|
||||
Running the tests
|
||||
-----------------
|
||||
|
||||
To make sure that everything is set up correctly, simply invoke the Python
|
||||
wrapper from your kernel repo:
|
||||
@ -62,6 +81,41 @@ followed by a list of tests that are run. All of them should be passing.
|
||||
Because it is building a lot of sources for the first time, the
|
||||
``Building KUnit kernel`` step may take a while.
|
||||
|
||||
Running tests without the KUnit Wrapper
|
||||
=======================================
|
||||
|
||||
If you'd rather not use the KUnit Wrapper (if, for example, you need to
|
||||
integrate with other systems, or use an architecture other than UML), KUnit can
|
||||
be included in any kernel, and the results read out and parsed manually.
|
||||
|
||||
.. note::
|
||||
KUnit is not designed for use in a production system, and it's possible that
|
||||
tests may reduce the stability or security of the system.
|
||||
|
||||
|
||||
|
||||
Configuring the kernel
|
||||
----------------------
|
||||
|
||||
In order to enable KUnit itself, you simply need to enable the ``CONFIG_KUNIT``
|
||||
Kconfig option (it's under Kernel Hacking/Kernel Testing and Coverage in
|
||||
menuconfig). From there, you can enable any KUnit tests you want: they usually
|
||||
have config options ending in ``_KUNIT_TEST``.
|
||||
|
||||
KUnit and KUnit tests can be compiled as modules: in this case the tests in a
|
||||
module will be run when the module is loaded.
|
||||
|
||||
Running the tests
|
||||
-----------------
|
||||
|
||||
Build and run your kernel as usual. Test output will be written to the kernel
|
||||
log in `TAP <https://testanything.org/>`_ format.
|
||||
|
||||
.. note::
|
||||
It's possible that there will be other lines and/or data interspersed in the
|
||||
TAP output.
|
||||
|
||||
|
||||
Writing your first test
|
||||
=======================
|
||||
|
||||
|
@ -591,3 +591,17 @@ able to run one test case per invocation.
|
||||
|
||||
.. TODO(brendanhiggins@google.com): Add an actual example of an architecture
|
||||
dependent KUnit test.
|
||||
|
||||
KUnit debugfs representation
|
||||
============================
|
||||
When kunit test suites are initialized, they create an associated directory
|
||||
in /sys/kernel/debug/kunit/<test-suite>. The directory contains one file
|
||||
|
||||
- results: "cat results" displays results of each test case and the results
|
||||
of the entire suite for the last test run.
|
||||
|
||||
The debugfs representation is primarily of use when kunit test suites are
|
||||
run in a native environment, either as modules or builtin. Having a way
|
||||
to display results like this is valuable as otherwise results can be
|
||||
intermixed with other events in dmesg output. The maximum size of each
|
||||
results file is KUNIT_LOG_SIZE bytes (defined in include/kunit/test.h).
|
||||
|
3
Documentation/devicetree/bindings/.gitignore
vendored
3
Documentation/devicetree/bindings/.gitignore
vendored
@ -1,2 +1,3 @@
|
||||
# SPDX-License-Identifier: GPL-2.0-only
|
||||
*.example.dts
|
||||
processed-schema.yaml
|
||||
processed-schema*.yaml
|
||||
|
@ -2,7 +2,6 @@
|
||||
DT_DOC_CHECKER ?= dt-doc-validate
|
||||
DT_EXTRACT_EX ?= dt-extract-example
|
||||
DT_MK_SCHEMA ?= dt-mk-schema
|
||||
DT_MK_SCHEMA_FLAGS := $(if $(DT_SCHEMA_FILES), -u)
|
||||
|
||||
quiet_cmd_chk_binding = CHKDT $(patsubst $(srctree)/%,%,$<)
|
||||
cmd_chk_binding = $(DT_DOC_CHECKER) -u $(srctree)/$(src) $< ; \
|
||||
@ -11,26 +10,35 @@ quiet_cmd_chk_binding = CHKDT $(patsubst $(srctree)/%,%,$<)
|
||||
$(obj)/%.example.dts: $(src)/%.yaml FORCE
|
||||
$(call if_changed,chk_binding)
|
||||
|
||||
DT_TMP_SCHEMA := processed-schema.yaml
|
||||
# Use full schemas when checking %.example.dts
|
||||
DT_TMP_SCHEMA := $(obj)/processed-schema-examples.yaml
|
||||
|
||||
quiet_cmd_mk_schema = SCHEMA $@
|
||||
cmd_mk_schema = $(DT_MK_SCHEMA) $(DT_MK_SCHEMA_FLAGS) -o $@ $(real-prereqs)
|
||||
|
||||
DT_DOCS = $(shell \
|
||||
DT_DOCS = $(addprefix $(src)/, \
|
||||
$(shell \
|
||||
cd $(srctree)/$(src) && \
|
||||
find * \( -name '*.yaml' ! \
|
||||
-name $(DT_TMP_SCHEMA) ! \
|
||||
-name 'processed-schema*' ! \
|
||||
-name '*.example.dt.yaml' \) \
|
||||
)
|
||||
))
|
||||
|
||||
DT_SCHEMA_FILES ?= $(addprefix $(src)/,$(DT_DOCS))
|
||||
DT_SCHEMA_FILES ?= $(DT_DOCS)
|
||||
|
||||
ifeq ($(CHECK_DTBS),)
|
||||
extra-y += $(patsubst $(src)/%.yaml,%.example.dts, $(DT_SCHEMA_FILES))
|
||||
extra-y += $(patsubst $(src)/%.yaml,%.example.dt.yaml, $(DT_SCHEMA_FILES))
|
||||
endif
|
||||
extra-$(CHECK_DT_BINDING) += $(patsubst $(src)/%.yaml,%.example.dts, $(DT_SCHEMA_FILES))
|
||||
extra-$(CHECK_DT_BINDING) += $(patsubst $(src)/%.yaml,%.example.dt.yaml, $(DT_SCHEMA_FILES))
|
||||
extra-$(CHECK_DT_BINDING) += processed-schema-examples.yaml
|
||||
|
||||
$(obj)/$(DT_TMP_SCHEMA): $(DT_SCHEMA_FILES) FORCE
|
||||
override DTC_FLAGS := \
|
||||
-Wno-avoid_unnecessary_addr_size \
|
||||
-Wno-graph_child_address
|
||||
|
||||
$(obj)/processed-schema-examples.yaml: $(DT_DOCS) FORCE
|
||||
$(call if_changed,mk_schema)
|
||||
|
||||
extra-y += $(DT_TMP_SCHEMA)
|
||||
$(obj)/processed-schema.yaml: DT_MK_SCHEMA_FLAGS := -u
|
||||
$(obj)/processed-schema.yaml: $(DT_SCHEMA_FILES) FORCE
|
||||
$(call if_changed,mk_schema)
|
||||
|
||||
extra-y += processed-schema.yaml
|
||||
|
@ -21,6 +21,8 @@ properties:
|
||||
required:
|
||||
- compatible
|
||||
|
||||
additionalProperties: false
|
||||
|
||||
examples:
|
||||
- |
|
||||
clkmgr@ffd04000 {
|
||||
|
@ -43,6 +43,8 @@ required:
|
||||
- compatible
|
||||
- reg
|
||||
|
||||
additionalProperties: false
|
||||
|
||||
examples:
|
||||
- |
|
||||
ao-secure@140 {
|
||||
|
86
Documentation/devicetree/bindings/arm/arm,integrator.yaml
Normal file
86
Documentation/devicetree/bindings/arm/arm,integrator.yaml
Normal file
@ -0,0 +1,86 @@
|
||||
# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
|
||||
%YAML 1.2
|
||||
---
|
||||
$id: http://devicetree.org/schemas/arm/arm,integrator.yaml#
|
||||
$schema: http://devicetree.org/meta-schemas/core.yaml#
|
||||
|
||||
title: ARM Integrator Boards Device Tree Bindings
|
||||
|
||||
maintainers:
|
||||
- Linus Walleij <linus.walleij@linaro.org>
|
||||
|
||||
description: |+
|
||||
These were the first ARM platforms officially supported by ARM Ltd.
|
||||
They are ARMv4, ARMv5 and ARMv6-capable using different core tiles,
|
||||
so the system is modular and can host a variety of CPU tiles called
|
||||
"core tiles" and referred to in the device tree as "core modules".
|
||||
|
||||
properties:
|
||||
$nodename:
|
||||
const: '/'
|
||||
compatible:
|
||||
oneOf:
|
||||
- description: ARM Integrator Application Platform, this board has a PCI
|
||||
host and several PCI slots, as well as a number of slots for logical
|
||||
expansion modules, it is referred to as an "ASIC Development
|
||||
Motherboard" and is extended with custom FPGA and is intended for
|
||||
rapid prototyping. See ARM DUI 0098B. This board can physically come
|
||||
pre-packaged in a PC Tower form factor called Integrator/PP1 or a
|
||||
special metal fixture called Integrator/PP2, see ARM DUI 0169A.
|
||||
items:
|
||||
- const: arm,integrator-ap
|
||||
- description: ARM Integrator Compact Platform (HBI-0086), this board has
|
||||
a compact form factor and mainly consists of the bare minimum
|
||||
peripherals to make use of the core module. See ARM DUI 0159B.
|
||||
items:
|
||||
- const: arm,integrator-cp
|
||||
- description: ARM Integrator Standard Development Board (SDB) Platform,
|
||||
this board is a PCI-based board conforming to the Microsoft SDB
|
||||
(HARP) specification. See ARM DUI 0099A.
|
||||
items:
|
||||
- const: arm,integrator-sp
|
||||
|
||||
core-module@10000000:
|
||||
type: object
|
||||
description: the root node in the Integrator platforms must contain
|
||||
a core module child node. They are always at physical address
|
||||
0x10000000 in all the Integrator variants.
|
||||
properties:
|
||||
compatible:
|
||||
items:
|
||||
- const: arm,core-module-integrator
|
||||
- const: syscon
|
||||
- const: simple-mfd
|
||||
reg:
|
||||
maxItems: 1
|
||||
|
||||
required:
|
||||
- compatible
|
||||
- reg
|
||||
|
||||
patternProperties:
|
||||
"^syscon@[0-9a-f]+$":
|
||||
description: All Integrator boards must provide a system controller as a
|
||||
node in the root of the device tree.
|
||||
type: object
|
||||
properties:
|
||||
compatible:
|
||||
items:
|
||||
- enum:
|
||||
- arm,integrator-ap-syscon
|
||||
- arm,integrator-cp-syscon
|
||||
- arm,integrator-sp-syscon
|
||||
- const: syscon
|
||||
reg:
|
||||
maxItems: 1
|
||||
|
||||
required:
|
||||
- compatible
|
||||
- reg
|
||||
|
||||
|
||||
required:
|
||||
- compatible
|
||||
- core-module@10000000
|
||||
|
||||
...
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user