Merge remote-tracking branch 'torvalds/master' into perf/core
To resolve a trivial merge conflict with c302378bc157f6a7 ("libbpf: Hashmap interface update to allow both long and void* keys/values"), where a function present upstream was removed in the perf tools development tree. Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
This commit is contained in:
commit
1a931707ad
@ -222,6 +222,7 @@ ForEachMacros:
|
||||
- 'for_each_component_dais'
|
||||
- 'for_each_component_dais_safe'
|
||||
- 'for_each_console'
|
||||
- 'for_each_console_srcu'
|
||||
- 'for_each_cpu'
|
||||
- 'for_each_cpu_and'
|
||||
- 'for_each_cpu_not'
|
||||
@ -440,8 +441,11 @@ ForEachMacros:
|
||||
- 'inet_lhash2_for_each_icsk'
|
||||
- 'inet_lhash2_for_each_icsk_continue'
|
||||
- 'inet_lhash2_for_each_icsk_rcu'
|
||||
- 'interval_tree_for_each_double_span'
|
||||
- 'interval_tree_for_each_span'
|
||||
- 'intlist__for_each_entry'
|
||||
- 'intlist__for_each_entry_safe'
|
||||
- 'iopt_for_each_contig_area'
|
||||
- 'kcore_copy__for_each_phdr'
|
||||
- 'key_for_each'
|
||||
- 'key_for_each_safe'
|
||||
@ -535,6 +539,7 @@ ForEachMacros:
|
||||
- 'perf_hpp_list__for_each_sort_list_safe'
|
||||
- 'perf_pmu__for_each_hybrid_pmu'
|
||||
- 'ping_portaddr_for_each_entry'
|
||||
- 'ping_portaddr_for_each_entry_rcu'
|
||||
- 'plist_for_each'
|
||||
- 'plist_for_each_continue'
|
||||
- 'plist_for_each_entry'
|
||||
|
1
.gitignore
vendored
1
.gitignore
vendored
@ -20,6 +20,7 @@
|
||||
*.dtb
|
||||
*.dtbo
|
||||
*.dtb.S
|
||||
*.dtbo.S
|
||||
*.dwo
|
||||
*.elf
|
||||
*.gcno
|
||||
|
7
.mailmap
7
.mailmap
@ -29,6 +29,7 @@ Alexandre Belloni <alexandre.belloni@bootlin.com> <alexandre.belloni@free-electr
|
||||
Alexei Starovoitov <ast@kernel.org> <alexei.starovoitov@gmail.com>
|
||||
Alexei Starovoitov <ast@kernel.org> <ast@fb.com>
|
||||
Alexei Starovoitov <ast@kernel.org> <ast@plumgrid.com>
|
||||
Alex Hung <alexhung@gmail.com> <alex.hung@canonical.com>
|
||||
Alex Shi <alexs@kernel.org> <alex.shi@intel.com>
|
||||
Alex Shi <alexs@kernel.org> <alex.shi@linaro.org>
|
||||
Alex Shi <alexs@kernel.org> <alex.shi@linux.alibaba.com>
|
||||
@ -227,6 +228,7 @@ Juha Yrjola <at solidboot.com>
|
||||
Juha Yrjola <juha.yrjola@nokia.com>
|
||||
Juha Yrjola <juha.yrjola@solidboot.com>
|
||||
Julien Thierry <julien.thierry.kdev@gmail.com> <julien.thierry@arm.com>
|
||||
Iskren Chernev <me@iskren.info> <iskren.chernev@gmail.com>
|
||||
Kalle Valo <kvalo@kernel.org> <kvalo@codeaurora.org>
|
||||
Kalyan Thota <quic_kalyant@quicinc.com> <kalyan_t@codeaurora.org>
|
||||
Kay Sievers <kay.sievers@vrfy.org>
|
||||
@ -286,6 +288,7 @@ Matthew Wilcox <willy@infradead.org> <willy@linux.intel.com>
|
||||
Matthew Wilcox <willy@infradead.org> <willy@parisc-linux.org>
|
||||
Matthias Fuchs <socketcan@esd.eu> <matthias.fuchs@esd.eu>
|
||||
Matthieu CASTET <castet.matthieu@free.fr>
|
||||
Matti Vaittinen <mazziesaccount@gmail.com> <matti.vaittinen@fi.rohmeurope.com>
|
||||
Matt Ranostay <matt.ranostay@konsulko.com> <matt@ranostay.consulting>
|
||||
Matt Ranostay <mranostay@gmail.com> Matthew Ranostay <mranostay@embeddedalley.com>
|
||||
Matt Ranostay <mranostay@gmail.com> <matt.ranostay@intel.com>
|
||||
@ -371,6 +374,8 @@ Ricardo Ribalda <ribalda@kernel.org> <ricardo.ribalda@gmail.com>
|
||||
Roman Gushchin <roman.gushchin@linux.dev> <guro@fb.com>
|
||||
Roman Gushchin <roman.gushchin@linux.dev> <guroan@gmail.com>
|
||||
Roman Gushchin <roman.gushchin@linux.dev> <klamm@yandex-team.ru>
|
||||
Muchun Song <muchun.song@linux.dev> <songmuchun@bytedance.com>
|
||||
Muchun Song <muchun.song@linux.dev> <smuchun@gmail.com>
|
||||
Ross Zwisler <zwisler@kernel.org> <ross.zwisler@linux.intel.com>
|
||||
Rudolf Marek <R.Marek@sh.cvut.cz>
|
||||
Rui Saraiva <rmps@joel.ist.utl.pt>
|
||||
@ -382,6 +387,7 @@ Santosh Shilimkar <santosh.shilimkar@oracle.org>
|
||||
Santosh Shilimkar <ssantosh@kernel.org>
|
||||
Sarangdhar Joshi <spjoshi@codeaurora.org>
|
||||
Sascha Hauer <s.hauer@pengutronix.de>
|
||||
Satya Priya <quic_c_skakit@quicinc.com> <skakit@codeaurora.org>
|
||||
S.Çağlar Onur <caglar@pardus.org.tr>
|
||||
Sean Christopherson <seanjc@google.com> <sean.j.christopherson@intel.com>
|
||||
Sean Nyekjaer <sean@geanix.com> <sean.nyekjaer@prevas.dk>
|
||||
@ -389,6 +395,7 @@ Sebastian Reichel <sre@kernel.org> <sebastian.reichel@collabora.co.uk>
|
||||
Sebastian Reichel <sre@kernel.org> <sre@debian.org>
|
||||
Sedat Dilek <sedat.dilek@gmail.com> <sedat.dilek@credativ.de>
|
||||
Seth Forshee <sforshee@kernel.org> <seth.forshee@canonical.com>
|
||||
Shannon Nelson <shannon.nelson@amd.com> <snelson@pensando.io>
|
||||
Shiraz Hashim <shiraz.linux.kernel@gmail.com> <shiraz.hashim@st.com>
|
||||
Shuah Khan <shuah@kernel.org> <shuahkhan@gmail.com>
|
||||
Shuah Khan <shuah@kernel.org> <shuah.khan@hp.com>
|
||||
|
21
Documentation/ABI/testing/debugfs-dell-wmi-ddv
Normal file
21
Documentation/ABI/testing/debugfs-dell-wmi-ddv
Normal file
@ -0,0 +1,21 @@
|
||||
What: /sys/kernel/debug/dell-wmi-ddv-<wmi_device_name>/fan_sensor_information
|
||||
Date: September 2022
|
||||
KernelVersion: 6.1
|
||||
Contact: Armin Wolf <W_Armin@gmx.de>
|
||||
Description:
|
||||
This file contains the contents of the fan sensor information buffer,
|
||||
which contains fan sensor entries and a terminating character (0xFF).
|
||||
|
||||
Each fan sensor entry consists of three bytes with an unknown meaning,
|
||||
interested people may use this file for reverse-engineering.
|
||||
|
||||
What: /sys/kernel/debug/dell-wmi-ddv-<wmi_device_name>/thermal_sensor_information
|
||||
Date: September 2022
|
||||
KernelVersion: 6.1
|
||||
Contact: Armin Wolf <W_Armin@gmx.de>
|
||||
Description:
|
||||
This file contains the contents of the thermal sensor information buffer,
|
||||
which contains thermal sensor entries and a terminating character (0xFF).
|
||||
|
||||
Each thermal sensor entry consists of five bytes with an unknown meaning,
|
||||
interested people may use this file for reverse-engineering.
|
@ -1,18 +0,0 @@
|
||||
What: /sys/kernel/debug/pktcdvd/pktcdvd[0-7]
|
||||
Date: Oct. 2006
|
||||
KernelVersion: 2.6.20
|
||||
Contact: Thomas Maier <balagi@justmail.de>
|
||||
Description:
|
||||
|
||||
The pktcdvd module (packet writing driver) creates
|
||||
these files in debugfs:
|
||||
|
||||
/sys/kernel/debug/pktcdvd/pktcdvd[0-7]/
|
||||
|
||||
==== ====== ====================================
|
||||
info 0444 Lots of driver statistics and infos.
|
||||
==== ====== ====================================
|
||||
|
||||
Example::
|
||||
|
||||
cat /sys/kernel/debug/pktcdvd/pktcdvd0/info
|
@ -137,3 +137,17 @@ Description:
|
||||
The writeback_limit file is read-write and specifies the maximum
|
||||
amount of writeback ZRAM can do. The limit could be changed
|
||||
in run time.
|
||||
|
||||
What: /sys/block/zram<id>/recomp_algorithm
|
||||
Date: November 2022
|
||||
Contact: Sergey Senozhatsky <senozhatsky@chromium.org>
|
||||
Description:
|
||||
The recomp_algorithm file is read-write and allows to set
|
||||
or show secondary compression algorithms.
|
||||
|
||||
What: /sys/block/zram<id>/recompress
|
||||
Date: November 2022
|
||||
Contact: Sergey Senozhatsky <senozhatsky@chromium.org>
|
||||
Description:
|
||||
The recompress file is write-only and triggers re-compression
|
||||
with secondary compression algorithms.
|
||||
|
@ -41,3 +41,17 @@ KernelVersion: 5.18
|
||||
Contact: Kajol Jain <kjain@linux.ibm.com>
|
||||
Description: (RO) This sysfs file exposes the cpumask which is designated to
|
||||
to retrieve nvdimm pmu event counter data.
|
||||
|
||||
What: /sys/bus/nd/devices/nmemX/cxl/id
|
||||
Date: November 2022
|
||||
KernelVersion: 6.2
|
||||
Contact: Dave Jiang <dave.jiang@intel.com>
|
||||
Description: (RO) Show the id (serial) of the device. This is CXL specific.
|
||||
|
||||
What: /sys/bus/nd/devices/nmemX/cxl/provider
|
||||
Date: November 2022
|
||||
KernelVersion: 6.2
|
||||
Contact: Dave Jiang <dave.jiang@intel.com>
|
||||
Description: (RO) Shows the CXL bridge device that ties to a CXL memory device
|
||||
to this NVDIMM device. I.e. the parent of the device returned is
|
||||
a /sys/bus/cxl/devices/memX instance.
|
||||
|
@ -407,6 +407,16 @@ Description:
|
||||
file contains a '1' if the memory has been published for
|
||||
use outside the driver that owns the device.
|
||||
|
||||
What: /sys/bus/pci/devices/.../p2pmem/allocate
|
||||
Date: August 2022
|
||||
Contact: Logan Gunthorpe <logang@deltatee.com>
|
||||
Description:
|
||||
This file allows mapping p2pmem into userspace. For each
|
||||
mmap() call on this file, the kernel will allocate a chunk
|
||||
of Peer-to-Peer memory for use in Peer-to-Peer transactions.
|
||||
This memory can be used in O_DIRECT calls to NVMe backed
|
||||
files for Peer-to-Peer copies.
|
||||
|
||||
What: /sys/bus/pci/devices/.../link/clkpm
|
||||
/sys/bus/pci/devices/.../link/l0s_aspm
|
||||
/sys/bus/pci/devices/.../link/l1_aspm
|
||||
|
@ -5,6 +5,9 @@ Contact: linux-mtd@lists.infradead.org
|
||||
Description: (RO) The JEDEC ID of the SPI NOR flash as reported by the
|
||||
flash device.
|
||||
|
||||
The attribute is not present if the flash doesn't support
|
||||
the "Read JEDEC ID" command (9Fh). This is the case for
|
||||
non-JEDEC compliant flashes.
|
||||
|
||||
What: /sys/bus/spi/devices/.../spi-nor/manufacturer
|
||||
Date: April 2021
|
||||
@ -12,6 +15,9 @@ KernelVersion: 5.14
|
||||
Contact: linux-mtd@lists.infradead.org
|
||||
Description: (RO) Manufacturer of the SPI NOR flash.
|
||||
|
||||
The attribute is not present if the flash device isn't
|
||||
known to the kernel and is only probed by its SFDP
|
||||
tables.
|
||||
|
||||
What: /sys/bus/spi/devices/.../spi-nor/partname
|
||||
Date: April 2021
|
||||
|
@ -44,6 +44,21 @@ Description:
|
||||
|
||||
(read-write)
|
||||
|
||||
What: /sys/class/bdi/<bdi>/min_ratio_fine
|
||||
Date: November 2022
|
||||
Contact: Stefan Roesch <shr@devkernel.io>
|
||||
Description:
|
||||
Under normal circumstances each device is given a part of the
|
||||
total write-back cache that relates to its current average
|
||||
writeout speed in relation to the other devices.
|
||||
|
||||
The 'min_ratio_fine' parameter allows assigning a minimum reserve
|
||||
of the write-back cache to a particular device. The value is
|
||||
expressed as part of 1 million. For example, this is useful for
|
||||
providing a minimum QoS.
|
||||
|
||||
(read-write)
|
||||
|
||||
What: /sys/class/bdi/<bdi>/max_ratio
|
||||
Date: January 2008
|
||||
Contact: Peter Zijlstra <a.p.zijlstra@chello.nl>
|
||||
@ -55,6 +70,59 @@ Description:
|
||||
mount that is prone to get stuck, or a FUSE mount which cannot
|
||||
be trusted to play fair.
|
||||
|
||||
(read-write)
|
||||
|
||||
What: /sys/class/bdi/<bdi>/max_ratio_fine
|
||||
Date: November 2022
|
||||
Contact: Stefan Roesch <shr@devkernel.io>
|
||||
Description:
|
||||
Allows limiting a particular device to use not more than the
|
||||
given value of the write-back cache. The value is given as part
|
||||
of 1 million. This is useful in situations where we want to avoid
|
||||
one device taking all or most of the write-back cache. For example
|
||||
in case of an NFS mount that is prone to get stuck, or a FUSE mount
|
||||
which cannot be trusted to play fair.
|
||||
|
||||
(read-write)
|
||||
|
||||
What: /sys/class/bdi/<bdi>/min_bytes
|
||||
Date: October 2022
|
||||
Contact: Stefan Roesch <shr@devkernel.io>
|
||||
Description:
|
||||
Under normal circumstances each device is given a part of the
|
||||
total write-back cache that relates to its current average
|
||||
writeout speed in relation to the other devices.
|
||||
|
||||
The 'min_bytes' parameter allows assigning a minimum
|
||||
percentage of the write-back cache to a particular device
|
||||
expressed in bytes.
|
||||
For example, this is useful for providing a minimum QoS.
|
||||
|
||||
(read-write)
|
||||
|
||||
What: /sys/class/bdi/<bdi>/max_bytes
|
||||
Date: October 2022
|
||||
Contact: Stefan Roesch <shr@devkernel.io>
|
||||
Description:
|
||||
Allows limiting a particular device to use not more than the
|
||||
given 'max_bytes' of the write-back cache. This is useful in
|
||||
situations where we want to avoid one device taking all or
|
||||
most of the write-back cache. For example in case of an NFS
|
||||
mount that is prone to get stuck, a FUSE mount which cannot be
|
||||
trusted to play fair, or a nbd device.
|
||||
|
||||
(read-write)
|
||||
|
||||
What: /sys/class/bdi/<bdi>/strict_limit
|
||||
Date: October 2022
|
||||
Contact: Stefan Roesch <shr@devkernel.io>
|
||||
Description:
|
||||
Forces per-BDI checks for the share of given device in the write-back
|
||||
cache even before the global background dirty limit is reached. This
|
||||
is useful in situations where the global limit is much higher than
|
||||
affordable for given relatively slow (or untrusted) device. Turning
|
||||
strictlimit on has no visible effect if max_ratio is equal to 100%.
|
||||
|
||||
(read-write)
|
||||
What: /sys/class/bdi/<bdi>/stable_pages_required
|
||||
Date: January 2008
|
||||
|
@ -1,97 +0,0 @@
|
||||
sysfs interface
|
||||
---------------
|
||||
The pktcdvd module (packet writing driver) creates the following files in the
|
||||
sysfs: (<devid> is in the format major:minor)
|
||||
|
||||
What: /sys/class/pktcdvd/add
|
||||
What: /sys/class/pktcdvd/remove
|
||||
What: /sys/class/pktcdvd/device_map
|
||||
Date: Oct. 2006
|
||||
KernelVersion: 2.6.20
|
||||
Contact: Thomas Maier <balagi@justmail.de>
|
||||
Description:
|
||||
|
||||
========== ==============================================
|
||||
add (WO) Write a block device id (major:minor) to
|
||||
create a new pktcdvd device and map it to the
|
||||
block device.
|
||||
|
||||
remove (WO) Write the pktcdvd device id (major:minor)
|
||||
to remove the pktcdvd device.
|
||||
|
||||
device_map (RO) Shows the device mapping in format:
|
||||
pktcdvd[0-7] <pktdevid> <blkdevid>
|
||||
========== ==============================================
|
||||
|
||||
|
||||
What: /sys/class/pktcdvd/pktcdvd[0-7]/dev
|
||||
What: /sys/class/pktcdvd/pktcdvd[0-7]/uevent
|
||||
Date: Oct. 2006
|
||||
KernelVersion: 2.6.20
|
||||
Contact: Thomas Maier <balagi@justmail.de>
|
||||
Description:
|
||||
dev: (RO) Device id
|
||||
|
||||
uevent: (WO) To send a uevent
|
||||
|
||||
|
||||
What: /sys/class/pktcdvd/pktcdvd[0-7]/stat/packets_started
|
||||
What: /sys/class/pktcdvd/pktcdvd[0-7]/stat/packets_finished
|
||||
What: /sys/class/pktcdvd/pktcdvd[0-7]/stat/kb_written
|
||||
What: /sys/class/pktcdvd/pktcdvd[0-7]/stat/kb_read
|
||||
What: /sys/class/pktcdvd/pktcdvd[0-7]/stat/kb_read_gather
|
||||
What: /sys/class/pktcdvd/pktcdvd[0-7]/stat/reset
|
||||
Date: Oct. 2006
|
||||
KernelVersion: 2.6.20
|
||||
Contact: Thomas Maier <balagi@justmail.de>
|
||||
Description:
|
||||
packets_started: (RO) Number of started packets.
|
||||
|
||||
packets_finished: (RO) Number of finished packets.
|
||||
|
||||
kb_written: (RO) kBytes written.
|
||||
|
||||
kb_read: (RO) kBytes read.
|
||||
|
||||
kb_read_gather: (RO) kBytes read to fill write packets.
|
||||
|
||||
reset: (WO) Write any value to it to reset
|
||||
pktcdvd device statistic values, like
|
||||
bytes read/written.
|
||||
|
||||
|
||||
What: /sys/class/pktcdvd/pktcdvd[0-7]/write_queue/size
|
||||
What: /sys/class/pktcdvd/pktcdvd[0-7]/write_queue/congestion_off
|
||||
What: /sys/class/pktcdvd/pktcdvd[0-7]/write_queue/congestion_on
|
||||
Date: Oct. 2006
|
||||
KernelVersion: 2.6.20
|
||||
Contact: Thomas Maier <balagi@justmail.de>
|
||||
Description:
|
||||
============== ================================================
|
||||
size (RO) Contains the size of the bio write queue.
|
||||
|
||||
congestion_off (RW) If bio write queue size is below this mark,
|
||||
accept new bio requests from the block layer.
|
||||
|
||||
congestion_on (RW) If bio write queue size is higher as this
|
||||
mark, do no longer accept bio write requests
|
||||
from the block layer and wait till the pktcdvd
|
||||
device has processed enough bio's so that bio
|
||||
write queue size is below congestion off mark.
|
||||
A value of <= 0 disables congestion control.
|
||||
============== ================================================
|
||||
|
||||
|
||||
Example:
|
||||
--------
|
||||
To use the pktcdvd sysfs interface directly, you can do::
|
||||
|
||||
# create a new pktcdvd device mapped to /dev/hdc
|
||||
echo "22:0" >/sys/class/pktcdvd/add
|
||||
cat /sys/class/pktcdvd/device_map
|
||||
# assuming device pktcdvd0 was created, look at stat's
|
||||
cat /sys/class/pktcdvd/pktcdvd0/stat/kb_written
|
||||
# print the device id of the mapped block device
|
||||
fgrep pktcdvd0 /sys/class/pktcdvd/device_map
|
||||
# remove device, using pktcdvd0 device id 253:0
|
||||
echo "253:0" >/sys/class/pktcdvd/remove
|
@ -1,6 +1,6 @@
|
||||
What: /sys/devices/uncore_iio_x/dieX
|
||||
Date: February 2020
|
||||
Contact: Roman Sudarikov <roman.sudarikov@linux.intel.com>
|
||||
Contact: Alexander Antonov <alexander.antonov@linux.intel.com>
|
||||
Description:
|
||||
Each IIO stack (PCIe root port) has its own IIO PMON block, so
|
||||
each dieX file (where X is die number) holds "Segment:Root Bus"
|
||||
@ -32,3 +32,31 @@ Description:
|
||||
IIO PMU 0 on die 1 belongs to PCI RP on bus 0x40, domain 0x0000
|
||||
IIO PMU 0 on die 2 belongs to PCI RP on bus 0x80, domain 0x0000
|
||||
IIO PMU 0 on die 3 belongs to PCI RP on bus 0xc0, domain 0x0000
|
||||
|
||||
What: /sys/devices/uncore_upi_x/dieX
|
||||
Date: March 2022
|
||||
Contact: Alexander Antonov <alexander.antonov@linux.intel.com>
|
||||
Description:
|
||||
Each /sys/devices/uncore_upi_X/dieY file holds "upi_Z,die_W"
|
||||
value that means UPI link number X on die Y is connected to UPI
|
||||
link Z on die W and this link between sockets can be monitored
|
||||
by UPI PMON block.
|
||||
For example, 4-die Sapphire Rapids platform has the following
|
||||
UPI 0 topology::
|
||||
|
||||
# tail /sys/devices/uncore_upi_0/die*
|
||||
==> /sys/devices/uncore_upi_0/die0 <==
|
||||
upi_1,die_1
|
||||
==> /sys/devices/uncore_upi_0/die1 <==
|
||||
upi_0,die_3
|
||||
==> /sys/devices/uncore_upi_0/die2 <==
|
||||
upi_1,die_3
|
||||
==> /sys/devices/uncore_upi_0/die3 <==
|
||||
upi_0,die_1
|
||||
|
||||
Which means::
|
||||
|
||||
UPI link 0 on die 0 is connected to UPI link 1 on die 1
|
||||
UPI link 0 on die 1 is connected to UPI link 0 on die 3
|
||||
UPI link 0 on die 2 is connected to UPI link 1 on die 3
|
||||
UPI link 0 on die 3 is connected to UPI link 0 on die 1
|
75
Documentation/ABI/testing/sysfs-driver-intel-i915-hwmon
Normal file
75
Documentation/ABI/testing/sysfs-driver-intel-i915-hwmon
Normal file
@ -0,0 +1,75 @@
|
||||
What: /sys/devices/.../hwmon/hwmon<i>/in0_input
|
||||
Date: February 2023
|
||||
KernelVersion: 6.2
|
||||
Contact: intel-gfx@lists.freedesktop.org
|
||||
Description: RO. Current Voltage in millivolt.
|
||||
|
||||
Only supported for particular Intel i915 graphics platforms.
|
||||
|
||||
What: /sys/devices/.../hwmon/hwmon<i>/power1_max
|
||||
Date: February 2023
|
||||
KernelVersion: 6.2
|
||||
Contact: intel-gfx@lists.freedesktop.org
|
||||
Description: RW. Card reactive sustained (PL1/Tau) power limit in microwatts.
|
||||
|
||||
The power controller will throttle the operating frequency
|
||||
if the power averaged over a window (typically seconds)
|
||||
exceeds this limit.
|
||||
|
||||
Only supported for particular Intel i915 graphics platforms.
|
||||
|
||||
What: /sys/devices/.../hwmon/hwmon<i>/power1_rated_max
|
||||
Date: February 2023
|
||||
KernelVersion: 6.2
|
||||
Contact: intel-gfx@lists.freedesktop.org
|
||||
Description: RO. Card default power limit (default TDP setting).
|
||||
|
||||
Only supported for particular Intel i915 graphics platforms.
|
||||
|
||||
What: /sys/devices/.../hwmon/hwmon<i>/power1_max_interval
|
||||
Date: February 2023
|
||||
KernelVersion: 6.2
|
||||
Contact: intel-gfx@lists.freedesktop.org
|
||||
Description: RW. Sustained power limit interval (Tau in PL1/Tau) in
|
||||
milliseconds over which sustained power is averaged.
|
||||
|
||||
Only supported for particular Intel i915 graphics platforms.
|
||||
|
||||
What: /sys/devices/.../hwmon/hwmon<i>/power1_crit
|
||||
Date: February 2023
|
||||
KernelVersion: 6.2
|
||||
Contact: intel-gfx@lists.freedesktop.org
|
||||
Description: RW. Card reactive critical (I1) power limit in microwatts.
|
||||
|
||||
Card reactive critical (I1) power limit in microwatts is exposed
|
||||
for client products. The power controller will throttle the
|
||||
operating frequency if the power averaged over a window exceeds
|
||||
this limit.
|
||||
|
||||
Only supported for particular Intel i915 graphics platforms.
|
||||
|
||||
What: /sys/devices/.../hwmon/hwmon<i>/curr1_crit
|
||||
Date: February 2023
|
||||
KernelVersion: 6.2
|
||||
Contact: intel-gfx@lists.freedesktop.org
|
||||
Description: RW. Card reactive critical (I1) power limit in milliamperes.
|
||||
|
||||
Card reactive critical (I1) power limit in milliamperes is
|
||||
exposed for server products. The power controller will throttle
|
||||
the operating frequency if the power averaged over a window
|
||||
exceeds this limit.
|
||||
|
||||
Only supported for particular Intel i915 graphics platforms.
|
||||
|
||||
What: /sys/devices/.../hwmon/hwmon<i>/energy1_input
|
||||
Date: February 2023
|
||||
KernelVersion: 6.2
|
||||
Contact: intel-gfx@lists.freedesktop.org
|
||||
Description: RO. Energy input of device or gt in microjoules.
|
||||
|
||||
For i915 device level hwmon devices (name "i915") this
|
||||
reflects energy input for the entire device. For gt level
|
||||
hwmon devices (name "i915_gtN") this reflects energy input
|
||||
for the gt.
|
||||
|
||||
Only supported for particular Intel i915 graphics platforms.
|
@ -4,21 +4,21 @@ KernelVersion: 5.18
|
||||
Contact: "David E. Box" <david.e.box@linux.intel.com>
|
||||
Description:
|
||||
This directory contains interface files for accessing Intel
|
||||
Software Defined Silicon (SDSi) features on a CPU. X
|
||||
represents the socket instance (though not the socket ID).
|
||||
The socket ID is determined by reading the registers file
|
||||
and decoding it per the specification.
|
||||
On Demand (formerly Software Defined Silicon or SDSi) features
|
||||
on a CPU. X represents the socket instance (though not the
|
||||
socket ID). The socket ID is determined by reading the
|
||||
registers file and decoding it per the specification.
|
||||
|
||||
Some files communicate with SDSi hardware through a mailbox.
|
||||
Should the operation fail, one of the following error codes
|
||||
may be returned:
|
||||
Some files communicate with On Demand hardware through a
|
||||
mailbox. Should the operation fail, one of the following error
|
||||
codes may be returned:
|
||||
|
||||
========== =====
|
||||
Error Code Cause
|
||||
========== =====
|
||||
EIO General mailbox failure. Log may indicate cause.
|
||||
EBUSY Mailbox is owned by another agent.
|
||||
EPERM SDSI capability is not enabled in hardware.
|
||||
EPERM On Demand capability is not enabled in hardware.
|
||||
EPROTO Failure in mailbox protocol detected by driver.
|
||||
See log for details.
|
||||
EOVERFLOW For provision commands, the size of the data
|
||||
@ -54,8 +54,8 @@ KernelVersion: 5.18
|
||||
Contact: "David E. Box" <david.e.box@linux.intel.com>
|
||||
Description:
|
||||
(WO) Used to write an Authentication Key Certificate (AKC) to
|
||||
the SDSi NVRAM for the CPU. The AKC is used to authenticate a
|
||||
Capability Activation Payload. Mailbox command.
|
||||
the On Demand NVRAM for the CPU. The AKC is used to authenticate
|
||||
a Capability Activation Payload. Mailbox command.
|
||||
|
||||
What: /sys/bus/auxiliary/devices/intel_vsec.sdsi.X/provision_cap
|
||||
Date: Feb 2022
|
||||
@ -63,17 +63,28 @@ KernelVersion: 5.18
|
||||
Contact: "David E. Box" <david.e.box@linux.intel.com>
|
||||
Description:
|
||||
(WO) Used to write a Capability Activation Payload (CAP) to the
|
||||
SDSi NVRAM for the CPU. CAPs are used to activate a given CPU
|
||||
feature. A CAP is validated by SDSi hardware using a previously
|
||||
provisioned AKC file. Upon successful authentication, the CPU
|
||||
configuration is updated. A cold reboot is required to fully
|
||||
activate the feature. Mailbox command.
|
||||
On Demand NVRAM for the CPU. CAPs are used to activate a given
|
||||
CPU feature. A CAP is validated by On Demand hardware using a
|
||||
previously provisioned AKC file. Upon successful authentication,
|
||||
the CPU configuration is updated. A cold reboot is required to
|
||||
fully activate the feature. Mailbox command.
|
||||
|
||||
What: /sys/bus/auxiliary/devices/intel_vsec.sdsi.X/meter_certificate
|
||||
Date: Nov 2022
|
||||
KernelVersion: 6.2
|
||||
Contact: "David E. Box" <david.e.box@linux.intel.com>
|
||||
Description:
|
||||
(RO) Used to read back the current meter certificate for the CPU
|
||||
from Intel On Demand hardware. The meter certificate contains
|
||||
utilization metrics of On Demand enabled features. Mailbox
|
||||
command.
|
||||
|
||||
What: /sys/bus/auxiliary/devices/intel_vsec.sdsi.X/state_certificate
|
||||
Date: Feb 2022
|
||||
KernelVersion: 5.18
|
||||
Contact: "David E. Box" <david.e.box@linux.intel.com>
|
||||
Description:
|
||||
(RO) Used to read back the current State Certificate for the CPU
|
||||
from SDSi hardware. The State Certificate contains information
|
||||
about the current licenses on the CPU. Mailbox command.
|
||||
(RO) Used to read back the current state certificate for the CPU
|
||||
from On Demand hardware. The state certificate contains
|
||||
information about the current licenses on the CPU. Mailbox
|
||||
command.
|
||||
|
@ -99,6 +99,12 @@ Description: Controls the issue rate of discard commands that consist of small
|
||||
checkpoint is triggered, and issued during the checkpoint.
|
||||
By default, it is disabled with 0.
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/max_ordered_discard
|
||||
Date: October 2022
|
||||
Contact: "Yangtao Li" <frank.li@vivo.com>
|
||||
Description: Controls the maximum ordered discard, the unit size is one block(4KB).
|
||||
Set it to 16 by default.
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/max_discard_request
|
||||
Date: December 2021
|
||||
Contact: "Konstantin Vyshetsky" <vkon@google.com>
|
||||
@ -132,7 +138,8 @@ Contact: "Chao Yu" <yuchao0@huawei.com>
|
||||
Description: Controls discard granularity of inner discard thread. Inner thread
|
||||
will not issue discards with size that is smaller than granularity.
|
||||
The unit size is one block(4KB), now only support configuring
|
||||
in range of [1, 512]. Default value is 4(=16KB).
|
||||
in range of [1, 512]. Default value is 16.
|
||||
For small devices, default value is 1.
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/umount_discard_timeout
|
||||
Date: January 2019
|
||||
@ -235,7 +242,7 @@ Description: Shows total written kbytes issued to disk.
|
||||
What: /sys/fs/f2fs/<disk>/features
|
||||
Date: July 2017
|
||||
Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
|
||||
Description: <deprecated: should use /sys/fs/f2fs/<disk>/feature_list/
|
||||
Description: <deprecated: should use /sys/fs/f2fs/<disk>/feature_list/>
|
||||
Shows all enabled features in current device.
|
||||
Supported features:
|
||||
encryption, blkzoned, extra_attr, projquota, inode_checksum,
|
||||
@ -592,10 +599,10 @@ Description: With "mode=fragment:block" mount options, we can scatter block allo
|
||||
in the length of 1..<max_fragment_hole> by turns. This value can be set
|
||||
between 1..512 and the default value is 4.
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/gc_urgent_high_remaining
|
||||
Date: December 2021
|
||||
Contact: "Daeho Jeong" <daehojeong@google.com>
|
||||
Description: You can set the trial count limit for GC urgent high mode with this value.
|
||||
What: /sys/fs/f2fs/<disk>/gc_remaining_trials
|
||||
Date: October 2022
|
||||
Contact: "Yangtao Li" <frank.li@vivo.com>
|
||||
Description: You can set the trial count limit for GC urgent and idle mode with this value.
|
||||
If GC thread gets to the limit, the mode will turn back to GC normal mode.
|
||||
By default, the value is zero, which means there is no limit like before.
|
||||
|
||||
@ -634,3 +641,31 @@ Date: July 2022
|
||||
Contact: "Daeho Jeong" <daehojeong@google.com>
|
||||
Description: Show the accumulated total revoked atomic write block count after boot.
|
||||
If you write "0" here, you can initialize to "0".
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/gc_mode
|
||||
Date: October 2022
|
||||
Contact: "Yangtao Li" <frank.li@vivo.com>
|
||||
Description: Show the current gc_mode as a string.
|
||||
This is a read-only entry.
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/discard_urgent_util
|
||||
Date: November 2022
|
||||
Contact: "Yangtao Li" <frank.li@vivo.com>
|
||||
Description: When space utilization exceeds this, do background DISCARD aggressively.
|
||||
Does DISCARD forcibly in a period of given min_discard_issue_time when the number
|
||||
of discards is not 0 and set discard granularity to 1.
|
||||
Default: 80
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/hot_data_age_threshold
|
||||
Date: November 2022
|
||||
Contact: "Ping Xiong" <xiongping1@xiaomi.com>
|
||||
Description: When DATA SEPARATION is on, it controls the age threshold to indicate
|
||||
the data blocks as hot. By default it was initialized as 262144 blocks
|
||||
(equals to 1GB).
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/warm_data_age_threshold
|
||||
Date: November 2022
|
||||
Contact: "Ping Xiong" <xiongping1@xiaomi.com>
|
||||
Description: When DATA SEPARATION is on, it controls the age threshold to indicate
|
||||
the data blocks as warm. By default it was initialized as 2621440 blocks
|
||||
(equals to 10GB).
|
||||
|
@ -27,6 +27,10 @@ Description: Writing 'on' or 'off' to this file makes the kdamond starts or
|
||||
makes the kdamond reads the user inputs in the sysfs files
|
||||
except 'state' again. Writing 'update_schemes_stats' to the
|
||||
file updates contents of schemes stats files of the kdamond.
|
||||
Writing 'update_schemes_tried_regions' to the file updates
|
||||
contents of 'tried_regions' directory of every scheme directory
|
||||
of this kdamond. Writing 'clear_schemes_tried_regions' to the
|
||||
file removes contents of the 'tried_regions' directory.
|
||||
|
||||
What: /sys/kernel/mm/damon/admin/kdamonds/<K>/pid
|
||||
Date: Mar 2022
|
||||
@ -283,3 +287,31 @@ Date: Mar 2022
|
||||
Contact: SeongJae Park <sj@kernel.org>
|
||||
Description: Reading this file returns the number of the exceed events of
|
||||
the scheme's quotas.
|
||||
|
||||
What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/tried_regions/<R>/start
|
||||
Date: Oct 2022
|
||||
Contact: SeongJae Park <sj@kernel.org>
|
||||
Description: Reading this file returns the start address of a memory region
|
||||
that corresponding DAMON-based Operation Scheme's action has
|
||||
tried to be applied.
|
||||
|
||||
What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/tried_regions/<R>/end
|
||||
Date: Oct 2022
|
||||
Contact: SeongJae Park <sj@kernel.org>
|
||||
Description: Reading this file returns the end address of a memory region
|
||||
that corresponding DAMON-based Operation Scheme's action has
|
||||
tried to be applied.
|
||||
|
||||
What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/tried_regions/<R>/nr_accesses
|
||||
Date: Oct 2022
|
||||
Contact: SeongJae Park <sj@kernel.org>
|
||||
Description: Reading this file returns the 'nr_accesses' of a memory region
|
||||
that corresponding DAMON-based Operation Scheme's action has
|
||||
tried to be applied.
|
||||
|
||||
What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/tried_regions/<R>/age
|
||||
Date: Oct 2022
|
||||
Contact: SeongJae Park <sj@kernel.org>
|
||||
Description: Reading this file returns the 'age' of a memory region that
|
||||
corresponding DAMON-based Operation Scheme's action has tried
|
||||
to be applied.
|
||||
|
6
Documentation/ABI/testing/sysfs-kernel-oops_count
Normal file
6
Documentation/ABI/testing/sysfs-kernel-oops_count
Normal file
@ -0,0 +1,6 @@
|
||||
What: /sys/kernel/oops_count
|
||||
Date: November 2022
|
||||
KernelVersion: 6.2.0
|
||||
Contact: Linux Kernel Hardening List <linux-hardening@vger.kernel.org>
|
||||
Description:
|
||||
Shows how many times the system has Oopsed since last boot.
|
6
Documentation/ABI/testing/sysfs-kernel-warn_count
Normal file
6
Documentation/ABI/testing/sysfs-kernel-warn_count
Normal file
@ -0,0 +1,6 @@
|
||||
What: /sys/kernel/oops_count
|
||||
Date: November 2022
|
||||
KernelVersion: 6.2.0
|
||||
Contact: Linux Kernel Hardening List <linux-hardening@vger.kernel.org>
|
||||
Description:
|
||||
Shows how many times the system has Warned since last boot.
|
7
Documentation/ABI/testing/sysfs-platform-dell-wmi-ddv
Normal file
7
Documentation/ABI/testing/sysfs-platform-dell-wmi-ddv
Normal file
@ -0,0 +1,7 @@
|
||||
What: /sys/class/power_supply/<battery_name>/eppid
|
||||
Date: September 2022
|
||||
KernelVersion: 6.1
|
||||
Contact: Armin Wolf <W_Armin@gmx.de>
|
||||
Description:
|
||||
Reports the Dell ePPID (electronic Dell Piece Part Identification)
|
||||
of the ACPI battery.
|
@ -1,39 +1,41 @@
|
||||
What: /sys/devices/virtual/misc/intel_ifs_<N>/run_test
|
||||
Date: April 21 2022
|
||||
KernelVersion: 5.19
|
||||
Date: Nov 16 2022
|
||||
KernelVersion: 6.2
|
||||
Contact: "Jithu Joseph" <jithu.joseph@intel.com>
|
||||
Description: Write <cpu#> to trigger IFS test for one online core.
|
||||
Note that the test is per core. The cpu# can be
|
||||
for any thread on the core. Running on one thread
|
||||
completes the test for the core containing that thread.
|
||||
Example: to test the core containing cpu5: echo 5 >
|
||||
/sys/devices/platform/intel_ifs.<N>/run_test
|
||||
/sys/devices/virtual/misc/intel_ifs_<N>/run_test
|
||||
|
||||
What: /sys/devices/virtual/misc/intel_ifs_<N>/status
|
||||
Date: April 21 2022
|
||||
KernelVersion: 5.19
|
||||
Date: Nov 16 2022
|
||||
KernelVersion: 6.2
|
||||
Contact: "Jithu Joseph" <jithu.joseph@intel.com>
|
||||
Description: The status of the last test. It can be one of "pass", "fail"
|
||||
or "untested".
|
||||
|
||||
What: /sys/devices/virtual/misc/intel_ifs_<N>/details
|
||||
Date: April 21 2022
|
||||
KernelVersion: 5.19
|
||||
Date: Nov 16 2022
|
||||
KernelVersion: 6.2
|
||||
Contact: "Jithu Joseph" <jithu.joseph@intel.com>
|
||||
Description: Additional information regarding the last test. The details file reports
|
||||
the hex value of the SCAN_STATUS MSR. Note that the error_code field
|
||||
may contain driver defined software code not defined in the Intel SDM.
|
||||
|
||||
What: /sys/devices/virtual/misc/intel_ifs_<N>/image_version
|
||||
Date: April 21 2022
|
||||
KernelVersion: 5.19
|
||||
Date: Nov 16 2022
|
||||
KernelVersion: 6.2
|
||||
Contact: "Jithu Joseph" <jithu.joseph@intel.com>
|
||||
Description: Version (hexadecimal) of loaded IFS binary image. If no scan image
|
||||
is loaded reports "none".
|
||||
|
||||
What: /sys/devices/virtual/misc/intel_ifs_<N>/reload
|
||||
Date: April 21 2022
|
||||
KernelVersion: 5.19
|
||||
What: /sys/devices/virtual/misc/intel_ifs_<N>/current_batch
|
||||
Date: Nov 16 2022
|
||||
KernelVersion: 6.2
|
||||
Contact: "Jithu Joseph" <jithu.joseph@intel.com>
|
||||
Description: Write "1" (or "y" or "Y") to reload the IFS image from
|
||||
/lib/firmware/intel/ifs/ff-mm-ss.scan.
|
||||
Description: Write a number less than or equal to 0xff to load an IFS test image.
|
||||
The number written treated as the 2 digit suffix in the following file name:
|
||||
/lib/firmware/intel/ifs_<N>/ff-mm-ss-02x.scan
|
||||
Reading the file will provide the suffix of the currently loaded IFS test image.
|
||||
|
@ -95,6 +95,15 @@ htmldocs:
|
||||
@$(srctree)/scripts/sphinx-pre-install --version-check
|
||||
@+$(foreach var,$(SPHINXDIRS),$(call loop_cmd,sphinx,html,$(var),,$(var)))
|
||||
|
||||
texinfodocs:
|
||||
@$(srctree)/scripts/sphinx-pre-install --version-check
|
||||
@+$(foreach var,$(SPHINXDIRS),$(call loop_cmd,sphinx,texinfo,$(var),texinfo,$(var)))
|
||||
|
||||
# Note: the 'info' Make target is generated by sphinx itself when
|
||||
# running the texinfodocs target define above.
|
||||
infodocs: texinfodocs
|
||||
$(MAKE) -C $(BUILDDIR)/texinfo info
|
||||
|
||||
linkcheckdocs:
|
||||
@$(foreach var,$(SPHINXDIRS),$(call loop_cmd,sphinx,linkcheck,$(var),,$(var)))
|
||||
|
||||
@ -143,6 +152,8 @@ cleandocs:
|
||||
dochelp:
|
||||
@echo ' Linux kernel internal documentation in different formats from ReST:'
|
||||
@echo ' htmldocs - HTML'
|
||||
@echo ' texinfodocs - Texinfo'
|
||||
@echo ' infodocs - Info'
|
||||
@echo ' latexdocs - LaTeX'
|
||||
@echo ' pdfdocs - PDF'
|
||||
@echo ' epubdocs - EPUB'
|
||||
|
@ -285,3 +285,13 @@ to bridges between the PCI root and the device, MSIs are disabled.
|
||||
It is also worth checking the device driver to see whether it supports MSIs.
|
||||
For example, it may contain calls to pci_alloc_irq_vectors() with the
|
||||
PCI_IRQ_MSI or PCI_IRQ_MSIX flags.
|
||||
|
||||
|
||||
List of device drivers MSI(-X) APIs
|
||||
===================================
|
||||
|
||||
The PCI/MSI subystem has a dedicated C file for its exported device driver
|
||||
APIs — `drivers/pci/msi/api.c`. The following functions are exported:
|
||||
|
||||
.. kernel-doc:: drivers/pci/msi/api.c
|
||||
:export:
|
||||
|
@ -83,6 +83,7 @@ This structure has the form::
|
||||
int (*mmio_enabled)(struct pci_dev *dev);
|
||||
int (*slot_reset)(struct pci_dev *dev);
|
||||
void (*resume)(struct pci_dev *dev);
|
||||
void (*cor_error_detected)(struct pci_dev *dev);
|
||||
};
|
||||
|
||||
The possible channel states are::
|
||||
@ -422,5 +423,11 @@ That is, the recovery API only requires that:
|
||||
- drivers/net/cxgb3
|
||||
- drivers/net/s2io.c
|
||||
|
||||
The cor_error_detected() callback is invoked in handle_error_source() when
|
||||
the error severity is "correctable". The callback is optional and allows
|
||||
additional logging to be done if desired. See example:
|
||||
|
||||
- drivers/cxl/pci.c
|
||||
|
||||
The End
|
||||
-------
|
||||
|
@ -1858,7 +1858,7 @@ unloaded. After a given module has been unloaded, any attempt to call
|
||||
one of its functions results in a segmentation fault. The module-unload
|
||||
functions must therefore cancel any delayed calls to loadable-module
|
||||
functions, for example, any outstanding mod_timer() must be dealt
|
||||
with via del_timer_sync() or similar.
|
||||
with via timer_shutdown_sync() or similar.
|
||||
|
||||
Unfortunately, there is no way to cancel an RCU callback; once you
|
||||
invoke call_rcu(), the callback function is eventually going to be
|
||||
|
@ -1,165 +0,0 @@
|
||||
.. _array_rcu_doc:
|
||||
|
||||
Using RCU to Protect Read-Mostly Arrays
|
||||
=======================================
|
||||
|
||||
Although RCU is more commonly used to protect linked lists, it can
|
||||
also be used to protect arrays. Three situations are as follows:
|
||||
|
||||
1. :ref:`Hash Tables <hash_tables>`
|
||||
|
||||
2. :ref:`Static Arrays <static_arrays>`
|
||||
|
||||
3. :ref:`Resizable Arrays <resizable_arrays>`
|
||||
|
||||
Each of these three situations involves an RCU-protected pointer to an
|
||||
array that is separately indexed. It might be tempting to consider use
|
||||
of RCU to instead protect the index into an array, however, this use
|
||||
case is **not** supported. The problem with RCU-protected indexes into
|
||||
arrays is that compilers can play way too many optimization games with
|
||||
integers, which means that the rules governing handling of these indexes
|
||||
are far more trouble than they are worth. If RCU-protected indexes into
|
||||
arrays prove to be particularly valuable (which they have not thus far),
|
||||
explicit cooperation from the compiler will be required to permit them
|
||||
to be safely used.
|
||||
|
||||
That aside, each of the three RCU-protected pointer situations are
|
||||
described in the following sections.
|
||||
|
||||
.. _hash_tables:
|
||||
|
||||
Situation 1: Hash Tables
|
||||
------------------------
|
||||
|
||||
Hash tables are often implemented as an array, where each array entry
|
||||
has a linked-list hash chain. Each hash chain can be protected by RCU
|
||||
as described in listRCU.rst. This approach also applies to other
|
||||
array-of-list situations, such as radix trees.
|
||||
|
||||
.. _static_arrays:
|
||||
|
||||
Situation 2: Static Arrays
|
||||
--------------------------
|
||||
|
||||
Static arrays, where the data (rather than a pointer to the data) is
|
||||
located in each array element, and where the array is never resized,
|
||||
have not been used with RCU. Rik van Riel recommends using seqlock in
|
||||
this situation, which would also have minimal read-side overhead as long
|
||||
as updates are rare.
|
||||
|
||||
Quick Quiz:
|
||||
Why is it so important that updates be rare when using seqlock?
|
||||
|
||||
:ref:`Answer to Quick Quiz <answer_quick_quiz_seqlock>`
|
||||
|
||||
.. _resizable_arrays:
|
||||
|
||||
Situation 3: Resizable Arrays
|
||||
------------------------------
|
||||
|
||||
Use of RCU for resizable arrays is demonstrated by the grow_ary()
|
||||
function formerly used by the System V IPC code. The array is used
|
||||
to map from semaphore, message-queue, and shared-memory IDs to the data
|
||||
structure that represents the corresponding IPC construct. The grow_ary()
|
||||
function does not acquire any locks; instead its caller must hold the
|
||||
ids->sem semaphore.
|
||||
|
||||
The grow_ary() function, shown below, does some limit checks, allocates a
|
||||
new ipc_id_ary, copies the old to the new portion of the new, initializes
|
||||
the remainder of the new, updates the ids->entries pointer to point to
|
||||
the new array, and invokes ipc_rcu_putref() to free up the old array.
|
||||
Note that rcu_assign_pointer() is used to update the ids->entries pointer,
|
||||
which includes any memory barriers required on whatever architecture
|
||||
you are running on::
|
||||
|
||||
static int grow_ary(struct ipc_ids* ids, int newsize)
|
||||
{
|
||||
struct ipc_id_ary* new;
|
||||
struct ipc_id_ary* old;
|
||||
int i;
|
||||
int size = ids->entries->size;
|
||||
|
||||
if(newsize > IPCMNI)
|
||||
newsize = IPCMNI;
|
||||
if(newsize <= size)
|
||||
return newsize;
|
||||
|
||||
new = ipc_rcu_alloc(sizeof(struct kern_ipc_perm *)*newsize +
|
||||
sizeof(struct ipc_id_ary));
|
||||
if(new == NULL)
|
||||
return size;
|
||||
new->size = newsize;
|
||||
memcpy(new->p, ids->entries->p,
|
||||
sizeof(struct kern_ipc_perm *)*size +
|
||||
sizeof(struct ipc_id_ary));
|
||||
for(i=size;i<newsize;i++) {
|
||||
new->p[i] = NULL;
|
||||
}
|
||||
old = ids->entries;
|
||||
|
||||
/*
|
||||
* Use rcu_assign_pointer() to make sure the memcpyed
|
||||
* contents of the new array are visible before the new
|
||||
* array becomes visible.
|
||||
*/
|
||||
rcu_assign_pointer(ids->entries, new);
|
||||
|
||||
ipc_rcu_putref(old);
|
||||
return newsize;
|
||||
}
|
||||
|
||||
The ipc_rcu_putref() function decrements the array's reference count
|
||||
and then, if the reference count has dropped to zero, uses call_rcu()
|
||||
to free the array after a grace period has elapsed.
|
||||
|
||||
The array is traversed by the ipc_lock() function. This function
|
||||
indexes into the array under the protection of rcu_read_lock(),
|
||||
using rcu_dereference() to pick up the pointer to the array so
|
||||
that it may later safely be dereferenced -- memory barriers are
|
||||
required on the Alpha CPU. Since the size of the array is stored
|
||||
with the array itself, there can be no array-size mismatches, so
|
||||
a simple check suffices. The pointer to the structure corresponding
|
||||
to the desired IPC object is placed in "out", with NULL indicating
|
||||
a non-existent entry. After acquiring "out->lock", the "out->deleted"
|
||||
flag indicates whether the IPC object is in the process of being
|
||||
deleted, and, if not, the pointer is returned::
|
||||
|
||||
struct kern_ipc_perm* ipc_lock(struct ipc_ids* ids, int id)
|
||||
{
|
||||
struct kern_ipc_perm* out;
|
||||
int lid = id % SEQ_MULTIPLIER;
|
||||
struct ipc_id_ary* entries;
|
||||
|
||||
rcu_read_lock();
|
||||
entries = rcu_dereference(ids->entries);
|
||||
if(lid >= entries->size) {
|
||||
rcu_read_unlock();
|
||||
return NULL;
|
||||
}
|
||||
out = entries->p[lid];
|
||||
if(out == NULL) {
|
||||
rcu_read_unlock();
|
||||
return NULL;
|
||||
}
|
||||
spin_lock(&out->lock);
|
||||
|
||||
/* ipc_rmid() may have already freed the ID while ipc_lock
|
||||
* was spinning: here verify that the structure is still valid
|
||||
*/
|
||||
if (out->deleted) {
|
||||
spin_unlock(&out->lock);
|
||||
rcu_read_unlock();
|
||||
return NULL;
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
.. _answer_quick_quiz_seqlock:
|
||||
|
||||
Answer to Quick Quiz:
|
||||
Why is it so important that updates be rare when using seqlock?
|
||||
|
||||
The reason that it is important that updates be rare when
|
||||
using seqlock is that frequent updates can livelock readers.
|
||||
One way to avoid this problem is to assign a seqlock for
|
||||
each array entry rather than to the entire array.
|
@ -32,8 +32,8 @@ over a rather long period of time, but improvements are always welcome!
|
||||
for lockless updates. This does result in the mildly
|
||||
counter-intuitive situation where rcu_read_lock() and
|
||||
rcu_read_unlock() are used to protect updates, however, this
|
||||
approach provides the same potential simplifications that garbage
|
||||
collectors do.
|
||||
approach can provide the same simplifications to certain types
|
||||
of lockless algorithms that garbage collectors do.
|
||||
|
||||
1. Does the update code have proper mutual exclusion?
|
||||
|
||||
@ -49,12 +49,12 @@ over a rather long period of time, but improvements are always welcome!
|
||||
them -- even x86 allows later loads to be reordered to precede
|
||||
earlier stores), and be prepared to explain why this added
|
||||
complexity is worthwhile. If you choose #c, be prepared to
|
||||
explain how this single task does not become a major bottleneck on
|
||||
big multiprocessor machines (for example, if the task is updating
|
||||
information relating to itself that other tasks can read, there
|
||||
by definition can be no bottleneck). Note that the definition
|
||||
of "large" has changed significantly: Eight CPUs was "large"
|
||||
in the year 2000, but a hundred CPUs was unremarkable in 2017.
|
||||
explain how this single task does not become a major bottleneck
|
||||
on large systems (for example, if the task is updating information
|
||||
relating to itself that other tasks can read, there by definition
|
||||
can be no bottleneck). Note that the definition of "large" has
|
||||
changed significantly: Eight CPUs was "large" in the year 2000,
|
||||
but a hundred CPUs was unremarkable in 2017.
|
||||
|
||||
2. Do the RCU read-side critical sections make proper use of
|
||||
rcu_read_lock() and friends? These primitives are needed
|
||||
@ -97,33 +97,38 @@ over a rather long period of time, but improvements are always welcome!
|
||||
|
||||
b. Proceed as in (a) above, but also maintain per-element
|
||||
locks (that are acquired by both readers and writers)
|
||||
that guard per-element state. Of course, fields that
|
||||
the readers refrain from accessing can be guarded by
|
||||
some other lock acquired only by updaters, if desired.
|
||||
that guard per-element state. Fields that the readers
|
||||
refrain from accessing can be guarded by some other lock
|
||||
acquired only by updaters, if desired.
|
||||
|
||||
This works quite well, also.
|
||||
This also works quite well.
|
||||
|
||||
c. Make updates appear atomic to readers. For example,
|
||||
pointer updates to properly aligned fields will
|
||||
appear atomic, as will individual atomic primitives.
|
||||
Sequences of operations performed under a lock will *not*
|
||||
appear to be atomic to RCU readers, nor will sequences
|
||||
of multiple atomic primitives.
|
||||
of multiple atomic primitives. One alternative is to
|
||||
move multiple individual fields to a separate structure,
|
||||
thus solving the multiple-field problem by imposing an
|
||||
additional level of indirection.
|
||||
|
||||
This can work, but is starting to get a bit tricky.
|
||||
|
||||
d. Carefully order the updates and the reads so that
|
||||
readers see valid data at all phases of the update.
|
||||
This is often more difficult than it sounds, especially
|
||||
given modern CPUs' tendency to reorder memory references.
|
||||
One must usually liberally sprinkle memory barriers
|
||||
(smp_wmb(), smp_rmb(), smp_mb()) through the code,
|
||||
making it difficult to understand and to test.
|
||||
d. Carefully order the updates and the reads so that readers
|
||||
see valid data at all phases of the update. This is often
|
||||
more difficult than it sounds, especially given modern
|
||||
CPUs' tendency to reorder memory references. One must
|
||||
usually liberally sprinkle memory-ordering operations
|
||||
through the code, making it difficult to understand and
|
||||
to test. Where it works, it is better to use things
|
||||
like smp_store_release() and smp_load_acquire(), but in
|
||||
some cases the smp_mb() full memory barrier is required.
|
||||
|
||||
It is usually better to group the changing data into
|
||||
a separate structure, so that the change may be made
|
||||
to appear atomic by updating a pointer to reference
|
||||
a new structure containing updated values.
|
||||
As noted earlier, it is usually better to group the
|
||||
changing data into a separate structure, so that the
|
||||
change may be made to appear atomic by updating a pointer
|
||||
to reference a new structure containing updated values.
|
||||
|
||||
4. Weakly ordered CPUs pose special challenges. Almost all CPUs
|
||||
are weakly ordered -- even x86 CPUs allow later loads to be
|
||||
@ -188,26 +193,29 @@ over a rather long period of time, but improvements are always welcome!
|
||||
when publicizing a pointer to a structure that can
|
||||
be traversed by an RCU read-side critical section.
|
||||
|
||||
5. If call_rcu() or call_srcu() is used, the callback function will
|
||||
be called from softirq context. In particular, it cannot block.
|
||||
If you need the callback to block, run that code in a workqueue
|
||||
handler scheduled from the callback. The queue_rcu_work()
|
||||
function does this for you in the case of call_rcu().
|
||||
5. If any of call_rcu(), call_srcu(), call_rcu_tasks(),
|
||||
call_rcu_tasks_rude(), or call_rcu_tasks_trace() is used,
|
||||
the callback function may be invoked from softirq context,
|
||||
and in any case with bottom halves disabled. In particular,
|
||||
this callback function cannot block. If you need the callback
|
||||
to block, run that code in a workqueue handler scheduled from
|
||||
the callback. The queue_rcu_work() function does this for you
|
||||
in the case of call_rcu().
|
||||
|
||||
6. Since synchronize_rcu() can block, it cannot be called
|
||||
from any sort of irq context. The same rule applies
|
||||
for synchronize_srcu(), synchronize_rcu_expedited(), and
|
||||
synchronize_srcu_expedited().
|
||||
for synchronize_srcu(), synchronize_rcu_expedited(),
|
||||
synchronize_srcu_expedited(), synchronize_rcu_tasks(),
|
||||
synchronize_rcu_tasks_rude(), and synchronize_rcu_tasks_trace().
|
||||
|
||||
The expedited forms of these primitives have the same semantics
|
||||
as the non-expedited forms, but expediting is both expensive and
|
||||
(with the exception of synchronize_srcu_expedited()) unfriendly
|
||||
to real-time workloads. Use of the expedited primitives should
|
||||
be restricted to rare configuration-change operations that would
|
||||
not normally be undertaken while a real-time workload is running.
|
||||
However, real-time workloads can use rcupdate.rcu_normal kernel
|
||||
boot parameter to completely disable expedited grace periods,
|
||||
though this might have performance implications.
|
||||
as the non-expedited forms, but expediting is more CPU intensive.
|
||||
Use of the expedited primitives should be restricted to rare
|
||||
configuration-change operations that would not normally be
|
||||
undertaken while a real-time workload is running. Note that
|
||||
IPI-sensitive real-time workloads can use the rcupdate.rcu_normal
|
||||
kernel boot parameter to completely disable expedited grace
|
||||
periods, though this might have performance implications.
|
||||
|
||||
In particular, if you find yourself invoking one of the expedited
|
||||
primitives repeatedly in a loop, please do everyone a favor:
|
||||
@ -215,8 +223,9 @@ over a rather long period of time, but improvements are always welcome!
|
||||
a single non-expedited primitive to cover the entire batch.
|
||||
This will very likely be faster than the loop containing the
|
||||
expedited primitive, and will be much much easier on the rest
|
||||
of the system, especially to real-time workloads running on
|
||||
the rest of the system.
|
||||
of the system, especially to real-time workloads running on the
|
||||
rest of the system. Alternatively, instead use asynchronous
|
||||
primitives such as call_rcu().
|
||||
|
||||
7. As of v4.20, a given kernel implements only one RCU flavor, which
|
||||
is RCU-sched for PREEMPTION=n and RCU-preempt for PREEMPTION=y.
|
||||
@ -239,7 +248,8 @@ over a rather long period of time, but improvements are always welcome!
|
||||
the corresponding readers must use rcu_read_lock_trace() and
|
||||
rcu_read_unlock_trace(). If an updater uses call_rcu_tasks_rude()
|
||||
or synchronize_rcu_tasks_rude(), then the corresponding readers
|
||||
must use anything that disables interrupts.
|
||||
must use anything that disables preemption, for example,
|
||||
preempt_disable() and preempt_enable().
|
||||
|
||||
Mixing things up will result in confusion and broken kernels, and
|
||||
has even resulted in an exploitable security issue. Therefore,
|
||||
@ -253,15 +263,16 @@ over a rather long period of time, but improvements are always welcome!
|
||||
that this usage is safe is that readers can use anything that
|
||||
disables BH when updaters use call_rcu() or synchronize_rcu().
|
||||
|
||||
8. Although synchronize_rcu() is slower than is call_rcu(), it
|
||||
usually results in simpler code. So, unless update performance is
|
||||
critically important, the updaters cannot block, or the latency of
|
||||
synchronize_rcu() is visible from userspace, synchronize_rcu()
|
||||
should be used in preference to call_rcu(). Furthermore,
|
||||
kfree_rcu() usually results in even simpler code than does
|
||||
synchronize_rcu() without synchronize_rcu()'s multi-millisecond
|
||||
latency. So please take advantage of kfree_rcu()'s "fire and
|
||||
forget" memory-freeing capabilities where it applies.
|
||||
8. Although synchronize_rcu() is slower than is call_rcu(),
|
||||
it usually results in simpler code. So, unless update
|
||||
performance is critically important, the updaters cannot block,
|
||||
or the latency of synchronize_rcu() is visible from userspace,
|
||||
synchronize_rcu() should be used in preference to call_rcu().
|
||||
Furthermore, kfree_rcu() and kvfree_rcu() usually result
|
||||
in even simpler code than does synchronize_rcu() without
|
||||
synchronize_rcu()'s multi-millisecond latency. So please take
|
||||
advantage of kfree_rcu()'s and kvfree_rcu()'s "fire and forget"
|
||||
memory-freeing capabilities where it applies.
|
||||
|
||||
An especially important property of the synchronize_rcu()
|
||||
primitive is that it automatically self-limits: if grace periods
|
||||
@ -271,8 +282,8 @@ over a rather long period of time, but improvements are always welcome!
|
||||
cases where grace periods are delayed, as failing to do so can
|
||||
result in excessive realtime latencies or even OOM conditions.
|
||||
|
||||
Ways of gaining this self-limiting property when using call_rcu()
|
||||
include:
|
||||
Ways of gaining this self-limiting property when using call_rcu(),
|
||||
kfree_rcu(), or kvfree_rcu() include:
|
||||
|
||||
a. Keeping a count of the number of data-structure elements
|
||||
used by the RCU-protected data structure, including
|
||||
@ -304,18 +315,21 @@ over a rather long period of time, but improvements are always welcome!
|
||||
here is that superuser already has lots of ways to crash
|
||||
the machine.
|
||||
|
||||
d. Periodically invoke synchronize_rcu(), permitting a limited
|
||||
number of updates per grace period. Better yet, periodically
|
||||
invoke rcu_barrier() to wait for all outstanding callbacks.
|
||||
d. Periodically invoke rcu_barrier(), permitting a limited
|
||||
number of updates per grace period.
|
||||
|
||||
The same cautions apply to call_srcu() and kfree_rcu().
|
||||
The same cautions apply to call_srcu(), call_rcu_tasks(),
|
||||
call_rcu_tasks_rude(), and call_rcu_tasks_trace(). This is
|
||||
why there is an srcu_barrier(), rcu_barrier_tasks(),
|
||||
rcu_barrier_tasks_rude(), and rcu_barrier_tasks_rude(),
|
||||
respectively.
|
||||
|
||||
Note that although these primitives do take action to avoid memory
|
||||
exhaustion when any given CPU has too many callbacks, a determined
|
||||
user could still exhaust memory. This is especially the case
|
||||
if a system with a large number of CPUs has been configured to
|
||||
offload all of its RCU callbacks onto a single CPU, or if the
|
||||
system has relatively little free memory.
|
||||
Note that although these primitives do take action to avoid
|
||||
memory exhaustion when any given CPU has too many callbacks,
|
||||
a determined user or administrator can still exhaust memory.
|
||||
This is especially the case if a system with a large number of
|
||||
CPUs has been configured to offload all of its RCU callbacks onto
|
||||
a single CPU, or if the system has relatively little free memory.
|
||||
|
||||
9. All RCU list-traversal primitives, which include
|
||||
rcu_dereference(), list_for_each_entry_rcu(), and
|
||||
@ -344,14 +358,14 @@ over a rather long period of time, but improvements are always welcome!
|
||||
and you don't hold the appropriate update-side lock, you *must*
|
||||
use the "_rcu()" variants of the list macros. Failing to do so
|
||||
will break Alpha, cause aggressive compilers to generate bad code,
|
||||
and confuse people trying to read your code.
|
||||
and confuse people trying to understand your code.
|
||||
|
||||
11. Any lock acquired by an RCU callback must be acquired elsewhere
|
||||
with softirq disabled, e.g., via spin_lock_irqsave(),
|
||||
spin_lock_bh(), etc. Failing to disable softirq on a given
|
||||
acquisition of that lock will result in deadlock as soon as
|
||||
the RCU softirq handler happens to run your RCU callback while
|
||||
interrupting that acquisition's critical section.
|
||||
with softirq disabled, e.g., via spin_lock_bh(). Failing to
|
||||
disable softirq on a given acquisition of that lock will result
|
||||
in deadlock as soon as the RCU softirq handler happens to run
|
||||
your RCU callback while interrupting that acquisition's critical
|
||||
section.
|
||||
|
||||
12. RCU callbacks can be and are executed in parallel. In many cases,
|
||||
the callback code simply wrappers around kfree(), so that this
|
||||
@ -372,7 +386,17 @@ over a rather long period of time, but improvements are always welcome!
|
||||
for some real-time workloads, this is the whole point of using
|
||||
the rcu_nocbs= kernel boot parameter.
|
||||
|
||||
13. Unlike other forms of RCU, it *is* permissible to block in an
|
||||
In addition, do not assume that callbacks queued in a given order
|
||||
will be invoked in that order, even if they all are queued on the
|
||||
same CPU. Furthermore, do not assume that same-CPU callbacks will
|
||||
be invoked serially. For example, in recent kernels, CPUs can be
|
||||
switched between offloaded and de-offloaded callback invocation,
|
||||
and while a given CPU is undergoing such a switch, its callbacks
|
||||
might be concurrently invoked by that CPU's softirq handler and
|
||||
that CPU's rcuo kthread. At such times, that CPU's callbacks
|
||||
might be executed both concurrently and out of order.
|
||||
|
||||
13. Unlike most flavors of RCU, it *is* permissible to block in an
|
||||
SRCU read-side critical section (demarked by srcu_read_lock()
|
||||
and srcu_read_unlock()), hence the "SRCU": "sleepable RCU".
|
||||
Please note that if you don't need to sleep in read-side critical
|
||||
@ -412,6 +436,12 @@ over a rather long period of time, but improvements are always welcome!
|
||||
never sends IPIs to other CPUs, so it is easier on
|
||||
real-time workloads than is synchronize_rcu_expedited().
|
||||
|
||||
It is also permissible to sleep in RCU Tasks Trace read-side
|
||||
critical, which are delimited by rcu_read_lock_trace() and
|
||||
rcu_read_unlock_trace(). However, this is a specialized flavor
|
||||
of RCU, and you should not use it without first checking with
|
||||
its current users. In most cases, you should instead use SRCU.
|
||||
|
||||
Note that rcu_assign_pointer() relates to SRCU just as it does to
|
||||
other forms of RCU, but instead of rcu_dereference() you should
|
||||
use srcu_dereference() in order to avoid lockdep splats.
|
||||
@ -442,50 +472,62 @@ over a rather long period of time, but improvements are always welcome!
|
||||
find problems as follows:
|
||||
|
||||
CONFIG_PROVE_LOCKING:
|
||||
check that accesses to RCU-protected data
|
||||
structures are carried out under the proper RCU
|
||||
read-side critical section, while holding the right
|
||||
combination of locks, or whatever other conditions
|
||||
are appropriate.
|
||||
check that accesses to RCU-protected data structures
|
||||
are carried out under the proper RCU read-side critical
|
||||
section, while holding the right combination of locks,
|
||||
or whatever other conditions are appropriate.
|
||||
|
||||
CONFIG_DEBUG_OBJECTS_RCU_HEAD:
|
||||
check that you don't pass the
|
||||
same object to call_rcu() (or friends) before an RCU
|
||||
grace period has elapsed since the last time that you
|
||||
passed that same object to call_rcu() (or friends).
|
||||
check that you don't pass the same object to call_rcu()
|
||||
(or friends) before an RCU grace period has elapsed
|
||||
since the last time that you passed that same object to
|
||||
call_rcu() (or friends).
|
||||
|
||||
__rcu sparse checks:
|
||||
tag the pointer to the RCU-protected data
|
||||
structure with __rcu, and sparse will warn you if you
|
||||
access that pointer without the services of one of the
|
||||
variants of rcu_dereference().
|
||||
tag the pointer to the RCU-protected data structure
|
||||
with __rcu, and sparse will warn you if you access that
|
||||
pointer without the services of one of the variants
|
||||
of rcu_dereference().
|
||||
|
||||
These debugging aids can help you find problems that are
|
||||
otherwise extremely difficult to spot.
|
||||
|
||||
17. If you register a callback using call_rcu() or call_srcu(), and
|
||||
pass in a function defined within a loadable module, then it in
|
||||
necessary to wait for all pending callbacks to be invoked after
|
||||
the last invocation and before unloading that module. Note that
|
||||
it is absolutely *not* sufficient to wait for a grace period!
|
||||
The current (say) synchronize_rcu() implementation is *not*
|
||||
guaranteed to wait for callbacks registered on other CPUs.
|
||||
Or even on the current CPU if that CPU recently went offline
|
||||
and came back online.
|
||||
17. If you pass a callback function defined within a module to one of
|
||||
call_rcu(), call_srcu(), call_rcu_tasks(), call_rcu_tasks_rude(),
|
||||
or call_rcu_tasks_trace(), then it is necessary to wait for all
|
||||
pending callbacks to be invoked before unloading that module.
|
||||
Note that it is absolutely *not* sufficient to wait for a grace
|
||||
period! For example, synchronize_rcu() implementation is *not*
|
||||
guaranteed to wait for callbacks registered on other CPUs via
|
||||
call_rcu(). Or even on the current CPU if that CPU recently
|
||||
went offline and came back online.
|
||||
|
||||
You instead need to use one of the barrier functions:
|
||||
|
||||
- call_rcu() -> rcu_barrier()
|
||||
- call_srcu() -> srcu_barrier()
|
||||
- call_rcu_tasks() -> rcu_barrier_tasks()
|
||||
- call_rcu_tasks_rude() -> rcu_barrier_tasks_rude()
|
||||
- call_rcu_tasks_trace() -> rcu_barrier_tasks_trace()
|
||||
|
||||
However, these barrier functions are absolutely *not* guaranteed
|
||||
to wait for a grace period. In fact, if there are no call_rcu()
|
||||
callbacks waiting anywhere in the system, rcu_barrier() is within
|
||||
its rights to return immediately.
|
||||
to wait for a grace period. For example, if there are no
|
||||
call_rcu() callbacks queued anywhere in the system, rcu_barrier()
|
||||
can and will return immediately.
|
||||
|
||||
So if you need to wait for both an RCU grace period and for
|
||||
all pre-existing call_rcu() callbacks, you will need to execute
|
||||
both rcu_barrier() and synchronize_rcu(), if necessary, using
|
||||
something like workqueues to execute them concurrently.
|
||||
So if you need to wait for both a grace period and for all
|
||||
pre-existing callbacks, you will need to invoke both functions,
|
||||
with the pair depending on the flavor of RCU:
|
||||
|
||||
- Either synchronize_rcu() or synchronize_rcu_expedited(),
|
||||
together with rcu_barrier()
|
||||
- Either synchronize_srcu() or synchronize_srcu_expedited(),
|
||||
together with and srcu_barrier()
|
||||
- synchronize_rcu_tasks() and rcu_barrier_tasks()
|
||||
- synchronize_tasks_rude() and rcu_barrier_tasks_rude()
|
||||
- synchronize_tasks_trace() and rcu_barrier_tasks_trace()
|
||||
|
||||
If necessary, you can use something like workqueues to execute
|
||||
the requisite pair of functions concurrently.
|
||||
|
||||
See rcubarrier.rst for more information.
|
||||
|
@ -9,7 +9,6 @@ RCU concepts
|
||||
.. toctree::
|
||||
:maxdepth: 3
|
||||
|
||||
arrayRCU
|
||||
checklist
|
||||
lockdep
|
||||
lockdep-splat
|
||||
|
@ -3,11 +3,10 @@
|
||||
Using RCU to Protect Read-Mostly Linked Lists
|
||||
=============================================
|
||||
|
||||
One of the best applications of RCU is to protect read-mostly linked lists
|
||||
(``struct list_head`` in list.h). One big advantage of this approach
|
||||
is that all of the required memory barriers are included for you in
|
||||
the list macros. This document describes several applications of RCU,
|
||||
with the best fits first.
|
||||
One of the most common uses of RCU is protecting read-mostly linked lists
|
||||
(``struct list_head`` in list.h). One big advantage of this approach is
|
||||
that all of the required memory ordering is provided by the list macros.
|
||||
This document describes several list-based RCU use cases.
|
||||
|
||||
|
||||
Example 1: Read-mostly list: Deferred Destruction
|
||||
@ -35,7 +34,8 @@ The code traversing the list of all processes typically looks like::
|
||||
}
|
||||
rcu_read_unlock();
|
||||
|
||||
The simplified code for removing a process from a task list is::
|
||||
The simplified and heavily inlined code for removing a process from a
|
||||
task list is::
|
||||
|
||||
void release_task(struct task_struct *p)
|
||||
{
|
||||
@ -45,39 +45,48 @@ The simplified code for removing a process from a task list is::
|
||||
call_rcu(&p->rcu, delayed_put_task_struct);
|
||||
}
|
||||
|
||||
When a process exits, ``release_task()`` calls ``list_del_rcu(&p->tasks)`` under
|
||||
``tasklist_lock`` writer lock protection, to remove the task from the list of
|
||||
all tasks. The ``tasklist_lock`` prevents concurrent list additions/removals
|
||||
from corrupting the list. Readers using ``for_each_process()`` are not protected
|
||||
with the ``tasklist_lock``. To prevent readers from noticing changes in the list
|
||||
pointers, the ``task_struct`` object is freed only after one or more grace
|
||||
periods elapse (with the help of call_rcu()). This deferring of destruction
|
||||
ensures that any readers traversing the list will see valid ``p->tasks.next``
|
||||
pointers and deletion/freeing can happen in parallel with traversal of the list.
|
||||
This pattern is also called an **existence lock**, since RCU pins the object in
|
||||
memory until all existing readers finish.
|
||||
When a process exits, ``release_task()`` calls ``list_del_rcu(&p->tasks)``
|
||||
via __exit_signal() and __unhash_process() under ``tasklist_lock``
|
||||
writer lock protection. The list_del_rcu() invocation removes
|
||||
the task from the list of all tasks. The ``tasklist_lock``
|
||||
prevents concurrent list additions/removals from corrupting the
|
||||
list. Readers using ``for_each_process()`` are not protected with the
|
||||
``tasklist_lock``. To prevent readers from noticing changes in the list
|
||||
pointers, the ``task_struct`` object is freed only after one or more
|
||||
grace periods elapse, with the help of call_rcu(), which is invoked via
|
||||
put_task_struct_rcu_user(). This deferring of destruction ensures that
|
||||
any readers traversing the list will see valid ``p->tasks.next`` pointers
|
||||
and deletion/freeing can happen in parallel with traversal of the list.
|
||||
This pattern is also called an **existence lock**, since RCU refrains
|
||||
from invoking the delayed_put_task_struct() callback function until
|
||||
all existing readers finish, which guarantees that the ``task_struct``
|
||||
object in question will remain in existence until after the completion
|
||||
of all RCU readers that might possibly have a reference to that object.
|
||||
|
||||
|
||||
Example 2: Read-Side Action Taken Outside of Lock: No In-Place Updates
|
||||
----------------------------------------------------------------------
|
||||
|
||||
The best applications are cases where, if reader-writer locking were
|
||||
used, the read-side lock would be dropped before taking any action
|
||||
based on the results of the search. The most celebrated example is
|
||||
the routing table. Because the routing table is tracking the state of
|
||||
equipment outside of the computer, it will at times contain stale data.
|
||||
Therefore, once the route has been computed, there is no need to hold
|
||||
the routing table static during transmission of the packet. After all,
|
||||
you can hold the routing table static all you want, but that won't keep
|
||||
the external Internet from changing, and it is the state of the external
|
||||
Internet that really matters. In addition, routing entries are typically
|
||||
added or deleted, rather than being modified in place.
|
||||
Some reader-writer locking use cases compute a value while holding
|
||||
the read-side lock, but continue to use that value after that lock is
|
||||
released. These use cases are often good candidates for conversion
|
||||
to RCU. One prominent example involves network packet routing.
|
||||
Because the packet-routing data tracks the state of equipment outside
|
||||
of the computer, it will at times contain stale data. Therefore, once
|
||||
the route has been computed, there is no need to hold the routing table
|
||||
static during transmission of the packet. After all, you can hold the
|
||||
routing table static all you want, but that won't keep the external
|
||||
Internet from changing, and it is the state of the external Internet
|
||||
that really matters. In addition, routing entries are typically added
|
||||
or deleted, rather than being modified in place. This is a rare example
|
||||
of the finite speed of light and the non-zero size of atoms actually
|
||||
helping make synchronization be lighter weight.
|
||||
|
||||
A straightforward example of this use of RCU may be found in the
|
||||
system-call auditing support. For example, a reader-writer locked
|
||||
A straightforward example of this type of RCU use case may be found in
|
||||
the system-call auditing support. For example, a reader-writer locked
|
||||
implementation of ``audit_filter_task()`` might be as follows::
|
||||
|
||||
static enum audit_state audit_filter_task(struct task_struct *tsk)
|
||||
static enum audit_state audit_filter_task(struct task_struct *tsk, char **key)
|
||||
{
|
||||
struct audit_entry *e;
|
||||
enum audit_state state;
|
||||
@ -86,6 +95,8 @@ implementation of ``audit_filter_task()`` might be as follows::
|
||||
/* Note: audit_filter_mutex held by caller. */
|
||||
list_for_each_entry(e, &audit_tsklist, list) {
|
||||
if (audit_filter_rules(tsk, &e->rule, NULL, &state)) {
|
||||
if (state == AUDIT_STATE_RECORD)
|
||||
*key = kstrdup(e->rule.filterkey, GFP_ATOMIC);
|
||||
read_unlock(&auditsc_lock);
|
||||
return state;
|
||||
}
|
||||
@ -101,7 +112,7 @@ you are turning auditing off, it is OK to audit a few extra system calls.
|
||||
|
||||
This means that RCU can be easily applied to the read side, as follows::
|
||||
|
||||
static enum audit_state audit_filter_task(struct task_struct *tsk)
|
||||
static enum audit_state audit_filter_task(struct task_struct *tsk, char **key)
|
||||
{
|
||||
struct audit_entry *e;
|
||||
enum audit_state state;
|
||||
@ -110,6 +121,8 @@ This means that RCU can be easily applied to the read side, as follows::
|
||||
/* Note: audit_filter_mutex held by caller. */
|
||||
list_for_each_entry_rcu(e, &audit_tsklist, list) {
|
||||
if (audit_filter_rules(tsk, &e->rule, NULL, &state)) {
|
||||
if (state == AUDIT_STATE_RECORD)
|
||||
*key = kstrdup(e->rule.filterkey, GFP_ATOMIC);
|
||||
rcu_read_unlock();
|
||||
return state;
|
||||
}
|
||||
@ -118,13 +131,15 @@ This means that RCU can be easily applied to the read side, as follows::
|
||||
return AUDIT_BUILD_CONTEXT;
|
||||
}
|
||||
|
||||
The ``read_lock()`` and ``read_unlock()`` calls have become rcu_read_lock()
|
||||
and rcu_read_unlock(), respectively, and the list_for_each_entry() has
|
||||
become list_for_each_entry_rcu(). The **_rcu()** list-traversal primitives
|
||||
insert the read-side memory barriers that are required on DEC Alpha CPUs.
|
||||
The read_lock() and read_unlock() calls have become rcu_read_lock()
|
||||
and rcu_read_unlock(), respectively, and the list_for_each_entry()
|
||||
has become list_for_each_entry_rcu(). The **_rcu()** list-traversal
|
||||
primitives add READ_ONCE() and diagnostic checks for incorrect use
|
||||
outside of an RCU read-side critical section.
|
||||
|
||||
The changes to the update side are also straightforward. A reader-writer lock
|
||||
might be used as follows for deletion and insertion::
|
||||
might be used as follows for deletion and insertion in these simplified
|
||||
versions of audit_del_rule() and audit_add_rule()::
|
||||
|
||||
static inline int audit_del_rule(struct audit_rule *rule,
|
||||
struct list_head *list)
|
||||
@ -188,16 +203,16 @@ Following are the RCU equivalents for these two functions::
|
||||
return 0;
|
||||
}
|
||||
|
||||
Normally, the ``write_lock()`` and ``write_unlock()`` would be replaced by a
|
||||
Normally, the write_lock() and write_unlock() would be replaced by a
|
||||
spin_lock() and a spin_unlock(). But in this case, all callers hold
|
||||
``audit_filter_mutex``, so no additional locking is required. The
|
||||
``auditsc_lock`` can therefore be eliminated, since use of RCU eliminates the
|
||||
auditsc_lock can therefore be eliminated, since use of RCU eliminates the
|
||||
need for writers to exclude readers.
|
||||
|
||||
The list_del(), list_add(), and list_add_tail() primitives have been
|
||||
replaced by list_del_rcu(), list_add_rcu(), and list_add_tail_rcu().
|
||||
The **_rcu()** list-manipulation primitives add memory barriers that are needed on
|
||||
weakly ordered CPUs (most of them!). The list_del_rcu() primitive omits the
|
||||
The **_rcu()** list-manipulation primitives add memory barriers that are
|
||||
needed on weakly ordered CPUs. The list_del_rcu() primitive omits the
|
||||
pointer poisoning debug-assist code that would otherwise cause concurrent
|
||||
readers to fail spectacularly.
|
||||
|
||||
@ -238,7 +253,9 @@ need to be filled in)::
|
||||
The RCU version creates a copy, updates the copy, then replaces the old
|
||||
entry with the newly updated entry. This sequence of actions, allowing
|
||||
concurrent reads while making a copy to perform an update, is what gives
|
||||
RCU (*read-copy update*) its name. The RCU code is as follows::
|
||||
RCU (*read-copy update*) its name.
|
||||
|
||||
The RCU version of audit_upd_rule() is as follows::
|
||||
|
||||
static inline int audit_upd_rule(struct audit_rule *rule,
|
||||
struct list_head *list,
|
||||
@ -267,6 +284,9 @@ RCU (*read-copy update*) its name. The RCU code is as follows::
|
||||
Again, this assumes that the caller holds ``audit_filter_mutex``. Normally, the
|
||||
writer lock would become a spinlock in this sort of code.
|
||||
|
||||
The update_lsm_rule() does something very similar, for those who would
|
||||
prefer to look at real Linux-kernel code.
|
||||
|
||||
Another use of this pattern can be found in the openswitch driver's *connection
|
||||
tracking table* code in ``ct_limit_set()``. The table holds connection tracking
|
||||
entries and has a limit on the maximum entries. There is one such table
|
||||
@ -281,9 +301,10 @@ Example 4: Eliminating Stale Data
|
||||
---------------------------------
|
||||
|
||||
The auditing example above tolerates stale data, as do most algorithms
|
||||
that are tracking external state. Because there is a delay from the
|
||||
time the external state changes before Linux becomes aware of the change,
|
||||
additional RCU-induced staleness is generally not a problem.
|
||||
that are tracking external state. After all, given there is a delay
|
||||
from the time the external state changes before Linux becomes aware
|
||||
of the change, and so as noted earlier, a small quantity of additional
|
||||
RCU-induced staleness is generally not a problem.
|
||||
|
||||
However, there are many examples where stale data cannot be tolerated.
|
||||
One example in the Linux kernel is the System V IPC (see the shm_lock()
|
||||
@ -302,7 +323,7 @@ Quick Quiz:
|
||||
|
||||
If the system-call audit module were to ever need to reject stale data, one way
|
||||
to accomplish this would be to add a ``deleted`` flag and a ``lock`` spinlock to the
|
||||
audit_entry structure, and modify ``audit_filter_task()`` as follows::
|
||||
``audit_entry`` structure, and modify audit_filter_task() as follows::
|
||||
|
||||
static enum audit_state audit_filter_task(struct task_struct *tsk)
|
||||
{
|
||||
@ -319,6 +340,8 @@ audit_entry structure, and modify ``audit_filter_task()`` as follows::
|
||||
return AUDIT_BUILD_CONTEXT;
|
||||
}
|
||||
rcu_read_unlock();
|
||||
if (state == AUDIT_STATE_RECORD)
|
||||
*key = kstrdup(e->rule.filterkey, GFP_ATOMIC);
|
||||
return state;
|
||||
}
|
||||
}
|
||||
@ -326,12 +349,6 @@ audit_entry structure, and modify ``audit_filter_task()`` as follows::
|
||||
return AUDIT_BUILD_CONTEXT;
|
||||
}
|
||||
|
||||
Note that this example assumes that entries are only added and deleted.
|
||||
Additional mechanism is required to deal correctly with the update-in-place
|
||||
performed by ``audit_upd_rule()``. For one thing, ``audit_upd_rule()`` would
|
||||
need additional memory barriers to ensure that the list_add_rcu() was really
|
||||
executed before the list_del_rcu().
|
||||
|
||||
The ``audit_del_rule()`` function would need to set the ``deleted`` flag under the
|
||||
spinlock as follows::
|
||||
|
||||
@ -357,24 +374,32 @@ spinlock as follows::
|
||||
|
||||
This too assumes that the caller holds ``audit_filter_mutex``.
|
||||
|
||||
Note that this example assumes that entries are only added and deleted.
|
||||
Additional mechanism is required to deal correctly with the update-in-place
|
||||
performed by audit_upd_rule(). For one thing, audit_upd_rule() would
|
||||
need to hold the locks of both the old ``audit_entry`` and its replacement
|
||||
while executing the list_replace_rcu().
|
||||
|
||||
|
||||
Example 5: Skipping Stale Objects
|
||||
---------------------------------
|
||||
|
||||
For some usecases, reader performance can be improved by skipping stale objects
|
||||
during read-side list traversal if the object in concern is pending destruction
|
||||
after one or more grace periods. One such example can be found in the timerfd
|
||||
subsystem. When a ``CLOCK_REALTIME`` clock is reprogrammed - for example due to
|
||||
setting of the system time, then all programmed timerfds that depend on this
|
||||
clock get triggered and processes waiting on them to expire are woken up in
|
||||
advance of their scheduled expiry. To facilitate this, all such timers are added
|
||||
to an RCU-managed ``cancel_list`` when they are setup in
|
||||
For some use cases, reader performance can be improved by skipping
|
||||
stale objects during read-side list traversal, where stale objects
|
||||
are those that will be removed and destroyed after one or more grace
|
||||
periods. One such example can be found in the timerfd subsystem. When a
|
||||
``CLOCK_REALTIME`` clock is reprogrammed (for example due to setting
|
||||
of the system time) then all programmed ``timerfds`` that depend on
|
||||
this clock get triggered and processes waiting on them are awakened in
|
||||
advance of their scheduled expiry. To facilitate this, all such timers
|
||||
are added to an RCU-managed ``cancel_list`` when they are setup in
|
||||
``timerfd_setup_cancel()``::
|
||||
|
||||
static void timerfd_setup_cancel(struct timerfd_ctx *ctx, int flags)
|
||||
{
|
||||
spin_lock(&ctx->cancel_lock);
|
||||
if ((ctx->clockid == CLOCK_REALTIME &&
|
||||
if ((ctx->clockid == CLOCK_REALTIME ||
|
||||
ctx->clockid == CLOCK_REALTIME_ALARM) &&
|
||||
(flags & TFD_TIMER_ABSTIME) && (flags & TFD_TIMER_CANCEL_ON_SET)) {
|
||||
if (!ctx->might_cancel) {
|
||||
ctx->might_cancel = true;
|
||||
@ -382,13 +407,16 @@ to an RCU-managed ``cancel_list`` when they are setup in
|
||||
list_add_rcu(&ctx->clist, &cancel_list);
|
||||
spin_unlock(&cancel_lock);
|
||||
}
|
||||
} else {
|
||||
__timerfd_remove_cancel(ctx);
|
||||
}
|
||||
spin_unlock(&ctx->cancel_lock);
|
||||
}
|
||||
|
||||
When a timerfd is freed (fd is closed), then the ``might_cancel`` flag of the
|
||||
timerfd object is cleared, the object removed from the ``cancel_list`` and
|
||||
destroyed::
|
||||
When a timerfd is freed (fd is closed), then the ``might_cancel``
|
||||
flag of the timerfd object is cleared, the object removed from the
|
||||
``cancel_list`` and destroyed, as shown in this simplified and inlined
|
||||
version of timerfd_release()::
|
||||
|
||||
int timerfd_release(struct inode *inode, struct file *file)
|
||||
{
|
||||
@ -403,7 +431,10 @@ destroyed::
|
||||
}
|
||||
spin_unlock(&ctx->cancel_lock);
|
||||
|
||||
hrtimer_cancel(&ctx->t.tmr);
|
||||
if (isalarm(ctx))
|
||||
alarm_cancel(&ctx->t.alarm);
|
||||
else
|
||||
hrtimer_cancel(&ctx->t.tmr);
|
||||
kfree_rcu(ctx, rcu);
|
||||
return 0;
|
||||
}
|
||||
@ -416,6 +447,7 @@ objects::
|
||||
|
||||
void timerfd_clock_was_set(void)
|
||||
{
|
||||
ktime_t moffs = ktime_mono_to_real(0);
|
||||
struct timerfd_ctx *ctx;
|
||||
unsigned long flags;
|
||||
|
||||
@ -424,7 +456,7 @@ objects::
|
||||
if (!ctx->might_cancel)
|
||||
continue;
|
||||
spin_lock_irqsave(&ctx->wqh.lock, flags);
|
||||
if (ctx->moffs != ktime_mono_to_real(0)) {
|
||||
if (ctx->moffs != moffs) {
|
||||
ctx->moffs = KTIME_MAX;
|
||||
ctx->ticks++;
|
||||
wake_up_locked_poll(&ctx->wqh, EPOLLIN);
|
||||
@ -434,10 +466,10 @@ objects::
|
||||
rcu_read_unlock();
|
||||
}
|
||||
|
||||
The key point here is, because RCU-traversal of the ``cancel_list`` happens
|
||||
while objects are being added and removed to the list, sometimes the traversal
|
||||
can step on an object that has been removed from the list. In this example, it
|
||||
is seen that it is better to skip such objects using a flag.
|
||||
The key point is that because RCU-protected traversal of the
|
||||
``cancel_list`` happens concurrently with object addition and removal,
|
||||
sometimes the traversal can access an object that has been removed from
|
||||
the list. In this example, a flag is used to skip such objects.
|
||||
|
||||
|
||||
Summary
|
||||
|
@ -17,7 +17,9 @@ state::
|
||||
rcu_read_lock_held() for normal RCU.
|
||||
rcu_read_lock_bh_held() for RCU-bh.
|
||||
rcu_read_lock_sched_held() for RCU-sched.
|
||||
rcu_read_lock_any_held() for any of normal RCU, RCU-bh, and RCU-sched.
|
||||
srcu_read_lock_held() for SRCU.
|
||||
rcu_read_lock_trace_held() for RCU Tasks Trace.
|
||||
|
||||
These functions are conservative, and will therefore return 1 if they
|
||||
aren't certain (for example, if CONFIG_DEBUG_LOCK_ALLOC is not set).
|
||||
@ -53,6 +55,8 @@ checking of rcu_dereference() primitives:
|
||||
is invoked by both SRCU readers and updaters.
|
||||
rcu_dereference_raw(p):
|
||||
Don't check. (Use sparingly, if at all.)
|
||||
rcu_dereference_raw_check(p):
|
||||
Don't do lockdep at all. (Use sparingly, if at all.)
|
||||
rcu_dereference_protected(p, c):
|
||||
Use explicit check expression "c", and omit all barriers
|
||||
and compiler constraints. This is useful when the data
|
||||
|
17
Documentation/accel/index.rst
Normal file
17
Documentation/accel/index.rst
Normal file
@ -0,0 +1,17 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
====================
|
||||
Compute Accelerators
|
||||
====================
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 1
|
||||
|
||||
introduction
|
||||
|
||||
.. only:: subproject and html
|
||||
|
||||
Indices
|
||||
=======
|
||||
|
||||
* :ref:`genindex`
|
110
Documentation/accel/introduction.rst
Normal file
110
Documentation/accel/introduction.rst
Normal file
@ -0,0 +1,110 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
============
|
||||
Introduction
|
||||
============
|
||||
|
||||
The Linux compute accelerators subsystem is designed to expose compute
|
||||
accelerators in a common way to user-space and provide a common set of
|
||||
functionality.
|
||||
|
||||
These devices can be either stand-alone ASICs or IP blocks inside an SoC/GPU.
|
||||
Although these devices are typically designed to accelerate
|
||||
Machine-Learning (ML) and/or Deep-Learning (DL) computations, the accel layer
|
||||
is not limited to handling these types of accelerators.
|
||||
|
||||
Typically, a compute accelerator will belong to one of the following
|
||||
categories:
|
||||
|
||||
- Edge AI - doing inference at an edge device. It can be an embedded ASIC/FPGA,
|
||||
or an IP inside a SoC (e.g. laptop web camera). These devices
|
||||
are typically configured using registers and can work with or without DMA.
|
||||
|
||||
- Inference data-center - single/multi user devices in a large server. This
|
||||
type of device can be stand-alone or an IP inside a SoC or a GPU. It will
|
||||
have on-board DRAM (to hold the DL topology), DMA engines and
|
||||
command submission queues (either kernel or user-space queues).
|
||||
It might also have an MMU to manage multiple users and might also enable
|
||||
virtualization (SR-IOV) to support multiple VMs on the same device. In
|
||||
addition, these devices will usually have some tools, such as profiler and
|
||||
debugger.
|
||||
|
||||
- Training data-center - Similar to Inference data-center cards, but typically
|
||||
have more computational power and memory b/w (e.g. HBM) and will likely have
|
||||
a method of scaling-up/out, i.e. connecting to other training cards inside
|
||||
the server or in other servers, respectively.
|
||||
|
||||
All these devices typically have different runtime user-space software stacks,
|
||||
that are tailored-made to their h/w. In addition, they will also probably
|
||||
include a compiler to generate programs to their custom-made computational
|
||||
engines. Typically, the common layer in user-space will be the DL frameworks,
|
||||
such as PyTorch and TensorFlow.
|
||||
|
||||
Sharing code with DRM
|
||||
=====================
|
||||
|
||||
Because this type of devices can be an IP inside GPUs or have similar
|
||||
characteristics as those of GPUs, the accel subsystem will use the
|
||||
DRM subsystem's code and functionality. i.e. the accel core code will
|
||||
be part of the DRM subsystem and an accel device will be a new type of DRM
|
||||
device.
|
||||
|
||||
This will allow us to leverage the extensive DRM code-base and
|
||||
collaborate with DRM developers that have experience with this type of
|
||||
devices. In addition, new features that will be added for the accelerator
|
||||
drivers can be of use to GPU drivers as well.
|
||||
|
||||
Differentiation from GPUs
|
||||
=========================
|
||||
|
||||
Because we want to prevent the extensive user-space graphic software stack
|
||||
from trying to use an accelerator as a GPU, the compute accelerators will be
|
||||
differentiated from GPUs by using a new major number and new device char files.
|
||||
|
||||
Furthermore, the drivers will be located in a separate place in the kernel
|
||||
tree - drivers/accel/.
|
||||
|
||||
The accelerator devices will be exposed to the user space with the dedicated
|
||||
261 major number and will have the following convention:
|
||||
|
||||
- device char files - /dev/accel/accel*
|
||||
- sysfs - /sys/class/accel/accel*/
|
||||
- debugfs - /sys/kernel/debug/accel/accel*/
|
||||
|
||||
Getting Started
|
||||
===============
|
||||
|
||||
First, read the DRM documentation at Documentation/gpu/index.rst.
|
||||
Not only it will explain how to write a new DRM driver but it will also
|
||||
contain all the information on how to contribute, the Code Of Conduct and
|
||||
what is the coding style/documentation. All of that is the same for the
|
||||
accel subsystem.
|
||||
|
||||
Second, make sure the kernel is configured with CONFIG_DRM_ACCEL.
|
||||
|
||||
To expose your device as an accelerator, two changes are needed to
|
||||
be done in your driver (as opposed to a standard DRM driver):
|
||||
|
||||
- Add the DRIVER_COMPUTE_ACCEL feature flag in your drm_driver's
|
||||
driver_features field. It is important to note that this driver feature is
|
||||
mutually exclusive with DRIVER_RENDER and DRIVER_MODESET. Devices that want
|
||||
to expose both graphics and compute device char files should be handled by
|
||||
two drivers that are connected using the auxiliary bus framework.
|
||||
|
||||
- Change the open callback in your driver fops structure to accel_open().
|
||||
Alternatively, your driver can use DEFINE_DRM_ACCEL_FOPS macro to easily
|
||||
set the correct function operations pointers structure.
|
||||
|
||||
External References
|
||||
===================
|
||||
|
||||
email threads
|
||||
-------------
|
||||
|
||||
* `Initial discussion on the New subsystem for acceleration devices <https://lkml.org/lkml/2022/7/31/83>`_ - Oded Gabbay (2022)
|
||||
* `patch-set to add the new subsystem <https://lkml.org/lkml/2022/10/22/544>`_ - Oded Gabbay (2022)
|
||||
|
||||
Conference talks
|
||||
----------------
|
||||
|
||||
* `LPC 2022 Accelerators BOF outcomes summary <https://airlied.blogspot.com/2022/09/accelerators-bof-outcomes-summary.html>`_ - Dave Airlie (2022)
|
@ -348,8 +348,13 @@ this can be accomplished with::
|
||||
|
||||
echo huge_idle > /sys/block/zramX/writeback
|
||||
|
||||
If a user chooses to writeback only incompressible pages (pages that none of
|
||||
algorithms can compress) this can be accomplished with::
|
||||
|
||||
echo incompressible > /sys/block/zramX/writeback
|
||||
|
||||
If an admin wants to write a specific page in zram device to the backing device,
|
||||
they could write a page index into the interface.
|
||||
they could write a page index into the interface::
|
||||
|
||||
echo "page_index=1251" > /sys/block/zramX/writeback
|
||||
|
||||
@ -401,6 +406,87 @@ budget in next setting is user's job.
|
||||
If admin wants to measure writeback count in a certain period, they could
|
||||
know it via /sys/block/zram0/bd_stat's 3rd column.
|
||||
|
||||
recompression
|
||||
-------------
|
||||
|
||||
With CONFIG_ZRAM_MULTI_COMP, zram can recompress pages using alternative
|
||||
(secondary) compression algorithms. The basic idea is that alternative
|
||||
compression algorithm can provide better compression ratio at a price of
|
||||
(potentially) slower compression/decompression speeds. Alternative compression
|
||||
algorithm can, for example, be more successful compressing huge pages (those
|
||||
that default algorithm failed to compress). Another application is idle pages
|
||||
recompression - pages that are cold and sit in the memory can be recompressed
|
||||
using more effective algorithm and, hence, reduce zsmalloc memory usage.
|
||||
|
||||
With CONFIG_ZRAM_MULTI_COMP, zram supports up to 4 compression algorithms:
|
||||
one primary and up to 3 secondary ones. Primary zram compressor is explained
|
||||
in "3) Select compression algorithm", secondary algorithms are configured
|
||||
using recomp_algorithm device attribute.
|
||||
|
||||
Example:::
|
||||
|
||||
#show supported recompression algorithms
|
||||
cat /sys/block/zramX/recomp_algorithm
|
||||
#1: lzo lzo-rle lz4 lz4hc [zstd]
|
||||
#2: lzo lzo-rle lz4 [lz4hc] zstd
|
||||
|
||||
Alternative compression algorithms are sorted by priority. In the example
|
||||
above, zstd is used as the first alternative algorithm, which has priority
|
||||
of 1, while lz4hc is configured as a compression algorithm with priority 2.
|
||||
Alternative compression algorithm's priority is provided during algorithms
|
||||
configuration:::
|
||||
|
||||
#select zstd recompression algorithm, priority 1
|
||||
echo "algo=zstd priority=1" > /sys/block/zramX/recomp_algorithm
|
||||
|
||||
#select deflate recompression algorithm, priority 2
|
||||
echo "algo=deflate priority=2" > /sys/block/zramX/recomp_algorithm
|
||||
|
||||
Another device attribute that CONFIG_ZRAM_MULTI_COMP enables is recompress,
|
||||
which controls recompression.
|
||||
|
||||
Examples:::
|
||||
|
||||
#IDLE pages recompression is activated by `idle` mode
|
||||
echo "type=idle" > /sys/block/zramX/recompress
|
||||
|
||||
#HUGE pages recompression is activated by `huge` mode
|
||||
echo "type=huge" > /sys/block/zram0/recompress
|
||||
|
||||
#HUGE_IDLE pages recompression is activated by `huge_idle` mode
|
||||
echo "type=huge_idle" > /sys/block/zramX/recompress
|
||||
|
||||
The number of idle pages can be significant, so user-space can pass a size
|
||||
threshold (in bytes) to the recompress knob: zram will recompress only pages
|
||||
of equal or greater size:::
|
||||
|
||||
#recompress all pages larger than 3000 bytes
|
||||
echo "threshold=3000" > /sys/block/zramX/recompress
|
||||
|
||||
#recompress idle pages larger than 2000 bytes
|
||||
echo "type=idle threshold=2000" > /sys/block/zramX/recompress
|
||||
|
||||
Recompression of idle pages requires memory tracking.
|
||||
|
||||
During re-compression for every page, that matches re-compression criteria,
|
||||
ZRAM iterates the list of registered alternative compression algorithms in
|
||||
order of their priorities. ZRAM stops either when re-compression was
|
||||
successful (re-compressed object is smaller in size than the original one)
|
||||
and matches re-compression criteria (e.g. size threshold) or when there are
|
||||
no secondary algorithms left to try. If none of the secondary algorithms can
|
||||
successfully re-compressed the page such a page is marked as incompressible,
|
||||
so ZRAM will not attempt to re-compress it in the future.
|
||||
|
||||
This re-compression behaviour, when it iterates through the list of
|
||||
registered compression algorithms, increases our chances of finding the
|
||||
algorithm that successfully compresses a particular page. Sometimes, however,
|
||||
it is convenient (and sometimes even necessary) to limit recompression to
|
||||
only one particular algorithm so that it will not try any other algorithms.
|
||||
This can be achieved by providing a algo=NAME parameter:::
|
||||
|
||||
#use zstd algorithm only (if registered)
|
||||
echo "type=huge algo=zstd" > /sys/block/zramX/recompress
|
||||
|
||||
memory tracking
|
||||
===============
|
||||
|
||||
@ -411,9 +497,11 @@ pages of the process with*pagemap.
|
||||
If you enable the feature, you could see block state via
|
||||
/sys/kernel/debug/zram/zram0/block_state". The output is as follows::
|
||||
|
||||
300 75.033841 .wh.
|
||||
301 63.806904 s...
|
||||
302 63.806919 ..hi
|
||||
300 75.033841 .wh...
|
||||
301 63.806904 s.....
|
||||
302 63.806919 ..hi..
|
||||
303 62.801919 ....r.
|
||||
304 146.781902 ..hi.n
|
||||
|
||||
First column
|
||||
zram's block index.
|
||||
@ -430,6 +518,10 @@ Third column
|
||||
huge page
|
||||
i:
|
||||
idle page
|
||||
r:
|
||||
recompressed page (secondary compression algorithm)
|
||||
n:
|
||||
none (including secondary) of algorithms could compress it
|
||||
|
||||
First line of above example says 300th block is accessed at 75.033841sec
|
||||
and the block's state is huge so it is written back to the backing
|
||||
|
@ -229,7 +229,7 @@ In addition to the kernel command line, the boot config can be used for
|
||||
passing the kernel parameters. All the key-value pairs under ``kernel``
|
||||
key will be passed to kernel cmdline directly. Moreover, the key-value
|
||||
pairs under ``init`` will be passed to init process via the cmdline.
|
||||
The parameters are concatinated with user-given kernel cmdline string
|
||||
The parameters are concatenated with user-given kernel cmdline string
|
||||
as the following order, so that the command line parameter can override
|
||||
bootconfig parameters (this depends on how the subsystem handles parameters
|
||||
but in general, earlier parameter will be overwritten by later one.)::
|
||||
|
@ -543,7 +543,8 @@ inactive_anon # of bytes of anonymous and swap cache memory on inactive
|
||||
LRU list.
|
||||
active_anon # of bytes of anonymous and swap cache memory on active
|
||||
LRU list.
|
||||
inactive_file # of bytes of file-backed memory on inactive LRU list.
|
||||
inactive_file # of bytes of file-backed memory and MADV_FREE anonymous memory(
|
||||
LazyFree pages) on inactive LRU list.
|
||||
active_file # of bytes of file-backed memory on active LRU list.
|
||||
unevictable # of bytes of memory that cannot be reclaimed (mlocked etc).
|
||||
=============== ===============================================================
|
||||
|
@ -1245,17 +1245,13 @@ PAGE_SIZE multiple when read back.
|
||||
This is a simple interface to trigger memory reclaim in the
|
||||
target cgroup.
|
||||
|
||||
This file accepts a single key, the number of bytes to reclaim.
|
||||
No nested keys are currently supported.
|
||||
This file accepts a string which contains the number of bytes to
|
||||
reclaim.
|
||||
|
||||
Example::
|
||||
|
||||
echo "1G" > memory.reclaim
|
||||
|
||||
The interface can be later extended with nested keys to
|
||||
configure the reclaim behavior. For example, specify the
|
||||
type of memory to reclaim from (anon, file, ..).
|
||||
|
||||
Please note that the kernel can over or under reclaim from
|
||||
the target cgroup. If less bytes are reclaimed than the
|
||||
specified amount, -EAGAIN is returned.
|
||||
@ -1267,6 +1263,13 @@ PAGE_SIZE multiple when read back.
|
||||
This means that the networking layer will not adapt based on
|
||||
reclaim induced by memory.reclaim.
|
||||
|
||||
This file also allows the user to specify the nodes to reclaim from,
|
||||
via the 'nodes=' key, for example::
|
||||
|
||||
echo "1G nodes=0,1" > memory.reclaim
|
||||
|
||||
The above instructs the kernel to reclaim memory from nodes 0,1.
|
||||
|
||||
memory.peak
|
||||
A read-only single value file which exists on non-root
|
||||
cgroups.
|
||||
@ -1488,12 +1491,18 @@ PAGE_SIZE multiple when read back.
|
||||
pgscan_direct (npn)
|
||||
Amount of scanned pages directly (in an inactive LRU list)
|
||||
|
||||
pgscan_khugepaged (npn)
|
||||
Amount of scanned pages by khugepaged (in an inactive LRU list)
|
||||
|
||||
pgsteal_kswapd (npn)
|
||||
Amount of reclaimed pages by kswapd
|
||||
|
||||
pgsteal_direct (npn)
|
||||
Amount of reclaimed pages directly
|
||||
|
||||
pgsteal_khugepaged (npn)
|
||||
Amount of reclaimed pages by khugepaged
|
||||
|
||||
pgfault (npn)
|
||||
Total number of page faults incurred
|
||||
|
||||
|
@ -858,7 +858,7 @@ CIFS kernel module parameters
|
||||
These module parameters can be specified or modified either during the time of
|
||||
module loading or during the runtime by using the interface::
|
||||
|
||||
/proc/module/cifs/parameters/<param>
|
||||
/sys/module/cifs/parameters/<param>
|
||||
|
||||
i.e.::
|
||||
|
||||
|
@ -123,3 +123,11 @@ Other examples (per target):
|
||||
0 1638400 verity 1 8:1 8:2 4096 4096 204800 1 sha256
|
||||
fb1a5a0f00deb908d8b53cb270858975e76cf64105d412ce764225d53b8f3cfd
|
||||
51934789604d1b92399c52e7cb149d1b3a1b74bbbcb103b2a0aaacbed5c08584
|
||||
|
||||
For setups using device-mapper on top of asynchronously probed block
|
||||
devices (MMC, USB, ..), it may be necessary to tell dm-init to
|
||||
explicitly wait for them to become available before setting up the
|
||||
device-mapper tables. This can be done with the "dm-mod.waitfor="
|
||||
module parameter, which takes a list of devices to wait for::
|
||||
|
||||
dm-mod.waitfor=<device1>[,..,<deviceN>]
|
||||
|
@ -3080,6 +3080,11 @@
|
||||
...
|
||||
255 = /dev/osd255 256th OSD Device
|
||||
|
||||
261 char Compute Acceleration Devices
|
||||
0 = /dev/accel/accel0 First acceleration device
|
||||
1 = /dev/accel/accel1 Second acceleration device
|
||||
...
|
||||
|
||||
384-511 char RESERVED FOR DYNAMIC ASSIGNMENT
|
||||
Character devices that request a dynamic allocation of major
|
||||
number will take numbers starting from 511 and downward,
|
||||
|
@ -1,6 +1,6 @@
|
||||
==========================================================
|
||||
Linux support for random number generator in i8xx chipsets
|
||||
==========================================================
|
||||
=================================
|
||||
Hardware random number generators
|
||||
=================================
|
||||
|
||||
Introduction
|
||||
============
|
||||
|
@ -595,3 +595,32 @@ X2TLB
|
||||
-----
|
||||
|
||||
Indicates whether the crashed kernel enabled SH extended mode.
|
||||
|
||||
RISCV64
|
||||
=======
|
||||
|
||||
VA_BITS
|
||||
-------
|
||||
|
||||
The maximum number of bits for virtual addresses. Used to compute the
|
||||
virtual memory ranges.
|
||||
|
||||
PAGE_OFFSET
|
||||
-----------
|
||||
|
||||
Indicates the virtual kernel start address of the direct-mapped RAM region.
|
||||
|
||||
phys_ram_base
|
||||
-------------
|
||||
|
||||
Indicates the start physical RAM address.
|
||||
|
||||
MODULES_VADDR|MODULES_END|VMALLOC_START|VMALLOC_END|VMEMMAP_START|VMEMMAP_END|KERNEL_LINK_ADDR
|
||||
----------------------------------------------------------------------------------------------
|
||||
|
||||
Used to get the correct ranges:
|
||||
|
||||
* MODULES_VADDR ~ MODULES_END : Kernel module space.
|
||||
* VMALLOC_START ~ VMALLOC_END : vmalloc() / ioremap() space.
|
||||
* VMEMMAP_START ~ VMEMMAP_END : vmemmap space, used for struct page array.
|
||||
* KERNEL_LINK_ADDR : start address of Kernel link and BPF
|
||||
|
@ -703,6 +703,17 @@
|
||||
condev= [HW,S390] console device
|
||||
conmode=
|
||||
|
||||
con3215_drop= [S390] 3215 console drop mode.
|
||||
Format: y|n|Y|N|1|0
|
||||
When set to true, drop data on the 3215 console when
|
||||
the console buffer is full. In this case the
|
||||
operator using a 3270 terminal emulator (for example
|
||||
x3270) does not have to enter the clear key for the
|
||||
console output to advance and the kernel to continue.
|
||||
This leads to a much faster boot time when a 3270
|
||||
terminal emulator is active. If no 3270 terminal
|
||||
emulator is used, this parameter has no effect.
|
||||
|
||||
console= [KNL] Output console device and options.
|
||||
|
||||
tty<n> Use the virtual console device <n>.
|
||||
@ -831,7 +842,7 @@
|
||||
memory region [offset, offset + size] for that kernel
|
||||
image. If '@offset' is omitted, then a suitable offset
|
||||
is selected automatically.
|
||||
[KNL, X86-64] Select a region under 4G first, and
|
||||
[KNL, X86-64, ARM64] Select a region under 4G first, and
|
||||
fall back to reserve region above 4G when '@offset'
|
||||
hasn't been specified.
|
||||
See Documentation/admin-guide/kdump/kdump.rst for further details.
|
||||
@ -851,26 +862,23 @@
|
||||
available.
|
||||
It will be ignored if crashkernel=X is specified.
|
||||
crashkernel=size[KMG],low
|
||||
[KNL, X86-64] range under 4G. When crashkernel=X,high
|
||||
[KNL, X86-64, ARM64] range under 4G. When crashkernel=X,high
|
||||
is passed, kernel could allocate physical memory region
|
||||
above 4G, that cause second kernel crash on system
|
||||
that require some amount of low memory, e.g. swiotlb
|
||||
requires at least 64M+32K low memory, also enough extra
|
||||
low memory is needed to make sure DMA buffers for 32-bit
|
||||
devices won't run out. Kernel would try to allocate
|
||||
at least 256M below 4G automatically.
|
||||
default size of memory below 4G automatically. The default
|
||||
size is platform dependent.
|
||||
--> x86: max(swiotlb_size_or_default() + 8MiB, 256MiB)
|
||||
--> arm64: 128MiB
|
||||
This one lets the user specify own low range under 4G
|
||||
for second kernel instead.
|
||||
0: to disable low allocation.
|
||||
It will be ignored when crashkernel=X,high is not used
|
||||
or memory reserved is below 4G.
|
||||
|
||||
[KNL, ARM64] range in low memory.
|
||||
This one lets the user specify a low range in the
|
||||
DMA zone for the crash dump kernel.
|
||||
It will be ignored when crashkernel=X,high is not used
|
||||
or memory reserved is located in the DMA zones.
|
||||
|
||||
cryptomgr.notests
|
||||
[KNL] Disable crypto self-tests
|
||||
|
||||
@ -3777,12 +3785,15 @@
|
||||
shutdown the other cpus. Instead use the REBOOT_VECTOR
|
||||
irq.
|
||||
|
||||
nomodeset Disable kernel modesetting. DRM drivers will not perform
|
||||
display-mode changes or accelerated rendering. Only the
|
||||
system framebuffer will be available for use if this was
|
||||
set-up by the firmware or boot loader.
|
||||
nomodeset Disable kernel modesetting. Most systems' firmware
|
||||
sets up a display mode and provides framebuffer memory
|
||||
for output. With nomodeset, DRM and fbdev drivers will
|
||||
not load if they could possibly displace the pre-
|
||||
initialized output. Only the system framebuffer will
|
||||
be available for use. The respective drivers will not
|
||||
perform display-mode changes or accelerated rendering.
|
||||
|
||||
Useful as fallback, or for testing and debugging.
|
||||
Useful as error fallback, or for testing and debugging.
|
||||
|
||||
nomodule Disable module load
|
||||
|
||||
@ -4566,17 +4577,15 @@
|
||||
|
||||
ramdisk_start= [RAM] RAM disk image start address
|
||||
|
||||
random.trust_cpu={on,off}
|
||||
[KNL] Enable or disable trusting the use of the
|
||||
CPU's random number generator (if available) to
|
||||
fully seed the kernel's CRNG. Default is controlled
|
||||
by CONFIG_RANDOM_TRUST_CPU.
|
||||
random.trust_cpu=off
|
||||
[KNL] Disable trusting the use of the CPU's
|
||||
random number generator (if available) to
|
||||
initialize the kernel's RNG.
|
||||
|
||||
random.trust_bootloader={on,off}
|
||||
[KNL] Enable or disable trusting the use of a
|
||||
seed passed by the bootloader (if available) to
|
||||
fully seed the kernel's CRNG. Default is controlled
|
||||
by CONFIG_RANDOM_TRUST_BOOTLOADER.
|
||||
random.trust_bootloader=off
|
||||
[KNL] Disable trusting the use of the a seed
|
||||
passed by the bootloader (if available) to
|
||||
initialize the kernel's RNG.
|
||||
|
||||
randomize_kstack_offset=
|
||||
[KNL] Enable or disable kernel stack offset
|
||||
@ -6257,6 +6266,25 @@
|
||||
See also Documentation/trace/ftrace.rst "trace options"
|
||||
section.
|
||||
|
||||
trace_trigger=[trigger-list]
|
||||
[FTRACE] Add a event trigger on specific events.
|
||||
Set a trigger on top of a specific event, with an optional
|
||||
filter.
|
||||
|
||||
The format is is "trace_trigger=<event>.<trigger>[ if <filter>],..."
|
||||
Where more than one trigger may be specified that are comma deliminated.
|
||||
|
||||
For example:
|
||||
|
||||
trace_trigger="sched_switch.stacktrace if prev_state == 2"
|
||||
|
||||
The above will enable the "stacktrace" trigger on the "sched_switch"
|
||||
event but only trigger it if the "prev_state" of the "sched_switch"
|
||||
event is "2" (TASK_UNINTERUPTIBLE).
|
||||
|
||||
See also "Event triggers" in Documentation/trace/events.rst
|
||||
|
||||
|
||||
traceoff_on_warning
|
||||
[FTRACE] enable this option to disable tracing when a
|
||||
warning is hit. This turns off "tracing_on". Tracing can
|
||||
@ -6959,3 +6987,14 @@
|
||||
memory, and other data can't be written using
|
||||
xmon commands.
|
||||
off xmon is disabled.
|
||||
|
||||
amd_pstate= [X86]
|
||||
disable
|
||||
Do not enable amd_pstate as the default
|
||||
scaling driver for the supported processors
|
||||
passive
|
||||
Use amd_pstate as a scaling driver, driver requests a
|
||||
desired performance on this abstract scale and the power
|
||||
management firmware translates the requests into actual
|
||||
hardware states (core frequency, data fabric and memory
|
||||
clocks etc.)
|
||||
|
@ -1,10 +0,0 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
=================================
|
||||
CEC driver-specific documentation
|
||||
=================================
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
|
||||
pulse8-cec
|
369
Documentation/admin-guide/media/cec.rst
Normal file
369
Documentation/admin-guide/media/cec.rst
Normal file
@ -0,0 +1,369 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
========
|
||||
HDMI CEC
|
||||
========
|
||||
|
||||
Supported hardware in mainline
|
||||
==============================
|
||||
|
||||
HDMI Transmitters:
|
||||
|
||||
- Exynos4
|
||||
- Exynos5
|
||||
- STIH4xx HDMI CEC
|
||||
- V4L2 adv7511 (same HW, but a different driver from the drm adv7511)
|
||||
- stm32
|
||||
- Allwinner A10 (sun4i)
|
||||
- Raspberry Pi
|
||||
- dw-hdmi (Synopsis IP)
|
||||
- amlogic (meson ao-cec and ao-cec-g12a)
|
||||
- drm adv7511/adv7533
|
||||
- omap4
|
||||
- tegra
|
||||
- rk3288, rk3399
|
||||
- tda998x
|
||||
- DisplayPort CEC-Tunneling-over-AUX on i915, nouveau and amdgpu
|
||||
- ChromeOS EC CEC
|
||||
- CEC for SECO boards (UDOO x86).
|
||||
- Chrontel CH7322
|
||||
|
||||
|
||||
HDMI Receivers:
|
||||
|
||||
- adv7604/11/12
|
||||
- adv7842
|
||||
- tc358743
|
||||
|
||||
USB Dongles (see below for additional information on how to use these
|
||||
dongles):
|
||||
|
||||
- Pulse-Eight: the pulse8-cec driver implements the following module option:
|
||||
``persistent_config``: by default this is off, but when set to 1 the driver
|
||||
will store the current settings to the device's internal eeprom and restore
|
||||
it the next time the device is connected to the USB port.
|
||||
- RainShadow Tech. Note: this driver does not support the persistent_config
|
||||
module option of the Pulse-Eight driver. The hardware supports it, but I
|
||||
have no plans to add this feature. But I accept patches :-)
|
||||
|
||||
Miscellaneous:
|
||||
|
||||
- vivid: emulates a CEC receiver and CEC transmitter.
|
||||
Can be used to test CEC applications without actual CEC hardware.
|
||||
|
||||
- cec-gpio. If the CEC pin is hooked up to a GPIO pin then
|
||||
you can control the CEC line through this driver. This supports error
|
||||
injection as well.
|
||||
|
||||
|
||||
Utilities
|
||||
=========
|
||||
|
||||
Utilities are available here: https://git.linuxtv.org/v4l-utils.git
|
||||
|
||||
``utils/cec-ctl``: control a CEC device
|
||||
|
||||
``utils/cec-compliance``: test compliance of a remote CEC device
|
||||
|
||||
``utils/cec-follower``: emulate a CEC follower device
|
||||
|
||||
Note that ``cec-ctl`` has support for the CEC Hospitality Profile as is
|
||||
used in some hotel displays. See http://www.htng.org.
|
||||
|
||||
Note that the libcec library (https://github.com/Pulse-Eight/libcec) supports
|
||||
the linux CEC framework.
|
||||
|
||||
If you want to get the CEC specification, then look at the References of
|
||||
the HDMI wikipedia page: https://en.wikipedia.org/wiki/HDMI. CEC is part
|
||||
of the HDMI specification. HDMI 1.3 is freely available (very similar to
|
||||
HDMI 1.4 w.r.t. CEC) and should be good enough for most things.
|
||||
|
||||
|
||||
DisplayPort to HDMI Adapters with working CEC
|
||||
=============================================
|
||||
|
||||
Background: most adapters do not support the CEC Tunneling feature,
|
||||
and of those that do many did not actually connect the CEC pin.
|
||||
Unfortunately, this means that while a CEC device is created, it
|
||||
is actually all alone in the world and will never be able to see other
|
||||
CEC devices.
|
||||
|
||||
This is a list of known working adapters that have CEC Tunneling AND
|
||||
that properly connected the CEC pin. If you find adapters that work
|
||||
but are not in this list, then drop me a note.
|
||||
|
||||
To test: hook up your DP-to-HDMI adapter to a CEC capable device
|
||||
(typically a TV), then run::
|
||||
|
||||
cec-ctl --playback # Configure the PC as a CEC Playback device
|
||||
cec-ctl -S # Show the CEC topology
|
||||
|
||||
The ``cec-ctl -S`` command should show at least two CEC devices,
|
||||
ourselves and the CEC device you are connected to (i.e. typically the TV).
|
||||
|
||||
General note: I have only seen this work with the Parade PS175, PS176 and
|
||||
PS186 chipsets and the MegaChips 2900. While MegaChips 28x0 claims CEC support,
|
||||
I have never seen it work.
|
||||
|
||||
USB-C to HDMI
|
||||
-------------
|
||||
|
||||
Samsung Multiport Adapter EE-PW700: https://www.samsung.com/ie/support/model/EE-PW700BBEGWW/
|
||||
|
||||
Kramer ADC-U31C/HF: https://www.kramerav.com/product/ADC-U31C/HF
|
||||
|
||||
Club3D CAC-2504: https://www.club-3d.com/en/detail/2449/usb_3.1_type_c_to_hdmi_2.0_uhd_4k_60hz_active_adapter/
|
||||
|
||||
DisplayPort to HDMI
|
||||
-------------------
|
||||
|
||||
Club3D CAC-1080: https://www.club-3d.com/en/detail/2442/displayport_1.4_to_hdmi_2.0b_hdr/
|
||||
|
||||
CableCreation (SKU: CD0712): https://www.cablecreation.com/products/active-displayport-to-hdmi-adapter-4k-hdr
|
||||
|
||||
HP DisplayPort to HDMI True 4k Adapter (P/N 2JA63AA): https://www.hp.com/us-en/shop/pdp/hp-displayport-to-hdmi-true-4k-adapter
|
||||
|
||||
Mini-DisplayPort to HDMI
|
||||
------------------------
|
||||
|
||||
Club3D CAC-1180: https://www.club-3d.com/en/detail/2443/mini_displayport_1.4_to_hdmi_2.0b_hdr/
|
||||
|
||||
Note that passive adapters will never work, you need an active adapter.
|
||||
|
||||
The Club3D adapters in this list are all MegaChips 2900 based. Other Club3D adapters
|
||||
are PS176 based and do NOT have the CEC pin hooked up, so only the three Club3D
|
||||
adapters above are known to work.
|
||||
|
||||
I suspect that MegaChips 2900 based designs in general are likely to work
|
||||
whereas with the PS176 it is more hit-and-miss (mostly miss). The PS186 is
|
||||
likely to have the CEC pin hooked up, it looks like they changed the reference
|
||||
design for that chipset.
|
||||
|
||||
|
||||
USB CEC Dongles
|
||||
===============
|
||||
|
||||
These dongles appear as ``/dev/ttyACMX`` devices and need the ``inputattach``
|
||||
utility to create the ``/dev/cecX`` devices. Support for the Pulse-Eight
|
||||
has been added to ``inputattach`` 1.6.0. Support for the Rainshadow Tech has
|
||||
been added to ``inputattach`` 1.6.1.
|
||||
|
||||
You also need udev rules to automatically start systemd services::
|
||||
|
||||
SUBSYSTEM=="tty", KERNEL=="ttyACM[0-9]*", ATTRS{idVendor}=="2548", ATTRS{idProduct}=="1002", ACTION=="add", TAG+="systemd", ENV{SYSTEMD_WANTS}+="pulse8-cec-inputattach@%k.service"
|
||||
SUBSYSTEM=="tty", KERNEL=="ttyACM[0-9]*", ATTRS{idVendor}=="2548", ATTRS{idProduct}=="1001", ACTION=="add", TAG+="systemd", ENV{SYSTEMD_WANTS}+="pulse8-cec-inputattach@%k.service"
|
||||
SUBSYSTEM=="tty", KERNEL=="ttyACM[0-9]*", ATTRS{idVendor}=="04d8", ATTRS{idProduct}=="ff59", ACTION=="add", TAG+="systemd", ENV{SYSTEMD_WANTS}+="rainshadow-cec-inputattach@%k.service"
|
||||
|
||||
and these systemd services:
|
||||
|
||||
For Pulse-Eight make /lib/systemd/system/pulse8-cec-inputattach@.service::
|
||||
|
||||
[Unit]
|
||||
Description=inputattach for pulse8-cec device on %I
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
ExecStart=/usr/bin/inputattach --pulse8-cec /dev/%I
|
||||
|
||||
For the RainShadow Tech make /lib/systemd/system/rainshadow-cec-inputattach@.service::
|
||||
|
||||
[Unit]
|
||||
Description=inputattach for rainshadow-cec device on %I
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
ExecStart=/usr/bin/inputattach --rainshadow-cec /dev/%I
|
||||
|
||||
|
||||
For proper suspend/resume support create: /lib/systemd/system/restart-cec-inputattach.service::
|
||||
|
||||
[Unit]
|
||||
Description=restart inputattach for cec devices
|
||||
After=suspend.target
|
||||
|
||||
[Service]
|
||||
Type=forking
|
||||
ExecStart=/bin/bash -c 'for d in /dev/serial/by-id/usb-Pulse-Eight*; do /usr/bin/inputattach --daemon --pulse8-cec $d; done; for d in /dev/serial/by-id/usb-RainShadow_Tech*; do /usr/bin/inputattach --daemon --rainshadow-cec $d; done'
|
||||
|
||||
[Install]
|
||||
WantedBy=suspend.target
|
||||
|
||||
And run ``systemctl enable restart-cec-inputattach``.
|
||||
|
||||
To automatically set the physical address of the CEC device whenever the
|
||||
EDID changes, you can use ``cec-ctl`` with the ``-E`` option::
|
||||
|
||||
cec-ctl -E /sys/class/drm/card0-DP-1/edid
|
||||
|
||||
This assumes the dongle is connected to the card0-DP-1 output (``xrandr`` will tell
|
||||
you which output is used) and it will poll for changes to the EDID and update
|
||||
the Physical Address whenever they occur.
|
||||
|
||||
To automatically run this command you can use cron. Edit crontab with
|
||||
``crontab -e`` and add this line::
|
||||
|
||||
@reboot /usr/local/bin/cec-ctl -E /sys/class/drm/card0-DP-1/edid
|
||||
|
||||
This only works for display drivers that expose the EDID in ``/sys/class/drm``,
|
||||
such as the i915 driver.
|
||||
|
||||
|
||||
CEC Without HPD
|
||||
===============
|
||||
|
||||
Some displays when in standby mode have no HDMI Hotplug Detect signal, but
|
||||
CEC is still enabled so connected devices can send an <Image View On> CEC
|
||||
message in order to wake up such displays. Unfortunately, not all CEC
|
||||
adapters can support this. An example is the Odroid-U3 SBC that has a
|
||||
level-shifter that is powered off when the HPD signal is low, thus
|
||||
blocking the CEC pin. Even though the SoC can use CEC without a HPD,
|
||||
the level-shifter will prevent this from functioning.
|
||||
|
||||
There is a CEC capability flag to signal this: ``CEC_CAP_NEEDS_HPD``.
|
||||
If set, then the hardware cannot wake up displays with this behavior.
|
||||
|
||||
Note for CEC application implementers: the <Image View On> message must
|
||||
be the first message you send, don't send any other messages before.
|
||||
Certain very bad but unfortunately not uncommon CEC implementations
|
||||
get very confused if they receive anything else but this message and
|
||||
they won't wake up.
|
||||
|
||||
When writing a driver it can be tricky to test this. There are two
|
||||
ways to do this:
|
||||
|
||||
1) Get a Pulse-Eight USB CEC dongle, connect an HDMI cable from your
|
||||
device to the Pulse-Eight, but do not connect the Pulse-Eight to
|
||||
the display.
|
||||
|
||||
Now configure the Pulse-Eight dongle::
|
||||
|
||||
cec-ctl -p0.0.0.0 --tv
|
||||
|
||||
and start monitoring::
|
||||
|
||||
sudo cec-ctl -M
|
||||
|
||||
On the device you are testing run::
|
||||
|
||||
cec-ctl --playback
|
||||
|
||||
It should report a physical address of f.f.f.f. Now run this
|
||||
command::
|
||||
|
||||
cec-ctl -t0 --image-view-on
|
||||
|
||||
The Pulse-Eight should see the <Image View On> message. If not,
|
||||
then something (hardware and/or software) is preventing the CEC
|
||||
message from going out.
|
||||
|
||||
To make sure you have the wiring correct just connect the
|
||||
Pulse-Eight to a CEC-enabled display and run the same command
|
||||
on your device: now there is a HPD, so you should see the command
|
||||
arriving at the Pulse-Eight.
|
||||
|
||||
2) If you have another linux device supporting CEC without HPD, then
|
||||
you can just connect your device to that device. Yes, you can connect
|
||||
two HDMI outputs together. You won't have a HPD (which is what we
|
||||
want for this test), but the second device can monitor the CEC pin.
|
||||
|
||||
Otherwise use the same commands as in 1.
|
||||
|
||||
If CEC messages do not come through when there is no HPD, then you
|
||||
need to figure out why. Typically it is either a hardware restriction
|
||||
or the software powers off the CEC core when the HPD goes low. The
|
||||
first cannot be corrected of course, the second will likely required
|
||||
driver changes.
|
||||
|
||||
|
||||
Microcontrollers & CEC
|
||||
======================
|
||||
|
||||
We have seen some CEC implementations in displays that use a microcontroller
|
||||
to sample the bus. This does not have to be a problem, but some implementations
|
||||
have timing issues. This is hard to discover unless you can hook up a low-level
|
||||
CEC debugger (see the next section).
|
||||
|
||||
You will see cases where the CEC transmitter holds the CEC line high or low for
|
||||
a longer time than is allowed. For directed messages this is not a problem since
|
||||
if that happens the message will not be Acked and it will be retransmitted.
|
||||
For broadcast messages no such mechanism exists.
|
||||
|
||||
It's not clear what to do about this. It is probably wise to transmit some
|
||||
broadcast messages twice to reduce the chance of them being lost. Specifically
|
||||
<Standby> and <Active Source> are candidates for that.
|
||||
|
||||
|
||||
Making a CEC debugger
|
||||
=====================
|
||||
|
||||
By using a Raspberry Pi 2B/3/4 and some cheap components you can make
|
||||
your own low-level CEC debugger.
|
||||
|
||||
Here is a picture of my setup:
|
||||
|
||||
https://hverkuil.home.xs4all.nl/rpi3-cec.jpg
|
||||
|
||||
It's a Raspberry Pi 3 together with a breadboard and some breadboard wires:
|
||||
|
||||
http://www.dx.com/p/diy-40p-male-to-female-male-to-male-female-to-female-dupont-line-wire-3pcs-356089#.WYLOOXWGN7I
|
||||
|
||||
Finally on of these HDMI female-female passthrough connectors (full soldering type 1):
|
||||
|
||||
https://elabbay.myshopify.com/collections/camera/products/hdmi-af-af-v1a-hdmi-type-a-female-to-hdmi-type-a-female-pass-through-adapter-breakout-board?variant=45533926147
|
||||
|
||||
We've tested this and it works up to 4kp30 (297 MHz). The quality is not high
|
||||
enough to pass-through 4kp60 (594 MHz).
|
||||
|
||||
I also added an RTC and a breakout shield:
|
||||
|
||||
https://www.amazon.com/Makerfire%C2%AE-Raspberry-Module-DS1307-Battery/dp/B00ZOXWHK4
|
||||
|
||||
https://www.dx.com/p/raspberry-pi-gpio-expansion-board-breadboard-easy-multiplexing-board-one-to-three-with-screw-for-raspberry-pi-2-3-b-b-2729992.html#.YGRCG0MzZ7I
|
||||
|
||||
These two are not needed but they make life a bit easier.
|
||||
|
||||
If you want to monitor the HPD line as well, then you need one of these
|
||||
level shifters:
|
||||
|
||||
https://www.adafruit.com/product/757
|
||||
|
||||
(This is just where I got these components, there are many other places you
|
||||
can get similar things).
|
||||
|
||||
The CEC pin of the HDMI connector needs to be connected to these pins:
|
||||
CE0/IO8 and CE1/IO7 (pull-up GPIOs). The (optional) HPD pin of the HDMI
|
||||
connector should be connected (via a level shifter to convert the 5V
|
||||
to 3.3V) to these pins: IO17 and IO27. The (optional) 5V pin of the HDMI
|
||||
connector should be connected (via a level shifter) to these pins: IO22
|
||||
and IO24. Monitoring the HPD an 5V lines is not necessary, but it is helpful.
|
||||
|
||||
This kernel patch will hook up the cec-gpio driver correctly to
|
||||
e.g. ``arch/arm/boot/dts/bcm2837-rpi-3-b-plus.dts``::
|
||||
|
||||
cec-gpio@7 {
|
||||
compatible = "cec-gpio";
|
||||
cec-gpios = <&gpio 7 (GPIO_ACTIVE_HIGH|GPIO_OPEN_DRAIN)>;
|
||||
hpd-gpios = <&gpio 17 GPIO_ACTIVE_HIGH>;
|
||||
v5-gpios = <&gpio 22 GPIO_ACTIVE_HIGH>;
|
||||
};
|
||||
|
||||
cec-gpio@8 {
|
||||
compatible = "cec-gpio";
|
||||
cec-gpios = <&gpio 8 (GPIO_ACTIVE_HIGH|GPIO_OPEN_DRAIN)>;
|
||||
hpd-gpios = <&gpio 27 GPIO_ACTIVE_HIGH>;
|
||||
v5-gpios = <&gpio 24 GPIO_ACTIVE_HIGH>;
|
||||
};
|
||||
|
||||
This dts change will enable two cec GPIO devices: I typically use one to
|
||||
send/receive CEC commands and the other to monitor. If you monitor using
|
||||
an unconfigured CEC adapter then it will use GPIO interrupts which makes
|
||||
monitoring very accurate.
|
||||
|
||||
The documentation on how to use the error injection is here: :ref:`cec_pin_error_inj`.
|
||||
|
||||
``cec-ctl --monitor-pin`` will do low-level CEC bus sniffing and analysis.
|
||||
You can also store the CEC traffic to file using ``--store-pin`` and analyze
|
||||
it later using ``--analyze-pin``.
|
||||
|
||||
You can also use this as a full-fledged CEC device by configuring it
|
||||
using ``cec-ctl --tv -p0.0.0.0`` or ``cec-ctl --playback -p1.0.0.0``.
|
@ -38,13 +38,14 @@ The media subsystem
|
||||
|
||||
remote-controller
|
||||
|
||||
cec
|
||||
|
||||
dvb
|
||||
|
||||
cardlist
|
||||
|
||||
v4l-drivers
|
||||
dvb-drivers
|
||||
cec-drivers
|
||||
|
||||
**Copyright** |copy| 1999-2020 : LinuxTV Developers
|
||||
|
||||
|
@ -1,13 +0,0 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
Pulse-Eight CEC Adapter driver
|
||||
==============================
|
||||
|
||||
The pulse8-cec driver implements the following module option:
|
||||
|
||||
``persistent_config``
|
||||
---------------------
|
||||
|
||||
By default this is off, but when set to 1 the driver will store the current
|
||||
settings to the device's internal eeprom and restore it the next time the
|
||||
device is connected to the USB port.
|
@ -31,4 +31,5 @@ Video4Linux (V4L) driver-specific documentation
|
||||
si4713
|
||||
si476x
|
||||
vimc
|
||||
visl
|
||||
vivid
|
||||
|
@ -35,11 +35,11 @@ of commands fits for the default topology:
|
||||
|
||||
media-ctl -d platform:vimc -V '"Sensor A":0[fmt:SBGGR8_1X8/640x480]'
|
||||
media-ctl -d platform:vimc -V '"Debayer A":0[fmt:SBGGR8_1X8/640x480]'
|
||||
media-ctl -d platform:vimc -V '"Sensor B":0[fmt:SBGGR8_1X8/640x480]'
|
||||
media-ctl -d platform:vimc -V '"Debayer B":0[fmt:SBGGR8_1X8/640x480]'
|
||||
v4l2-ctl -z platform:vimc -d "RGB/YUV Capture" -v width=1920,height=1440
|
||||
media-ctl -d platform:vimc -V '"Scaler":0[fmt:RGB888_1X24/640x480]'
|
||||
media-ctl -d platform:vimc -V '"Scaler":0[crop:(100,50)/400x150]'
|
||||
media-ctl -d platform:vimc -V '"Scaler":1[fmt:RGB888_1X24/300x700]'
|
||||
v4l2-ctl -z platform:vimc -d "RGB/YUV Capture" -v width=300,height=700
|
||||
v4l2-ctl -z platform:vimc -d "Raw Capture 0" -v pixelformat=BA81
|
||||
v4l2-ctl -z platform:vimc -d "Raw Capture 1" -v pixelformat=BA81
|
||||
|
||||
Subdevices
|
||||
----------
|
||||
|
175
Documentation/admin-guide/media/visl.rst
Normal file
175
Documentation/admin-guide/media/visl.rst
Normal file
@ -0,0 +1,175 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
The Virtual Stateless Decoder Driver (visl)
|
||||
===========================================
|
||||
|
||||
A virtual stateless decoder device for stateless uAPI development
|
||||
purposes.
|
||||
|
||||
This tool's objective is to help the development and testing of
|
||||
userspace applications that use the V4L2 stateless API to decode media.
|
||||
|
||||
A userspace implementation can use visl to run a decoding loop even when
|
||||
no hardware is available or when the kernel uAPI for the codec has not
|
||||
been upstreamed yet. This can reveal bugs at an early stage.
|
||||
|
||||
This driver can also trace the contents of the V4L2 controls submitted
|
||||
to it. It can also dump the contents of the vb2 buffers through a
|
||||
debugfs interface. This is in many ways similar to the tracing
|
||||
infrastructure available for other popular encode/decode APIs out there
|
||||
and can help develop a userspace application by using another (working)
|
||||
one as a reference.
|
||||
|
||||
.. note::
|
||||
|
||||
No actual decoding of video frames is performed by visl. The
|
||||
V4L2 test pattern generator is used to write various debug information
|
||||
to the capture buffers instead.
|
||||
|
||||
Module parameters
|
||||
-----------------
|
||||
|
||||
- visl_debug: Activates debug info, printing various debug messages through
|
||||
dprintk. Also controls whether per-frame debug info is shown. Defaults to off.
|
||||
Note that enabling this feature can result in slow performance through serial.
|
||||
|
||||
- visl_transtime_ms: Simulated process time in milliseconds. Slowing down the
|
||||
decoding speed can be useful for debugging.
|
||||
|
||||
- visl_dprintk_frame_start, visl_dprintk_frame_nframes: Dictates a range of
|
||||
frames where dprintk is activated. This only controls the dprintk tracing on a
|
||||
per-frame basis. Note that printing a lot of data can be slow through serial.
|
||||
|
||||
- keep_bitstream_buffers: Controls whether bitstream (i.e. OUTPUT) buffers are
|
||||
kept after a decoding session. Defaults to false so as to reduce the amount of
|
||||
clutter. keep_bitstream_buffers == false works well when live debugging the
|
||||
client program with GDB.
|
||||
|
||||
- bitstream_trace_frame_start, bitstream_trace_nframes: Similar to
|
||||
visl_dprintk_frame_start, visl_dprintk_nframes, but controls the dumping of
|
||||
buffer data through debugfs instead.
|
||||
|
||||
What is the default use case for this driver?
|
||||
---------------------------------------------
|
||||
|
||||
This driver can be used as a way to compare different userspace implementations.
|
||||
This assumes that a working client is run against visl and that the ftrace and
|
||||
OUTPUT buffer data is subsequently used to debug a work-in-progress
|
||||
implementation.
|
||||
|
||||
Information on reference frames, their timestamps, the status of the OUTPUT and
|
||||
CAPTURE queues and more can be read directly from the CAPTURE buffers.
|
||||
|
||||
Supported codecs
|
||||
----------------
|
||||
|
||||
The following codecs are supported:
|
||||
|
||||
- FWHT
|
||||
- MPEG2
|
||||
- VP8
|
||||
- VP9
|
||||
- H.264
|
||||
- HEVC
|
||||
|
||||
visl trace events
|
||||
-----------------
|
||||
The trace events are defined on a per-codec basis, e.g.:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
$ ls /sys/kernel/debug/tracing/events/ | grep visl
|
||||
visl_fwht_controls
|
||||
visl_h264_controls
|
||||
visl_hevc_controls
|
||||
visl_mpeg2_controls
|
||||
visl_vp8_controls
|
||||
visl_vp9_controls
|
||||
|
||||
For example, in order to dump HEVC SPS data:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
$ echo 1 > /sys/kernel/debug/tracing/events/visl_hevc_controls/v4l2_ctrl_hevc_sps/enable
|
||||
|
||||
The SPS data will be dumped to the trace buffer, i.e.:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
$ cat /sys/kernel/debug/tracing/trace
|
||||
video_parameter_set_id 0
|
||||
seq_parameter_set_id 0
|
||||
pic_width_in_luma_samples 1920
|
||||
pic_height_in_luma_samples 1080
|
||||
bit_depth_luma_minus8 0
|
||||
bit_depth_chroma_minus8 0
|
||||
log2_max_pic_order_cnt_lsb_minus4 4
|
||||
sps_max_dec_pic_buffering_minus1 6
|
||||
sps_max_num_reorder_pics 2
|
||||
sps_max_latency_increase_plus1 0
|
||||
log2_min_luma_coding_block_size_minus3 0
|
||||
log2_diff_max_min_luma_coding_block_size 3
|
||||
log2_min_luma_transform_block_size_minus2 0
|
||||
log2_diff_max_min_luma_transform_block_size 3
|
||||
max_transform_hierarchy_depth_inter 2
|
||||
max_transform_hierarchy_depth_intra 2
|
||||
pcm_sample_bit_depth_luma_minus1 0
|
||||
pcm_sample_bit_depth_chroma_minus1 0
|
||||
log2_min_pcm_luma_coding_block_size_minus3 0
|
||||
log2_diff_max_min_pcm_luma_coding_block_size 0
|
||||
num_short_term_ref_pic_sets 0
|
||||
num_long_term_ref_pics_sps 0
|
||||
chroma_format_idc 1
|
||||
sps_max_sub_layers_minus1 0
|
||||
flags AMP_ENABLED|SAMPLE_ADAPTIVE_OFFSET|TEMPORAL_MVP_ENABLED|STRONG_INTRA_SMOOTHING_ENABLED
|
||||
|
||||
|
||||
Dumping OUTPUT buffer data through debugfs
|
||||
------------------------------------------
|
||||
|
||||
If the **VISL_DEBUGFS** Kconfig is enabled, visl will populate
|
||||
**/sys/kernel/debug/visl/bitstream** with OUTPUT buffer data according to the
|
||||
values of bitstream_trace_frame_start and bitstream_trace_nframes. This can
|
||||
highlight errors as broken clients may fail to fill the buffers properly.
|
||||
|
||||
A single file is created for each processed OUTPUT buffer. Its name contains an
|
||||
integer that denotes the buffer sequence, i.e.:
|
||||
|
||||
.. code-block:: c
|
||||
|
||||
snprintf(name, 32, "bitstream%d", run->src->sequence);
|
||||
|
||||
Dumping the values is simply a matter of reading from the file, i.e.:
|
||||
|
||||
For the buffer with sequence == 0:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
$ xxd /sys/kernel/debug/visl/bitstream/bitstream0
|
||||
00000000: 2601 af04 d088 bc25 a173 0e41 a4f2 3274 &......%.s.A..2t
|
||||
00000010: c668 cb28 e775 b4ac f53a ba60 f8fd 3aa1 .h.(.u...:.`..:.
|
||||
00000020: 46b4 bcfc 506c e227 2372 e5f5 d7ea 579f F...Pl.'#r....W.
|
||||
00000030: 6371 5eb5 0eb8 23b5 ca6a 5de5 983a 19e4 cq^...#..j]..:..
|
||||
00000040: e8c3 4320 b4ba a226 cbc1 4138 3a12 32d6 ..C ...&..A8:.2.
|
||||
00000050: fef3 247b 3523 4e90 9682 ac8e eb0c a389 ..${5#N.........
|
||||
00000060: ddd0 6cfc 0187 0e20 7aae b15b 1812 3d33 ..l.... z..[..=3
|
||||
00000070: e1c5 f425 a83a 00b7 4f18 8127 3c4c aefb ...%.:..O..'<L..
|
||||
|
||||
For the buffer with sequence == 1:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
$ xxd /sys/kernel/debug/visl/bitstream/bitstream1
|
||||
00000000: 0201 d021 49e1 0c40 aa11 1449 14a6 01dc ...!I..@...I....
|
||||
00000010: 7023 889a c8cd 2cd0 13b4 dab0 e8ca 21fe p#....,.......!.
|
||||
00000020: c4c8 ab4c 486e 4e2f b0df 96cc c74e 8dde ...LHnN/.....N..
|
||||
00000030: 8ce7 ee36 d880 4095 4d64 30a0 ff4f 0c5e ...6..@.Md0..O.^
|
||||
00000040: f16b a6a1 d806 ca2a 0ece a673 7bea 1f37 .k.....*...s{..7
|
||||
00000050: 370f 5bb9 1dc4 ba21 6434 bc53 0173 cba0 7.[....!d4.S.s..
|
||||
00000060: dfe6 bc99 01ea b6e0 346b 92b5 c8de 9f5d ........4k.....]
|
||||
00000070: e7cc 3484 1769 fef2 a693 a945 2c8b 31da ..4..i.....E,.1.
|
||||
|
||||
And so on.
|
||||
|
||||
By default, the files are removed during STREAMOFF. This is to reduce the amount
|
||||
of clutter.
|
@ -392,7 +392,7 @@ Which one is returned depends on the chosen channel, each next valid channel
|
||||
will cycle through the possible audio subchannel combinations. This allows
|
||||
you to test the various combinations by just switching channels..
|
||||
|
||||
Finally, for these inputs the v4l2_timecode struct is filled in in the
|
||||
Finally, for these inputs the v4l2_timecode struct is filled in the
|
||||
dequeued v4l2_buffer struct.
|
||||
|
||||
|
||||
|
@ -88,6 +88,9 @@ comma (","). ::
|
||||
│ │ │ │ │ │ │ │ weights/sz_permil,nr_accesses_permil,age_permil
|
||||
│ │ │ │ │ │ │ watermarks/metric,interval_us,high,mid,low
|
||||
│ │ │ │ │ │ │ stats/nr_tried,sz_tried,nr_applied,sz_applied,qt_exceeds
|
||||
│ │ │ │ │ │ │ tried_regions/
|
||||
│ │ │ │ │ │ │ │ 0/start,end,nr_accesses,age
|
||||
│ │ │ │ │ │ │ │ ...
|
||||
│ │ │ │ │ │ ...
|
||||
│ │ │ │ ...
|
||||
│ │ ...
|
||||
@ -125,7 +128,14 @@ in the state. Writing ``commit`` to the ``state`` file makes kdamond reads the
|
||||
user inputs in the sysfs files except ``state`` file again. Writing
|
||||
``update_schemes_stats`` to ``state`` file updates the contents of stats files
|
||||
for each DAMON-based operation scheme of the kdamond. For details of the
|
||||
stats, please refer to :ref:`stats section <sysfs_schemes_stats>`.
|
||||
stats, please refer to :ref:`stats section <sysfs_schemes_stats>`. Writing
|
||||
``update_schemes_tried_regions`` to ``state`` file updates the DAMON-based
|
||||
operation scheme action tried regions directory for each DAMON-based operation
|
||||
scheme of the kdamond. Writing ``clear_schemes_tried_regions`` to ``state``
|
||||
file clears the DAMON-based operating scheme action tried regions directory for
|
||||
each DAMON-based operation scheme of the kdamond. For details of the
|
||||
DAMON-based operation scheme action tried regions directory, please refer to
|
||||
:ref:tried_regions section <sysfs_schemes_tried_regions>`.
|
||||
|
||||
If the state is ``on``, reading ``pid`` shows the pid of the kdamond thread.
|
||||
|
||||
@ -166,6 +176,8 @@ You can set and get what type of monitoring operations DAMON will use for the
|
||||
context by writing one of the keywords listed in ``avail_operations`` file and
|
||||
reading from the ``operations`` file.
|
||||
|
||||
.. _sysfs_monitoring_attrs:
|
||||
|
||||
contexts/<N>/monitoring_attrs/
|
||||
------------------------------
|
||||
|
||||
@ -235,6 +247,9 @@ In each region directory, you will find two files (``start`` and ``end``). You
|
||||
can set and get the start and end addresses of the initial monitoring target
|
||||
region by writing to and reading from the files, respectively.
|
||||
|
||||
Each region should not overlap with others. ``end`` of directory ``N`` should
|
||||
be equal or smaller than ``start`` of directory ``N+1``.
|
||||
|
||||
contexts/<N>/schemes/
|
||||
---------------------
|
||||
|
||||
@ -252,8 +267,9 @@ to ``N-1``. Each directory represents each DAMON-based operation scheme.
|
||||
schemes/<N>/
|
||||
------------
|
||||
|
||||
In each scheme directory, four directories (``access_pattern``, ``quotas``,
|
||||
``watermarks``, and ``stats``) and one file (``action``) exist.
|
||||
In each scheme directory, five directories (``access_pattern``, ``quotas``,
|
||||
``watermarks``, ``stats``, and ``tried_regions``) and one file (``action``)
|
||||
exist.
|
||||
|
||||
The ``action`` file is for setting and getting what action you want to apply to
|
||||
memory regions having specific access pattern of the interest. The keywords
|
||||
@ -348,6 +364,32 @@ should ask DAMON sysfs interface to updte the content of the files for the
|
||||
stats by writing a special keyword, ``update_schemes_stats`` to the relevant
|
||||
``kdamonds/<N>/state`` file.
|
||||
|
||||
.. _sysfs_schemes_tried_regions:
|
||||
|
||||
schemes/<N>/tried_regions/
|
||||
--------------------------
|
||||
|
||||
When a special keyword, ``update_schemes_tried_regions``, is written to the
|
||||
relevant ``kdamonds/<N>/state`` file, DAMON creates directories named integer
|
||||
starting from ``0`` under this directory. Each directory contains files
|
||||
exposing detailed information about each of the memory region that the
|
||||
corresponding scheme's ``action`` has tried to be applied under this directory,
|
||||
during next :ref:`aggregation interval <sysfs_monitoring_attrs>`. The
|
||||
information includes address range, ``nr_accesses``, , and ``age`` of the
|
||||
region.
|
||||
|
||||
The directories will be removed when another special keyword,
|
||||
``clear_schemes_tried_regions``, is written to the relevant
|
||||
``kdamonds/<N>/state`` file.
|
||||
|
||||
tried_regions/<N>/
|
||||
------------------
|
||||
|
||||
In each region directory, you will find four files (``start``, ``end``,
|
||||
``nr_accesses``, and ``age``). Reading the files will show the start and end
|
||||
addresses, ``nr_accesses``, and ``age`` of the region that corresponding
|
||||
DAMON-based operation scheme ``action`` has tried to be applied.
|
||||
|
||||
Example
|
||||
~~~~~~~
|
||||
|
||||
@ -465,8 +507,9 @@ regions in case of physical memory monitoring. Therefore, users should set the
|
||||
monitoring target regions by themselves.
|
||||
|
||||
In such cases, users can explicitly set the initial monitoring target regions
|
||||
as they want, by writing proper values to the ``init_regions`` file. Each line
|
||||
of the input should represent one region in below form.::
|
||||
as they want, by writing proper values to the ``init_regions`` file. The input
|
||||
should be a sequence of three integers separated by white spaces that represent
|
||||
one region in below form.::
|
||||
|
||||
<target idx> <start address> <end address>
|
||||
|
||||
@ -481,9 +524,9 @@ ranges, ``20-40`` and ``50-100`` as that of pid 4242, which is the second one
|
||||
# cd <debugfs>/damon
|
||||
# cat target_ids
|
||||
42 4242
|
||||
# echo "0 1 100
|
||||
0 100 200
|
||||
1 20 40
|
||||
# echo "0 1 100 \
|
||||
0 100 200 \
|
||||
1 20 40 \
|
||||
1 50 100" > init_regions
|
||||
|
||||
Note that this sets the initial monitoring target regions only. In case of
|
||||
|
@ -14,13 +14,7 @@ for potentially reduced swap I/O. This trade-off can also result in a
|
||||
significant performance improvement if reads from the compressed cache are
|
||||
faster than reads from a swap device.
|
||||
|
||||
.. note::
|
||||
Zswap is a new feature as of v3.11 and interacts heavily with memory
|
||||
reclaim. This interaction has not been fully explored on the large set of
|
||||
potential configurations and workloads that exist. For this reason, zswap
|
||||
is a work in progress and should be considered experimental.
|
||||
|
||||
Some potential benefits:
|
||||
Some potential benefits:
|
||||
|
||||
* Desktop/laptop users with limited RAM capacities can mitigate the
|
||||
performance impact of swapping.
|
||||
|
@ -15,10 +15,10 @@ HiSilicon PCIe PMU driver
|
||||
The PCIe PMU driver registers a perf PMU with the name of its sicl-id and PCIe
|
||||
Core id.::
|
||||
|
||||
/sys/bus/event_source/hisi_pcie<sicl>_<core>
|
||||
/sys/bus/event_source/hisi_pcie<sicl>_core<core>
|
||||
|
||||
PMU driver provides description of available events and filter options in sysfs,
|
||||
see /sys/bus/event_source/devices/hisi_pcie<sicl>_<core>.
|
||||
see /sys/bus/event_source/devices/hisi_pcie<sicl>_core<core>.
|
||||
|
||||
The "format" directory describes all formats of the config (events) and config1
|
||||
(filter options) fields of the perf_event_attr structure. The "events" directory
|
||||
@ -33,13 +33,13 @@ monitored by PMU.
|
||||
Example usage of perf::
|
||||
|
||||
$# perf list
|
||||
hisi_pcie0_0/rx_mwr_latency/ [kernel PMU event]
|
||||
hisi_pcie0_0/rx_mwr_cnt/ [kernel PMU event]
|
||||
hisi_pcie0_core0/rx_mwr_latency/ [kernel PMU event]
|
||||
hisi_pcie0_core0/rx_mwr_cnt/ [kernel PMU event]
|
||||
------------------------------------------
|
||||
|
||||
$# perf stat -e hisi_pcie0_0/rx_mwr_latency/
|
||||
$# perf stat -e hisi_pcie0_0/rx_mwr_cnt/
|
||||
$# perf stat -g -e hisi_pcie0_0/rx_mwr_latency/ -e hisi_pcie0_0/rx_mwr_cnt/
|
||||
$# perf stat -e hisi_pcie0_core0/rx_mwr_latency/
|
||||
$# perf stat -e hisi_pcie0_core0/rx_mwr_cnt/
|
||||
$# perf stat -g -e hisi_pcie0_core0/rx_mwr_latency/ -e hisi_pcie0_core0/rx_mwr_cnt/
|
||||
|
||||
The current driver does not support sampling. So "perf record" is unsupported.
|
||||
Also attach to a task is unsupported for PCIe PMU.
|
||||
@ -48,59 +48,83 @@ Filter options
|
||||
--------------
|
||||
|
||||
1. Target filter
|
||||
PMU could only monitor the performance of traffic downstream target Root Ports
|
||||
or downstream target Endpoint. PCIe PMU driver support "port" and "bdf"
|
||||
interfaces for users, and these two interfaces aren't supported at the same
|
||||
time.
|
||||
|
||||
-port
|
||||
"port" filter can be used in all PCIe PMU events, target Root Port can be
|
||||
selected by configuring the 16-bits-bitmap "port". Multi ports can be selected
|
||||
for AP-layer-events, and only one port can be selected for TL/DL-layer-events.
|
||||
PMU could only monitor the performance of traffic downstream target Root
|
||||
Ports or downstream target Endpoint. PCIe PMU driver support "port" and
|
||||
"bdf" interfaces for users, and these two interfaces aren't supported at the
|
||||
same time.
|
||||
|
||||
For example, if target Root Port is 0000:00:00.0 (x8 lanes), bit0 of bitmap
|
||||
should be set, port=0x1; if target Root Port is 0000:00:04.0 (x4 lanes),
|
||||
bit8 is set, port=0x100; if these two Root Ports are both monitored, port=0x101.
|
||||
- port
|
||||
|
||||
Example usage of perf::
|
||||
"port" filter can be used in all PCIe PMU events, target Root Port can be
|
||||
selected by configuring the 16-bits-bitmap "port". Multi ports can be
|
||||
selected for AP-layer-events, and only one port can be selected for
|
||||
TL/DL-layer-events.
|
||||
|
||||
$# perf stat -e hisi_pcie0_0/rx_mwr_latency,port=0x1/ sleep 5
|
||||
For example, if target Root Port is 0000:00:00.0 (x8 lanes), bit0 of
|
||||
bitmap should be set, port=0x1; if target Root Port is 0000:00:04.0 (x4
|
||||
lanes), bit8 is set, port=0x100; if these two Root Ports are both
|
||||
monitored, port=0x101.
|
||||
|
||||
-bdf
|
||||
Example usage of perf::
|
||||
|
||||
"bdf" filter can only be used in bandwidth events, target Endpoint is selected
|
||||
by configuring BDF to "bdf". Counter only counts the bandwidth of message
|
||||
requested by target Endpoint.
|
||||
$# perf stat -e hisi_pcie0_core0/rx_mwr_latency,port=0x1/ sleep 5
|
||||
|
||||
For example, "bdf=0x3900" means BDF of target Endpoint is 0000:39:00.0.
|
||||
- bdf
|
||||
|
||||
Example usage of perf::
|
||||
"bdf" filter can only be used in bandwidth events, target Endpoint is
|
||||
selected by configuring BDF to "bdf". Counter only counts the bandwidth of
|
||||
message requested by target Endpoint.
|
||||
|
||||
$# perf stat -e hisi_pcie0_0/rx_mrd_flux,bdf=0x3900/ sleep 5
|
||||
For example, "bdf=0x3900" means BDF of target Endpoint is 0000:39:00.0.
|
||||
|
||||
Example usage of perf::
|
||||
|
||||
$# perf stat -e hisi_pcie0_core0/rx_mrd_flux,bdf=0x3900/ sleep 5
|
||||
|
||||
2. Trigger filter
|
||||
Event statistics start when the first time TLP length is greater/smaller
|
||||
than trigger condition. You can set the trigger condition by writing "trig_len",
|
||||
and set the trigger mode by writing "trig_mode". This filter can only be used
|
||||
in bandwidth events.
|
||||
|
||||
For example, "trig_len=4" means trigger condition is 2^4 DW, "trig_mode=0"
|
||||
means statistics start when TLP length > trigger condition, "trig_mode=1"
|
||||
means start when TLP length < condition.
|
||||
Event statistics start when the first time TLP length is greater/smaller
|
||||
than trigger condition. You can set the trigger condition by writing
|
||||
"trig_len", and set the trigger mode by writing "trig_mode". This filter can
|
||||
only be used in bandwidth events.
|
||||
|
||||
Example usage of perf::
|
||||
For example, "trig_len=4" means trigger condition is 2^4 DW, "trig_mode=0"
|
||||
means statistics start when TLP length > trigger condition, "trig_mode=1"
|
||||
means start when TLP length < condition.
|
||||
|
||||
$# perf stat -e hisi_pcie0_0/rx_mrd_flux,trig_len=0x4,trig_mode=1/ sleep 5
|
||||
Example usage of perf::
|
||||
|
||||
$# perf stat -e hisi_pcie0_core0/rx_mrd_flux,trig_len=0x4,trig_mode=1/ sleep 5
|
||||
|
||||
3. Threshold filter
|
||||
Counter counts when TLP length within the specified range. You can set the
|
||||
threshold by writing "thr_len", and set the threshold mode by writing
|
||||
"thr_mode". This filter can only be used in bandwidth events.
|
||||
|
||||
For example, "thr_len=4" means threshold is 2^4 DW, "thr_mode=0" means
|
||||
counter counts when TLP length >= threshold, and "thr_mode=1" means counts
|
||||
when TLP length < threshold.
|
||||
Counter counts when TLP length within the specified range. You can set the
|
||||
threshold by writing "thr_len", and set the threshold mode by writing
|
||||
"thr_mode". This filter can only be used in bandwidth events.
|
||||
|
||||
Example usage of perf::
|
||||
For example, "thr_len=4" means threshold is 2^4 DW, "thr_mode=0" means
|
||||
counter counts when TLP length >= threshold, and "thr_mode=1" means counts
|
||||
when TLP length < threshold.
|
||||
|
||||
$# perf stat -e hisi_pcie0_0/rx_mrd_flux,thr_len=0x4,thr_mode=1/ sleep 5
|
||||
Example usage of perf::
|
||||
|
||||
$# perf stat -e hisi_pcie0_core0/rx_mrd_flux,thr_len=0x4,thr_mode=1/ sleep 5
|
||||
|
||||
4. TLP Length filter
|
||||
|
||||
When counting bandwidth, the data can be composed of certain parts of TLP
|
||||
packets. You can specify it through "len_mode":
|
||||
|
||||
- 2'b00: Reserved (Do not use this since the behaviour is undefined)
|
||||
- 2'b01: Bandwidth of TLP payloads
|
||||
- 2'b10: Bandwidth of TLP headers
|
||||
- 2'b11: Bandwidth of both TLP payloads and headers
|
||||
|
||||
For example, "len_mode=2" means only counting the bandwidth of TLP headers
|
||||
and "len_mode=3" means the final bandwidth data is composed of both TLP
|
||||
headers and payloads. Default value if not specified is 2'b11.
|
||||
|
||||
Example usage of perf::
|
||||
|
||||
$# perf stat -e hisi_pcie0_core0/rx_mrd_flux,len_mode=0x1/ sleep 5
|
||||
|
@ -19,3 +19,5 @@ Performance monitor support
|
||||
arm_dsu_pmu
|
||||
thunderx2-pmu
|
||||
alibaba_pmu
|
||||
nvidia-pmu
|
||||
meson-ddr-pmu
|
||||
|
70
Documentation/admin-guide/perf/meson-ddr-pmu.rst
Normal file
70
Documentation/admin-guide/perf/meson-ddr-pmu.rst
Normal file
@ -0,0 +1,70 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
===========================================================
|
||||
Amlogic SoC DDR Bandwidth Performance Monitoring Unit (PMU)
|
||||
===========================================================
|
||||
|
||||
The Amlogic Meson G12 SoC contains a bandwidth monitor inside DRAM controller.
|
||||
The monitor includes 4 channels. Each channel can count the request accessing
|
||||
DRAM. The channel can count up to 3 AXI port simultaneously. It can be helpful
|
||||
to show if the performance bottleneck is on DDR bandwidth.
|
||||
|
||||
Currently, this driver supports the following 5 perf events:
|
||||
|
||||
+ meson_ddr_bw/total_rw_bytes/
|
||||
+ meson_ddr_bw/chan_1_rw_bytes/
|
||||
+ meson_ddr_bw/chan_2_rw_bytes/
|
||||
+ meson_ddr_bw/chan_3_rw_bytes/
|
||||
+ meson_ddr_bw/chan_4_rw_bytes/
|
||||
|
||||
meson_ddr_bw/chan_{1,2,3,4}_rw_bytes/ events are channel-specific events.
|
||||
Each channel support filtering, which can let the channel to monitor
|
||||
individual IP module in SoC.
|
||||
|
||||
Below are DDR access request event filter keywords:
|
||||
|
||||
+ arm - from CPU
|
||||
+ vpu_read1 - from OSD + VPP read
|
||||
+ gpu - from 3D GPU
|
||||
+ pcie - from PCIe controller
|
||||
+ hdcp - from HDCP controller
|
||||
+ hevc_front - from HEVC codec front end
|
||||
+ usb3_0 - from USB3.0 controller
|
||||
+ hevc_back - from HEVC codec back end
|
||||
+ h265enc - from HEVC encoder
|
||||
+ vpu_read2 - from DI read
|
||||
+ vpu_write1 - from VDIN write
|
||||
+ vpu_write2 - from di write
|
||||
+ vdec - from legacy codec video decoder
|
||||
+ hcodec - from H264 encoder
|
||||
+ ge2d - from ge2d
|
||||
+ spicc1 - from SPI controller 1
|
||||
+ usb0 - from USB2.0 controller 0
|
||||
+ dma - from system DMA controller 1
|
||||
+ arb0 - from arb0
|
||||
+ sd_emmc_b - from SD eMMC b controller
|
||||
+ usb1 - from USB2.0 controller 1
|
||||
+ audio - from Audio module
|
||||
+ sd_emmc_c - from SD eMMC c controller
|
||||
+ spicc2 - from SPI controller 2
|
||||
+ ethernet - from Ethernet controller
|
||||
|
||||
|
||||
Examples:
|
||||
|
||||
+ Show the total DDR bandwidth per seconds:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
perf stat -a -e meson_ddr_bw/total_rw_bytes/ -I 1000 sleep 10
|
||||
|
||||
|
||||
+ Show individual DDR bandwidth from CPU and GPU respectively, as well as
|
||||
sum of them:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
perf stat -a -e meson_ddr_bw/chan_1_rw_bytes,arm=1/ -I 1000 sleep 10
|
||||
perf stat -a -e meson_ddr_bw/chan_2_rw_bytes,gpu=1/ -I 1000 sleep 10
|
||||
perf stat -a -e meson_ddr_bw/chan_3_rw_bytes,arm=1,gpu=1/ -I 1000 sleep 10
|
||||
|
299
Documentation/admin-guide/perf/nvidia-pmu.rst
Normal file
299
Documentation/admin-guide/perf/nvidia-pmu.rst
Normal file
@ -0,0 +1,299 @@
|
||||
=========================================================
|
||||
NVIDIA Tegra SoC Uncore Performance Monitoring Unit (PMU)
|
||||
=========================================================
|
||||
|
||||
The NVIDIA Tegra SoC includes various system PMUs to measure key performance
|
||||
metrics like memory bandwidth, latency, and utilization:
|
||||
|
||||
* Scalable Coherency Fabric (SCF)
|
||||
* NVLink-C2C0
|
||||
* NVLink-C2C1
|
||||
* CNVLink
|
||||
* PCIE
|
||||
|
||||
PMU Driver
|
||||
----------
|
||||
|
||||
The PMUs in this document are based on ARM CoreSight PMU Architecture as
|
||||
described in document: ARM IHI 0091. Since this is a standard architecture, the
|
||||
PMUs are managed by a common driver "arm-cs-arch-pmu". This driver describes
|
||||
the available events and configuration of each PMU in sysfs. Please see the
|
||||
sections below to get the sysfs path of each PMU. Like other uncore PMU drivers,
|
||||
the driver provides "cpumask" sysfs attribute to show the CPU id used to handle
|
||||
the PMU event. There is also "associated_cpus" sysfs attribute, which contains a
|
||||
list of CPUs associated with the PMU instance.
|
||||
|
||||
.. _SCF_PMU_Section:
|
||||
|
||||
SCF PMU
|
||||
-------
|
||||
|
||||
The SCF PMU monitors system level cache events, CPU traffic, and
|
||||
strongly-ordered (SO) PCIE write traffic to local/remote memory. Please see
|
||||
:ref:`NVIDIA_Uncore_PMU_Traffic_Coverage_Section` for more info about the PMU
|
||||
traffic coverage.
|
||||
|
||||
The events and configuration options of this PMU device are described in sysfs,
|
||||
see /sys/bus/event_sources/devices/nvidia_scf_pmu_<socket-id>.
|
||||
|
||||
Example usage:
|
||||
|
||||
* Count event id 0x0 in socket 0::
|
||||
|
||||
perf stat -a -e nvidia_scf_pmu_0/event=0x0/
|
||||
|
||||
* Count event id 0x0 in socket 1::
|
||||
|
||||
perf stat -a -e nvidia_scf_pmu_1/event=0x0/
|
||||
|
||||
NVLink-C2C0 PMU
|
||||
--------------------
|
||||
|
||||
The NVLink-C2C0 PMU monitors incoming traffic from a GPU/CPU connected with
|
||||
NVLink-C2C (Chip-2-Chip) interconnect. The type of traffic captured by this PMU
|
||||
varies dependent on the chip configuration:
|
||||
|
||||
* NVIDIA Grace Hopper Superchip: Hopper GPU is connected with Grace SoC.
|
||||
|
||||
In this config, the PMU captures GPU ATS translated or EGM traffic from the GPU.
|
||||
|
||||
* NVIDIA Grace CPU Superchip: two Grace CPU SoCs are connected.
|
||||
|
||||
In this config, the PMU captures read and relaxed ordered (RO) writes from
|
||||
PCIE device of the remote SoC.
|
||||
|
||||
Please see :ref:`NVIDIA_Uncore_PMU_Traffic_Coverage_Section` for more info about
|
||||
the PMU traffic coverage.
|
||||
|
||||
The events and configuration options of this PMU device are described in sysfs,
|
||||
see /sys/bus/event_sources/devices/nvidia_nvlink_c2c0_pmu_<socket-id>.
|
||||
|
||||
Example usage:
|
||||
|
||||
* Count event id 0x0 from the GPU/CPU connected with socket 0::
|
||||
|
||||
perf stat -a -e nvidia_nvlink_c2c0_pmu_0/event=0x0/
|
||||
|
||||
* Count event id 0x0 from the GPU/CPU connected with socket 1::
|
||||
|
||||
perf stat -a -e nvidia_nvlink_c2c0_pmu_1/event=0x0/
|
||||
|
||||
* Count event id 0x0 from the GPU/CPU connected with socket 2::
|
||||
|
||||
perf stat -a -e nvidia_nvlink_c2c0_pmu_2/event=0x0/
|
||||
|
||||
* Count event id 0x0 from the GPU/CPU connected with socket 3::
|
||||
|
||||
perf stat -a -e nvidia_nvlink_c2c0_pmu_3/event=0x0/
|
||||
|
||||
NVLink-C2C1 PMU
|
||||
-------------------
|
||||
|
||||
The NVLink-C2C1 PMU monitors incoming traffic from a GPU connected with
|
||||
NVLink-C2C (Chip-2-Chip) interconnect. This PMU captures untranslated GPU
|
||||
traffic, in contrast with NvLink-C2C0 PMU that captures ATS translated traffic.
|
||||
Please see :ref:`NVIDIA_Uncore_PMU_Traffic_Coverage_Section` for more info about
|
||||
the PMU traffic coverage.
|
||||
|
||||
The events and configuration options of this PMU device are described in sysfs,
|
||||
see /sys/bus/event_sources/devices/nvidia_nvlink_c2c1_pmu_<socket-id>.
|
||||
|
||||
Example usage:
|
||||
|
||||
* Count event id 0x0 from the GPU connected with socket 0::
|
||||
|
||||
perf stat -a -e nvidia_nvlink_c2c1_pmu_0/event=0x0/
|
||||
|
||||
* Count event id 0x0 from the GPU connected with socket 1::
|
||||
|
||||
perf stat -a -e nvidia_nvlink_c2c1_pmu_1/event=0x0/
|
||||
|
||||
* Count event id 0x0 from the GPU connected with socket 2::
|
||||
|
||||
perf stat -a -e nvidia_nvlink_c2c1_pmu_2/event=0x0/
|
||||
|
||||
* Count event id 0x0 from the GPU connected with socket 3::
|
||||
|
||||
perf stat -a -e nvidia_nvlink_c2c1_pmu_3/event=0x0/
|
||||
|
||||
CNVLink PMU
|
||||
---------------
|
||||
|
||||
The CNVLink PMU monitors traffic from GPU and PCIE device on remote sockets
|
||||
to local memory. For PCIE traffic, this PMU captures read and relaxed ordered
|
||||
(RO) write traffic. Please see :ref:`NVIDIA_Uncore_PMU_Traffic_Coverage_Section`
|
||||
for more info about the PMU traffic coverage.
|
||||
|
||||
The events and configuration options of this PMU device are described in sysfs,
|
||||
see /sys/bus/event_sources/devices/nvidia_cnvlink_pmu_<socket-id>.
|
||||
|
||||
Each SoC socket can be connected to one or more sockets via CNVLink. The user can
|
||||
use "rem_socket" bitmap parameter to select the remote socket(s) to monitor.
|
||||
Each bit represents the socket number, e.g. "rem_socket=0xE" corresponds to
|
||||
socket 1 to 3.
|
||||
/sys/bus/event_sources/devices/nvidia_cnvlink_pmu_<socket-id>/format/rem_socket
|
||||
shows the valid bits that can be set in the "rem_socket" parameter.
|
||||
|
||||
The PMU can not distinguish the remote traffic initiator, therefore it does not
|
||||
provide filter to select the traffic source to monitor. It reports combined
|
||||
traffic from remote GPU and PCIE devices.
|
||||
|
||||
Example usage:
|
||||
|
||||
* Count event id 0x0 for the traffic from remote socket 1, 2, and 3 to socket 0::
|
||||
|
||||
perf stat -a -e nvidia_cnvlink_pmu_0/event=0x0,rem_socket=0xE/
|
||||
|
||||
* Count event id 0x0 for the traffic from remote socket 0, 2, and 3 to socket 1::
|
||||
|
||||
perf stat -a -e nvidia_cnvlink_pmu_1/event=0x0,rem_socket=0xD/
|
||||
|
||||
* Count event id 0x0 for the traffic from remote socket 0, 1, and 3 to socket 2::
|
||||
|
||||
perf stat -a -e nvidia_cnvlink_pmu_2/event=0x0,rem_socket=0xB/
|
||||
|
||||
* Count event id 0x0 for the traffic from remote socket 0, 1, and 2 to socket 3::
|
||||
|
||||
perf stat -a -e nvidia_cnvlink_pmu_3/event=0x0,rem_socket=0x7/
|
||||
|
||||
|
||||
PCIE PMU
|
||||
------------
|
||||
|
||||
The PCIE PMU monitors all read/write traffic from PCIE root ports to
|
||||
local/remote memory. Please see :ref:`NVIDIA_Uncore_PMU_Traffic_Coverage_Section`
|
||||
for more info about the PMU traffic coverage.
|
||||
|
||||
The events and configuration options of this PMU device are described in sysfs,
|
||||
see /sys/bus/event_sources/devices/nvidia_pcie_pmu_<socket-id>.
|
||||
|
||||
Each SoC socket can support multiple root ports. The user can use
|
||||
"root_port" bitmap parameter to select the port(s) to monitor, i.e.
|
||||
"root_port=0xF" corresponds to root port 0 to 3.
|
||||
/sys/bus/event_sources/devices/nvidia_pcie_pmu_<socket-id>/format/root_port
|
||||
shows the valid bits that can be set in the "root_port" parameter.
|
||||
|
||||
Example usage:
|
||||
|
||||
* Count event id 0x0 from root port 0 and 1 of socket 0::
|
||||
|
||||
perf stat -a -e nvidia_pcie_pmu_0/event=0x0,root_port=0x3/
|
||||
|
||||
* Count event id 0x0 from root port 0 and 1 of socket 1::
|
||||
|
||||
perf stat -a -e nvidia_pcie_pmu_1/event=0x0,root_port=0x3/
|
||||
|
||||
.. _NVIDIA_Uncore_PMU_Traffic_Coverage_Section:
|
||||
|
||||
Traffic Coverage
|
||||
----------------
|
||||
|
||||
The PMU traffic coverage may vary dependent on the chip configuration:
|
||||
|
||||
* **NVIDIA Grace Hopper Superchip**: Hopper GPU is connected with Grace SoC.
|
||||
|
||||
Example configuration with two Grace SoCs::
|
||||
|
||||
********************************* *********************************
|
||||
* SOCKET-A * * SOCKET-B *
|
||||
* * * *
|
||||
* :::::::: * * :::::::: *
|
||||
* : PCIE : * * : PCIE : *
|
||||
* :::::::: * * :::::::: *
|
||||
* | * * | *
|
||||
* | * * | *
|
||||
* ::::::: ::::::::: * * ::::::::: ::::::: *
|
||||
* : : : : * * : : : : *
|
||||
* : GPU :<--NVLink-->: Grace :<---CNVLink--->: Grace :<--NVLink-->: GPU : *
|
||||
* : : C2C : SoC : * * : SoC : C2C : : *
|
||||
* ::::::: ::::::::: * * ::::::::: ::::::: *
|
||||
* | | * * | | *
|
||||
* | | * * | | *
|
||||
* &&&&&&&& &&&&&&&& * * &&&&&&&& &&&&&&&& *
|
||||
* & GMEM & & CMEM & * * & CMEM & & GMEM & *
|
||||
* &&&&&&&& &&&&&&&& * * &&&&&&&& &&&&&&&& *
|
||||
* * * *
|
||||
********************************* *********************************
|
||||
|
||||
GMEM = GPU Memory (e.g. HBM)
|
||||
CMEM = CPU Memory (e.g. LPDDR5X)
|
||||
|
||||
|
|
||||
| Following table contains traffic coverage of Grace SoC PMU in socket-A:
|
||||
|
||||
::
|
||||
|
||||
+--------------+-------+-----------+-----------+-----+----------+----------+
|
||||
| | Source |
|
||||
+ +-------+-----------+-----------+-----+----------+----------+
|
||||
| Destination | |GPU ATS |GPU Not-ATS| | Socket-B | Socket-B |
|
||||
| |PCI R/W|Translated,|Translated | CPU | CPU/PCIE1| GPU/PCIE2|
|
||||
| | |EGM | | | | |
|
||||
+==============+=======+===========+===========+=====+==========+==========+
|
||||
| Local | PCIE |NVLink-C2C0|NVLink-C2C1| SCF | SCF PMU | CNVLink |
|
||||
| SYSRAM/CMEM | PMU |PMU |PMU | PMU | | PMU |
|
||||
+--------------+-------+-----------+-----------+-----+----------+----------+
|
||||
| Local GMEM | PCIE | N/A |NVLink-C2C1| SCF | SCF PMU | CNVLink |
|
||||
| | PMU | |PMU | PMU | | PMU |
|
||||
+--------------+-------+-----------+-----------+-----+----------+----------+
|
||||
| Remote | PCIE |NVLink-C2C0|NVLink-C2C1| SCF | | |
|
||||
| SYSRAM/CMEM | PMU |PMU |PMU | PMU | N/A | N/A |
|
||||
| over CNVLink | | | | | | |
|
||||
+--------------+-------+-----------+-----------+-----+----------+----------+
|
||||
| Remote GMEM | PCIE |NVLink-C2C0|NVLink-C2C1| SCF | | |
|
||||
| over CNVLink | PMU |PMU |PMU | PMU | N/A | N/A |
|
||||
+--------------+-------+-----------+-----------+-----+----------+----------+
|
||||
|
||||
PCIE1 traffic represents strongly ordered (SO) writes.
|
||||
PCIE2 traffic represents reads and relaxed ordered (RO) writes.
|
||||
|
||||
* **NVIDIA Grace CPU Superchip**: two Grace CPU SoCs are connected.
|
||||
|
||||
Example configuration with two Grace SoCs::
|
||||
|
||||
******************* *******************
|
||||
* SOCKET-A * * SOCKET-B *
|
||||
* * * *
|
||||
* :::::::: * * :::::::: *
|
||||
* : PCIE : * * : PCIE : *
|
||||
* :::::::: * * :::::::: *
|
||||
* | * * | *
|
||||
* | * * | *
|
||||
* ::::::::: * * ::::::::: *
|
||||
* : : * * : : *
|
||||
* : Grace :<--------NVLink------->: Grace : *
|
||||
* : SoC : * C2C * : SoC : *
|
||||
* ::::::::: * * ::::::::: *
|
||||
* | * * | *
|
||||
* | * * | *
|
||||
* &&&&&&&& * * &&&&&&&& *
|
||||
* & CMEM & * * & CMEM & *
|
||||
* &&&&&&&& * * &&&&&&&& *
|
||||
* * * *
|
||||
******************* *******************
|
||||
|
||||
GMEM = GPU Memory (e.g. HBM)
|
||||
CMEM = CPU Memory (e.g. LPDDR5X)
|
||||
|
||||
|
|
||||
| Following table contains traffic coverage of Grace SoC PMU in socket-A:
|
||||
|
||||
::
|
||||
|
||||
+-----------------+-----------+---------+----------+-------------+
|
||||
| | Source |
|
||||
+ +-----------+---------+----------+-------------+
|
||||
| Destination | | | Socket-B | Socket-B |
|
||||
| | PCI R/W | CPU | CPU/PCIE1| PCIE2 |
|
||||
| | | | | |
|
||||
+=================+===========+=========+==========+=============+
|
||||
| Local | PCIE PMU | SCF PMU | SCF PMU | NVLink-C2C0 |
|
||||
| SYSRAM/CMEM | | | | PMU |
|
||||
+-----------------+-----------+---------+----------+-------------+
|
||||
| Remote | | | | |
|
||||
| SYSRAM/CMEM | PCIE PMU | SCF PMU | N/A | N/A |
|
||||
| over NVLink-C2C | | | | |
|
||||
+-----------------+-----------+---------+----------+-------------+
|
||||
|
||||
PCIE1 traffic represents strongly ordered (SO) writes.
|
||||
PCIE2 traffic represents reads and relaxed ordered (RO) writes.
|
@ -283,23 +283,19 @@ efficiency frequency management method on AMD processors.
|
||||
Kernel Module Options for ``amd-pstate``
|
||||
=========================================
|
||||
|
||||
.. _shared_mem:
|
||||
Passive Mode
|
||||
------------
|
||||
|
||||
``shared_mem``
|
||||
Use a module param (shared_mem) to enable related processors manually with
|
||||
**amd_pstate.shared_mem=1**.
|
||||
Due to the performance issue on the processors with `Shared Memory Support
|
||||
<perf_cap_>`_, we disable it presently and will re-enable this by default
|
||||
once we address performance issue with this solution.
|
||||
``amd_pstate=passive``
|
||||
|
||||
To check whether the current processor is using `Full MSR Support <perf_cap_>`_
|
||||
or `Shared Memory Support <perf_cap_>`_ : ::
|
||||
|
||||
ray@hr-test1:~$ lscpu | grep cppc
|
||||
Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd cppc arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip pku ospke vaes vpclmulqdq rdpid overflow_recov succor smca fsrm
|
||||
|
||||
If the CPU flags have ``cppc``, then this processor supports `Full MSR Support
|
||||
<perf_cap_>`_. Otherwise, it supports `Shared Memory Support <perf_cap_>`_.
|
||||
It will be enabled if the ``amd_pstate=passive`` is passed to the kernel in the command line.
|
||||
In this mode, ``amd_pstate`` driver software specifies a desired QoS target in the CPPC
|
||||
performance scale as a relative number. This can be expressed as percentage of nominal
|
||||
performance (infrastructure max). Below the nominal sustained performance level,
|
||||
desired performance expresses the average performance level of the processor subject
|
||||
to the Performance Reduction Tolerance register. Above the nominal performance level,
|
||||
processor must provide at least nominal performance requested and go higher if current
|
||||
operating conditions allow.
|
||||
|
||||
|
||||
``cpupower`` tool support for ``amd-pstate``
|
||||
@ -409,37 +405,55 @@ Unit Tests for amd-pstate
|
||||
|
||||
1. Test case decriptions
|
||||
|
||||
1). Basic tests
|
||||
|
||||
Test prerequisite and basic functions for the ``amd-pstate`` driver.
|
||||
|
||||
+---------+--------------------------------+------------------------------------------------------------------------------------+
|
||||
| Index | Functions | Description |
|
||||
+=========+================================+====================================================================================+
|
||||
| 0 | amd_pstate_ut_acpi_cpc_valid || Check whether the _CPC object is present in SBIOS. |
|
||||
| 1 | amd_pstate_ut_acpi_cpc_valid || Check whether the _CPC object is present in SBIOS. |
|
||||
| | || |
|
||||
| | || The detail refer to `Processor Support <processor_support_>`_. |
|
||||
+---------+--------------------------------+------------------------------------------------------------------------------------+
|
||||
| 1 | amd_pstate_ut_check_enabled || Check whether AMD P-State is enabled. |
|
||||
| 2 | amd_pstate_ut_check_enabled || Check whether AMD P-State is enabled. |
|
||||
| | || |
|
||||
| | || AMD P-States and ACPI hardware P-States always can be supported in one processor. |
|
||||
| | | But AMD P-States has the higher priority and if it is enabled with |
|
||||
| | | :c:macro:`MSR_AMD_CPPC_ENABLE` or ``cppc_set_enable``, it will respond to the |
|
||||
| | | request from AMD P-States. |
|
||||
+---------+--------------------------------+------------------------------------------------------------------------------------+
|
||||
| 2 | amd_pstate_ut_check_perf || Check if the each performance values are reasonable. |
|
||||
| 3 | amd_pstate_ut_check_perf || Check if the each performance values are reasonable. |
|
||||
| | || highest_perf >= nominal_perf > lowest_nonlinear_perf > lowest_perf > 0. |
|
||||
+---------+--------------------------------+------------------------------------------------------------------------------------+
|
||||
| 3 | amd_pstate_ut_check_freq || Check if the each frequency values and max freq when set support boost mode |
|
||||
| 4 | amd_pstate_ut_check_freq || Check if the each frequency values and max freq when set support boost mode |
|
||||
| | | are reasonable. |
|
||||
| | || max_freq >= nominal_freq > lowest_nonlinear_freq > min_freq > 0 |
|
||||
| | || If boost is not active but supported, this maximum frequency will be larger than |
|
||||
| | | the one in ``cpuinfo``. |
|
||||
+---------+--------------------------------+------------------------------------------------------------------------------------+
|
||||
|
||||
2). Tbench test
|
||||
|
||||
Test and monitor the cpu changes when running tbench benchmark under the specified governor.
|
||||
These changes include desire performance, frequency, load, performance, energy etc.
|
||||
The specified governor is ondemand or schedutil.
|
||||
Tbench can also be tested on the ``acpi-cpufreq`` kernel driver for comparison.
|
||||
|
||||
3). Gitsource test
|
||||
|
||||
Test and monitor the cpu changes when running gitsource benchmark under the specified governor.
|
||||
These changes include desire performance, frequency, load, time, energy etc.
|
||||
The specified governor is ondemand or schedutil.
|
||||
Gitsource can also be tested on the ``acpi-cpufreq`` kernel driver for comparison.
|
||||
|
||||
#. How to execute the tests
|
||||
|
||||
We use test module in the kselftest frameworks to implement it.
|
||||
We create amd-pstate-ut module and tie it into kselftest.(for
|
||||
We create ``amd-pstate-ut`` module and tie it into kselftest.(for
|
||||
details refer to Linux Kernel Selftests [4]_).
|
||||
|
||||
1. Build
|
||||
1). Build
|
||||
|
||||
+ open the :c:macro:`CONFIG_X86_AMD_PSTATE` configuration option.
|
||||
+ set the :c:macro:`CONFIG_X86_AMD_PSTATE_UT` configuration option to M.
|
||||
@ -449,23 +463,159 @@ Unit Tests for amd-pstate
|
||||
$ cd linux
|
||||
$ make -C tools/testing/selftests
|
||||
|
||||
#. Installation & Steps ::
|
||||
+ make perf ::
|
||||
|
||||
$ cd tools/perf/
|
||||
$ make
|
||||
|
||||
|
||||
2). Installation & Steps ::
|
||||
|
||||
$ make -C tools/testing/selftests install INSTALL_PATH=~/kselftest
|
||||
$ cp tools/perf/perf /usr/bin/perf
|
||||
$ sudo ./kselftest/run_kselftest.sh -c amd-pstate
|
||||
TAP version 13
|
||||
1..1
|
||||
# selftests: amd-pstate: amd-pstate-ut.sh
|
||||
# amd-pstate-ut: ok
|
||||
ok 1 selftests: amd-pstate: amd-pstate-ut.sh
|
||||
|
||||
#. Results ::
|
||||
3). Specified test case ::
|
||||
|
||||
$ dmesg | grep "amd_pstate_ut" | tee log.txt
|
||||
[12977.570663] amd_pstate_ut: 1 amd_pstate_ut_acpi_cpc_valid success!
|
||||
[12977.570673] amd_pstate_ut: 2 amd_pstate_ut_check_enabled success!
|
||||
[12977.571207] amd_pstate_ut: 3 amd_pstate_ut_check_perf success!
|
||||
[12977.571212] amd_pstate_ut: 4 amd_pstate_ut_check_freq success!
|
||||
$ cd ~/kselftest/amd-pstate
|
||||
$ sudo ./run.sh -t basic
|
||||
$ sudo ./run.sh -t tbench
|
||||
$ sudo ./run.sh -t tbench -m acpi-cpufreq
|
||||
$ sudo ./run.sh -t gitsource
|
||||
$ sudo ./run.sh -t gitsource -m acpi-cpufreq
|
||||
$ ./run.sh --help
|
||||
./run.sh: illegal option -- -
|
||||
Usage: ./run.sh [OPTION...]
|
||||
[-h <help>]
|
||||
[-o <output-file-for-dump>]
|
||||
[-c <all: All testing,
|
||||
basic: Basic testing,
|
||||
tbench: Tbench testing,
|
||||
gitsource: Gitsource testing.>]
|
||||
[-t <tbench time limit>]
|
||||
[-p <tbench process number>]
|
||||
[-l <loop times for tbench>]
|
||||
[-i <amd tracer interval>]
|
||||
[-m <comparative test: acpi-cpufreq>]
|
||||
|
||||
|
||||
4). Results
|
||||
|
||||
+ basic
|
||||
|
||||
When you finish test, you will get the following log info ::
|
||||
|
||||
$ dmesg | grep "amd_pstate_ut" | tee log.txt
|
||||
[12977.570663] amd_pstate_ut: 1 amd_pstate_ut_acpi_cpc_valid success!
|
||||
[12977.570673] amd_pstate_ut: 2 amd_pstate_ut_check_enabled success!
|
||||
[12977.571207] amd_pstate_ut: 3 amd_pstate_ut_check_perf success!
|
||||
[12977.571212] amd_pstate_ut: 4 amd_pstate_ut_check_freq success!
|
||||
|
||||
+ tbench
|
||||
|
||||
When you finish test, you will get selftest.tbench.csv and png images.
|
||||
The selftest.tbench.csv file contains the raw data and the drop of the comparative test.
|
||||
The png images shows the performance, energy and performan per watt of each test.
|
||||
Open selftest.tbench.csv :
|
||||
|
||||
+-------------------------------------------------+--------------+----------+---------+----------+-------------+---------+----------------------+
|
||||
+ Governor | Round | Des-perf | Freq | Load | Performance | Energy | Performance Per Watt |
|
||||
+-------------------------------------------------+--------------+----------+---------+----------+-------------+---------+----------------------+
|
||||
+ Unit | | | GHz | | MB/s | J | MB/J |
|
||||
+=================================================+==============+==========+=========+==========+=============+=========+======================+
|
||||
+ amd-pstate-ondemand | 1 | | | | 2504.05 | 1563.67 | 158.5378 |
|
||||
+-------------------------------------------------+--------------+----------+---------+----------+-------------+---------+----------------------+
|
||||
+ amd-pstate-ondemand | 2 | | | | 2243.64 | 1430.32 | 155.2941 |
|
||||
+-------------------------------------------------+--------------+----------+---------+----------+-------------+---------+----------------------+
|
||||
+ amd-pstate-ondemand | 3 | | | | 2183.88 | 1401.32 | 154.2860 |
|
||||
+-------------------------------------------------+--------------+----------+---------+----------+-------------+---------+----------------------+
|
||||
+ amd-pstate-ondemand | Average | | | | 2310.52 | 1465.1 | 156.1268 |
|
||||
+-------------------------------------------------+--------------+----------+---------+----------+-------------+---------+----------------------+
|
||||
+ amd-pstate-schedutil | 1 | 165.329 | 1.62257 | 99.798 | 2136.54 | 1395.26 | 151.5971 |
|
||||
+-------------------------------------------------+--------------+----------+---------+----------+-------------+---------+----------------------+
|
||||
+ amd-pstate-schedutil | 2 | 166 | 1.49761 | 99.9993 | 2100.56 | 1380.5 | 150.6377 |
|
||||
+-------------------------------------------------+--------------+----------+---------+----------+-------------+---------+----------------------+
|
||||
+ amd-pstate-schedutil | 3 | 166 | 1.47806 | 99.9993 | 2084.12 | 1375.76 | 149.9737 |
|
||||
+-------------------------------------------------+--------------+----------+---------+----------+-------------+---------+----------------------+
|
||||
+ amd-pstate-schedutil | Average | 165.776 | 1.53275 | 99.9322 | 2107.07 | 1383.84 | 150.7399 |
|
||||
+-------------------------------------------------+--------------+----------+---------+----------+-------------+---------+----------------------+
|
||||
+ acpi-cpufreq-ondemand | 1 | | | | 2529.9 | 1564.4 | 160.0997 |
|
||||
+-------------------------------------------------+--------------+----------+---------+----------+-------------+---------+----------------------+
|
||||
+ acpi-cpufreq-ondemand | 2 | | | | 2249.76 | 1432.97 | 155.4297 |
|
||||
+-------------------------------------------------+--------------+----------+---------+----------+-------------+---------+----------------------+
|
||||
+ acpi-cpufreq-ondemand | 3 | | | | 2181.46 | 1406.88 | 153.5060 |
|
||||
+-------------------------------------------------+--------------+----------+---------+----------+-------------+---------+----------------------+
|
||||
+ acpi-cpufreq-ondemand | Average | | | | 2320.37 | 1468.08 | 156.4741 |
|
||||
+-------------------------------------------------+--------------+----------+---------+----------+-------------+---------+----------------------+
|
||||
+ acpi-cpufreq-schedutil | 1 | | | | 2137.64 | 1385.24 | 152.7723 |
|
||||
+-------------------------------------------------+--------------+----------+---------+----------+-------------+---------+----------------------+
|
||||
+ acpi-cpufreq-schedutil | 2 | | | | 2107.05 | 1372.23 | 152.0138 |
|
||||
+-------------------------------------------------+--------------+----------+---------+----------+-------------+---------+----------------------+
|
||||
+ acpi-cpufreq-schedutil | 3 | | | | 2085.86 | 1365.35 | 151.2433 |
|
||||
+-------------------------------------------------+--------------+----------+---------+----------+-------------+---------+----------------------+
|
||||
+ acpi-cpufreq-schedutil | Average | | | | 2110.18 | 1374.27 | 152.0136 |
|
||||
+-------------------------------------------------+--------------+----------+---------+----------+-------------+---------+----------------------+
|
||||
+ acpi-cpufreq-ondemand VS acpi-cpufreq-schedutil | Comprison(%) | | | | -9.0584 | -6.3899 | -2.8506 |
|
||||
+-------------------------------------------------+--------------+----------+---------+----------+-------------+---------+----------------------+
|
||||
+ amd-pstate-ondemand VS amd-pstate-schedutil | Comprison(%) | | | | 8.8053 | -5.5463 | -3.4503 |
|
||||
+-------------------------------------------------+--------------+----------+---------+----------+-------------+---------+----------------------+
|
||||
+ acpi-cpufreq-ondemand VS amd-pstate-ondemand | Comprison(%) | | | | -0.4245 | -0.2029 | -0.2219 |
|
||||
+-------------------------------------------------+--------------+----------+---------+----------+-------------+---------+----------------------+
|
||||
+ acpi-cpufreq-schedutil VS amd-pstate-schedutil | Comprison(%) | | | | -0.1473 | 0.6963 | -0.8378 |
|
||||
+-------------------------------------------------+--------------+----------+---------+----------+-------------+---------+----------------------+
|
||||
|
||||
+ gitsource
|
||||
|
||||
When you finish test, you will get selftest.gitsource.csv and png images.
|
||||
The selftest.gitsource.csv file contains the raw data and the drop of the comparative test.
|
||||
The png images shows the performance, energy and performan per watt of each test.
|
||||
Open selftest.gitsource.csv :
|
||||
|
||||
+-------------------------------------------------+--------------+----------+----------+----------+-------------+---------+----------------------+
|
||||
+ Governor | Round | Des-perf | Freq | Load | Time | Energy | Performance Per Watt |
|
||||
+-------------------------------------------------+--------------+----------+----------+----------+-------------+---------+----------------------+
|
||||
+ Unit | | | GHz | | s | J | 1/J |
|
||||
+=================================================+==============+==========+==========+==========+=============+=========+======================+
|
||||
+ amd-pstate-ondemand | 1 | 50.119 | 2.10509 | 23.3076 | 475.69 | 865.78 | 0.001155027 |
|
||||
+-------------------------------------------------+--------------+----------+----------+----------+-------------+---------+----------------------+
|
||||
+ amd-pstate-ondemand | 2 | 94.8006 | 1.98771 | 56.6533 | 467.1 | 839.67 | 0.001190944 |
|
||||
+-------------------------------------------------+--------------+----------+----------+----------+-------------+---------+----------------------+
|
||||
+ amd-pstate-ondemand | 3 | 76.6091 | 2.53251 | 43.7791 | 467.69 | 855.85 | 0.001168429 |
|
||||
+-------------------------------------------------+--------------+----------+----------+----------+-------------+---------+----------------------+
|
||||
+ amd-pstate-ondemand | Average | 73.8429 | 2.20844 | 41.2467 | 470.16 | 853.767 | 0.001171279 |
|
||||
+-------------------------------------------------+--------------+----------+----------+----------+-------------+---------+----------------------+
|
||||
+ amd-pstate-schedutil | 1 | 165.919 | 1.62319 | 98.3868 | 464.17 | 866.8 | 0.001153668 |
|
||||
+-------------------------------------------------+--------------+----------+----------+----------+-------------+---------+----------------------+
|
||||
+ amd-pstate-schedutil | 2 | 165.97 | 1.31309 | 99.5712 | 480.15 | 880.4 | 0.001135847 |
|
||||
+-------------------------------------------------+--------------+----------+----------+----------+-------------+---------+----------------------+
|
||||
+ amd-pstate-schedutil | 3 | 165.973 | 1.28448 | 99.9252 | 481.79 | 867.02 | 0.001153375 |
|
||||
+-------------------------------------------------+--------------+----------+----------+----------+-------------+---------+----------------------+
|
||||
+ amd-pstate-schedutil | Average | 165.954 | 1.40692 | 99.2944 | 475.37 | 871.407 | 0.001147569 |
|
||||
+-------------------------------------------------+--------------+----------+----------+----------+-------------+---------+----------------------+
|
||||
+ acpi-cpufreq-ondemand | 1 | | | | 2379.62 | 742.96 | 0.001345967 |
|
||||
+-------------------------------------------------+--------------+----------+----------+----------+-------------+---------+----------------------+
|
||||
+ acpi-cpufreq-ondemand | 2 | | | | 441.74 | 817.49 | 0.001223256 |
|
||||
+-------------------------------------------------+--------------+----------+----------+----------+-------------+---------+----------------------+
|
||||
+ acpi-cpufreq-ondemand | 3 | | | | 455.48 | 820.01 | 0.001219497 |
|
||||
+-------------------------------------------------+--------------+----------+----------+----------+-------------+---------+----------------------+
|
||||
+ acpi-cpufreq-ondemand | Average | | | | 425.613 | 793.487 | 0.001260260 |
|
||||
+-------------------------------------------------+--------------+----------+----------+----------+-------------+---------+----------------------+
|
||||
+ acpi-cpufreq-schedutil | 1 | | | | 459.69 | 838.54 | 0.001192548 |
|
||||
+-------------------------------------------------+--------------+----------+----------+----------+-------------+---------+----------------------+
|
||||
+ acpi-cpufreq-schedutil | 2 | | | | 466.55 | 830.89 | 0.001203528 |
|
||||
+-------------------------------------------------+--------------+----------+----------+----------+-------------+---------+----------------------+
|
||||
+ acpi-cpufreq-schedutil | 3 | | | | 470.38 | 837.32 | 0.001194286 |
|
||||
+-------------------------------------------------+--------------+----------+----------+----------+-------------+---------+----------------------+
|
||||
+ acpi-cpufreq-schedutil | Average | | | | 465.54 | 835.583 | 0.001196769 |
|
||||
+-------------------------------------------------+--------------+----------+----------+----------+-------------+---------+----------------------+
|
||||
+ acpi-cpufreq-ondemand VS acpi-cpufreq-schedutil | Comprison(%) | | | | 9.3810 | 5.3051 | -5.0379 |
|
||||
+-------------------------------------------------+--------------+----------+----------+----------+-------------+---------+----------------------+
|
||||
+ amd-pstate-ondemand VS amd-pstate-schedutil | Comprison(%) | 124.7392 | -36.2934 | 140.7329 | 1.1081 | 2.0661 | -2.0242 |
|
||||
+-------------------------------------------------+--------------+----------+----------+----------+-------------+---------+----------------------+
|
||||
+ acpi-cpufreq-ondemand VS amd-pstate-ondemand | Comprison(%) | | | | 10.4665 | 7.5968 | -7.0605 |
|
||||
+-------------------------------------------------+--------------+----------+----------+----------+-------------+---------+----------------------+
|
||||
+ acpi-cpufreq-schedutil VS amd-pstate-schedutil | Comprison(%) | | | | 2.1115 | 4.2873 | -4.1110 |
|
||||
+-------------------------------------------------+--------------+----------+----------+----------+-------------+---------+----------------------+
|
||||
|
||||
Reference
|
||||
===========
|
||||
|
@ -2,8 +2,6 @@
|
||||
Documentation for /proc/sys/fs/
|
||||
===============================
|
||||
|
||||
kernel version 2.2.10
|
||||
|
||||
Copyright (c) 1998, 1999, Rik van Riel <riel@nl.linux.org>
|
||||
|
||||
Copyright (c) 2009, Shen Feng<shen@cn.fujitsu.com>
|
||||
@ -12,58 +10,40 @@ For general info and legal blurb, please look in intro.rst.
|
||||
|
||||
------------------------------------------------------------------------------
|
||||
|
||||
This file contains documentation for the sysctl files in
|
||||
/proc/sys/fs/ and is valid for Linux kernel version 2.2.
|
||||
This file contains documentation for the sysctl files and directories
|
||||
in ``/proc/sys/fs/``.
|
||||
|
||||
The files in this directory can be used to tune and monitor
|
||||
miscellaneous and general things in the operation of the Linux
|
||||
kernel. Since some of the files _can_ be used to screw up your
|
||||
kernel. Since some of the files *can* be used to screw up your
|
||||
system, it is advisable to read both documentation and source
|
||||
before actually making adjustments.
|
||||
|
||||
1. /proc/sys/fs
|
||||
===============
|
||||
|
||||
Currently, these files are in /proc/sys/fs:
|
||||
Currently, these files might (depending on your configuration)
|
||||
show up in ``/proc/sys/fs``:
|
||||
|
||||
- aio-max-nr
|
||||
- aio-nr
|
||||
- dentry-state
|
||||
- dquot-max
|
||||
- dquot-nr
|
||||
- file-max
|
||||
- file-nr
|
||||
- inode-max
|
||||
- inode-nr
|
||||
- inode-state
|
||||
- nr_open
|
||||
- overflowuid
|
||||
- overflowgid
|
||||
- pipe-user-pages-hard
|
||||
- pipe-user-pages-soft
|
||||
- protected_fifos
|
||||
- protected_hardlinks
|
||||
- protected_regular
|
||||
- protected_symlinks
|
||||
- suid_dumpable
|
||||
- super-max
|
||||
- super-nr
|
||||
.. contents:: :local:
|
||||
|
||||
|
||||
aio-nr & aio-max-nr
|
||||
-------------------
|
||||
|
||||
aio-nr is the running total of the number of events specified on the
|
||||
io_setup system call for all currently active aio contexts. If aio-nr
|
||||
reaches aio-max-nr then io_setup will fail with EAGAIN. Note that
|
||||
raising aio-max-nr does not result in the pre-allocation or re-sizing
|
||||
of any kernel data structures.
|
||||
``aio-nr`` shows the current system-wide number of asynchronous io
|
||||
requests. ``aio-max-nr`` allows you to change the maximum value
|
||||
``aio-nr`` can grow to. If ``aio-nr`` reaches ``aio-nr-max`` then
|
||||
``io_setup`` will fail with ``EAGAIN``. Note that raising
|
||||
``aio-max-nr`` does not result in the
|
||||
pre-allocation or re-sizing of any kernel data structures.
|
||||
|
||||
|
||||
dentry-state
|
||||
------------
|
||||
|
||||
From linux/include/linux/dcache.h::
|
||||
This file shows the values in ``struct dentry_stat``, as defined in
|
||||
``linux/include/linux/dcache.h``::
|
||||
|
||||
struct dentry_stat_t dentry_stat {
|
||||
int nr_dentry;
|
||||
@ -76,55 +56,73 @@ From linux/include/linux/dcache.h::
|
||||
|
||||
Dentries are dynamically allocated and deallocated.
|
||||
|
||||
nr_dentry shows the total number of dentries allocated (active
|
||||
+ unused). nr_unused shows the number of dentries that are not
|
||||
``nr_dentry`` shows the total number of dentries allocated (active
|
||||
+ unused). ``nr_unused shows`` the number of dentries that are not
|
||||
actively used, but are saved in the LRU list for future reuse.
|
||||
|
||||
Age_limit is the age in seconds after which dcache entries
|
||||
can be reclaimed when memory is short and want_pages is
|
||||
nonzero when shrink_dcache_pages() has been called and the
|
||||
``age_limit`` is the age in seconds after which dcache entries
|
||||
can be reclaimed when memory is short and ``want_pages`` is
|
||||
nonzero when ``shrink_dcache_pages()`` has been called and the
|
||||
dcache isn't pruned yet.
|
||||
|
||||
nr_negative shows the number of unused dentries that are also
|
||||
``nr_negative`` shows the number of unused dentries that are also
|
||||
negative dentries which do not map to any files. Instead,
|
||||
they help speeding up rejection of non-existing files provided
|
||||
by the users.
|
||||
|
||||
|
||||
dquot-max & dquot-nr
|
||||
--------------------
|
||||
|
||||
The file dquot-max shows the maximum number of cached disk
|
||||
quota entries.
|
||||
|
||||
The file dquot-nr shows the number of allocated disk quota
|
||||
entries and the number of free disk quota entries.
|
||||
|
||||
If the number of free cached disk quotas is very low and
|
||||
you have some awesome number of simultaneous system users,
|
||||
you might want to raise the limit.
|
||||
|
||||
|
||||
file-max & file-nr
|
||||
------------------
|
||||
|
||||
The value in file-max denotes the maximum number of file-
|
||||
The value in ``file-max`` denotes the maximum number of file-
|
||||
handles that the Linux kernel will allocate. When you get lots
|
||||
of error messages about running out of file handles, you might
|
||||
want to increase this limit.
|
||||
|
||||
Historically,the kernel was able to allocate file handles
|
||||
dynamically, but not to free them again. The three values in
|
||||
file-nr denote the number of allocated file handles, the number
|
||||
``file-nr`` denote the number of allocated file handles, the number
|
||||
of allocated but unused file handles, and the maximum number of
|
||||
file handles. Linux 2.6 always reports 0 as the number of free
|
||||
file handles. Linux 2.6 and later always reports 0 as the number of free
|
||||
file handles -- this is not an error, it just means that the
|
||||
number of allocated file handles exactly matches the number of
|
||||
used file handles.
|
||||
|
||||
Attempts to allocate more file descriptors than file-max are
|
||||
reported with printk, look for "VFS: file-max limit <number>
|
||||
reached".
|
||||
Attempts to allocate more file descriptors than ``file-max`` are
|
||||
reported with ``printk``, look for::
|
||||
|
||||
VFS: file-max limit <number> reached
|
||||
|
||||
in the kernel logs.
|
||||
|
||||
|
||||
inode-nr & inode-state
|
||||
----------------------
|
||||
|
||||
As with file handles, the kernel allocates the inode structures
|
||||
dynamically, but can't free them yet.
|
||||
|
||||
The file ``inode-nr`` contains the first two items from
|
||||
``inode-state``, so we'll skip to that file...
|
||||
|
||||
``inode-state`` contains three actual numbers and four dummies.
|
||||
The actual numbers are, in order of appearance, ``nr_inodes``,
|
||||
``nr_free_inodes`` and ``preshrink``.
|
||||
|
||||
``nr_inodes`` stands for the number of inodes the system has
|
||||
allocated.
|
||||
|
||||
``nr_free_inodes`` represents the number of free inodes (?) and
|
||||
preshrink is nonzero when the
|
||||
system needs to prune the inode list instead of allocating
|
||||
more.
|
||||
|
||||
|
||||
mount-max
|
||||
---------
|
||||
|
||||
This denotes the maximum number of mounts that may exist
|
||||
in a mount namespace.
|
||||
|
||||
|
||||
nr_open
|
||||
@ -132,39 +130,10 @@ nr_open
|
||||
|
||||
This denotes the maximum number of file-handles a process can
|
||||
allocate. Default value is 1024*1024 (1048576) which should be
|
||||
enough for most machines. Actual limit depends on RLIMIT_NOFILE
|
||||
enough for most machines. Actual limit depends on ``RLIMIT_NOFILE``
|
||||
resource limit.
|
||||
|
||||
|
||||
inode-max, inode-nr & inode-state
|
||||
---------------------------------
|
||||
|
||||
As with file handles, the kernel allocates the inode structures
|
||||
dynamically, but can't free them yet.
|
||||
|
||||
The value in inode-max denotes the maximum number of inode
|
||||
handlers. This value should be 3-4 times larger than the value
|
||||
in file-max, since stdin, stdout and network sockets also
|
||||
need an inode struct to handle them. When you regularly run
|
||||
out of inodes, you need to increase this value.
|
||||
|
||||
The file inode-nr contains the first two items from
|
||||
inode-state, so we'll skip to that file...
|
||||
|
||||
Inode-state contains three actual numbers and four dummies.
|
||||
The actual numbers are, in order of appearance, nr_inodes,
|
||||
nr_free_inodes and preshrink.
|
||||
|
||||
Nr_inodes stands for the number of inodes the system has
|
||||
allocated, this can be slightly more than inode-max because
|
||||
Linux allocates them one pageful at a time.
|
||||
|
||||
Nr_free_inodes represents the number of free inodes (?) and
|
||||
preshrink is nonzero when the nr_inodes > inode-max and the
|
||||
system needs to prune the inode list instead of allocating
|
||||
more.
|
||||
|
||||
|
||||
overflowgid & overflowuid
|
||||
-------------------------
|
||||
|
||||
@ -192,7 +161,7 @@ pipe-user-pages-soft
|
||||
Maximum total number of pages a non-privileged user may allocate for pipes
|
||||
before the pipe size gets limited to a single page. Once this limit is reached,
|
||||
new pipes will be limited to a single page in size for this user in order to
|
||||
limit total memory usage, and trying to increase them using fcntl() will be
|
||||
limit total memory usage, and trying to increase them using ``fcntl()`` will be
|
||||
denied until usage goes below the limit again. The default value allows to
|
||||
allocate up to 1024 pipes at their default size. When set to 0, no limit is
|
||||
applied.
|
||||
@ -207,7 +176,7 @@ file.
|
||||
|
||||
When set to "0", writing to FIFOs is unrestricted.
|
||||
|
||||
When set to "1" don't allow O_CREAT open on FIFOs that we don't own
|
||||
When set to "1" don't allow ``O_CREAT`` open on FIFOs that we don't own
|
||||
in world writable sticky directories, unless they are owned by the
|
||||
owner of the directory.
|
||||
|
||||
@ -221,7 +190,7 @@ protected_hardlinks
|
||||
|
||||
A long-standing class of security issues is the hardlink-based
|
||||
time-of-check-time-of-use race, most commonly seen in world-writable
|
||||
directories like /tmp. The common method of exploitation of this flaw
|
||||
directories like ``/tmp``. The common method of exploitation of this flaw
|
||||
is to cross privilege boundaries when following a given hardlink (i.e. a
|
||||
root process follows a hardlink created by another user). Additionally,
|
||||
on systems without separated partitions, this stops unauthorized users
|
||||
@ -239,13 +208,13 @@ This protection is based on the restrictions in Openwall and grsecurity.
|
||||
protected_regular
|
||||
-----------------
|
||||
|
||||
This protection is similar to protected_fifos, but it
|
||||
This protection is similar to `protected_fifos`_, but it
|
||||
avoids writes to an attacker-controlled regular file, where a program
|
||||
expected to create one.
|
||||
|
||||
When set to "0", writing to regular files is unrestricted.
|
||||
|
||||
When set to "1" don't allow O_CREAT open on regular files that we
|
||||
When set to "1" don't allow ``O_CREAT`` open on regular files that we
|
||||
don't own in world writable sticky directories, unless they are
|
||||
owned by the owner of the directory.
|
||||
|
||||
@ -257,7 +226,7 @@ protected_symlinks
|
||||
|
||||
A long-standing class of security issues is the symlink-based
|
||||
time-of-check-time-of-use race, most commonly seen in world-writable
|
||||
directories like /tmp. The common method of exploitation of this flaw
|
||||
directories like ``/tmp``. The common method of exploitation of this flaw
|
||||
is to cross privilege boundaries when following a given symlink (i.e. a
|
||||
root process follows a symlink belonging to another user). For a likely
|
||||
incomplete list of hundreds of examples across the years, please see:
|
||||
@ -272,23 +241,25 @@ follower match, or when the directory owner matches the symlink's owner.
|
||||
This protection is based on the restrictions in Openwall and grsecurity.
|
||||
|
||||
|
||||
suid_dumpable:
|
||||
--------------
|
||||
suid_dumpable
|
||||
-------------
|
||||
|
||||
This value can be used to query and set the core dump mode for setuid
|
||||
or otherwise protected/tainted binaries. The modes are
|
||||
|
||||
= ========== ===============================================================
|
||||
0 (default) traditional behaviour. Any process which has changed
|
||||
0 (default) Traditional behaviour. Any process which has changed
|
||||
privilege levels or is execute only will not be dumped.
|
||||
1 (debug) all processes dump core when possible. The core dump is
|
||||
1 (debug) All processes dump core when possible. The core dump is
|
||||
owned by the current user and no security is applied. This is
|
||||
intended for system debugging situations only.
|
||||
Ptrace is unchecked.
|
||||
This is insecure as it allows regular users to examine the
|
||||
memory contents of privileged processes.
|
||||
2 (suidsafe) any binary which normally would not be dumped is dumped
|
||||
anyway, but only if the "core_pattern" kernel sysctl is set to
|
||||
2 (suidsafe) Any binary which normally would not be dumped is dumped
|
||||
anyway, but only if the ``core_pattern`` kernel sysctl (see
|
||||
:ref:`Documentation/admin-guide/sysctl/kernel.rst <core_pattern>`)
|
||||
is set to
|
||||
either a pipe handler or a fully qualified path. (For more
|
||||
details on this limitation, see CVE-2006-2451.) This mode is
|
||||
appropriate when administrators are attempting to debug
|
||||
@ -301,36 +272,11 @@ or otherwise protected/tainted binaries. The modes are
|
||||
= ========== ===============================================================
|
||||
|
||||
|
||||
super-max & super-nr
|
||||
--------------------
|
||||
|
||||
These numbers control the maximum number of superblocks, and
|
||||
thus the maximum number of mounted filesystems the kernel
|
||||
can have. You only need to increase super-max if you need to
|
||||
mount more filesystems than the current value in super-max
|
||||
allows you to.
|
||||
|
||||
|
||||
aio-nr & aio-max-nr
|
||||
-------------------
|
||||
|
||||
aio-nr shows the current system-wide number of asynchronous io
|
||||
requests. aio-max-nr allows you to change the maximum value
|
||||
aio-nr can grow to.
|
||||
|
||||
|
||||
mount-max
|
||||
---------
|
||||
|
||||
This denotes the maximum number of mounts that may exist
|
||||
in a mount namespace.
|
||||
|
||||
|
||||
|
||||
2. /proc/sys/fs/binfmt_misc
|
||||
===========================
|
||||
|
||||
Documentation for the files in /proc/sys/fs/binfmt_misc is
|
||||
Documentation for the files in ``/proc/sys/fs/binfmt_misc`` is
|
||||
in Documentation/admin-guide/binfmt-misc.rst.
|
||||
|
||||
|
||||
@ -343,28 +289,32 @@ creation of a user space library that implements the POSIX message queues
|
||||
API (as noted by the MSG tag in the POSIX 1003.1-2001 version of the System
|
||||
Interfaces specification.)
|
||||
|
||||
The "mqueue" filesystem contains values for determining/setting the amount of
|
||||
resources used by the file system.
|
||||
The "mqueue" filesystem contains values for determining/setting the
|
||||
amount of resources used by the file system.
|
||||
|
||||
/proc/sys/fs/mqueue/queues_max is a read/write file for setting/getting the
|
||||
maximum number of message queues allowed on the system.
|
||||
``/proc/sys/fs/mqueue/queues_max`` is a read/write file for
|
||||
setting/getting the maximum number of message queues allowed on the
|
||||
system.
|
||||
|
||||
/proc/sys/fs/mqueue/msg_max is a read/write file for setting/getting the
|
||||
maximum number of messages in a queue value. In fact it is the limiting value
|
||||
for another (user) limit which is set in mq_open invocation. This attribute of
|
||||
a queue must be less or equal then msg_max.
|
||||
``/proc/sys/fs/mqueue/msg_max`` is a read/write file for
|
||||
setting/getting the maximum number of messages in a queue value. In
|
||||
fact it is the limiting value for another (user) limit which is set in
|
||||
``mq_open`` invocation. This attribute of a queue must be less than
|
||||
or equal to ``msg_max``.
|
||||
|
||||
/proc/sys/fs/mqueue/msgsize_max is a read/write file for setting/getting the
|
||||
maximum message size value (it is every message queue's attribute set during
|
||||
its creation).
|
||||
``/proc/sys/fs/mqueue/msgsize_max`` is a read/write file for
|
||||
setting/getting the maximum message size value (it is an attribute of
|
||||
every message queue, set during its creation).
|
||||
|
||||
/proc/sys/fs/mqueue/msg_default is a read/write file for setting/getting the
|
||||
default number of messages in a queue value if attr parameter of mq_open(2) is
|
||||
NULL. If it exceed msg_max, the default value is initialized msg_max.
|
||||
``/proc/sys/fs/mqueue/msg_default`` is a read/write file for
|
||||
setting/getting the default number of messages in a queue value if the
|
||||
``attr`` parameter of ``mq_open(2)`` is ``NULL``. If it exceeds
|
||||
``msg_max``, the default value is initialized to ``msg_max``.
|
||||
|
||||
/proc/sys/fs/mqueue/msgsize_default is a read/write file for setting/getting
|
||||
the default message size value if attr parameter of mq_open(2) is NULL. If it
|
||||
exceed msgsize_max, the default value is initialized msgsize_max.
|
||||
``/proc/sys/fs/mqueue/msgsize_default`` is a read/write file for
|
||||
setting/getting the default message size value if the ``attr``
|
||||
parameter of ``mq_open(2)`` is ``NULL``. If it exceeds
|
||||
``msgsize_max``, the default value is initialized to ``msgsize_max``.
|
||||
|
||||
4. /proc/sys/fs/epoll - Configuration options for the epoll interface
|
||||
=====================================================================
|
||||
@ -378,7 +328,7 @@ Every epoll file descriptor can store a number of files to be monitored
|
||||
for event readiness. Each one of these monitored files constitutes a "watch".
|
||||
This configuration option sets the maximum number of "watches" that are
|
||||
allowed for each user.
|
||||
Each "watch" costs roughly 90 bytes on a 32bit kernel, and roughly 160 bytes
|
||||
on a 64bit one.
|
||||
The current default value for max_user_watches is the 1/25 (4%) of the
|
||||
available low memory, divided for the "watch" cost in bytes.
|
||||
Each "watch" costs roughly 90 bytes on a 32-bit kernel, and roughly 160 bytes
|
||||
on a 64-bit one.
|
||||
The current default value for ``max_user_watches`` is 4% of the
|
||||
available low memory, divided by the "watch" cost in bytes.
|
||||
|
@ -139,6 +139,8 @@ Highest valid capability of the running kernel. Exports
|
||||
``CAP_LAST_CAP`` from the kernel.
|
||||
|
||||
|
||||
.. _core_pattern:
|
||||
|
||||
core_pattern
|
||||
============
|
||||
|
||||
@ -174,6 +176,7 @@ core_pattern
|
||||
%f executable filename
|
||||
%E executable path
|
||||
%c maximum size of core file by resource limit RLIMIT_CORE
|
||||
%C CPU the task ran on
|
||||
%<OTHER> both are dropped
|
||||
======== ==========================================
|
||||
|
||||
@ -667,6 +670,15 @@ This is the default behavior.
|
||||
an oops event is detected.
|
||||
|
||||
|
||||
oops_limit
|
||||
==========
|
||||
|
||||
Number of kernel oopses after which the kernel should panic when
|
||||
``panic_on_oops`` is not set. Setting this to 0 disables checking
|
||||
the count. Setting this to 1 has the same effect as setting
|
||||
``panic_on_oops=1``. The default value is 10000.
|
||||
|
||||
|
||||
osrelease, ostype & version
|
||||
===========================
|
||||
|
||||
@ -1314,6 +1326,29 @@ watchdog work to be queued by the watchdog timer function, otherwise the NMI
|
||||
watchdog — if enabled — can detect a hard lockup condition.
|
||||
|
||||
|
||||
split_lock_mitigate (x86 only)
|
||||
==============================
|
||||
|
||||
On x86, each "split lock" imposes a system-wide performance penalty. On larger
|
||||
systems, large numbers of split locks from unprivileged users can result in
|
||||
denials of service to well-behaved and potentially more important users.
|
||||
|
||||
The kernel mitigates these bad users by detecting split locks and imposing
|
||||
penalties: forcing them to wait and only allowing one core to execute split
|
||||
locks at a time.
|
||||
|
||||
These mitigations can make those bad applications unbearably slow. Setting
|
||||
split_lock_mitigate=0 may restore some application performance, but will also
|
||||
increase system exposure to denial of service attacks from split lock users.
|
||||
|
||||
= ===================================================================
|
||||
0 Disable the mitigation mode - just warns the split lock on kernel log
|
||||
and exposes the system to denials of service from the split lockers.
|
||||
1 Enable the mitigation mode (this is the default) - penalizes the split
|
||||
lockers with intentional performance degradation.
|
||||
= ===================================================================
|
||||
|
||||
|
||||
stack_erasing
|
||||
=============
|
||||
|
||||
@ -1500,6 +1535,16 @@ entry will default to 2 instead of 0.
|
||||
2 Unprivileged calls to ``bpf()`` are disabled
|
||||
= =============================================================
|
||||
|
||||
|
||||
warn_limit
|
||||
==========
|
||||
|
||||
Number of kernel warnings after which the kernel should panic when
|
||||
``panic_on_warn`` is not set. Setting this to 0 disables checking
|
||||
the warning count. Setting this to 1 has the same effect as setting
|
||||
``panic_on_warn=1``. The default value is 0.
|
||||
|
||||
|
||||
watchdog
|
||||
========
|
||||
|
||||
|
@ -14,18 +14,20 @@ Orion family
|
||||
|
||||
Flavors:
|
||||
- 88F5082
|
||||
- 88F5181
|
||||
- 88F5181L
|
||||
- 88F5182
|
||||
- 88F5181 a.k.a Orion-1
|
||||
- 88F5181L a.k.a Orion-VoIP
|
||||
- 88F5182 a.k.a Orion-NAS
|
||||
|
||||
- Datasheet: https://web.archive.org/web/20210124231420/http://csclub.uwaterloo.ca/~board/ts7800/MV88F5182-datasheet.pdf
|
||||
- Programmer's User Guide: https://web.archive.org/web/20210124231536/http://csclub.uwaterloo.ca/~board/ts7800/MV88F5182-opensource-manual.pdf
|
||||
- User Manual: https://web.archive.org/web/20210124231631/http://csclub.uwaterloo.ca/~board/ts7800/MV88F5182-usermanual.pdf
|
||||
- Functional Errata: https://web.archive.org/web/20210704165540/https://www.digriz.org.uk/ts78xx/88F5182_Functional_Errata.pdf
|
||||
- 88F5281
|
||||
- 88F5281 a.k.a Orion-2
|
||||
|
||||
- Datasheet: https://web.archive.org/web/20131028144728/http://www.ocmodshop.com/images/reviews/networking/qnap_ts409u/marvel_88f5281_data_sheet.pdf
|
||||
- 88F6183
|
||||
- 88F6183 a.k.a Orion-1-90
|
||||
Homepage:
|
||||
https://web.archive.org/web/20080607215437/http://www.marvell.com/products/media/index.jsp
|
||||
Core:
|
||||
Feroceon 88fr331 (88f51xx) or 88fr531-vd (88f52xx) ARMv5 compatible
|
||||
Linux kernel mach directory:
|
||||
|
@ -163,7 +163,7 @@ FPDT Section 5.2.23 (signature == "FPDT")
|
||||
|
||||
**Firmware Performance Data Table**
|
||||
|
||||
Optional, not currently supported.
|
||||
Optional, useful for boot performance profiling.
|
||||
|
||||
GTDT Section 5.2.24 (signature == "GTDT")
|
||||
|
||||
|
@ -121,8 +121,9 @@ Header notes:
|
||||
to the base of DRAM, since memory below it is not
|
||||
accessible via the linear mapping
|
||||
1
|
||||
2MB aligned base may be anywhere in physical
|
||||
memory
|
||||
2MB aligned base such that all image_size bytes
|
||||
counted from the start of the image are within
|
||||
the 48-bit addressable range of physical memory
|
||||
Bits 4-63 Reserved.
|
||||
============= ===============================================================
|
||||
|
||||
@ -348,7 +349,7 @@ Before jumping into the kernel, the following conditions must be met:
|
||||
|
||||
- HWFGWTR_EL2.nSMPRI_EL1 (bit 54) must be initialised to 0b01.
|
||||
|
||||
For CPUs with the Scalable Matrix Extension FA64 feature (FEAT_SME_FA64)
|
||||
For CPUs with the Scalable Matrix Extension FA64 feature (FEAT_SME_FA64):
|
||||
|
||||
- If EL3 is present:
|
||||
|
||||
|
@ -275,6 +275,15 @@ HWCAP2_EBF16
|
||||
HWCAP2_SVE_EBF16
|
||||
Functionality implied by ID_AA64ZFR0_EL1.BF16 == 0b0010.
|
||||
|
||||
HWCAP2_CSSC
|
||||
Functionality implied by ID_AA64ISAR2_EL1.CSSC == 0b0001.
|
||||
|
||||
HWCAP2_RPRFM
|
||||
Functionality implied by ID_AA64ISAR2_EL1.RPRFM == 0b0001.
|
||||
|
||||
HWCAP2_SVE2P1
|
||||
Functionality implied by ID_AA64ZFR0_EL1.SVEver == 0b0010.
|
||||
|
||||
4. Unused AT_HWCAP bits
|
||||
-----------------------
|
||||
|
||||
|
@ -120,6 +120,8 @@ stable kernels.
|
||||
+----------------+-----------------+-----------------+-----------------------------+
|
||||
| ARM | Cortex-A710 | #2224489 | ARM64_ERRATUM_2224489 |
|
||||
+----------------+-----------------+-----------------+-----------------------------+
|
||||
| ARM | Cortex-A715 | #2645198 | ARM64_ERRATUM_2645198 |
|
||||
+----------------+-----------------+-----------------+-----------------------------+
|
||||
| ARM | Cortex-X2 | #2119858 | ARM64_ERRATUM_2119858 |
|
||||
+----------------+-----------------+-----------------+-----------------------------+
|
||||
| ARM | Cortex-X2 | #2224489 | ARM64_ERRATUM_2224489 |
|
||||
|
@ -52,6 +52,7 @@ model features for SVE is included in Appendix A.
|
||||
HWCAP2_SVEBITPERM
|
||||
HWCAP2_SVESHA3
|
||||
HWCAP2_SVESM4
|
||||
HWCAP2_SVE2P1
|
||||
|
||||
This list may be extended over time as the SVE architecture evolves.
|
||||
|
||||
|
@ -142,7 +142,7 @@ Therefore, we also introduce *blk-crypto-fallback*, which is an implementation
|
||||
of inline encryption using the kernel crypto API. blk-crypto-fallback is built
|
||||
into the block layer, so it works on any block device without any special setup.
|
||||
Essentially, when a bio with an encryption context is submitted to a
|
||||
request_queue that doesn't support that encryption context, the block layer will
|
||||
block_device that doesn't support that encryption context, the block layer will
|
||||
handle en/decryption of the bio using blk-crypto-fallback.
|
||||
|
||||
For encryption, the data cannot be encrypted in-place, as callers usually rely
|
||||
@ -187,7 +187,7 @@ API presented to users of the block layer
|
||||
|
||||
``blk_crypto_config_supported()`` allows users to check ahead of time whether
|
||||
inline encryption with particular crypto settings will work on a particular
|
||||
request_queue -- either via hardware or via blk-crypto-fallback. This function
|
||||
block_device -- either via hardware or via blk-crypto-fallback. This function
|
||||
takes in a ``struct blk_crypto_config`` which is like blk_crypto_key, but omits
|
||||
the actual bytes of the key and instead just contains the algorithm, data unit
|
||||
size, etc. This function can be useful if blk-crypto-fallback is disabled.
|
||||
@ -195,7 +195,7 @@ size, etc. This function can be useful if blk-crypto-fallback is disabled.
|
||||
``blk_crypto_init_key()`` allows users to initialize a blk_crypto_key.
|
||||
|
||||
Users must call ``blk_crypto_start_using_key()`` before actually starting to use
|
||||
a blk_crypto_key on a request_queue (even if ``blk_crypto_config_supported()``
|
||||
a blk_crypto_key on a block_device (even if ``blk_crypto_config_supported()``
|
||||
was called earlier). This is needed to initialize blk-crypto-fallback if it
|
||||
will be needed. This must not be called from the data path, as this may have to
|
||||
allocate resources, which may deadlock in that case.
|
||||
@ -207,7 +207,7 @@ for en/decryption. Users don't need to worry about freeing the bio_crypt_ctx
|
||||
later, as that happens automatically when the bio is freed or reset.
|
||||
|
||||
Finally, when done using inline encryption with a blk_crypto_key on a
|
||||
request_queue, users must call ``blk_crypto_evict_key()``. This ensures that
|
||||
block_device, users must call ``blk_crypto_evict_key()``. This ensures that
|
||||
the key is evicted from all keyslots it may be programmed into and unlinked from
|
||||
any kernel data structures it may be linked into.
|
||||
|
||||
@ -221,9 +221,9 @@ as follows:
|
||||
5. ``blk_crypto_evict_key()`` (after all I/O has completed)
|
||||
6. Zeroize the blk_crypto_key (this has no dedicated function)
|
||||
|
||||
If a blk_crypto_key is being used on multiple request_queues, then
|
||||
If a blk_crypto_key is being used on multiple block_devices, then
|
||||
``blk_crypto_config_supported()`` (if used), ``blk_crypto_start_using_key()``,
|
||||
and ``blk_crypto_evict_key()`` must be called on each request_queue.
|
||||
and ``blk_crypto_evict_key()`` must be called on each block_device.
|
||||
|
||||
API presented to device drivers
|
||||
===============================
|
||||
|
@ -298,3 +298,48 @@ A: NO.
|
||||
|
||||
The BTF_ID macro does not cause a function to become part of the ABI
|
||||
any more than does the EXPORT_SYMBOL_GPL macro.
|
||||
|
||||
Q: What is the compatibility story for special BPF types in map values?
|
||||
-----------------------------------------------------------------------
|
||||
Q: Users are allowed to embed bpf_spin_lock, bpf_timer fields in their BPF map
|
||||
values (when using BTF support for BPF maps). This allows to use helpers for
|
||||
such objects on these fields inside map values. Users are also allowed to embed
|
||||
pointers to some kernel types (with __kptr and __kptr_ref BTF tags). Will the
|
||||
kernel preserve backwards compatibility for these features?
|
||||
|
||||
A: It depends. For bpf_spin_lock, bpf_timer: YES, for kptr and everything else:
|
||||
NO, but see below.
|
||||
|
||||
For struct types that have been added already, like bpf_spin_lock and bpf_timer,
|
||||
the kernel will preserve backwards compatibility, as they are part of UAPI.
|
||||
|
||||
For kptrs, they are also part of UAPI, but only with respect to the kptr
|
||||
mechanism. The types that you can use with a __kptr and __kptr_ref tagged
|
||||
pointer in your struct are NOT part of the UAPI contract. The supported types can
|
||||
and will change across kernel releases. However, operations like accessing kptr
|
||||
fields and bpf_kptr_xchg() helper will continue to be supported across kernel
|
||||
releases for the supported types.
|
||||
|
||||
For any other supported struct type, unless explicitly stated in this document
|
||||
and added to bpf.h UAPI header, such types can and will arbitrarily change their
|
||||
size, type, and alignment, or any other user visible API or ABI detail across
|
||||
kernel releases. The users must adapt their BPF programs to the new changes and
|
||||
update them to make sure their programs continue to work correctly.
|
||||
|
||||
NOTE: BPF subsystem specially reserves the 'bpf\_' prefix for type names, in
|
||||
order to introduce more special fields in the future. Hence, user programs must
|
||||
avoid defining types with 'bpf\_' prefix to not be broken in future releases.
|
||||
In other words, no backwards compatibility is guaranteed if one using a type
|
||||
in BTF with 'bpf\_' prefix.
|
||||
|
||||
Q: What is the compatibility story for special BPF types in allocated objects?
|
||||
------------------------------------------------------------------------------
|
||||
Q: Same as above, but for allocated objects (i.e. objects allocated using
|
||||
bpf_obj_new for user defined types). Will the kernel preserve backwards
|
||||
compatibility for these features?
|
||||
|
||||
A: NO.
|
||||
|
||||
Unlike map value types, there are no stability guarantees for this case. The
|
||||
whole API to work with allocated objects and any support for special fields
|
||||
inside them is unstable (since it is exposed through kfuncs).
|
||||
|
@ -44,6 +44,33 @@ is a guarantee that the reported issue will be overlooked.**
|
||||
Submitting patches
|
||||
==================
|
||||
|
||||
Q: How do I run BPF CI on my changes before sending them out for review?
|
||||
------------------------------------------------------------------------
|
||||
A: BPF CI is GitHub based and hosted at https://github.com/kernel-patches/bpf.
|
||||
While GitHub also provides a CLI that can be used to accomplish the same
|
||||
results, here we focus on the UI based workflow.
|
||||
|
||||
The following steps lay out how to start a CI run for your patches:
|
||||
|
||||
- Create a fork of the aforementioned repository in your own account (one time
|
||||
action)
|
||||
|
||||
- Clone the fork locally, check out a new branch tracking either the bpf-next
|
||||
or bpf branch, and apply your to-be-tested patches on top of it
|
||||
|
||||
- Push the local branch to your fork and create a pull request against
|
||||
kernel-patches/bpf's bpf-next_base or bpf_base branch, respectively
|
||||
|
||||
Shortly after the pull request has been created, the CI workflow will run. Note
|
||||
that capacity is shared with patches submitted upstream being checked and so
|
||||
depending on utilization the run can take a while to finish.
|
||||
|
||||
Note furthermore that both base branches (bpf-next_base and bpf_base) will be
|
||||
updated as patches are pushed to the respective upstream branches they track. As
|
||||
such, your patch set will automatically (be attempted to) be rebased as well.
|
||||
This behavior can result in a CI run being aborted and restarted with the new
|
||||
base line.
|
||||
|
||||
Q: To which mailing list do I need to submit my BPF patches?
|
||||
------------------------------------------------------------
|
||||
A: Please submit your BPF patches to the bpf kernel mailing list:
|
||||
|
485
Documentation/bpf/bpf_iterators.rst
Normal file
485
Documentation/bpf/bpf_iterators.rst
Normal file
@ -0,0 +1,485 @@
|
||||
=============
|
||||
BPF Iterators
|
||||
=============
|
||||
|
||||
|
||||
----------
|
||||
Motivation
|
||||
----------
|
||||
|
||||
There are a few existing ways to dump kernel data into user space. The most
|
||||
popular one is the ``/proc`` system. For example, ``cat /proc/net/tcp6`` dumps
|
||||
all tcp6 sockets in the system, and ``cat /proc/net/netlink`` dumps all netlink
|
||||
sockets in the system. However, their output format tends to be fixed, and if
|
||||
users want more information about these sockets, they have to patch the kernel,
|
||||
which often takes time to publish upstream and release. The same is true for popular
|
||||
tools like `ss <https://man7.org/linux/man-pages/man8/ss.8.html>`_ where any
|
||||
additional information needs a kernel patch.
|
||||
|
||||
To solve this problem, the `drgn
|
||||
<https://www.kernel.org/doc/html/latest/bpf/drgn.html>`_ tool is often used to
|
||||
dig out the kernel data with no kernel change. However, the main drawback for
|
||||
drgn is performance, as it cannot do pointer tracing inside the kernel. In
|
||||
addition, drgn cannot validate a pointer value and may read invalid data if the
|
||||
pointer becomes invalid inside the kernel.
|
||||
|
||||
The BPF iterator solves the above problem by providing flexibility on what data
|
||||
(e.g., tasks, bpf_maps, etc.) to collect by calling BPF programs for each kernel
|
||||
data object.
|
||||
|
||||
----------------------
|
||||
How BPF Iterators Work
|
||||
----------------------
|
||||
|
||||
A BPF iterator is a type of BPF program that allows users to iterate over
|
||||
specific types of kernel objects. Unlike traditional BPF tracing programs that
|
||||
allow users to define callbacks that are invoked at particular points of
|
||||
execution in the kernel, BPF iterators allow users to define callbacks that
|
||||
should be executed for every entry in a variety of kernel data structures.
|
||||
|
||||
For example, users can define a BPF iterator that iterates over every task on
|
||||
the system and dumps the total amount of CPU runtime currently used by each of
|
||||
them. Another BPF task iterator may instead dump the cgroup information for each
|
||||
task. Such flexibility is the core value of BPF iterators.
|
||||
|
||||
A BPF program is always loaded into the kernel at the behest of a user space
|
||||
process. A user space process loads a BPF program by opening and initializing
|
||||
the program skeleton as required and then invoking a syscall to have the BPF
|
||||
program verified and loaded by the kernel.
|
||||
|
||||
In traditional tracing programs, a program is activated by having user space
|
||||
obtain a ``bpf_link`` to the program with ``bpf_program__attach()``. Once
|
||||
activated, the program callback will be invoked whenever the tracepoint is
|
||||
triggered in the main kernel. For BPF iterator programs, a ``bpf_link`` to the
|
||||
program is obtained using ``bpf_link_create()``, and the program callback is
|
||||
invoked by issuing system calls from user space.
|
||||
|
||||
Next, let us see how you can use the iterators to iterate on kernel objects and
|
||||
read data.
|
||||
|
||||
------------------------
|
||||
How to Use BPF iterators
|
||||
------------------------
|
||||
|
||||
BPF selftests are a great resource to illustrate how to use the iterators. In
|
||||
this section, we’ll walk through a BPF selftest which shows how to load and use
|
||||
a BPF iterator program. To begin, we’ll look at `bpf_iter.c
|
||||
<https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git/tree/tools/testing/selftests/bpf/prog_tests/bpf_iter.c>`_,
|
||||
which illustrates how to load and trigger BPF iterators on the user space side.
|
||||
Later, we’ll look at a BPF program that runs in kernel space.
|
||||
|
||||
Loading a BPF iterator in the kernel from user space typically involves the
|
||||
following steps:
|
||||
|
||||
* The BPF program is loaded into the kernel through ``libbpf``. Once the kernel
|
||||
has verified and loaded the program, it returns a file descriptor (fd) to user
|
||||
space.
|
||||
* Obtain a ``link_fd`` to the BPF program by calling the ``bpf_link_create()``
|
||||
specified with the BPF program file descriptor received from the kernel.
|
||||
* Next, obtain a BPF iterator file descriptor (``bpf_iter_fd``) by calling the
|
||||
``bpf_iter_create()`` specified with the ``bpf_link`` received from Step 2.
|
||||
* Trigger the iteration by calling ``read(bpf_iter_fd)`` until no data is
|
||||
available.
|
||||
* Close the iterator fd using ``close(bpf_iter_fd)``.
|
||||
* If needed to reread the data, get a new ``bpf_iter_fd`` and do the read again.
|
||||
|
||||
The following are a few examples of selftest BPF iterator programs:
|
||||
|
||||
* `bpf_iter_tcp4.c <https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git/tree/tools/testing/selftests/bpf/progs/bpf_iter_tcp4.c>`_
|
||||
* `bpf_iter_task_vma.c <https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git/tree/tools/testing/selftests/bpf/progs/bpf_iter_task_vma.c>`_
|
||||
* `bpf_iter_task_file.c <https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git/tree/tools/testing/selftests/bpf/progs/bpf_iter_task_file.c>`_
|
||||
|
||||
Let us look at ``bpf_iter_task_file.c``, which runs in kernel space:
|
||||
|
||||
Here is the definition of ``bpf_iter__task_file`` in `vmlinux.h
|
||||
<https://facebookmicrosites.github.io/bpf/blog/2020/02/19/bpf-portability-and-co-re.html#btf>`_.
|
||||
Any struct name in ``vmlinux.h`` in the format ``bpf_iter__<iter_name>``
|
||||
represents a BPF iterator. The suffix ``<iter_name>`` represents the type of
|
||||
iterator.
|
||||
|
||||
::
|
||||
|
||||
struct bpf_iter__task_file {
|
||||
union {
|
||||
struct bpf_iter_meta *meta;
|
||||
};
|
||||
union {
|
||||
struct task_struct *task;
|
||||
};
|
||||
u32 fd;
|
||||
union {
|
||||
struct file *file;
|
||||
};
|
||||
};
|
||||
|
||||
In the above code, the field 'meta' contains the metadata, which is the same for
|
||||
all BPF iterator programs. The rest of the fields are specific to different
|
||||
iterators. For example, for task_file iterators, the kernel layer provides the
|
||||
'task', 'fd' and 'file' field values. The 'task' and 'file' are `reference
|
||||
counted
|
||||
<https://facebookmicrosites.github.io/bpf/blog/2018/08/31/object-lifetime.html#file-descriptors-and-reference-counters>`_,
|
||||
so they won't go away when the BPF program runs.
|
||||
|
||||
Here is a snippet from the ``bpf_iter_task_file.c`` file:
|
||||
|
||||
::
|
||||
|
||||
SEC("iter/task_file")
|
||||
int dump_task_file(struct bpf_iter__task_file *ctx)
|
||||
{
|
||||
struct seq_file *seq = ctx->meta->seq;
|
||||
struct task_struct *task = ctx->task;
|
||||
struct file *file = ctx->file;
|
||||
__u32 fd = ctx->fd;
|
||||
|
||||
if (task == NULL || file == NULL)
|
||||
return 0;
|
||||
|
||||
if (ctx->meta->seq_num == 0) {
|
||||
count = 0;
|
||||
BPF_SEQ_PRINTF(seq, " tgid gid fd file\n");
|
||||
}
|
||||
|
||||
if (tgid == task->tgid && task->tgid != task->pid)
|
||||
count++;
|
||||
|
||||
if (last_tgid != task->tgid) {
|
||||
last_tgid = task->tgid;
|
||||
unique_tgid_count++;
|
||||
}
|
||||
|
||||
BPF_SEQ_PRINTF(seq, "%8d %8d %8d %lx\n", task->tgid, task->pid, fd,
|
||||
(long)file->f_op);
|
||||
return 0;
|
||||
}
|
||||
|
||||
In the above example, the section name ``SEC(iter/task_file)``, indicates that
|
||||
the program is a BPF iterator program to iterate all files from all tasks. The
|
||||
context of the program is ``bpf_iter__task_file`` struct.
|
||||
|
||||
The user space program invokes the BPF iterator program running in the kernel
|
||||
by issuing a ``read()`` syscall. Once invoked, the BPF
|
||||
program can export data to user space using a variety of BPF helper functions.
|
||||
You can use either ``bpf_seq_printf()`` (and BPF_SEQ_PRINTF helper macro) or
|
||||
``bpf_seq_write()`` function based on whether you need formatted output or just
|
||||
binary data, respectively. For binary-encoded data, the user space applications
|
||||
can process the data from ``bpf_seq_write()`` as needed. For the formatted data,
|
||||
you can use ``cat <path>`` to print the results similar to ``cat
|
||||
/proc/net/netlink`` after pinning the BPF iterator to the bpffs mount. Later,
|
||||
use ``rm -f <path>`` to remove the pinned iterator.
|
||||
|
||||
For example, you can use the following command to create a BPF iterator from the
|
||||
``bpf_iter_ipv6_route.o`` object file and pin it to the ``/sys/fs/bpf/my_route``
|
||||
path:
|
||||
|
||||
::
|
||||
|
||||
$ bpftool iter pin ./bpf_iter_ipv6_route.o /sys/fs/bpf/my_route
|
||||
|
||||
And then print out the results using the following command:
|
||||
|
||||
::
|
||||
|
||||
$ cat /sys/fs/bpf/my_route
|
||||
|
||||
|
||||
-------------------------------------------------------
|
||||
Implement Kernel Support for BPF Iterator Program Types
|
||||
-------------------------------------------------------
|
||||
|
||||
To implement a BPF iterator in the kernel, the developer must make a one-time
|
||||
change to the following key data structure defined in the `bpf.h
|
||||
<https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git/tree/include/linux/bpf.h>`_
|
||||
file.
|
||||
|
||||
::
|
||||
|
||||
struct bpf_iter_reg {
|
||||
const char *target;
|
||||
bpf_iter_attach_target_t attach_target;
|
||||
bpf_iter_detach_target_t detach_target;
|
||||
bpf_iter_show_fdinfo_t show_fdinfo;
|
||||
bpf_iter_fill_link_info_t fill_link_info;
|
||||
bpf_iter_get_func_proto_t get_func_proto;
|
||||
u32 ctx_arg_info_size;
|
||||
u32 feature;
|
||||
struct bpf_ctx_arg_aux ctx_arg_info[BPF_ITER_CTX_ARG_MAX];
|
||||
const struct bpf_iter_seq_info *seq_info;
|
||||
};
|
||||
|
||||
After filling the data structure fields, call ``bpf_iter_reg_target()`` to
|
||||
register the iterator to the main BPF iterator subsystem.
|
||||
|
||||
The following is the breakdown for each field in struct ``bpf_iter_reg``.
|
||||
|
||||
.. list-table::
|
||||
:widths: 25 50
|
||||
:header-rows: 1
|
||||
|
||||
* - Fields
|
||||
- Description
|
||||
* - target
|
||||
- Specifies the name of the BPF iterator. For example: ``bpf_map``,
|
||||
``bpf_map_elem``. The name should be different from other ``bpf_iter`` target names in the kernel.
|
||||
* - attach_target and detach_target
|
||||
- Allows for target specific ``link_create`` action since some targets
|
||||
may need special processing. Called during the user space link_create stage.
|
||||
* - show_fdinfo and fill_link_info
|
||||
- Called to fill target specific information when user tries to get link
|
||||
info associated with the iterator.
|
||||
* - get_func_proto
|
||||
- Permits a BPF iterator to access BPF helpers specific to the iterator.
|
||||
* - ctx_arg_info_size and ctx_arg_info
|
||||
- Specifies the verifier states for BPF program arguments associated with
|
||||
the bpf iterator.
|
||||
* - feature
|
||||
- Specifies certain action requests in the kernel BPF iterator
|
||||
infrastructure. Currently, only BPF_ITER_RESCHED is supported. This means
|
||||
that the kernel function cond_resched() is called to avoid other kernel
|
||||
subsystem (e.g., rcu) misbehaving.
|
||||
* - seq_info
|
||||
- Specifies certain action requests in the kernel BPF iterator
|
||||
infrastructure. Currently, only BPF_ITER_RESCHED is supported. This means
|
||||
that the kernel function cond_resched() is called to avoid other kernel
|
||||
subsystem (e.g., rcu) misbehaving.
|
||||
|
||||
|
||||
`Click here
|
||||
<https://lore.kernel.org/bpf/20210212183107.50963-2-songliubraving@fb.com/>`_
|
||||
to see an implementation of the ``task_vma`` BPF iterator in the kernel.
|
||||
|
||||
---------------------------------
|
||||
Parameterizing BPF Task Iterators
|
||||
---------------------------------
|
||||
|
||||
By default, BPF iterators walk through all the objects of the specified types
|
||||
(processes, cgroups, maps, etc.) across the entire system to read relevant
|
||||
kernel data. But often, there are cases where we only care about a much smaller
|
||||
subset of iterable kernel objects, such as only iterating tasks within a
|
||||
specific process. Therefore, BPF iterator programs support filtering out objects
|
||||
from iteration by allowing user space to configure the iterator program when it
|
||||
is attached.
|
||||
|
||||
--------------------------
|
||||
BPF Task Iterator Program
|
||||
--------------------------
|
||||
|
||||
The following code is a BPF iterator program to print files and task information
|
||||
through the ``seq_file`` of the iterator. It is a standard BPF iterator program
|
||||
that visits every file of an iterator. We will use this BPF program in our
|
||||
example later.
|
||||
|
||||
::
|
||||
|
||||
#include <vmlinux.h>
|
||||
#include <bpf/bpf_helpers.h>
|
||||
|
||||
char _license[] SEC("license") = "GPL";
|
||||
|
||||
SEC("iter/task_file")
|
||||
int dump_task_file(struct bpf_iter__task_file *ctx)
|
||||
{
|
||||
struct seq_file *seq = ctx->meta->seq;
|
||||
struct task_struct *task = ctx->task;
|
||||
struct file *file = ctx->file;
|
||||
__u32 fd = ctx->fd;
|
||||
if (task == NULL || file == NULL)
|
||||
return 0;
|
||||
if (ctx->meta->seq_num == 0) {
|
||||
BPF_SEQ_PRINTF(seq, " tgid pid fd file\n");
|
||||
}
|
||||
BPF_SEQ_PRINTF(seq, "%8d %8d %8d %lx\n", task->tgid, task->pid, fd,
|
||||
(long)file->f_op);
|
||||
return 0;
|
||||
}
|
||||
|
||||
----------------------------------------
|
||||
Creating a File Iterator with Parameters
|
||||
----------------------------------------
|
||||
|
||||
Now, let us look at how to create an iterator that includes only files of a
|
||||
process.
|
||||
|
||||
First, fill the ``bpf_iter_attach_opts`` struct as shown below:
|
||||
|
||||
::
|
||||
|
||||
LIBBPF_OPTS(bpf_iter_attach_opts, opts);
|
||||
union bpf_iter_link_info linfo;
|
||||
memset(&linfo, 0, sizeof(linfo));
|
||||
linfo.task.pid = getpid();
|
||||
opts.link_info = &linfo;
|
||||
opts.link_info_len = sizeof(linfo);
|
||||
|
||||
``linfo.task.pid``, if it is non-zero, directs the kernel to create an iterator
|
||||
that only includes opened files for the process with the specified ``pid``. In
|
||||
this example, we will only be iterating files for our process. If
|
||||
``linfo.task.pid`` is zero, the iterator will visit every opened file of every
|
||||
process. Similarly, ``linfo.task.tid`` directs the kernel to create an iterator
|
||||
that visits opened files of a specific thread, not a process. In this example,
|
||||
``linfo.task.tid`` is different from ``linfo.task.pid`` only if the thread has a
|
||||
separate file descriptor table. In most circumstances, all process threads share
|
||||
a single file descriptor table.
|
||||
|
||||
Now, in the userspace program, pass the pointer of struct to the
|
||||
``bpf_program__attach_iter()``.
|
||||
|
||||
::
|
||||
|
||||
link = bpf_program__attach_iter(prog, &opts); iter_fd =
|
||||
bpf_iter_create(bpf_link__fd(link));
|
||||
|
||||
If both *tid* and *pid* are zero, an iterator created from this struct
|
||||
``bpf_iter_attach_opts`` will include every opened file of every task in the
|
||||
system (in the namespace, actually.) It is the same as passing a NULL as the
|
||||
second argument to ``bpf_program__attach_iter()``.
|
||||
|
||||
The whole program looks like the following code:
|
||||
|
||||
::
|
||||
|
||||
#include <stdio.h>
|
||||
#include <unistd.h>
|
||||
#include <bpf/bpf.h>
|
||||
#include <bpf/libbpf.h>
|
||||
#include "bpf_iter_task_ex.skel.h"
|
||||
|
||||
static int do_read_opts(struct bpf_program *prog, struct bpf_iter_attach_opts *opts)
|
||||
{
|
||||
struct bpf_link *link;
|
||||
char buf[16] = {};
|
||||
int iter_fd = -1, len;
|
||||
int ret = 0;
|
||||
|
||||
link = bpf_program__attach_iter(prog, opts);
|
||||
if (!link) {
|
||||
fprintf(stderr, "bpf_program__attach_iter() fails\n");
|
||||
return -1;
|
||||
}
|
||||
iter_fd = bpf_iter_create(bpf_link__fd(link));
|
||||
if (iter_fd < 0) {
|
||||
fprintf(stderr, "bpf_iter_create() fails\n");
|
||||
ret = -1;
|
||||
goto free_link;
|
||||
}
|
||||
/* not check contents, but ensure read() ends without error */
|
||||
while ((len = read(iter_fd, buf, sizeof(buf) - 1)) > 0) {
|
||||
buf[len] = 0;
|
||||
printf("%s", buf);
|
||||
}
|
||||
printf("\n");
|
||||
free_link:
|
||||
if (iter_fd >= 0)
|
||||
close(iter_fd);
|
||||
bpf_link__destroy(link);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void test_task_file(void)
|
||||
{
|
||||
LIBBPF_OPTS(bpf_iter_attach_opts, opts);
|
||||
struct bpf_iter_task_ex *skel;
|
||||
union bpf_iter_link_info linfo;
|
||||
skel = bpf_iter_task_ex__open_and_load();
|
||||
if (skel == NULL)
|
||||
return;
|
||||
memset(&linfo, 0, sizeof(linfo));
|
||||
linfo.task.pid = getpid();
|
||||
opts.link_info = &linfo;
|
||||
opts.link_info_len = sizeof(linfo);
|
||||
printf("PID %d\n", getpid());
|
||||
do_read_opts(skel->progs.dump_task_file, &opts);
|
||||
bpf_iter_task_ex__destroy(skel);
|
||||
}
|
||||
|
||||
int main(int argc, const char * const * argv)
|
||||
{
|
||||
test_task_file();
|
||||
return 0;
|
||||
}
|
||||
|
||||
The following lines are the output of the program.
|
||||
::
|
||||
|
||||
PID 1859
|
||||
|
||||
tgid pid fd file
|
||||
1859 1859 0 ffffffff82270aa0
|
||||
1859 1859 1 ffffffff82270aa0
|
||||
1859 1859 2 ffffffff82270aa0
|
||||
1859 1859 3 ffffffff82272980
|
||||
1859 1859 4 ffffffff8225e120
|
||||
1859 1859 5 ffffffff82255120
|
||||
1859 1859 6 ffffffff82254f00
|
||||
1859 1859 7 ffffffff82254d80
|
||||
1859 1859 8 ffffffff8225abe0
|
||||
|
||||
------------------
|
||||
Without Parameters
|
||||
------------------
|
||||
|
||||
Let us look at how a BPF iterator without parameters skips files of other
|
||||
processes in the system. In this case, the BPF program has to check the pid or
|
||||
the tid of tasks, or it will receive every opened file in the system (in the
|
||||
current *pid* namespace, actually). So, we usually add a global variable in the
|
||||
BPF program to pass a *pid* to the BPF program.
|
||||
|
||||
The BPF program would look like the following block.
|
||||
|
||||
::
|
||||
|
||||
......
|
||||
int target_pid = 0;
|
||||
|
||||
SEC("iter/task_file")
|
||||
int dump_task_file(struct bpf_iter__task_file *ctx)
|
||||
{
|
||||
......
|
||||
if (task->tgid != target_pid) /* Check task->pid instead to check thread IDs */
|
||||
return 0;
|
||||
BPF_SEQ_PRINTF(seq, "%8d %8d %8d %lx\n", task->tgid, task->pid, fd,
|
||||
(long)file->f_op);
|
||||
return 0;
|
||||
}
|
||||
|
||||
The user space program would look like the following block:
|
||||
|
||||
::
|
||||
|
||||
......
|
||||
static void test_task_file(void)
|
||||
{
|
||||
......
|
||||
skel = bpf_iter_task_ex__open_and_load();
|
||||
if (skel == NULL)
|
||||
return;
|
||||
skel->bss->target_pid = getpid(); /* process ID. For thread id, use gettid() */
|
||||
memset(&linfo, 0, sizeof(linfo));
|
||||
linfo.task.pid = getpid();
|
||||
opts.link_info = &linfo;
|
||||
opts.link_info_len = sizeof(linfo);
|
||||
......
|
||||
}
|
||||
|
||||
``target_pid`` is a global variable in the BPF program. The user space program
|
||||
should initialize the variable with a process ID to skip opened files of other
|
||||
processes in the BPF program. When you parametrize a BPF iterator, the iterator
|
||||
calls the BPF program fewer times which can save significant resources.
|
||||
|
||||
---------------------------
|
||||
Parametrizing VMA Iterators
|
||||
---------------------------
|
||||
|
||||
By default, a BPF VMA iterator includes every VMA in every process. However,
|
||||
you can still specify a process or a thread to include only its VMAs. Unlike
|
||||
files, a thread can not have a separate address space (since Linux 2.6.0-test6).
|
||||
Here, using *tid* makes no difference from using *pid*.
|
||||
|
||||
----------------------------
|
||||
Parametrizing Task Iterators
|
||||
----------------------------
|
||||
|
||||
A BPF task iterator with *pid* includes all tasks (threads) of a process. The
|
||||
BPF program receives these tasks one after another. You can specify a BPF task
|
||||
iterator with *tid* parameter to include only the tasks that match the given
|
||||
*tid*.
|
@ -1062,4 +1062,9 @@ format.::
|
||||
7. Testing
|
||||
==========
|
||||
|
||||
Kernel bpf selftest `test_btf.c` provides extensive set of BTF-related tests.
|
||||
The kernel BPF selftest `tools/testing/selftests/bpf/prog_tests/btf.c`_
|
||||
provides an extensive set of BTF-related tests.
|
||||
|
||||
.. Links
|
||||
.. _tools/testing/selftests/bpf/prog_tests/btf.c:
|
||||
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/tree/tools/testing/selftests/bpf/prog_tests/btf.c
|
||||
|
@ -24,11 +24,13 @@ that goes into great technical depth about the BPF Architecture.
|
||||
maps
|
||||
bpf_prog_run
|
||||
classic_vs_extended.rst
|
||||
bpf_iterators
|
||||
bpf_licensing
|
||||
test_debug
|
||||
clang-notes
|
||||
linux-notes
|
||||
other
|
||||
redirect
|
||||
|
||||
.. only:: subproject and html
|
||||
|
||||
|
@ -122,11 +122,11 @@ BPF_END 0xd0 byte swap operations (see `Byte swap instructions`_ below)
|
||||
|
||||
``BPF_XOR | BPF_K | BPF_ALU`` means::
|
||||
|
||||
src_reg = (u32) src_reg ^ (u32) imm32
|
||||
dst_reg = (u32) dst_reg ^ (u32) imm32
|
||||
|
||||
``BPF_XOR | BPF_K | BPF_ALU64`` means::
|
||||
|
||||
src_reg = src_reg ^ imm32
|
||||
dst_reg = dst_reg ^ imm32
|
||||
|
||||
|
||||
Byte swap instructions
|
||||
|
@ -72,6 +72,30 @@ argument as its size. By default, without __sz annotation, the size of the type
|
||||
of the pointer is used. Without __sz annotation, a kfunc cannot accept a void
|
||||
pointer.
|
||||
|
||||
2.2.2 __k Annotation
|
||||
--------------------
|
||||
|
||||
This annotation is only understood for scalar arguments, where it indicates that
|
||||
the verifier must check the scalar argument to be a known constant, which does
|
||||
not indicate a size parameter, and the value of the constant is relevant to the
|
||||
safety of the program.
|
||||
|
||||
An example is given below::
|
||||
|
||||
void *bpf_obj_new(u32 local_type_id__k, ...)
|
||||
{
|
||||
...
|
||||
}
|
||||
|
||||
Here, bpf_obj_new uses local_type_id argument to find out the size of that type
|
||||
ID in program's BTF and return a sized pointer to it. Each type ID will have a
|
||||
distinct size, hence it is crucial to treat each such call as distinct when
|
||||
values don't match during verifier state pruning checks.
|
||||
|
||||
Hence, whenever a constant scalar argument is accepted by a kfunc which is not a
|
||||
size parameter, and the value of the constant matters for program safety, __k
|
||||
suffix should be used.
|
||||
|
||||
.. _BPF_kfunc_nodef:
|
||||
|
||||
2.3 Using an existing kernel function
|
||||
@ -137,22 +161,20 @@ KF_ACQUIRE and KF_RET_NULL flags.
|
||||
--------------------------
|
||||
|
||||
The KF_TRUSTED_ARGS flag is used for kfuncs taking pointer arguments. It
|
||||
indicates that the all pointer arguments will always have a guaranteed lifetime,
|
||||
and pointers to kernel objects are always passed to helpers in their unmodified
|
||||
form (as obtained from acquire kfuncs).
|
||||
indicates that the all pointer arguments are valid, and that all pointers to
|
||||
BTF objects have been passed in their unmodified form (that is, at a zero
|
||||
offset, and without having been obtained from walking another pointer).
|
||||
|
||||
It can be used to enforce that a pointer to a refcounted object acquired from a
|
||||
kfunc or BPF helper is passed as an argument to this kfunc without any
|
||||
modifications (e.g. pointer arithmetic) such that it is trusted and points to
|
||||
the original object.
|
||||
There are two types of pointers to kernel objects which are considered "valid":
|
||||
|
||||
Meanwhile, it is also allowed pass pointers to normal memory to such kfuncs,
|
||||
but those can have a non-zero offset.
|
||||
1. Pointers which are passed as tracepoint or struct_ops callback arguments.
|
||||
2. Pointers which were returned from a KF_ACQUIRE or KF_KPTR_GET kfunc.
|
||||
|
||||
This flag is often used for kfuncs that operate (change some property, perform
|
||||
some operation) on an object that was obtained using an acquire kfunc. Such
|
||||
kfuncs need an unchanged pointer to ensure the integrity of the operation being
|
||||
performed on the expected object.
|
||||
Pointers to non-BTF objects (e.g. scalar pointers) may also be passed to
|
||||
KF_TRUSTED_ARGS kfuncs, and may have a non-zero offset.
|
||||
|
||||
The definition of "valid" pointers is subject to change at any time, and has
|
||||
absolutely no ABI stability guarantees.
|
||||
|
||||
2.4.6 KF_SLEEPABLE flag
|
||||
-----------------------
|
||||
@ -169,6 +191,15 @@ rebooting or panicking. Due to this additional restrictions apply to these
|
||||
calls. At the moment they only require CAP_SYS_BOOT capability, but more can be
|
||||
added later.
|
||||
|
||||
2.4.8 KF_RCU flag
|
||||
-----------------
|
||||
|
||||
The KF_RCU flag is used for kfuncs which have a rcu ptr as its argument.
|
||||
When used together with KF_ACQUIRE, it indicates the kfunc should have a
|
||||
single argument which must be a trusted argument or a MEM_RCU pointer.
|
||||
The argument may have reference count of 0 and the kfunc must take this
|
||||
into consideration.
|
||||
|
||||
2.5 Registering the kfuncs
|
||||
--------------------------
|
||||
|
||||
@ -191,3 +222,201 @@ type. An example is shown below::
|
||||
return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &bpf_task_kfunc_set);
|
||||
}
|
||||
late_initcall(init_subsystem);
|
||||
|
||||
3. Core kfuncs
|
||||
==============
|
||||
|
||||
The BPF subsystem provides a number of "core" kfuncs that are potentially
|
||||
applicable to a wide variety of different possible use cases and programs.
|
||||
Those kfuncs are documented here.
|
||||
|
||||
3.1 struct task_struct * kfuncs
|
||||
-------------------------------
|
||||
|
||||
There are a number of kfuncs that allow ``struct task_struct *`` objects to be
|
||||
used as kptrs:
|
||||
|
||||
.. kernel-doc:: kernel/bpf/helpers.c
|
||||
:identifiers: bpf_task_acquire bpf_task_release
|
||||
|
||||
These kfuncs are useful when you want to acquire or release a reference to a
|
||||
``struct task_struct *`` that was passed as e.g. a tracepoint arg, or a
|
||||
struct_ops callback arg. For example:
|
||||
|
||||
.. code-block:: c
|
||||
|
||||
/**
|
||||
* A trivial example tracepoint program that shows how to
|
||||
* acquire and release a struct task_struct * pointer.
|
||||
*/
|
||||
SEC("tp_btf/task_newtask")
|
||||
int BPF_PROG(task_acquire_release_example, struct task_struct *task, u64 clone_flags)
|
||||
{
|
||||
struct task_struct *acquired;
|
||||
|
||||
acquired = bpf_task_acquire(task);
|
||||
|
||||
/*
|
||||
* In a typical program you'd do something like store
|
||||
* the task in a map, and the map will automatically
|
||||
* release it later. Here, we release it manually.
|
||||
*/
|
||||
bpf_task_release(acquired);
|
||||
return 0;
|
||||
}
|
||||
|
||||
----
|
||||
|
||||
A BPF program can also look up a task from a pid. This can be useful if the
|
||||
caller doesn't have a trusted pointer to a ``struct task_struct *`` object that
|
||||
it can acquire a reference on with bpf_task_acquire().
|
||||
|
||||
.. kernel-doc:: kernel/bpf/helpers.c
|
||||
:identifiers: bpf_task_from_pid
|
||||
|
||||
Here is an example of it being used:
|
||||
|
||||
.. code-block:: c
|
||||
|
||||
SEC("tp_btf/task_newtask")
|
||||
int BPF_PROG(task_get_pid_example, struct task_struct *task, u64 clone_flags)
|
||||
{
|
||||
struct task_struct *lookup;
|
||||
|
||||
lookup = bpf_task_from_pid(task->pid);
|
||||
if (!lookup)
|
||||
/* A task should always be found, as %task is a tracepoint arg. */
|
||||
return -ENOENT;
|
||||
|
||||
if (lookup->pid != task->pid) {
|
||||
/* bpf_task_from_pid() looks up the task via its
|
||||
* globally-unique pid from the init_pid_ns. Thus,
|
||||
* the pid of the lookup task should always be the
|
||||
* same as the input task.
|
||||
*/
|
||||
bpf_task_release(lookup);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
/* bpf_task_from_pid() returns an acquired reference,
|
||||
* so it must be dropped before returning from the
|
||||
* tracepoint handler.
|
||||
*/
|
||||
bpf_task_release(lookup);
|
||||
return 0;
|
||||
}
|
||||
|
||||
3.2 struct cgroup * kfuncs
|
||||
--------------------------
|
||||
|
||||
``struct cgroup *`` objects also have acquire and release functions:
|
||||
|
||||
.. kernel-doc:: kernel/bpf/helpers.c
|
||||
:identifiers: bpf_cgroup_acquire bpf_cgroup_release
|
||||
|
||||
These kfuncs are used in exactly the same manner as bpf_task_acquire() and
|
||||
bpf_task_release() respectively, so we won't provide examples for them.
|
||||
|
||||
----
|
||||
|
||||
You may also acquire a reference to a ``struct cgroup`` kptr that's already
|
||||
stored in a map using bpf_cgroup_kptr_get():
|
||||
|
||||
.. kernel-doc:: kernel/bpf/helpers.c
|
||||
:identifiers: bpf_cgroup_kptr_get
|
||||
|
||||
Here's an example of how it can be used:
|
||||
|
||||
.. code-block:: c
|
||||
|
||||
/* struct containing the struct task_struct kptr which is actually stored in the map. */
|
||||
struct __cgroups_kfunc_map_value {
|
||||
struct cgroup __kptr_ref * cgroup;
|
||||
};
|
||||
|
||||
/* The map containing struct __cgroups_kfunc_map_value entries. */
|
||||
struct {
|
||||
__uint(type, BPF_MAP_TYPE_HASH);
|
||||
__type(key, int);
|
||||
__type(value, struct __cgroups_kfunc_map_value);
|
||||
__uint(max_entries, 1);
|
||||
} __cgroups_kfunc_map SEC(".maps");
|
||||
|
||||
/* ... */
|
||||
|
||||
/**
|
||||
* A simple example tracepoint program showing how a
|
||||
* struct cgroup kptr that is stored in a map can
|
||||
* be acquired using the bpf_cgroup_kptr_get() kfunc.
|
||||
*/
|
||||
SEC("tp_btf/cgroup_mkdir")
|
||||
int BPF_PROG(cgroup_kptr_get_example, struct cgroup *cgrp, const char *path)
|
||||
{
|
||||
struct cgroup *kptr;
|
||||
struct __cgroups_kfunc_map_value *v;
|
||||
s32 id = cgrp->self.id;
|
||||
|
||||
/* Assume a cgroup kptr was previously stored in the map. */
|
||||
v = bpf_map_lookup_elem(&__cgroups_kfunc_map, &id);
|
||||
if (!v)
|
||||
return -ENOENT;
|
||||
|
||||
/* Acquire a reference to the cgroup kptr that's already stored in the map. */
|
||||
kptr = bpf_cgroup_kptr_get(&v->cgroup);
|
||||
if (!kptr)
|
||||
/* If no cgroup was present in the map, it's because
|
||||
* we're racing with another CPU that removed it with
|
||||
* bpf_kptr_xchg() between the bpf_map_lookup_elem()
|
||||
* above, and our call to bpf_cgroup_kptr_get().
|
||||
* bpf_cgroup_kptr_get() internally safely handles this
|
||||
* race, and will return NULL if the task is no longer
|
||||
* present in the map by the time we invoke the kfunc.
|
||||
*/
|
||||
return -EBUSY;
|
||||
|
||||
/* Free the reference we just took above. Note that the
|
||||
* original struct cgroup kptr is still in the map. It will
|
||||
* be freed either at a later time if another context deletes
|
||||
* it from the map, or automatically by the BPF subsystem if
|
||||
* it's still present when the map is destroyed.
|
||||
*/
|
||||
bpf_cgroup_release(kptr);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
----
|
||||
|
||||
Another kfunc available for interacting with ``struct cgroup *`` objects is
|
||||
bpf_cgroup_ancestor(). This allows callers to access the ancestor of a cgroup,
|
||||
and return it as a cgroup kptr.
|
||||
|
||||
.. kernel-doc:: kernel/bpf/helpers.c
|
||||
:identifiers: bpf_cgroup_ancestor
|
||||
|
||||
Eventually, BPF should be updated to allow this to happen with a normal memory
|
||||
load in the program itself. This is currently not possible without more work in
|
||||
the verifier. bpf_cgroup_ancestor() can be used as follows:
|
||||
|
||||
.. code-block:: c
|
||||
|
||||
/**
|
||||
* Simple tracepoint example that illustrates how a cgroup's
|
||||
* ancestor can be accessed using bpf_cgroup_ancestor().
|
||||
*/
|
||||
SEC("tp_btf/cgroup_mkdir")
|
||||
int BPF_PROG(cgrp_ancestor_example, struct cgroup *cgrp, const char *path)
|
||||
{
|
||||
struct cgroup *parent;
|
||||
|
||||
/* The parent cgroup resides at the level before the current cgroup's level. */
|
||||
parent = bpf_cgroup_ancestor(cgrp, cgrp->level - 1);
|
||||
if (!parent)
|
||||
return -ENOENT;
|
||||
|
||||
bpf_printk("Parent id is %d", parent->self.id);
|
||||
|
||||
/* Return the parent cgroup that was acquired above. */
|
||||
bpf_cgroup_release(parent);
|
||||
return 0;
|
||||
}
|
||||
|
@ -1,5 +1,7 @@
|
||||
.. SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
|
||||
|
||||
.. _libbpf:
|
||||
|
||||
libbpf
|
||||
======
|
||||
|
||||
@ -7,6 +9,7 @@ libbpf
|
||||
:maxdepth: 1
|
||||
|
||||
API Documentation <https://libbpf.readthedocs.io/en/latest/api.html>
|
||||
program_types
|
||||
libbpf_naming_convention
|
||||
libbpf_build
|
||||
|
||||
|
203
Documentation/bpf/libbpf/program_types.rst
Normal file
203
Documentation/bpf/libbpf/program_types.rst
Normal file
@ -0,0 +1,203 @@
|
||||
.. SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
|
||||
|
||||
.. _program_types_and_elf:
|
||||
|
||||
Program Types and ELF Sections
|
||||
==============================
|
||||
|
||||
The table below lists the program types, their attach types where relevant and the ELF section
|
||||
names supported by libbpf for them. The ELF section names follow these rules:
|
||||
|
||||
- ``type`` is an exact match, e.g. ``SEC("socket")``
|
||||
- ``type+`` means it can be either exact ``SEC("type")`` or well-formed ``SEC("type/extras")``
|
||||
with a '``/``' separator between ``type`` and ``extras``.
|
||||
|
||||
When ``extras`` are specified, they provide details of how to auto-attach the BPF program. The
|
||||
format of ``extras`` depends on the program type, e.g. ``SEC("tracepoint/<category>/<name>")``
|
||||
for tracepoints or ``SEC("usdt/<path>:<provider>:<name>")`` for USDT probes. The extras are
|
||||
described in more detail in the footnotes.
|
||||
|
||||
|
||||
+-------------------------------------------+----------------------------------------+----------------------------------+-----------+
|
||||
| Program Type | Attach Type | ELF Section Name | Sleepable |
|
||||
+===========================================+========================================+==================================+===========+
|
||||
| ``BPF_PROG_TYPE_CGROUP_DEVICE`` | ``BPF_CGROUP_DEVICE`` | ``cgroup/dev`` | |
|
||||
+-------------------------------------------+----------------------------------------+----------------------------------+-----------+
|
||||
| ``BPF_PROG_TYPE_CGROUP_SKB`` | | ``cgroup/skb`` | |
|
||||
+ +----------------------------------------+----------------------------------+-----------+
|
||||
| | ``BPF_CGROUP_INET_EGRESS`` | ``cgroup_skb/egress`` | |
|
||||
+ +----------------------------------------+----------------------------------+-----------+
|
||||
| | ``BPF_CGROUP_INET_INGRESS`` | ``cgroup_skb/ingress`` | |
|
||||
+-------------------------------------------+----------------------------------------+----------------------------------+-----------+
|
||||
| ``BPF_PROG_TYPE_CGROUP_SOCKOPT`` | ``BPF_CGROUP_GETSOCKOPT`` | ``cgroup/getsockopt`` | |
|
||||
+ +----------------------------------------+----------------------------------+-----------+
|
||||
| | ``BPF_CGROUP_SETSOCKOPT`` | ``cgroup/setsockopt`` | |
|
||||
+-------------------------------------------+----------------------------------------+----------------------------------+-----------+
|
||||
| ``BPF_PROG_TYPE_CGROUP_SOCK_ADDR`` | ``BPF_CGROUP_INET4_BIND`` | ``cgroup/bind4`` | |
|
||||
+ +----------------------------------------+----------------------------------+-----------+
|
||||
| | ``BPF_CGROUP_INET4_CONNECT`` | ``cgroup/connect4`` | |
|
||||
+ +----------------------------------------+----------------------------------+-----------+
|
||||
| | ``BPF_CGROUP_INET4_GETPEERNAME`` | ``cgroup/getpeername4`` | |
|
||||
+ +----------------------------------------+----------------------------------+-----------+
|
||||
| | ``BPF_CGROUP_INET4_GETSOCKNAME`` | ``cgroup/getsockname4`` | |
|
||||
+ +----------------------------------------+----------------------------------+-----------+
|
||||
| | ``BPF_CGROUP_INET6_BIND`` | ``cgroup/bind6`` | |
|
||||
+ +----------------------------------------+----------------------------------+-----------+
|
||||
| | ``BPF_CGROUP_INET6_CONNECT`` | ``cgroup/connect6`` | |
|
||||
+ +----------------------------------------+----------------------------------+-----------+
|
||||
| | ``BPF_CGROUP_INET6_GETPEERNAME`` | ``cgroup/getpeername6`` | |
|
||||
+ +----------------------------------------+----------------------------------+-----------+
|
||||
| | ``BPF_CGROUP_INET6_GETSOCKNAME`` | ``cgroup/getsockname6`` | |
|
||||
+ +----------------------------------------+----------------------------------+-----------+
|
||||
| | ``BPF_CGROUP_UDP4_RECVMSG`` | ``cgroup/recvmsg4`` | |
|
||||
+ +----------------------------------------+----------------------------------+-----------+
|
||||
| | ``BPF_CGROUP_UDP4_SENDMSG`` | ``cgroup/sendmsg4`` | |
|
||||
+ +----------------------------------------+----------------------------------+-----------+
|
||||
| | ``BPF_CGROUP_UDP6_RECVMSG`` | ``cgroup/recvmsg6`` | |
|
||||
+ +----------------------------------------+----------------------------------+-----------+
|
||||
| | ``BPF_CGROUP_UDP6_SENDMSG`` | ``cgroup/sendmsg6`` | |
|
||||
+-------------------------------------------+----------------------------------------+----------------------------------+-----------+
|
||||
| ``BPF_PROG_TYPE_CGROUP_SOCK`` | ``BPF_CGROUP_INET4_POST_BIND`` | ``cgroup/post_bind4`` | |
|
||||
+ +----------------------------------------+----------------------------------+-----------+
|
||||
| | ``BPF_CGROUP_INET6_POST_BIND`` | ``cgroup/post_bind6`` | |
|
||||
+ +----------------------------------------+----------------------------------+-----------+
|
||||
| | ``BPF_CGROUP_INET_SOCK_CREATE`` | ``cgroup/sock_create`` | |
|
||||
+ + +----------------------------------+-----------+
|
||||
| | | ``cgroup/sock`` | |
|
||||
+ +----------------------------------------+----------------------------------+-----------+
|
||||
| | ``BPF_CGROUP_INET_SOCK_RELEASE`` | ``cgroup/sock_release`` | |
|
||||
+-------------------------------------------+----------------------------------------+----------------------------------+-----------+
|
||||
| ``BPF_PROG_TYPE_CGROUP_SYSCTL`` | ``BPF_CGROUP_SYSCTL`` | ``cgroup/sysctl`` | |
|
||||
+-------------------------------------------+----------------------------------------+----------------------------------+-----------+
|
||||
| ``BPF_PROG_TYPE_EXT`` | | ``freplace+`` [#fentry]_ | |
|
||||
+-------------------------------------------+----------------------------------------+----------------------------------+-----------+
|
||||
| ``BPF_PROG_TYPE_FLOW_DISSECTOR`` | ``BPF_FLOW_DISSECTOR`` | ``flow_dissector`` | |
|
||||
+-------------------------------------------+----------------------------------------+----------------------------------+-----------+
|
||||
| ``BPF_PROG_TYPE_KPROBE`` | | ``kprobe+`` [#kprobe]_ | |
|
||||
+ + +----------------------------------+-----------+
|
||||
| | | ``kretprobe+`` [#kprobe]_ | |
|
||||
+ + +----------------------------------+-----------+
|
||||
| | | ``ksyscall+`` [#ksyscall]_ | |
|
||||
+ + +----------------------------------+-----------+
|
||||
| | | ``kretsyscall+`` [#ksyscall]_ | |
|
||||
+ + +----------------------------------+-----------+
|
||||
| | | ``uprobe+`` [#uprobe]_ | |
|
||||
+ + +----------------------------------+-----------+
|
||||
| | | ``uprobe.s+`` [#uprobe]_ | Yes |
|
||||
+ + +----------------------------------+-----------+
|
||||
| | | ``uretprobe+`` [#uprobe]_ | |
|
||||
+ + +----------------------------------+-----------+
|
||||
| | | ``uretprobe.s+`` [#uprobe]_ | Yes |
|
||||
+ + +----------------------------------+-----------+
|
||||
| | | ``usdt+`` [#usdt]_ | |
|
||||
+ +----------------------------------------+----------------------------------+-----------+
|
||||
| | ``BPF_TRACE_KPROBE_MULTI`` | ``kprobe.multi+`` [#kpmulti]_ | |
|
||||
+ + +----------------------------------+-----------+
|
||||
| | | ``kretprobe.multi+`` [#kpmulti]_ | |
|
||||
+-------------------------------------------+----------------------------------------+----------------------------------+-----------+
|
||||
| ``BPF_PROG_TYPE_LIRC_MODE2`` | ``BPF_LIRC_MODE2`` | ``lirc_mode2`` | |
|
||||
+-------------------------------------------+----------------------------------------+----------------------------------+-----------+
|
||||
| ``BPF_PROG_TYPE_LSM`` | ``BPF_LSM_CGROUP`` | ``lsm_cgroup+`` | |
|
||||
+ +----------------------------------------+----------------------------------+-----------+
|
||||
| | ``BPF_LSM_MAC`` | ``lsm+`` [#lsm]_ | |
|
||||
+ + +----------------------------------+-----------+
|
||||
| | | ``lsm.s+`` [#lsm]_ | Yes |
|
||||
+-------------------------------------------+----------------------------------------+----------------------------------+-----------+
|
||||
| ``BPF_PROG_TYPE_LWT_IN`` | | ``lwt_in`` | |
|
||||
+-------------------------------------------+----------------------------------------+----------------------------------+-----------+
|
||||
| ``BPF_PROG_TYPE_LWT_OUT`` | | ``lwt_out`` | |
|
||||
+-------------------------------------------+----------------------------------------+----------------------------------+-----------+
|
||||
| ``BPF_PROG_TYPE_LWT_SEG6LOCAL`` | | ``lwt_seg6local`` | |
|
||||
+-------------------------------------------+----------------------------------------+----------------------------------+-----------+
|
||||
| ``BPF_PROG_TYPE_LWT_XMIT`` | | ``lwt_xmit`` | |
|
||||
+-------------------------------------------+----------------------------------------+----------------------------------+-----------+
|
||||
| ``BPF_PROG_TYPE_PERF_EVENT`` | | ``perf_event`` | |
|
||||
+-------------------------------------------+----------------------------------------+----------------------------------+-----------+
|
||||
| ``BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE`` | | ``raw_tp.w+`` [#rawtp]_ | |
|
||||
+ + +----------------------------------+-----------+
|
||||
| | | ``raw_tracepoint.w+`` | |
|
||||
+-------------------------------------------+----------------------------------------+----------------------------------+-----------+
|
||||
| ``BPF_PROG_TYPE_RAW_TRACEPOINT`` | | ``raw_tp+`` [#rawtp]_ | |
|
||||
+ + +----------------------------------+-----------+
|
||||
| | | ``raw_tracepoint+`` | |
|
||||
+-------------------------------------------+----------------------------------------+----------------------------------+-----------+
|
||||
| ``BPF_PROG_TYPE_SCHED_ACT`` | | ``action`` | |
|
||||
+-------------------------------------------+----------------------------------------+----------------------------------+-----------+
|
||||
| ``BPF_PROG_TYPE_SCHED_CLS`` | | ``classifier`` | |
|
||||
+ + +----------------------------------+-----------+
|
||||
| | | ``tc`` | |
|
||||
+-------------------------------------------+----------------------------------------+----------------------------------+-----------+
|
||||
| ``BPF_PROG_TYPE_SK_LOOKUP`` | ``BPF_SK_LOOKUP`` | ``sk_lookup`` | |
|
||||
+-------------------------------------------+----------------------------------------+----------------------------------+-----------+
|
||||
| ``BPF_PROG_TYPE_SK_MSG`` | ``BPF_SK_MSG_VERDICT`` | ``sk_msg`` | |
|
||||
+-------------------------------------------+----------------------------------------+----------------------------------+-----------+
|
||||
| ``BPF_PROG_TYPE_SK_REUSEPORT`` | ``BPF_SK_REUSEPORT_SELECT_OR_MIGRATE`` | ``sk_reuseport/migrate`` | |
|
||||
+ +----------------------------------------+----------------------------------+-----------+
|
||||
| | ``BPF_SK_REUSEPORT_SELECT`` | ``sk_reuseport`` | |
|
||||
+-------------------------------------------+----------------------------------------+----------------------------------+-----------+
|
||||
| ``BPF_PROG_TYPE_SK_SKB`` | | ``sk_skb`` | |
|
||||
+ +----------------------------------------+----------------------------------+-----------+
|
||||
| | ``BPF_SK_SKB_STREAM_PARSER`` | ``sk_skb/stream_parser`` | |
|
||||
+ +----------------------------------------+----------------------------------+-----------+
|
||||
| | ``BPF_SK_SKB_STREAM_VERDICT`` | ``sk_skb/stream_verdict`` | |
|
||||
+-------------------------------------------+----------------------------------------+----------------------------------+-----------+
|
||||
| ``BPF_PROG_TYPE_SOCKET_FILTER`` | | ``socket`` | |
|
||||
+-------------------------------------------+----------------------------------------+----------------------------------+-----------+
|
||||
| ``BPF_PROG_TYPE_SOCK_OPS`` | ``BPF_CGROUP_SOCK_OPS`` | ``sockops`` | |
|
||||
+-------------------------------------------+----------------------------------------+----------------------------------+-----------+
|
||||
| ``BPF_PROG_TYPE_STRUCT_OPS`` | | ``struct_ops+`` | |
|
||||
+-------------------------------------------+----------------------------------------+----------------------------------+-----------+
|
||||
| ``BPF_PROG_TYPE_SYSCALL`` | | ``syscall`` | Yes |
|
||||
+-------------------------------------------+----------------------------------------+----------------------------------+-----------+
|
||||
| ``BPF_PROG_TYPE_TRACEPOINT`` | | ``tp+`` [#tp]_ | |
|
||||
+ + +----------------------------------+-----------+
|
||||
| | | ``tracepoint+`` [#tp]_ | |
|
||||
+-------------------------------------------+----------------------------------------+----------------------------------+-----------+
|
||||
| ``BPF_PROG_TYPE_TRACING`` | ``BPF_MODIFY_RETURN`` | ``fmod_ret+`` [#fentry]_ | |
|
||||
+ + +----------------------------------+-----------+
|
||||
| | | ``fmod_ret.s+`` [#fentry]_ | Yes |
|
||||
+ +----------------------------------------+----------------------------------+-----------+
|
||||
| | ``BPF_TRACE_FENTRY`` | ``fentry+`` [#fentry]_ | |
|
||||
+ + +----------------------------------+-----------+
|
||||
| | | ``fentry.s+`` [#fentry]_ | Yes |
|
||||
+ +----------------------------------------+----------------------------------+-----------+
|
||||
| | ``BPF_TRACE_FEXIT`` | ``fexit+`` [#fentry]_ | |
|
||||
+ + +----------------------------------+-----------+
|
||||
| | | ``fexit.s+`` [#fentry]_ | Yes |
|
||||
+ +----------------------------------------+----------------------------------+-----------+
|
||||
| | ``BPF_TRACE_ITER`` | ``iter+`` [#iter]_ | |
|
||||
+ + +----------------------------------+-----------+
|
||||
| | | ``iter.s+`` [#iter]_ | Yes |
|
||||
+ +----------------------------------------+----------------------------------+-----------+
|
||||
| | ``BPF_TRACE_RAW_TP`` | ``tp_btf+`` [#fentry]_ | |
|
||||
+-------------------------------------------+----------------------------------------+----------------------------------+-----------+
|
||||
| ``BPF_PROG_TYPE_XDP`` | ``BPF_XDP_CPUMAP`` | ``xdp.frags/cpumap`` | |
|
||||
+ + +----------------------------------+-----------+
|
||||
| | | ``xdp/cpumap`` | |
|
||||
+ +----------------------------------------+----------------------------------+-----------+
|
||||
| | ``BPF_XDP_DEVMAP`` | ``xdp.frags/devmap`` | |
|
||||
+ + +----------------------------------+-----------+
|
||||
| | | ``xdp/devmap`` | |
|
||||
+ +----------------------------------------+----------------------------------+-----------+
|
||||
| | ``BPF_XDP`` | ``xdp.frags`` | |
|
||||
+ + +----------------------------------+-----------+
|
||||
| | | ``xdp`` | |
|
||||
+-------------------------------------------+----------------------------------------+----------------------------------+-----------+
|
||||
|
||||
|
||||
.. rubric:: Footnotes
|
||||
|
||||
.. [#fentry] The ``fentry`` attach format is ``fentry[.s]/<function>``.
|
||||
.. [#kprobe] The ``kprobe`` attach format is ``kprobe/<function>[+<offset>]``. Valid
|
||||
characters for ``function`` are ``a-zA-Z0-9_.`` and ``offset`` must be a valid
|
||||
non-negative integer.
|
||||
.. [#ksyscall] The ``ksyscall`` attach format is ``ksyscall/<syscall>``.
|
||||
.. [#uprobe] The ``uprobe`` attach format is ``uprobe[.s]/<path>:<function>[+<offset>]``.
|
||||
.. [#usdt] The ``usdt`` attach format is ``usdt/<path>:<provider>:<name>``.
|
||||
.. [#kpmulti] The ``kprobe.multi`` attach format is ``kprobe.multi/<pattern>`` where ``pattern``
|
||||
supports ``*`` and ``?`` wildcards. Valid characters for pattern are
|
||||
``a-zA-Z0-9_.*?``.
|
||||
.. [#lsm] The ``lsm`` attachment format is ``lsm[.s]/<hook>``.
|
||||
.. [#rawtp] The ``raw_tp`` attach format is ``raw_tracepoint[.w]/<tracepoint>``.
|
||||
.. [#tp] The ``tracepoint`` attach format is ``tracepoint/<category>/<name>``.
|
||||
.. [#iter] The ``iter`` attach format is ``iter[.s]/<struct-name>``.
|
262
Documentation/bpf/map_array.rst
Normal file
262
Documentation/bpf/map_array.rst
Normal file
@ -0,0 +1,262 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0-only
|
||||
.. Copyright (C) 2022 Red Hat, Inc.
|
||||
|
||||
================================================
|
||||
BPF_MAP_TYPE_ARRAY and BPF_MAP_TYPE_PERCPU_ARRAY
|
||||
================================================
|
||||
|
||||
.. note::
|
||||
- ``BPF_MAP_TYPE_ARRAY`` was introduced in kernel version 3.19
|
||||
- ``BPF_MAP_TYPE_PERCPU_ARRAY`` was introduced in version 4.6
|
||||
|
||||
``BPF_MAP_TYPE_ARRAY`` and ``BPF_MAP_TYPE_PERCPU_ARRAY`` provide generic array
|
||||
storage. The key type is an unsigned 32-bit integer (4 bytes) and the map is
|
||||
of constant size. The size of the array is defined in ``max_entries`` at
|
||||
creation time. All array elements are pre-allocated and zero initialized when
|
||||
created. ``BPF_MAP_TYPE_PERCPU_ARRAY`` uses a different memory region for each
|
||||
CPU whereas ``BPF_MAP_TYPE_ARRAY`` uses the same memory region. The value
|
||||
stored can be of any size, however, all array elements are aligned to 8
|
||||
bytes.
|
||||
|
||||
Since kernel 5.5, memory mapping may be enabled for ``BPF_MAP_TYPE_ARRAY`` by
|
||||
setting the flag ``BPF_F_MMAPABLE``. The map definition is page-aligned and
|
||||
starts on the first page. Sufficient page-sized and page-aligned blocks of
|
||||
memory are allocated to store all array values, starting on the second page,
|
||||
which in some cases will result in over-allocation of memory. The benefit of
|
||||
using this is increased performance and ease of use since userspace programs
|
||||
would not be required to use helper functions to access and mutate data.
|
||||
|
||||
Usage
|
||||
=====
|
||||
|
||||
Kernel BPF
|
||||
----------
|
||||
|
||||
bpf_map_lookup_elem()
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. code-block:: c
|
||||
|
||||
void *bpf_map_lookup_elem(struct bpf_map *map, const void *key)
|
||||
|
||||
Array elements can be retrieved using the ``bpf_map_lookup_elem()`` helper.
|
||||
This helper returns a pointer into the array element, so to avoid data races
|
||||
with userspace reading the value, the user must use primitives like
|
||||
``__sync_fetch_and_add()`` when updating the value in-place.
|
||||
|
||||
bpf_map_update_elem()
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. code-block:: c
|
||||
|
||||
long bpf_map_update_elem(struct bpf_map *map, const void *key, const void *value, u64 flags)
|
||||
|
||||
Array elements can be updated using the ``bpf_map_update_elem()`` helper.
|
||||
|
||||
``bpf_map_update_elem()`` returns 0 on success, or negative error in case of
|
||||
failure.
|
||||
|
||||
Since the array is of constant size, ``bpf_map_delete_elem()`` is not supported.
|
||||
To clear an array element, you may use ``bpf_map_update_elem()`` to insert a
|
||||
zero value to that index.
|
||||
|
||||
Per CPU Array
|
||||
-------------
|
||||
|
||||
Values stored in ``BPF_MAP_TYPE_ARRAY`` can be accessed by multiple programs
|
||||
across different CPUs. To restrict storage to a single CPU, you may use a
|
||||
``BPF_MAP_TYPE_PERCPU_ARRAY``.
|
||||
|
||||
When using a ``BPF_MAP_TYPE_PERCPU_ARRAY`` the ``bpf_map_update_elem()`` and
|
||||
``bpf_map_lookup_elem()`` helpers automatically access the slot for the current
|
||||
CPU.
|
||||
|
||||
bpf_map_lookup_percpu_elem()
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. code-block:: c
|
||||
|
||||
void *bpf_map_lookup_percpu_elem(struct bpf_map *map, const void *key, u32 cpu)
|
||||
|
||||
The ``bpf_map_lookup_percpu_elem()`` helper can be used to lookup the array
|
||||
value for a specific CPU. Returns value on success , or ``NULL`` if no entry was
|
||||
found or ``cpu`` is invalid.
|
||||
|
||||
Concurrency
|
||||
-----------
|
||||
|
||||
Since kernel version 5.1, the BPF infrastructure provides ``struct bpf_spin_lock``
|
||||
to synchronize access.
|
||||
|
||||
Userspace
|
||||
---------
|
||||
|
||||
Access from userspace uses libbpf APIs with the same names as above, with
|
||||
the map identified by its ``fd``.
|
||||
|
||||
Examples
|
||||
========
|
||||
|
||||
Please see the ``tools/testing/selftests/bpf`` directory for functional
|
||||
examples. The code samples below demonstrate API usage.
|
||||
|
||||
Kernel BPF
|
||||
----------
|
||||
|
||||
This snippet shows how to declare an array in a BPF program.
|
||||
|
||||
.. code-block:: c
|
||||
|
||||
struct {
|
||||
__uint(type, BPF_MAP_TYPE_ARRAY);
|
||||
__type(key, u32);
|
||||
__type(value, long);
|
||||
__uint(max_entries, 256);
|
||||
} my_map SEC(".maps");
|
||||
|
||||
|
||||
This example BPF program shows how to access an array element.
|
||||
|
||||
.. code-block:: c
|
||||
|
||||
int bpf_prog(struct __sk_buff *skb)
|
||||
{
|
||||
struct iphdr ip;
|
||||
int index;
|
||||
long *value;
|
||||
|
||||
if (bpf_skb_load_bytes(skb, ETH_HLEN, &ip, sizeof(ip)) < 0)
|
||||
return 0;
|
||||
|
||||
index = ip.protocol;
|
||||
value = bpf_map_lookup_elem(&my_map, &index);
|
||||
if (value)
|
||||
__sync_fetch_and_add(value, skb->len);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
Userspace
|
||||
---------
|
||||
|
||||
BPF_MAP_TYPE_ARRAY
|
||||
~~~~~~~~~~~~~~~~~~
|
||||
|
||||
This snippet shows how to create an array, using ``bpf_map_create_opts`` to
|
||||
set flags.
|
||||
|
||||
.. code-block:: c
|
||||
|
||||
#include <bpf/libbpf.h>
|
||||
#include <bpf/bpf.h>
|
||||
|
||||
int create_array()
|
||||
{
|
||||
int fd;
|
||||
LIBBPF_OPTS(bpf_map_create_opts, opts, .map_flags = BPF_F_MMAPABLE);
|
||||
|
||||
fd = bpf_map_create(BPF_MAP_TYPE_ARRAY,
|
||||
"example_array", /* name */
|
||||
sizeof(__u32), /* key size */
|
||||
sizeof(long), /* value size */
|
||||
256, /* max entries */
|
||||
&opts); /* create opts */
|
||||
return fd;
|
||||
}
|
||||
|
||||
This snippet shows how to initialize the elements of an array.
|
||||
|
||||
.. code-block:: c
|
||||
|
||||
int initialize_array(int fd)
|
||||
{
|
||||
__u32 i;
|
||||
long value;
|
||||
int ret;
|
||||
|
||||
for (i = 0; i < 256; i++) {
|
||||
value = i;
|
||||
ret = bpf_map_update_elem(fd, &i, &value, BPF_ANY);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
This snippet shows how to retrieve an element value from an array.
|
||||
|
||||
.. code-block:: c
|
||||
|
||||
int lookup(int fd)
|
||||
{
|
||||
__u32 index = 42;
|
||||
long value;
|
||||
int ret;
|
||||
|
||||
ret = bpf_map_lookup_elem(fd, &index, &value);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
|
||||
/* use value here */
|
||||
assert(value == 42);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
BPF_MAP_TYPE_PERCPU_ARRAY
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
This snippet shows how to initialize the elements of a per CPU array.
|
||||
|
||||
.. code-block:: c
|
||||
|
||||
int initialize_array(int fd)
|
||||
{
|
||||
int ncpus = libbpf_num_possible_cpus();
|
||||
long values[ncpus];
|
||||
__u32 i, j;
|
||||
int ret;
|
||||
|
||||
for (i = 0; i < 256 ; i++) {
|
||||
for (j = 0; j < ncpus; j++)
|
||||
values[j] = i;
|
||||
ret = bpf_map_update_elem(fd, &i, &values, BPF_ANY);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
This snippet shows how to access the per CPU elements of an array value.
|
||||
|
||||
.. code-block:: c
|
||||
|
||||
int lookup(int fd)
|
||||
{
|
||||
int ncpus = libbpf_num_possible_cpus();
|
||||
__u32 index = 42, j;
|
||||
long values[ncpus];
|
||||
int ret;
|
||||
|
||||
ret = bpf_map_lookup_elem(fd, &index, &values);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
|
||||
for (j = 0; j < ncpus; j++) {
|
||||
/* Use per CPU value here */
|
||||
assert(values[j] == 42);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
Semantics
|
||||
=========
|
||||
|
||||
As shown in the example above, when accessing a ``BPF_MAP_TYPE_PERCPU_ARRAY``
|
||||
in userspace, each value is an array with ``ncpus`` elements.
|
||||
|
||||
When calling ``bpf_map_update_elem()`` the flag ``BPF_NOEXIST`` can not be used
|
||||
for these maps.
|
174
Documentation/bpf/map_bloom_filter.rst
Normal file
174
Documentation/bpf/map_bloom_filter.rst
Normal file
@ -0,0 +1,174 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0-only
|
||||
.. Copyright (C) 2022 Red Hat, Inc.
|
||||
|
||||
=========================
|
||||
BPF_MAP_TYPE_BLOOM_FILTER
|
||||
=========================
|
||||
|
||||
.. note::
|
||||
- ``BPF_MAP_TYPE_BLOOM_FILTER`` was introduced in kernel version 5.16
|
||||
|
||||
``BPF_MAP_TYPE_BLOOM_FILTER`` provides a BPF bloom filter map. Bloom
|
||||
filters are a space-efficient probabilistic data structure used to
|
||||
quickly test whether an element exists in a set. In a bloom filter,
|
||||
false positives are possible whereas false negatives are not.
|
||||
|
||||
The bloom filter map does not have keys, only values. When the bloom
|
||||
filter map is created, it must be created with a ``key_size`` of 0. The
|
||||
bloom filter map supports two operations:
|
||||
|
||||
- push: adding an element to the map
|
||||
- peek: determining whether an element is present in the map
|
||||
|
||||
BPF programs must use ``bpf_map_push_elem`` to add an element to the
|
||||
bloom filter map and ``bpf_map_peek_elem`` to query the map. These
|
||||
operations are exposed to userspace applications using the existing
|
||||
``bpf`` syscall in the following way:
|
||||
|
||||
- ``BPF_MAP_UPDATE_ELEM`` -> push
|
||||
- ``BPF_MAP_LOOKUP_ELEM`` -> peek
|
||||
|
||||
The ``max_entries`` size that is specified at map creation time is used
|
||||
to approximate a reasonable bitmap size for the bloom filter, and is not
|
||||
otherwise strictly enforced. If the user wishes to insert more entries
|
||||
into the bloom filter than ``max_entries``, this may lead to a higher
|
||||
false positive rate.
|
||||
|
||||
The number of hashes to use for the bloom filter is configurable using
|
||||
the lower 4 bits of ``map_extra`` in ``union bpf_attr`` at map creation
|
||||
time. If no number is specified, the default used will be 5 hash
|
||||
functions. In general, using more hashes decreases both the false
|
||||
positive rate and the speed of a lookup.
|
||||
|
||||
It is not possible to delete elements from a bloom filter map. A bloom
|
||||
filter map may be used as an inner map. The user is responsible for
|
||||
synchronising concurrent updates and lookups to ensure no false negative
|
||||
lookups occur.
|
||||
|
||||
Usage
|
||||
=====
|
||||
|
||||
Kernel BPF
|
||||
----------
|
||||
|
||||
bpf_map_push_elem()
|
||||
~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. code-block:: c
|
||||
|
||||
long bpf_map_push_elem(struct bpf_map *map, const void *value, u64 flags)
|
||||
|
||||
A ``value`` can be added to a bloom filter using the
|
||||
``bpf_map_push_elem()`` helper. The ``flags`` parameter must be set to
|
||||
``BPF_ANY`` when adding an entry to the bloom filter. This helper
|
||||
returns ``0`` on success, or negative error in case of failure.
|
||||
|
||||
bpf_map_peek_elem()
|
||||
~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. code-block:: c
|
||||
|
||||
long bpf_map_peek_elem(struct bpf_map *map, void *value)
|
||||
|
||||
The ``bpf_map_peek_elem()`` helper is used to determine whether
|
||||
``value`` is present in the bloom filter map. This helper returns ``0``
|
||||
if ``value`` is probably present in the map, or ``-ENOENT`` if ``value``
|
||||
is definitely not present in the map.
|
||||
|
||||
Userspace
|
||||
---------
|
||||
|
||||
bpf_map_update_elem()
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. code-block:: c
|
||||
|
||||
int bpf_map_update_elem (int fd, const void *key, const void *value, __u64 flags)
|
||||
|
||||
A userspace program can add a ``value`` to a bloom filter using libbpf's
|
||||
``bpf_map_update_elem`` function. The ``key`` parameter must be set to
|
||||
``NULL`` and ``flags`` must be set to ``BPF_ANY``. Returns ``0`` on
|
||||
success, or negative error in case of failure.
|
||||
|
||||
bpf_map_lookup_elem()
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. code-block:: c
|
||||
|
||||
int bpf_map_lookup_elem (int fd, const void *key, void *value)
|
||||
|
||||
A userspace program can determine the presence of ``value`` in a bloom
|
||||
filter using libbpf's ``bpf_map_lookup_elem`` function. The ``key``
|
||||
parameter must be set to ``NULL``. Returns ``0`` if ``value`` is
|
||||
probably present in the map, or ``-ENOENT`` if ``value`` is definitely
|
||||
not present in the map.
|
||||
|
||||
Examples
|
||||
========
|
||||
|
||||
Kernel BPF
|
||||
----------
|
||||
|
||||
This snippet shows how to declare a bloom filter in a BPF program:
|
||||
|
||||
.. code-block:: c
|
||||
|
||||
struct {
|
||||
__uint(type, BPF_MAP_TYPE_BLOOM_FILTER);
|
||||
__type(value, __u32);
|
||||
__uint(max_entries, 1000);
|
||||
__uint(map_extra, 3);
|
||||
} bloom_filter SEC(".maps");
|
||||
|
||||
This snippet shows how to determine presence of a value in a bloom
|
||||
filter in a BPF program:
|
||||
|
||||
.. code-block:: c
|
||||
|
||||
void *lookup(__u32 key)
|
||||
{
|
||||
if (bpf_map_peek_elem(&bloom_filter, &key) == 0) {
|
||||
/* Verify not a false positive and fetch an associated
|
||||
* value using a secondary lookup, e.g. in a hash table
|
||||
*/
|
||||
return bpf_map_lookup_elem(&hash_table, &key);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
Userspace
|
||||
---------
|
||||
|
||||
This snippet shows how to use libbpf to create a bloom filter map from
|
||||
userspace:
|
||||
|
||||
.. code-block:: c
|
||||
|
||||
int create_bloom()
|
||||
{
|
||||
LIBBPF_OPTS(bpf_map_create_opts, opts,
|
||||
.map_extra = 3); /* number of hashes */
|
||||
|
||||
return bpf_map_create(BPF_MAP_TYPE_BLOOM_FILTER,
|
||||
"ipv6_bloom", /* name */
|
||||
0, /* key size, must be zero */
|
||||
sizeof(ipv6_addr), /* value size */
|
||||
10000, /* max entries */
|
||||
&opts); /* create options */
|
||||
}
|
||||
|
||||
This snippet shows how to add an element to a bloom filter from
|
||||
userspace:
|
||||
|
||||
.. code-block:: c
|
||||
|
||||
int add_element(struct bpf_map *bloom_map, __u32 value)
|
||||
{
|
||||
int bloom_fd = bpf_map__fd(bloom_map);
|
||||
return bpf_map_update_elem(bloom_fd, NULL, &value, BPF_ANY);
|
||||
}
|
||||
|
||||
References
|
||||
==========
|
||||
|
||||
https://lwn.net/ml/bpf/20210831225005.2762202-1-joannekoong@fb.com/
|
109
Documentation/bpf/map_cgrp_storage.rst
Normal file
109
Documentation/bpf/map_cgrp_storage.rst
Normal file
@ -0,0 +1,109 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0-only
|
||||
.. Copyright (C) 2022 Meta Platforms, Inc. and affiliates.
|
||||
|
||||
=========================
|
||||
BPF_MAP_TYPE_CGRP_STORAGE
|
||||
=========================
|
||||
|
||||
The ``BPF_MAP_TYPE_CGRP_STORAGE`` map type represents a local fix-sized
|
||||
storage for cgroups. It is only available with ``CONFIG_CGROUPS``.
|
||||
The programs are made available by the same Kconfig. The
|
||||
data for a particular cgroup can be retrieved by looking up the map
|
||||
with that cgroup.
|
||||
|
||||
This document describes the usage and semantics of the
|
||||
``BPF_MAP_TYPE_CGRP_STORAGE`` map type.
|
||||
|
||||
Usage
|
||||
=====
|
||||
|
||||
The map key must be ``sizeof(int)`` representing a cgroup fd.
|
||||
To access the storage in a program, use ``bpf_cgrp_storage_get``::
|
||||
|
||||
void *bpf_cgrp_storage_get(struct bpf_map *map, struct cgroup *cgroup, void *value, u64 flags)
|
||||
|
||||
``flags`` could be 0 or ``BPF_LOCAL_STORAGE_GET_F_CREATE`` which indicates that
|
||||
a new local storage will be created if one does not exist.
|
||||
|
||||
The local storage can be removed with ``bpf_cgrp_storage_delete``::
|
||||
|
||||
long bpf_cgrp_storage_delete(struct bpf_map *map, struct cgroup *cgroup)
|
||||
|
||||
The map is available to all program types.
|
||||
|
||||
Examples
|
||||
========
|
||||
|
||||
A BPF program example with BPF_MAP_TYPE_CGRP_STORAGE::
|
||||
|
||||
#include <vmlinux.h>
|
||||
#include <bpf/bpf_helpers.h>
|
||||
#include <bpf/bpf_tracing.h>
|
||||
|
||||
struct {
|
||||
__uint(type, BPF_MAP_TYPE_CGRP_STORAGE);
|
||||
__uint(map_flags, BPF_F_NO_PREALLOC);
|
||||
__type(key, int);
|
||||
__type(value, long);
|
||||
} cgrp_storage SEC(".maps");
|
||||
|
||||
SEC("tp_btf/sys_enter")
|
||||
int BPF_PROG(on_enter, struct pt_regs *regs, long id)
|
||||
{
|
||||
struct task_struct *task = bpf_get_current_task_btf();
|
||||
long *ptr;
|
||||
|
||||
ptr = bpf_cgrp_storage_get(&cgrp_storage, task->cgroups->dfl_cgrp, 0,
|
||||
BPF_LOCAL_STORAGE_GET_F_CREATE);
|
||||
if (ptr)
|
||||
__sync_fetch_and_add(ptr, 1);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
Userspace accessing map declared above::
|
||||
|
||||
#include <linux/bpf.h>
|
||||
#include <linux/libbpf.h>
|
||||
|
||||
__u32 map_lookup(struct bpf_map *map, int cgrp_fd)
|
||||
{
|
||||
__u32 *value;
|
||||
value = bpf_map_lookup_elem(bpf_map__fd(map), &cgrp_fd);
|
||||
if (value)
|
||||
return *value;
|
||||
return 0;
|
||||
}
|
||||
|
||||
Difference Between BPF_MAP_TYPE_CGRP_STORAGE and BPF_MAP_TYPE_CGROUP_STORAGE
|
||||
============================================================================
|
||||
|
||||
The old cgroup storage map ``BPF_MAP_TYPE_CGROUP_STORAGE`` has been marked as
|
||||
deprecated (renamed to ``BPF_MAP_TYPE_CGROUP_STORAGE_DEPRECATED``). The new
|
||||
``BPF_MAP_TYPE_CGRP_STORAGE`` map should be used instead. The following
|
||||
illusates the main difference between ``BPF_MAP_TYPE_CGRP_STORAGE`` and
|
||||
``BPF_MAP_TYPE_CGROUP_STORAGE_DEPRECATED``.
|
||||
|
||||
(1). ``BPF_MAP_TYPE_CGRP_STORAGE`` can be used by all program types while
|
||||
``BPF_MAP_TYPE_CGROUP_STORAGE_DEPRECATED`` is available only to cgroup program types
|
||||
like BPF_CGROUP_INET_INGRESS or BPF_CGROUP_SOCK_OPS, etc.
|
||||
|
||||
(2). ``BPF_MAP_TYPE_CGRP_STORAGE`` supports local storage for more than one
|
||||
cgroup while ``BPF_MAP_TYPE_CGROUP_STORAGE_DEPRECATED`` only supports one cgroup
|
||||
which is attached by a BPF program.
|
||||
|
||||
(3). ``BPF_MAP_TYPE_CGROUP_STORAGE_DEPRECATED`` allocates local storage at attach time so
|
||||
``bpf_get_local_storage()`` always returns non-NULL local storage.
|
||||
``BPF_MAP_TYPE_CGRP_STORAGE`` allocates local storage at runtime so
|
||||
it is possible that ``bpf_cgrp_storage_get()`` may return null local storage.
|
||||
To avoid such null local storage issue, user space can do
|
||||
``bpf_map_update_elem()`` to pre-allocate local storage before a BPF program
|
||||
is attached.
|
||||
|
||||
(4). ``BPF_MAP_TYPE_CGRP_STORAGE`` supports deleting local storage by a BPF program
|
||||
while ``BPF_MAP_TYPE_CGROUP_STORAGE_DEPRECATED`` only deletes storage during
|
||||
prog detach time.
|
||||
|
||||
So overall, ``BPF_MAP_TYPE_CGRP_STORAGE`` supports all ``BPF_MAP_TYPE_CGROUP_STORAGE_DEPRECATED``
|
||||
functionality and beyond. It is recommended to use ``BPF_MAP_TYPE_CGRP_STORAGE``
|
||||
instead of ``BPF_MAP_TYPE_CGROUP_STORAGE_DEPRECATED``.
|
177
Documentation/bpf/map_cpumap.rst
Normal file
177
Documentation/bpf/map_cpumap.rst
Normal file
@ -0,0 +1,177 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0-only
|
||||
.. Copyright (C) 2022 Red Hat, Inc.
|
||||
|
||||
===================
|
||||
BPF_MAP_TYPE_CPUMAP
|
||||
===================
|
||||
|
||||
.. note::
|
||||
- ``BPF_MAP_TYPE_CPUMAP`` was introduced in kernel version 4.15
|
||||
|
||||
.. kernel-doc:: kernel/bpf/cpumap.c
|
||||
:doc: cpu map
|
||||
|
||||
An example use-case for this map type is software based Receive Side Scaling (RSS).
|
||||
|
||||
The CPUMAP represents the CPUs in the system indexed as the map-key, and the
|
||||
map-value is the config setting (per CPUMAP entry). Each CPUMAP entry has a dedicated
|
||||
kernel thread bound to the given CPU to represent the remote CPU execution unit.
|
||||
|
||||
Starting from Linux kernel version 5.9 the CPUMAP can run a second XDP program
|
||||
on the remote CPU. This allows an XDP program to split its processing across
|
||||
multiple CPUs. For example, a scenario where the initial CPU (that sees/receives
|
||||
the packets) needs to do minimal packet processing and the remote CPU (to which
|
||||
the packet is directed) can afford to spend more cycles processing the frame. The
|
||||
initial CPU is where the XDP redirect program is executed. The remote CPU
|
||||
receives raw ``xdp_frame`` objects.
|
||||
|
||||
Usage
|
||||
=====
|
||||
|
||||
Kernel BPF
|
||||
----------
|
||||
bpf_redirect_map()
|
||||
^^^^^^^^^^^^^^^^^^
|
||||
.. code-block:: c
|
||||
|
||||
long bpf_redirect_map(struct bpf_map *map, u32 key, u64 flags)
|
||||
|
||||
Redirect the packet to the endpoint referenced by ``map`` at index ``key``.
|
||||
For ``BPF_MAP_TYPE_CPUMAP`` this map contains references to CPUs.
|
||||
|
||||
The lower two bits of ``flags`` are used as the return code if the map lookup
|
||||
fails. This is so that the return value can be one of the XDP program return
|
||||
codes up to ``XDP_TX``, as chosen by the caller.
|
||||
|
||||
User space
|
||||
----------
|
||||
.. note::
|
||||
CPUMAP entries can only be updated/looked up/deleted from user space and not
|
||||
from an eBPF program. Trying to call these functions from a kernel eBPF
|
||||
program will result in the program failing to load and a verifier warning.
|
||||
|
||||
bpf_map_update_elem()
|
||||
^^^^^^^^^^^^^^^^^^^^^
|
||||
.. code-block:: c
|
||||
|
||||
int bpf_map_update_elem(int fd, const void *key, const void *value, __u64 flags);
|
||||
|
||||
CPU entries can be added or updated using the ``bpf_map_update_elem()``
|
||||
helper. This helper replaces existing elements atomically. The ``value`` parameter
|
||||
can be ``struct bpf_cpumap_val``.
|
||||
|
||||
.. code-block:: c
|
||||
|
||||
struct bpf_cpumap_val {
|
||||
__u32 qsize; /* queue size to remote target CPU */
|
||||
union {
|
||||
int fd; /* prog fd on map write */
|
||||
__u32 id; /* prog id on map read */
|
||||
} bpf_prog;
|
||||
};
|
||||
|
||||
The flags argument can be one of the following:
|
||||
- BPF_ANY: Create a new element or update an existing element.
|
||||
- BPF_NOEXIST: Create a new element only if it did not exist.
|
||||
- BPF_EXIST: Update an existing element.
|
||||
|
||||
bpf_map_lookup_elem()
|
||||
^^^^^^^^^^^^^^^^^^^^^
|
||||
.. code-block:: c
|
||||
|
||||
int bpf_map_lookup_elem(int fd, const void *key, void *value);
|
||||
|
||||
CPU entries can be retrieved using the ``bpf_map_lookup_elem()``
|
||||
helper.
|
||||
|
||||
bpf_map_delete_elem()
|
||||
^^^^^^^^^^^^^^^^^^^^^
|
||||
.. code-block:: c
|
||||
|
||||
int bpf_map_delete_elem(int fd, const void *key);
|
||||
|
||||
CPU entries can be deleted using the ``bpf_map_delete_elem()``
|
||||
helper. This helper will return 0 on success, or negative error in case of
|
||||
failure.
|
||||
|
||||
Examples
|
||||
========
|
||||
Kernel
|
||||
------
|
||||
|
||||
The following code snippet shows how to declare a ``BPF_MAP_TYPE_CPUMAP`` called
|
||||
``cpu_map`` and how to redirect packets to a remote CPU using a round robin scheme.
|
||||
|
||||
.. code-block:: c
|
||||
|
||||
struct {
|
||||
__uint(type, BPF_MAP_TYPE_CPUMAP);
|
||||
__type(key, __u32);
|
||||
__type(value, struct bpf_cpumap_val);
|
||||
__uint(max_entries, 12);
|
||||
} cpu_map SEC(".maps");
|
||||
|
||||
struct {
|
||||
__uint(type, BPF_MAP_TYPE_ARRAY);
|
||||
__type(key, __u32);
|
||||
__type(value, __u32);
|
||||
__uint(max_entries, 12);
|
||||
} cpus_available SEC(".maps");
|
||||
|
||||
struct {
|
||||
__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
|
||||
__type(key, __u32);
|
||||
__type(value, __u32);
|
||||
__uint(max_entries, 1);
|
||||
} cpus_iterator SEC(".maps");
|
||||
|
||||
SEC("xdp")
|
||||
int xdp_redir_cpu_round_robin(struct xdp_md *ctx)
|
||||
{
|
||||
__u32 key = 0;
|
||||
__u32 cpu_dest = 0;
|
||||
__u32 *cpu_selected, *cpu_iterator;
|
||||
__u32 cpu_idx;
|
||||
|
||||
cpu_iterator = bpf_map_lookup_elem(&cpus_iterator, &key);
|
||||
if (!cpu_iterator)
|
||||
return XDP_ABORTED;
|
||||
cpu_idx = *cpu_iterator;
|
||||
|
||||
*cpu_iterator += 1;
|
||||
if (*cpu_iterator == bpf_num_possible_cpus())
|
||||
*cpu_iterator = 0;
|
||||
|
||||
cpu_selected = bpf_map_lookup_elem(&cpus_available, &cpu_idx);
|
||||
if (!cpu_selected)
|
||||
return XDP_ABORTED;
|
||||
cpu_dest = *cpu_selected;
|
||||
|
||||
if (cpu_dest >= bpf_num_possible_cpus())
|
||||
return XDP_ABORTED;
|
||||
|
||||
return bpf_redirect_map(&cpu_map, cpu_dest, 0);
|
||||
}
|
||||
|
||||
User space
|
||||
----------
|
||||
|
||||
The following code snippet shows how to dynamically set the max_entries for a
|
||||
CPUMAP to the max number of cpus available on the system.
|
||||
|
||||
.. code-block:: c
|
||||
|
||||
int set_max_cpu_entries(struct bpf_map *cpu_map)
|
||||
{
|
||||
if (bpf_map__set_max_entries(cpu_map, libbpf_num_possible_cpus()) < 0) {
|
||||
fprintf(stderr, "Failed to set max entries for cpu_map map: %s",
|
||||
strerror(errno));
|
||||
return -1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
References
|
||||
===========
|
||||
|
||||
- https://developers.redhat.com/blog/2021/05/13/receive-side-scaling-rss-with-ebpf-and-cpumap#redirecting_into_a_cpumap
|
238
Documentation/bpf/map_devmap.rst
Normal file
238
Documentation/bpf/map_devmap.rst
Normal file
@ -0,0 +1,238 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0-only
|
||||
.. Copyright (C) 2022 Red Hat, Inc.
|
||||
|
||||
=================================================
|
||||
BPF_MAP_TYPE_DEVMAP and BPF_MAP_TYPE_DEVMAP_HASH
|
||||
=================================================
|
||||
|
||||
.. note::
|
||||
- ``BPF_MAP_TYPE_DEVMAP`` was introduced in kernel version 4.14
|
||||
- ``BPF_MAP_TYPE_DEVMAP_HASH`` was introduced in kernel version 5.4
|
||||
|
||||
``BPF_MAP_TYPE_DEVMAP`` and ``BPF_MAP_TYPE_DEVMAP_HASH`` are BPF maps primarily
|
||||
used as backend maps for the XDP BPF helper call ``bpf_redirect_map()``.
|
||||
``BPF_MAP_TYPE_DEVMAP`` is backed by an array that uses the key as
|
||||
the index to lookup a reference to a net device. While ``BPF_MAP_TYPE_DEVMAP_HASH``
|
||||
is backed by a hash table that uses a key to lookup a reference to a net device.
|
||||
The user provides either <``key``/ ``ifindex``> or <``key``/ ``struct bpf_devmap_val``>
|
||||
pairs to update the maps with new net devices.
|
||||
|
||||
.. note::
|
||||
- The key to a hash map doesn't have to be an ``ifindex``.
|
||||
- While ``BPF_MAP_TYPE_DEVMAP_HASH`` allows for densely packing the net devices
|
||||
it comes at the cost of a hash of the key when performing a look up.
|
||||
|
||||
The setup and packet enqueue/send code is shared between the two types of
|
||||
devmap; only the lookup and insertion is different.
|
||||
|
||||
Usage
|
||||
=====
|
||||
Kernel BPF
|
||||
----------
|
||||
bpf_redirect_map()
|
||||
^^^^^^^^^^^^^^^^^^
|
||||
.. code-block:: c
|
||||
|
||||
long bpf_redirect_map(struct bpf_map *map, u32 key, u64 flags)
|
||||
|
||||
Redirect the packet to the endpoint referenced by ``map`` at index ``key``.
|
||||
For ``BPF_MAP_TYPE_DEVMAP`` and ``BPF_MAP_TYPE_DEVMAP_HASH`` this map contains
|
||||
references to net devices (for forwarding packets through other ports).
|
||||
|
||||
The lower two bits of *flags* are used as the return code if the map lookup
|
||||
fails. This is so that the return value can be one of the XDP program return
|
||||
codes up to ``XDP_TX``, as chosen by the caller. The higher bits of ``flags``
|
||||
can be set to ``BPF_F_BROADCAST`` or ``BPF_F_EXCLUDE_INGRESS`` as defined
|
||||
below.
|
||||
|
||||
With ``BPF_F_BROADCAST`` the packet will be broadcast to all the interfaces
|
||||
in the map, with ``BPF_F_EXCLUDE_INGRESS`` the ingress interface will be excluded
|
||||
from the broadcast.
|
||||
|
||||
.. note::
|
||||
- The key is ignored if BPF_F_BROADCAST is set.
|
||||
- The broadcast feature can also be used to implement multicast forwarding:
|
||||
simply create multiple DEVMAPs, each one corresponding to a single multicast group.
|
||||
|
||||
This helper will return ``XDP_REDIRECT`` on success, or the value of the two
|
||||
lower bits of the ``flags`` argument if the map lookup fails.
|
||||
|
||||
More information about redirection can be found :doc:`redirect`
|
||||
|
||||
bpf_map_lookup_elem()
|
||||
^^^^^^^^^^^^^^^^^^^^^
|
||||
.. code-block:: c
|
||||
|
||||
void *bpf_map_lookup_elem(struct bpf_map *map, const void *key)
|
||||
|
||||
Net device entries can be retrieved using the ``bpf_map_lookup_elem()``
|
||||
helper.
|
||||
|
||||
User space
|
||||
----------
|
||||
.. note::
|
||||
DEVMAP entries can only be updated/deleted from user space and not
|
||||
from an eBPF program. Trying to call these functions from a kernel eBPF
|
||||
program will result in the program failing to load and a verifier warning.
|
||||
|
||||
bpf_map_update_elem()
|
||||
^^^^^^^^^^^^^^^^^^^^^
|
||||
.. code-block:: c
|
||||
|
||||
int bpf_map_update_elem(int fd, const void *key, const void *value, __u64 flags);
|
||||
|
||||
Net device entries can be added or updated using the ``bpf_map_update_elem()``
|
||||
helper. This helper replaces existing elements atomically. The ``value`` parameter
|
||||
can be ``struct bpf_devmap_val`` or a simple ``int ifindex`` for backwards
|
||||
compatibility.
|
||||
|
||||
.. code-block:: c
|
||||
|
||||
struct bpf_devmap_val {
|
||||
__u32 ifindex; /* device index */
|
||||
union {
|
||||
int fd; /* prog fd on map write */
|
||||
__u32 id; /* prog id on map read */
|
||||
} bpf_prog;
|
||||
};
|
||||
|
||||
The ``flags`` argument can be one of the following:
|
||||
- ``BPF_ANY``: Create a new element or update an existing element.
|
||||
- ``BPF_NOEXIST``: Create a new element only if it did not exist.
|
||||
- ``BPF_EXIST``: Update an existing element.
|
||||
|
||||
DEVMAPs can associate a program with a device entry by adding a ``bpf_prog.fd``
|
||||
to ``struct bpf_devmap_val``. Programs are run after ``XDP_REDIRECT`` and have
|
||||
access to both Rx device and Tx device. The program associated with the ``fd``
|
||||
must have type XDP with expected attach type ``xdp_devmap``.
|
||||
When a program is associated with a device index, the program is run on an
|
||||
``XDP_REDIRECT`` and before the buffer is added to the per-cpu queue. Examples
|
||||
of how to attach/use xdp_devmap progs can be found in the kernel selftests:
|
||||
|
||||
- ``tools/testing/selftests/bpf/prog_tests/xdp_devmap_attach.c``
|
||||
- ``tools/testing/selftests/bpf/progs/test_xdp_with_devmap_helpers.c``
|
||||
|
||||
bpf_map_lookup_elem()
|
||||
^^^^^^^^^^^^^^^^^^^^^
|
||||
.. code-block:: c
|
||||
|
||||
.. c:function::
|
||||
int bpf_map_lookup_elem(int fd, const void *key, void *value);
|
||||
|
||||
Net device entries can be retrieved using the ``bpf_map_lookup_elem()``
|
||||
helper.
|
||||
|
||||
bpf_map_delete_elem()
|
||||
^^^^^^^^^^^^^^^^^^^^^
|
||||
.. code-block:: c
|
||||
|
||||
.. c:function::
|
||||
int bpf_map_delete_elem(int fd, const void *key);
|
||||
|
||||
Net device entries can be deleted using the ``bpf_map_delete_elem()``
|
||||
helper. This helper will return 0 on success, or negative error in case of
|
||||
failure.
|
||||
|
||||
Examples
|
||||
========
|
||||
|
||||
Kernel BPF
|
||||
----------
|
||||
|
||||
The following code snippet shows how to declare a ``BPF_MAP_TYPE_DEVMAP``
|
||||
called tx_port.
|
||||
|
||||
.. code-block:: c
|
||||
|
||||
struct {
|
||||
__uint(type, BPF_MAP_TYPE_DEVMAP);
|
||||
__type(key, __u32);
|
||||
__type(value, __u32);
|
||||
__uint(max_entries, 256);
|
||||
} tx_port SEC(".maps");
|
||||
|
||||
The following code snippet shows how to declare a ``BPF_MAP_TYPE_DEVMAP_HASH``
|
||||
called forward_map.
|
||||
|
||||
.. code-block:: c
|
||||
|
||||
struct {
|
||||
__uint(type, BPF_MAP_TYPE_DEVMAP_HASH);
|
||||
__type(key, __u32);
|
||||
__type(value, struct bpf_devmap_val);
|
||||
__uint(max_entries, 32);
|
||||
} forward_map SEC(".maps");
|
||||
|
||||
.. note::
|
||||
|
||||
The value type in the DEVMAP above is a ``struct bpf_devmap_val``
|
||||
|
||||
The following code snippet shows a simple xdp_redirect_map program. This program
|
||||
would work with a user space program that populates the devmap ``forward_map`` based
|
||||
on ingress ifindexes. The BPF program (below) is redirecting packets using the
|
||||
ingress ``ifindex`` as the ``key``.
|
||||
|
||||
.. code-block:: c
|
||||
|
||||
SEC("xdp")
|
||||
int xdp_redirect_map_func(struct xdp_md *ctx)
|
||||
{
|
||||
int index = ctx->ingress_ifindex;
|
||||
|
||||
return bpf_redirect_map(&forward_map, index, 0);
|
||||
}
|
||||
|
||||
The following code snippet shows a BPF program that is broadcasting packets to
|
||||
all the interfaces in the ``tx_port`` devmap.
|
||||
|
||||
.. code-block:: c
|
||||
|
||||
SEC("xdp")
|
||||
int xdp_redirect_map_func(struct xdp_md *ctx)
|
||||
{
|
||||
return bpf_redirect_map(&tx_port, 0, BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS);
|
||||
}
|
||||
|
||||
User space
|
||||
----------
|
||||
|
||||
The following code snippet shows how to update a devmap called ``tx_port``.
|
||||
|
||||
.. code-block:: c
|
||||
|
||||
int update_devmap(int ifindex, int redirect_ifindex)
|
||||
{
|
||||
int ret;
|
||||
|
||||
ret = bpf_map_update_elem(bpf_map__fd(tx_port), &ifindex, &redirect_ifindex, 0);
|
||||
if (ret < 0) {
|
||||
fprintf(stderr, "Failed to update devmap_ value: %s\n",
|
||||
strerror(errno));
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
The following code snippet shows how to update a hash_devmap called ``forward_map``.
|
||||
|
||||
.. code-block:: c
|
||||
|
||||
int update_devmap(int ifindex, int redirect_ifindex)
|
||||
{
|
||||
struct bpf_devmap_val devmap_val = { .ifindex = redirect_ifindex };
|
||||
int ret;
|
||||
|
||||
ret = bpf_map_update_elem(bpf_map__fd(forward_map), &ifindex, &devmap_val, 0);
|
||||
if (ret < 0) {
|
||||
fprintf(stderr, "Failed to update devmap_ value: %s\n",
|
||||
strerror(errno));
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
References
|
||||
===========
|
||||
|
||||
- https://lwn.net/Articles/728146/
|
||||
- https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git/commit/?id=6f9d451ab1a33728adb72d7ff66a7b374d665176
|
||||
- https://elixir.bootlin.com/linux/latest/source/net/core/filter.c#L4106
|
@ -34,7 +34,14 @@ the ``BPF_F_NO_COMMON_LRU`` flag when calling ``bpf_map_create``.
|
||||
Usage
|
||||
=====
|
||||
|
||||
.. c:function::
|
||||
Kernel BPF
|
||||
----------
|
||||
|
||||
bpf_map_update_elem()
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. code-block:: c
|
||||
|
||||
long bpf_map_update_elem(struct bpf_map *map, const void *key, const void *value, u64 flags)
|
||||
|
||||
Hash entries can be added or updated using the ``bpf_map_update_elem()``
|
||||
@ -49,14 +56,22 @@ parameter can be used to control the update behaviour:
|
||||
``bpf_map_update_elem()`` returns 0 on success, or negative error in
|
||||
case of failure.
|
||||
|
||||
.. c:function::
|
||||
bpf_map_lookup_elem()
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. code-block:: c
|
||||
|
||||
void *bpf_map_lookup_elem(struct bpf_map *map, const void *key)
|
||||
|
||||
Hash entries can be retrieved using the ``bpf_map_lookup_elem()``
|
||||
helper. This helper returns a pointer to the value associated with
|
||||
``key``, or ``NULL`` if no entry was found.
|
||||
|
||||
.. c:function::
|
||||
bpf_map_delete_elem()
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. code-block:: c
|
||||
|
||||
long bpf_map_delete_elem(struct bpf_map *map, const void *key)
|
||||
|
||||
Hash entries can be deleted using the ``bpf_map_delete_elem()``
|
||||
@ -70,7 +85,11 @@ For ``BPF_MAP_TYPE_PERCPU_HASH`` and ``BPF_MAP_TYPE_LRU_PERCPU_HASH``
|
||||
the ``bpf_map_update_elem()`` and ``bpf_map_lookup_elem()`` helpers
|
||||
automatically access the hash slot for the current CPU.
|
||||
|
||||
.. c:function::
|
||||
bpf_map_lookup_percpu_elem()
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. code-block:: c
|
||||
|
||||
void *bpf_map_lookup_percpu_elem(struct bpf_map *map, const void *key, u32 cpu)
|
||||
|
||||
The ``bpf_map_lookup_percpu_elem()`` helper can be used to lookup the
|
||||
@ -89,7 +108,11 @@ See ``tools/testing/selftests/bpf/progs/test_spin_lock.c``.
|
||||
Userspace
|
||||
---------
|
||||
|
||||
.. c:function::
|
||||
bpf_map_get_next_key()
|
||||
~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. code-block:: c
|
||||
|
||||
int bpf_map_get_next_key(int fd, const void *cur_key, void *next_key)
|
||||
|
||||
In userspace, it is possible to iterate through the keys of a hash using
|
||||
|
197
Documentation/bpf/map_lpm_trie.rst
Normal file
197
Documentation/bpf/map_lpm_trie.rst
Normal file
@ -0,0 +1,197 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0-only
|
||||
.. Copyright (C) 2022 Red Hat, Inc.
|
||||
|
||||
=====================
|
||||
BPF_MAP_TYPE_LPM_TRIE
|
||||
=====================
|
||||
|
||||
.. note::
|
||||
- ``BPF_MAP_TYPE_LPM_TRIE`` was introduced in kernel version 4.11
|
||||
|
||||
``BPF_MAP_TYPE_LPM_TRIE`` provides a longest prefix match algorithm that
|
||||
can be used to match IP addresses to a stored set of prefixes.
|
||||
Internally, data is stored in an unbalanced trie of nodes that uses
|
||||
``prefixlen,data`` pairs as its keys. The ``data`` is interpreted in
|
||||
network byte order, i.e. big endian, so ``data[0]`` stores the most
|
||||
significant byte.
|
||||
|
||||
LPM tries may be created with a maximum prefix length that is a multiple
|
||||
of 8, in the range from 8 to 2048. The key used for lookup and update
|
||||
operations is a ``struct bpf_lpm_trie_key``, extended by
|
||||
``max_prefixlen/8`` bytes.
|
||||
|
||||
- For IPv4 addresses the data length is 4 bytes
|
||||
- For IPv6 addresses the data length is 16 bytes
|
||||
|
||||
The value type stored in the LPM trie can be any user defined type.
|
||||
|
||||
.. note::
|
||||
When creating a map of type ``BPF_MAP_TYPE_LPM_TRIE`` you must set the
|
||||
``BPF_F_NO_PREALLOC`` flag.
|
||||
|
||||
Usage
|
||||
=====
|
||||
|
||||
Kernel BPF
|
||||
----------
|
||||
|
||||
bpf_map_lookup_elem()
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. code-block:: c
|
||||
|
||||
void *bpf_map_lookup_elem(struct bpf_map *map, const void *key)
|
||||
|
||||
The longest prefix entry for a given data value can be found using the
|
||||
``bpf_map_lookup_elem()`` helper. This helper returns a pointer to the
|
||||
value associated with the longest matching ``key``, or ``NULL`` if no
|
||||
entry was found.
|
||||
|
||||
The ``key`` should have ``prefixlen`` set to ``max_prefixlen`` when
|
||||
performing longest prefix lookups. For example, when searching for the
|
||||
longest prefix match for an IPv4 address, ``prefixlen`` should be set to
|
||||
``32``.
|
||||
|
||||
bpf_map_update_elem()
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. code-block:: c
|
||||
|
||||
long bpf_map_update_elem(struct bpf_map *map, const void *key, const void *value, u64 flags)
|
||||
|
||||
Prefix entries can be added or updated using the ``bpf_map_update_elem()``
|
||||
helper. This helper replaces existing elements atomically.
|
||||
|
||||
``bpf_map_update_elem()`` returns ``0`` on success, or negative error in
|
||||
case of failure.
|
||||
|
||||
.. note::
|
||||
The flags parameter must be one of BPF_ANY, BPF_NOEXIST or BPF_EXIST,
|
||||
but the value is ignored, giving BPF_ANY semantics.
|
||||
|
||||
bpf_map_delete_elem()
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. code-block:: c
|
||||
|
||||
long bpf_map_delete_elem(struct bpf_map *map, const void *key)
|
||||
|
||||
Prefix entries can be deleted using the ``bpf_map_delete_elem()``
|
||||
helper. This helper will return 0 on success, or negative error in case
|
||||
of failure.
|
||||
|
||||
Userspace
|
||||
---------
|
||||
|
||||
Access from userspace uses libbpf APIs with the same names as above, with
|
||||
the map identified by ``fd``.
|
||||
|
||||
bpf_map_get_next_key()
|
||||
~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. code-block:: c
|
||||
|
||||
int bpf_map_get_next_key (int fd, const void *cur_key, void *next_key)
|
||||
|
||||
A userspace program can iterate through the entries in an LPM trie using
|
||||
libbpf's ``bpf_map_get_next_key()`` function. The first key can be
|
||||
fetched by calling ``bpf_map_get_next_key()`` with ``cur_key`` set to
|
||||
``NULL``. Subsequent calls will fetch the next key that follows the
|
||||
current key. ``bpf_map_get_next_key()`` returns ``0`` on success,
|
||||
``-ENOENT`` if ``cur_key`` is the last key in the trie, or negative
|
||||
error in case of failure.
|
||||
|
||||
``bpf_map_get_next_key()`` will iterate through the LPM trie elements
|
||||
from leftmost leaf first. This means that iteration will return more
|
||||
specific keys before less specific ones.
|
||||
|
||||
Examples
|
||||
========
|
||||
|
||||
Please see ``tools/testing/selftests/bpf/test_lpm_map.c`` for examples
|
||||
of LPM trie usage from userspace. The code snippets below demonstrate
|
||||
API usage.
|
||||
|
||||
Kernel BPF
|
||||
----------
|
||||
|
||||
The following BPF code snippet shows how to declare a new LPM trie for IPv4
|
||||
address prefixes:
|
||||
|
||||
.. code-block:: c
|
||||
|
||||
#include <linux/bpf.h>
|
||||
#include <bpf/bpf_helpers.h>
|
||||
|
||||
struct ipv4_lpm_key {
|
||||
__u32 prefixlen;
|
||||
__u32 data;
|
||||
};
|
||||
|
||||
struct {
|
||||
__uint(type, BPF_MAP_TYPE_LPM_TRIE);
|
||||
__type(key, struct ipv4_lpm_key);
|
||||
__type(value, __u32);
|
||||
__uint(map_flags, BPF_F_NO_PREALLOC);
|
||||
__uint(max_entries, 255);
|
||||
} ipv4_lpm_map SEC(".maps");
|
||||
|
||||
The following BPF code snippet shows how to lookup by IPv4 address:
|
||||
|
||||
.. code-block:: c
|
||||
|
||||
void *lookup(__u32 ipaddr)
|
||||
{
|
||||
struct ipv4_lpm_key key = {
|
||||
.prefixlen = 32,
|
||||
.data = ipaddr
|
||||
};
|
||||
|
||||
return bpf_map_lookup_elem(&ipv4_lpm_map, &key);
|
||||
}
|
||||
|
||||
Userspace
|
||||
---------
|
||||
|
||||
The following snippet shows how to insert an IPv4 prefix entry into an
|
||||
LPM trie:
|
||||
|
||||
.. code-block:: c
|
||||
|
||||
int add_prefix_entry(int lpm_fd, __u32 addr, __u32 prefixlen, struct value *value)
|
||||
{
|
||||
struct ipv4_lpm_key ipv4_key = {
|
||||
.prefixlen = prefixlen,
|
||||
.data = addr
|
||||
};
|
||||
return bpf_map_update_elem(lpm_fd, &ipv4_key, value, BPF_ANY);
|
||||
}
|
||||
|
||||
The following snippet shows a userspace program walking through the entries
|
||||
of an LPM trie:
|
||||
|
||||
|
||||
.. code-block:: c
|
||||
|
||||
#include <bpf/libbpf.h>
|
||||
#include <bpf/bpf.h>
|
||||
|
||||
void iterate_lpm_trie(int map_fd)
|
||||
{
|
||||
struct ipv4_lpm_key *cur_key = NULL;
|
||||
struct ipv4_lpm_key next_key;
|
||||
struct value value;
|
||||
int err;
|
||||
|
||||
for (;;) {
|
||||
err = bpf_map_get_next_key(map_fd, cur_key, &next_key);
|
||||
if (err)
|
||||
break;
|
||||
|
||||
bpf_map_lookup_elem(map_fd, &next_key, &value);
|
||||
|
||||
/* Use key and value here */
|
||||
|
||||
cur_key = &next_key;
|
||||
}
|
||||
}
|
130
Documentation/bpf/map_of_maps.rst
Normal file
130
Documentation/bpf/map_of_maps.rst
Normal file
@ -0,0 +1,130 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0-only
|
||||
.. Copyright (C) 2022 Red Hat, Inc.
|
||||
|
||||
========================================================
|
||||
BPF_MAP_TYPE_ARRAY_OF_MAPS and BPF_MAP_TYPE_HASH_OF_MAPS
|
||||
========================================================
|
||||
|
||||
.. note::
|
||||
- ``BPF_MAP_TYPE_ARRAY_OF_MAPS`` and ``BPF_MAP_TYPE_HASH_OF_MAPS`` were
|
||||
introduced in kernel version 4.12
|
||||
|
||||
``BPF_MAP_TYPE_ARRAY_OF_MAPS`` and ``BPF_MAP_TYPE_HASH_OF_MAPS`` provide general
|
||||
purpose support for map in map storage. One level of nesting is supported, where
|
||||
an outer map contains instances of a single type of inner map, for example
|
||||
``array_of_maps->sock_map``.
|
||||
|
||||
When creating an outer map, an inner map instance is used to initialize the
|
||||
metadata that the outer map holds about its inner maps. This inner map has a
|
||||
separate lifetime from the outer map and can be deleted after the outer map has
|
||||
been created.
|
||||
|
||||
The outer map supports element lookup, update and delete from user space using
|
||||
the syscall API. A BPF program is only allowed to do element lookup in the outer
|
||||
map.
|
||||
|
||||
.. note::
|
||||
- Multi-level nesting is not supported.
|
||||
- Any BPF map type can be used as an inner map, except for
|
||||
``BPF_MAP_TYPE_PROG_ARRAY``.
|
||||
- A BPF program cannot update or delete outer map entries.
|
||||
|
||||
For ``BPF_MAP_TYPE_ARRAY_OF_MAPS`` the key is an unsigned 32-bit integer index
|
||||
into the array. The array is a fixed size with ``max_entries`` elements that are
|
||||
zero initialized when created.
|
||||
|
||||
For ``BPF_MAP_TYPE_HASH_OF_MAPS`` the key type can be chosen when defining the
|
||||
map. The kernel is responsible for allocating and freeing key/value pairs, up to
|
||||
the max_entries limit that you specify. Hash maps use pre-allocation of hash
|
||||
table elements by default. The ``BPF_F_NO_PREALLOC`` flag can be used to disable
|
||||
pre-allocation when it is too memory expensive.
|
||||
|
||||
Usage
|
||||
=====
|
||||
|
||||
Kernel BPF Helper
|
||||
-----------------
|
||||
|
||||
bpf_map_lookup_elem()
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. code-block:: c
|
||||
|
||||
void *bpf_map_lookup_elem(struct bpf_map *map, const void *key)
|
||||
|
||||
Inner maps can be retrieved using the ``bpf_map_lookup_elem()`` helper. This
|
||||
helper returns a pointer to the inner map, or ``NULL`` if no entry was found.
|
||||
|
||||
Examples
|
||||
========
|
||||
|
||||
Kernel BPF Example
|
||||
------------------
|
||||
|
||||
This snippet shows how to create and initialise an array of devmaps in a BPF
|
||||
program. Note that the outer array can only be modified from user space using
|
||||
the syscall API.
|
||||
|
||||
.. code-block:: c
|
||||
|
||||
struct inner_map {
|
||||
__uint(type, BPF_MAP_TYPE_DEVMAP);
|
||||
__uint(max_entries, 10);
|
||||
__type(key, __u32);
|
||||
__type(value, __u32);
|
||||
} inner_map1 SEC(".maps"), inner_map2 SEC(".maps");
|
||||
|
||||
struct {
|
||||
__uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS);
|
||||
__uint(max_entries, 2);
|
||||
__type(key, __u32);
|
||||
__array(values, struct inner_map);
|
||||
} outer_map SEC(".maps") = {
|
||||
.values = { &inner_map1,
|
||||
&inner_map2 }
|
||||
};
|
||||
|
||||
See ``progs/test_btf_map_in_map.c`` in ``tools/testing/selftests/bpf`` for more
|
||||
examples of declarative initialisation of outer maps.
|
||||
|
||||
User Space
|
||||
----------
|
||||
|
||||
This snippet shows how to create an array based outer map:
|
||||
|
||||
.. code-block:: c
|
||||
|
||||
int create_outer_array(int inner_fd) {
|
||||
LIBBPF_OPTS(bpf_map_create_opts, opts, .inner_map_fd = inner_fd);
|
||||
int fd;
|
||||
|
||||
fd = bpf_map_create(BPF_MAP_TYPE_ARRAY_OF_MAPS,
|
||||
"example_array", /* name */
|
||||
sizeof(__u32), /* key size */
|
||||
sizeof(__u32), /* value size */
|
||||
256, /* max entries */
|
||||
&opts); /* create opts */
|
||||
return fd;
|
||||
}
|
||||
|
||||
|
||||
This snippet shows how to add an inner map to an outer map:
|
||||
|
||||
.. code-block:: c
|
||||
|
||||
int add_devmap(int outer_fd, int index, const char *name) {
|
||||
int fd;
|
||||
|
||||
fd = bpf_map_create(BPF_MAP_TYPE_DEVMAP, name,
|
||||
sizeof(__u32), sizeof(__u32), 256, NULL);
|
||||
if (fd < 0)
|
||||
return fd;
|
||||
|
||||
return bpf_map_update_elem(outer_fd, &index, &fd, BPF_ANY);
|
||||
}
|
||||
|
||||
References
|
||||
==========
|
||||
|
||||
- https://lore.kernel.org/netdev/20170322170035.923581-3-kafai@fb.com/
|
||||
- https://lore.kernel.org/netdev/20170322170035.923581-4-kafai@fb.com/
|
146
Documentation/bpf/map_queue_stack.rst
Normal file
146
Documentation/bpf/map_queue_stack.rst
Normal file
@ -0,0 +1,146 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0-only
|
||||
.. Copyright (C) 2022 Red Hat, Inc.
|
||||
|
||||
=========================================
|
||||
BPF_MAP_TYPE_QUEUE and BPF_MAP_TYPE_STACK
|
||||
=========================================
|
||||
|
||||
.. note::
|
||||
- ``BPF_MAP_TYPE_QUEUE`` and ``BPF_MAP_TYPE_STACK`` were introduced
|
||||
in kernel version 4.20
|
||||
|
||||
``BPF_MAP_TYPE_QUEUE`` provides FIFO storage and ``BPF_MAP_TYPE_STACK``
|
||||
provides LIFO storage for BPF programs. These maps support peek, pop and
|
||||
push operations that are exposed to BPF programs through the respective
|
||||
helpers. These operations are exposed to userspace applications using
|
||||
the existing ``bpf`` syscall in the following way:
|
||||
|
||||
- ``BPF_MAP_LOOKUP_ELEM`` -> peek
|
||||
- ``BPF_MAP_LOOKUP_AND_DELETE_ELEM`` -> pop
|
||||
- ``BPF_MAP_UPDATE_ELEM`` -> push
|
||||
|
||||
``BPF_MAP_TYPE_QUEUE`` and ``BPF_MAP_TYPE_STACK`` do not support
|
||||
``BPF_F_NO_PREALLOC``.
|
||||
|
||||
Usage
|
||||
=====
|
||||
|
||||
Kernel BPF
|
||||
----------
|
||||
|
||||
bpf_map_push_elem()
|
||||
~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. code-block:: c
|
||||
|
||||
long bpf_map_push_elem(struct bpf_map *map, const void *value, u64 flags)
|
||||
|
||||
An element ``value`` can be added to a queue or stack using the
|
||||
``bpf_map_push_elem`` helper. The ``flags`` parameter must be set to
|
||||
``BPF_ANY`` or ``BPF_EXIST``. If ``flags`` is set to ``BPF_EXIST`` then,
|
||||
when the queue or stack is full, the oldest element will be removed to
|
||||
make room for ``value`` to be added. Returns ``0`` on success, or
|
||||
negative error in case of failure.
|
||||
|
||||
bpf_map_peek_elem()
|
||||
~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. code-block:: c
|
||||
|
||||
long bpf_map_peek_elem(struct bpf_map *map, void *value)
|
||||
|
||||
This helper fetches an element ``value`` from a queue or stack without
|
||||
removing it. Returns ``0`` on success, or negative error in case of
|
||||
failure.
|
||||
|
||||
bpf_map_pop_elem()
|
||||
~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. code-block:: c
|
||||
|
||||
long bpf_map_pop_elem(struct bpf_map *map, void *value)
|
||||
|
||||
This helper removes an element into ``value`` from a queue or
|
||||
stack. Returns ``0`` on success, or negative error in case of failure.
|
||||
|
||||
|
||||
Userspace
|
||||
---------
|
||||
|
||||
bpf_map_update_elem()
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. code-block:: c
|
||||
|
||||
int bpf_map_update_elem (int fd, const void *key, const void *value, __u64 flags)
|
||||
|
||||
A userspace program can push ``value`` onto a queue or stack using libbpf's
|
||||
``bpf_map_update_elem`` function. The ``key`` parameter must be set to
|
||||
``NULL`` and ``flags`` must be set to ``BPF_ANY`` or ``BPF_EXIST``, with the
|
||||
same semantics as the ``bpf_map_push_elem`` kernel helper. Returns ``0`` on
|
||||
success, or negative error in case of failure.
|
||||
|
||||
bpf_map_lookup_elem()
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. code-block:: c
|
||||
|
||||
int bpf_map_lookup_elem (int fd, const void *key, void *value)
|
||||
|
||||
A userspace program can peek at the ``value`` at the head of a queue or stack
|
||||
using the libbpf ``bpf_map_lookup_elem`` function. The ``key`` parameter must be
|
||||
set to ``NULL``. Returns ``0`` on success, or negative error in case of
|
||||
failure.
|
||||
|
||||
bpf_map_lookup_and_delete_elem()
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. code-block:: c
|
||||
|
||||
int bpf_map_lookup_and_delete_elem (int fd, const void *key, void *value)
|
||||
|
||||
A userspace program can pop a ``value`` from the head of a queue or stack using
|
||||
the libbpf ``bpf_map_lookup_and_delete_elem`` function. The ``key`` parameter
|
||||
must be set to ``NULL``. Returns ``0`` on success, or negative error in case of
|
||||
failure.
|
||||
|
||||
Examples
|
||||
========
|
||||
|
||||
Kernel BPF
|
||||
----------
|
||||
|
||||
This snippet shows how to declare a queue in a BPF program:
|
||||
|
||||
.. code-block:: c
|
||||
|
||||
struct {
|
||||
__uint(type, BPF_MAP_TYPE_QUEUE);
|
||||
__type(value, __u32);
|
||||
__uint(max_entries, 10);
|
||||
} queue SEC(".maps");
|
||||
|
||||
|
||||
Userspace
|
||||
---------
|
||||
|
||||
This snippet shows how to use libbpf's low-level API to create a queue from
|
||||
userspace:
|
||||
|
||||
.. code-block:: c
|
||||
|
||||
int create_queue()
|
||||
{
|
||||
return bpf_map_create(BPF_MAP_TYPE_QUEUE,
|
||||
"sample_queue", /* name */
|
||||
0, /* key size, must be zero */
|
||||
sizeof(__u32), /* value size */
|
||||
10, /* max entries */
|
||||
NULL); /* create options */
|
||||
}
|
||||
|
||||
|
||||
References
|
||||
==========
|
||||
|
||||
https://lwn.net/ml/netdev/153986858555.9127.14517764371945179514.stgit@kernel/
|
155
Documentation/bpf/map_sk_storage.rst
Normal file
155
Documentation/bpf/map_sk_storage.rst
Normal file
@ -0,0 +1,155 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0-only
|
||||
.. Copyright (C) 2022 Red Hat, Inc.
|
||||
|
||||
=======================
|
||||
BPF_MAP_TYPE_SK_STORAGE
|
||||
=======================
|
||||
|
||||
.. note::
|
||||
- ``BPF_MAP_TYPE_SK_STORAGE`` was introduced in kernel version 5.2
|
||||
|
||||
``BPF_MAP_TYPE_SK_STORAGE`` is used to provide socket-local storage for BPF
|
||||
programs. A map of type ``BPF_MAP_TYPE_SK_STORAGE`` declares the type of storage
|
||||
to be provided and acts as the handle for accessing the socket-local
|
||||
storage. The values for maps of type ``BPF_MAP_TYPE_SK_STORAGE`` are stored
|
||||
locally with each socket instead of with the map. The kernel is responsible for
|
||||
allocating storage for a socket when requested and for freeing the storage when
|
||||
either the map or the socket is deleted.
|
||||
|
||||
.. note::
|
||||
- The key type must be ``int`` and ``max_entries`` must be set to ``0``.
|
||||
- The ``BPF_F_NO_PREALLOC`` flag must be used when creating a map for
|
||||
socket-local storage.
|
||||
|
||||
Usage
|
||||
=====
|
||||
|
||||
Kernel BPF
|
||||
----------
|
||||
|
||||
bpf_sk_storage_get()
|
||||
~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. code-block:: c
|
||||
|
||||
void *bpf_sk_storage_get(struct bpf_map *map, void *sk, void *value, u64 flags)
|
||||
|
||||
Socket-local storage can be retrieved using the ``bpf_sk_storage_get()``
|
||||
helper. The helper gets the storage from ``sk`` that is associated with ``map``.
|
||||
If the ``BPF_LOCAL_STORAGE_GET_F_CREATE`` flag is used then
|
||||
``bpf_sk_storage_get()`` will create the storage for ``sk`` if it does not
|
||||
already exist. ``value`` can be used together with
|
||||
``BPF_LOCAL_STORAGE_GET_F_CREATE`` to initialize the storage value, otherwise it
|
||||
will be zero initialized. Returns a pointer to the storage on success, or
|
||||
``NULL`` in case of failure.
|
||||
|
||||
.. note::
|
||||
- ``sk`` is a kernel ``struct sock`` pointer for LSM or tracing programs.
|
||||
- ``sk`` is a ``struct bpf_sock`` pointer for other program types.
|
||||
|
||||
bpf_sk_storage_delete()
|
||||
~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. code-block:: c
|
||||
|
||||
long bpf_sk_storage_delete(struct bpf_map *map, void *sk)
|
||||
|
||||
Socket-local storage can be deleted using the ``bpf_sk_storage_delete()``
|
||||
helper. The helper deletes the storage from ``sk`` that is identified by
|
||||
``map``. Returns ``0`` on success, or negative error in case of failure.
|
||||
|
||||
User space
|
||||
----------
|
||||
|
||||
bpf_map_update_elem()
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. code-block:: c
|
||||
|
||||
int bpf_map_update_elem(int map_fd, const void *key, const void *value, __u64 flags)
|
||||
|
||||
Socket-local storage for the socket identified by ``key`` belonging to
|
||||
``map_fd`` can be added or updated using the ``bpf_map_update_elem()`` libbpf
|
||||
function. ``key`` must be a pointer to a valid ``fd`` in the user space
|
||||
program. The ``flags`` parameter can be used to control the update behaviour:
|
||||
|
||||
- ``BPF_ANY`` will create storage for ``fd`` or update existing storage.
|
||||
- ``BPF_NOEXIST`` will create storage for ``fd`` only if it did not already
|
||||
exist, otherwise the call will fail with ``-EEXIST``.
|
||||
- ``BPF_EXIST`` will update existing storage for ``fd`` if it already exists,
|
||||
otherwise the call will fail with ``-ENOENT``.
|
||||
|
||||
Returns ``0`` on success, or negative error in case of failure.
|
||||
|
||||
bpf_map_lookup_elem()
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. code-block:: c
|
||||
|
||||
int bpf_map_lookup_elem(int map_fd, const void *key, void *value)
|
||||
|
||||
Socket-local storage for the socket identified by ``key`` belonging to
|
||||
``map_fd`` can be retrieved using the ``bpf_map_lookup_elem()`` libbpf
|
||||
function. ``key`` must be a pointer to a valid ``fd`` in the user space
|
||||
program. Returns ``0`` on success, or negative error in case of failure.
|
||||
|
||||
bpf_map_delete_elem()
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. code-block:: c
|
||||
|
||||
int bpf_map_delete_elem(int map_fd, const void *key)
|
||||
|
||||
Socket-local storage for the socket identified by ``key`` belonging to
|
||||
``map_fd`` can be deleted using the ``bpf_map_delete_elem()`` libbpf
|
||||
function. Returns ``0`` on success, or negative error in case of failure.
|
||||
|
||||
Examples
|
||||
========
|
||||
|
||||
Kernel BPF
|
||||
----------
|
||||
|
||||
This snippet shows how to declare socket-local storage in a BPF program:
|
||||
|
||||
.. code-block:: c
|
||||
|
||||
struct {
|
||||
__uint(type, BPF_MAP_TYPE_SK_STORAGE);
|
||||
__uint(map_flags, BPF_F_NO_PREALLOC);
|
||||
__type(key, int);
|
||||
__type(value, struct my_storage);
|
||||
} socket_storage SEC(".maps");
|
||||
|
||||
This snippet shows how to retrieve socket-local storage in a BPF program:
|
||||
|
||||
.. code-block:: c
|
||||
|
||||
SEC("sockops")
|
||||
int _sockops(struct bpf_sock_ops *ctx)
|
||||
{
|
||||
struct my_storage *storage;
|
||||
struct bpf_sock *sk;
|
||||
|
||||
sk = ctx->sk;
|
||||
if (!sk)
|
||||
return 1;
|
||||
|
||||
storage = bpf_sk_storage_get(&socket_storage, sk, 0,
|
||||
BPF_LOCAL_STORAGE_GET_F_CREATE);
|
||||
if (!storage)
|
||||
return 1;
|
||||
|
||||
/* Use 'storage' here */
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
||||
Please see the ``tools/testing/selftests/bpf`` directory for functional
|
||||
examples.
|
||||
|
||||
References
|
||||
==========
|
||||
|
||||
https://lwn.net/ml/netdev/20190426171103.61892-1-kafai@fb.com/
|
192
Documentation/bpf/map_xskmap.rst
Normal file
192
Documentation/bpf/map_xskmap.rst
Normal file
@ -0,0 +1,192 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0-only
|
||||
.. Copyright (C) 2022 Red Hat, Inc.
|
||||
|
||||
===================
|
||||
BPF_MAP_TYPE_XSKMAP
|
||||
===================
|
||||
|
||||
.. note::
|
||||
- ``BPF_MAP_TYPE_XSKMAP`` was introduced in kernel version 4.18
|
||||
|
||||
The ``BPF_MAP_TYPE_XSKMAP`` is used as a backend map for XDP BPF helper
|
||||
call ``bpf_redirect_map()`` and ``XDP_REDIRECT`` action, like 'devmap' and 'cpumap'.
|
||||
This map type redirects raw XDP frames to `AF_XDP`_ sockets (XSKs), a new type of
|
||||
address family in the kernel that allows redirection of frames from a driver to
|
||||
user space without having to traverse the full network stack. An AF_XDP socket
|
||||
binds to a single netdev queue. A mapping of XSKs to queues is shown below:
|
||||
|
||||
.. code-block:: none
|
||||
|
||||
+---------------------------------------------------+
|
||||
| xsk A | xsk B | xsk C |<---+ User space
|
||||
=========================================================|==========
|
||||
| Queue 0 | Queue 1 | Queue 2 | | Kernel
|
||||
+---------------------------------------------------+ |
|
||||
| Netdev eth0 | |
|
||||
+---------------------------------------------------+ |
|
||||
| +=============+ | |
|
||||
| | key | xsk | | |
|
||||
| +---------+ +=============+ | |
|
||||
| | | | 0 | xsk A | | |
|
||||
| | | +-------------+ | |
|
||||
| | | | 1 | xsk B | | |
|
||||
| | BPF |-- redirect -->+-------------+-------------+
|
||||
| | prog | | 2 | xsk C | |
|
||||
| | | +-------------+ |
|
||||
| | | |
|
||||
| | | |
|
||||
| +---------+ |
|
||||
| |
|
||||
+---------------------------------------------------+
|
||||
|
||||
.. note::
|
||||
An AF_XDP socket that is bound to a certain <netdev/queue_id> will *only*
|
||||
accept XDP frames from that <netdev/queue_id>. If an XDP program tries to redirect
|
||||
from a <netdev/queue_id> other than what the socket is bound to, the frame will
|
||||
not be received on the socket.
|
||||
|
||||
Typically an XSKMAP is created per netdev. This map contains an array of XSK File
|
||||
Descriptors (FDs). The number of array elements is typically set or adjusted using
|
||||
the ``max_entries`` map parameter. For AF_XDP ``max_entries`` is equal to the number
|
||||
of queues supported by the netdev.
|
||||
|
||||
.. note::
|
||||
Both the map key and map value size must be 4 bytes.
|
||||
|
||||
Usage
|
||||
=====
|
||||
|
||||
Kernel BPF
|
||||
----------
|
||||
bpf_redirect_map()
|
||||
^^^^^^^^^^^^^^^^^^
|
||||
.. code-block:: c
|
||||
|
||||
long bpf_redirect_map(struct bpf_map *map, u32 key, u64 flags)
|
||||
|
||||
Redirect the packet to the endpoint referenced by ``map`` at index ``key``.
|
||||
For ``BPF_MAP_TYPE_XSKMAP`` this map contains references to XSK FDs
|
||||
for sockets attached to a netdev's queues.
|
||||
|
||||
.. note::
|
||||
If the map is empty at an index, the packet is dropped. This means that it is
|
||||
necessary to have an XDP program loaded with at least one XSK in the
|
||||
XSKMAP to be able to get any traffic to user space through the socket.
|
||||
|
||||
bpf_map_lookup_elem()
|
||||
^^^^^^^^^^^^^^^^^^^^^
|
||||
.. code-block:: c
|
||||
|
||||
void *bpf_map_lookup_elem(struct bpf_map *map, const void *key)
|
||||
|
||||
XSK entry references of type ``struct xdp_sock *`` can be retrieved using the
|
||||
``bpf_map_lookup_elem()`` helper.
|
||||
|
||||
User space
|
||||
----------
|
||||
.. note::
|
||||
XSK entries can only be updated/deleted from user space and not from
|
||||
a BPF program. Trying to call these functions from a kernel BPF program will
|
||||
result in the program failing to load and a verifier warning.
|
||||
|
||||
bpf_map_update_elem()
|
||||
^^^^^^^^^^^^^^^^^^^^^
|
||||
.. code-block:: c
|
||||
|
||||
int bpf_map_update_elem(int fd, const void *key, const void *value, __u64 flags)
|
||||
|
||||
XSK entries can be added or updated using the ``bpf_map_update_elem()``
|
||||
helper. The ``key`` parameter is equal to the queue_id of the queue the XSK
|
||||
is attaching to. And the ``value`` parameter is the FD value of that socket.
|
||||
|
||||
Under the hood, the XSKMAP update function uses the XSK FD value to retrieve the
|
||||
associated ``struct xdp_sock`` instance.
|
||||
|
||||
The flags argument can be one of the following:
|
||||
|
||||
- BPF_ANY: Create a new element or update an existing element.
|
||||
- BPF_NOEXIST: Create a new element only if it did not exist.
|
||||
- BPF_EXIST: Update an existing element.
|
||||
|
||||
bpf_map_lookup_elem()
|
||||
^^^^^^^^^^^^^^^^^^^^^
|
||||
.. code-block:: c
|
||||
|
||||
int bpf_map_lookup_elem(int fd, const void *key, void *value)
|
||||
|
||||
Returns ``struct xdp_sock *`` or negative error in case of failure.
|
||||
|
||||
bpf_map_delete_elem()
|
||||
^^^^^^^^^^^^^^^^^^^^^
|
||||
.. code-block:: c
|
||||
|
||||
int bpf_map_delete_elem(int fd, const void *key)
|
||||
|
||||
XSK entries can be deleted using the ``bpf_map_delete_elem()``
|
||||
helper. This helper will return 0 on success, or negative error in case of
|
||||
failure.
|
||||
|
||||
.. note::
|
||||
When `libxdp`_ deletes an XSK it also removes the associated socket
|
||||
entry from the XSKMAP.
|
||||
|
||||
Examples
|
||||
========
|
||||
Kernel
|
||||
------
|
||||
|
||||
The following code snippet shows how to declare a ``BPF_MAP_TYPE_XSKMAP`` called
|
||||
``xsks_map`` and how to redirect packets to an XSK.
|
||||
|
||||
.. code-block:: c
|
||||
|
||||
struct {
|
||||
__uint(type, BPF_MAP_TYPE_XSKMAP);
|
||||
__type(key, __u32);
|
||||
__type(value, __u32);
|
||||
__uint(max_entries, 64);
|
||||
} xsks_map SEC(".maps");
|
||||
|
||||
|
||||
SEC("xdp")
|
||||
int xsk_redir_prog(struct xdp_md *ctx)
|
||||
{
|
||||
__u32 index = ctx->rx_queue_index;
|
||||
|
||||
if (bpf_map_lookup_elem(&xsks_map, &index))
|
||||
return bpf_redirect_map(&xsks_map, index, 0);
|
||||
return XDP_PASS;
|
||||
}
|
||||
|
||||
User space
|
||||
----------
|
||||
|
||||
The following code snippet shows how to update an XSKMAP with an XSK entry.
|
||||
|
||||
.. code-block:: c
|
||||
|
||||
int update_xsks_map(struct bpf_map *xsks_map, int queue_id, int xsk_fd)
|
||||
{
|
||||
int ret;
|
||||
|
||||
ret = bpf_map_update_elem(bpf_map__fd(xsks_map), &queue_id, &xsk_fd, 0);
|
||||
if (ret < 0)
|
||||
fprintf(stderr, "Failed to update xsks_map: %s\n", strerror(errno));
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
For an example on how create AF_XDP sockets, please see the AF_XDP-example and
|
||||
AF_XDP-forwarding programs in the `bpf-examples`_ directory in the `libxdp`_ repository.
|
||||
For a detailed explaination of the AF_XDP interface please see:
|
||||
|
||||
- `libxdp-readme`_.
|
||||
- `AF_XDP`_ kernel documentation.
|
||||
|
||||
.. note::
|
||||
The most comprehensive resource for using XSKMAPs and AF_XDP is `libxdp`_.
|
||||
|
||||
.. _libxdp: https://github.com/xdp-project/xdp-tools/tree/master/lib/libxdp
|
||||
.. _AF_XDP: https://www.kernel.org/doc/html/latest/networking/af_xdp.html
|
||||
.. _bpf-examples: https://github.com/xdp-project/bpf-examples
|
||||
.. _libxdp-readme: https://github.com/xdp-project/xdp-tools/tree/master/lib/libxdp#using-af_xdp-sockets
|
@ -1,46 +1,19 @@
|
||||
|
||||
=========
|
||||
eBPF maps
|
||||
=========
|
||||
========
|
||||
BPF maps
|
||||
========
|
||||
|
||||
'maps' is a generic storage of different types for sharing data between kernel
|
||||
and userspace.
|
||||
BPF 'maps' provide generic storage of different types for sharing data between
|
||||
kernel and user space. There are several storage types available, including
|
||||
hash, array, bloom filter and radix-tree. Several of the map types exist to
|
||||
support specific BPF helpers that perform actions based on the map contents. The
|
||||
maps are accessed from BPF programs via BPF helpers which are documented in the
|
||||
`man-pages`_ for `bpf-helpers(7)`_.
|
||||
|
||||
The maps are accessed from user space via BPF syscall, which has commands:
|
||||
|
||||
- create a map with given type and attributes
|
||||
``map_fd = bpf(BPF_MAP_CREATE, union bpf_attr *attr, u32 size)``
|
||||
using attr->map_type, attr->key_size, attr->value_size, attr->max_entries
|
||||
returns process-local file descriptor or negative error
|
||||
|
||||
- lookup key in a given map
|
||||
``err = bpf(BPF_MAP_LOOKUP_ELEM, union bpf_attr *attr, u32 size)``
|
||||
using attr->map_fd, attr->key, attr->value
|
||||
returns zero and stores found elem into value or negative error
|
||||
|
||||
- create or update key/value pair in a given map
|
||||
``err = bpf(BPF_MAP_UPDATE_ELEM, union bpf_attr *attr, u32 size)``
|
||||
using attr->map_fd, attr->key, attr->value
|
||||
returns zero or negative error
|
||||
|
||||
- find and delete element by key in a given map
|
||||
``err = bpf(BPF_MAP_DELETE_ELEM, union bpf_attr *attr, u32 size)``
|
||||
using attr->map_fd, attr->key
|
||||
|
||||
- to delete map: close(fd)
|
||||
Exiting process will delete maps automatically
|
||||
|
||||
userspace programs use this syscall to create/access maps that eBPF programs
|
||||
are concurrently updating.
|
||||
|
||||
maps can have different types: hash, array, bloom filter, radix-tree, etc.
|
||||
|
||||
The map is defined by:
|
||||
|
||||
- type
|
||||
- max number of elements
|
||||
- key size in bytes
|
||||
- value size in bytes
|
||||
BPF maps are accessed from user space via the ``bpf`` syscall, which provides
|
||||
commands to create maps, lookup elements, update elements and delete
|
||||
elements. More details of the BPF syscall are available in
|
||||
:doc:`/userspace-api/ebpf/syscall` and in the `man-pages`_ for `bpf(2)`_.
|
||||
|
||||
Map Types
|
||||
=========
|
||||
@ -49,4 +22,60 @@ Map Types
|
||||
:maxdepth: 1
|
||||
:glob:
|
||||
|
||||
map_*
|
||||
map_*
|
||||
|
||||
Usage Notes
|
||||
===========
|
||||
|
||||
.. c:function::
|
||||
int bpf(int command, union bpf_attr *attr, u32 size)
|
||||
|
||||
Use the ``bpf()`` system call to perform the operation specified by
|
||||
``command``. The operation takes parameters provided in ``attr``. The ``size``
|
||||
argument is the size of the ``union bpf_attr`` in ``attr``.
|
||||
|
||||
**BPF_MAP_CREATE**
|
||||
|
||||
Create a map with the desired type and attributes in ``attr``:
|
||||
|
||||
.. code-block:: c
|
||||
|
||||
int fd;
|
||||
union bpf_attr attr = {
|
||||
.map_type = BPF_MAP_TYPE_ARRAY; /* mandatory */
|
||||
.key_size = sizeof(__u32); /* mandatory */
|
||||
.value_size = sizeof(__u32); /* mandatory */
|
||||
.max_entries = 256; /* mandatory */
|
||||
.map_flags = BPF_F_MMAPABLE;
|
||||
.map_name = "example_array";
|
||||
};
|
||||
|
||||
fd = bpf(BPF_MAP_CREATE, &attr, sizeof(attr));
|
||||
|
||||
Returns a process-local file descriptor on success, or negative error in case of
|
||||
failure. The map can be deleted by calling ``close(fd)``. Maps held by open
|
||||
file descriptors will be deleted automatically when a process exits.
|
||||
|
||||
.. note:: Valid characters for ``map_name`` are ``A-Z``, ``a-z``, ``0-9``,
|
||||
``'_'`` and ``'.'``.
|
||||
|
||||
**BPF_MAP_LOOKUP_ELEM**
|
||||
|
||||
Lookup key in a given map using ``attr->map_fd``, ``attr->key``,
|
||||
``attr->value``. Returns zero and stores found elem into ``attr->value`` on
|
||||
success, or negative error on failure.
|
||||
|
||||
**BPF_MAP_UPDATE_ELEM**
|
||||
|
||||
Create or update key/value pair in a given map using ``attr->map_fd``, ``attr->key``,
|
||||
``attr->value``. Returns zero on success or negative error on failure.
|
||||
|
||||
**BPF_MAP_DELETE_ELEM**
|
||||
|
||||
Find and delete element by key in a given map using ``attr->map_fd``,
|
||||
``attr->key``. Returns zero on success or negative error on failure.
|
||||
|
||||
.. Links:
|
||||
.. _man-pages: https://www.kernel.org/doc/man-pages/
|
||||
.. _bpf(2): https://man7.org/linux/man-pages/man2/bpf.2.html
|
||||
.. _bpf-helpers(7): https://man7.org/linux/man-pages/man7/bpf-helpers.7.html
|
||||
|
@ -7,3 +7,6 @@ Program Types
|
||||
:glob:
|
||||
|
||||
prog_*
|
||||
|
||||
For a list of all program types, see :ref:`program_types_and_elf` in
|
||||
the :ref:`libbpf` documentation.
|
||||
|
81
Documentation/bpf/redirect.rst
Normal file
81
Documentation/bpf/redirect.rst
Normal file
@ -0,0 +1,81 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0-only
|
||||
.. Copyright (C) 2022 Red Hat, Inc.
|
||||
|
||||
========
|
||||
Redirect
|
||||
========
|
||||
XDP_REDIRECT
|
||||
############
|
||||
Supported maps
|
||||
--------------
|
||||
|
||||
XDP_REDIRECT works with the following map types:
|
||||
|
||||
- ``BPF_MAP_TYPE_DEVMAP``
|
||||
- ``BPF_MAP_TYPE_DEVMAP_HASH``
|
||||
- ``BPF_MAP_TYPE_CPUMAP``
|
||||
- ``BPF_MAP_TYPE_XSKMAP``
|
||||
|
||||
For more information on these maps, please see the specific map documentation.
|
||||
|
||||
Process
|
||||
-------
|
||||
|
||||
.. kernel-doc:: net/core/filter.c
|
||||
:doc: xdp redirect
|
||||
|
||||
.. note::
|
||||
Not all drivers support transmitting frames after a redirect, and for
|
||||
those that do, not all of them support non-linear frames. Non-linear xdp
|
||||
bufs/frames are bufs/frames that contain more than one fragment.
|
||||
|
||||
Debugging packet drops
|
||||
----------------------
|
||||
Silent packet drops for XDP_REDIRECT can be debugged using:
|
||||
|
||||
- bpf_trace
|
||||
- perf_record
|
||||
|
||||
bpf_trace
|
||||
^^^^^^^^^
|
||||
The following bpftrace command can be used to capture and count all XDP tracepoints:
|
||||
|
||||
.. code-block:: none
|
||||
|
||||
sudo bpftrace -e 'tracepoint:xdp:* { @cnt[probe] = count(); }'
|
||||
Attaching 12 probes...
|
||||
^C
|
||||
|
||||
@cnt[tracepoint:xdp:mem_connect]: 18
|
||||
@cnt[tracepoint:xdp:mem_disconnect]: 18
|
||||
@cnt[tracepoint:xdp:xdp_exception]: 19605
|
||||
@cnt[tracepoint:xdp:xdp_devmap_xmit]: 1393604
|
||||
@cnt[tracepoint:xdp:xdp_redirect]: 22292200
|
||||
|
||||
.. note::
|
||||
The various xdp tracepoints can be found in ``source/include/trace/events/xdp.h``
|
||||
|
||||
The following bpftrace command can be used to extract the ``ERRNO`` being returned as
|
||||
part of the err parameter:
|
||||
|
||||
.. code-block:: none
|
||||
|
||||
sudo bpftrace -e \
|
||||
'tracepoint:xdp:xdp_redirect*_err {@redir_errno[-args->err] = count();}
|
||||
tracepoint:xdp:xdp_devmap_xmit {@devmap_errno[-args->err] = count();}'
|
||||
|
||||
perf record
|
||||
^^^^^^^^^^^
|
||||
The perf tool also supports recording tracepoints:
|
||||
|
||||
.. code-block:: none
|
||||
|
||||
perf record -a -e xdp:xdp_redirect_err \
|
||||
-e xdp:xdp_redirect_map_err \
|
||||
-e xdp:xdp_exception \
|
||||
-e xdp:xdp_devmap_xmit
|
||||
|
||||
References
|
||||
===========
|
||||
|
||||
- https://github.com/xdp-project/xdp-tutorial/tree/master/tracing02-xdp-monitor
|
@ -194,6 +194,24 @@ finally:
|
||||
else:
|
||||
version = release = "unknown version"
|
||||
|
||||
#
|
||||
# HACK: there seems to be no easy way for us to get at the version and
|
||||
# release information passed in from the makefile...so go pawing through the
|
||||
# command-line options and find it for ourselves.
|
||||
#
|
||||
def get_cline_version():
|
||||
c_version = c_release = ''
|
||||
for arg in sys.argv:
|
||||
if arg.startswith('version='):
|
||||
c_version = arg[8:]
|
||||
elif arg.startswith('release='):
|
||||
c_release = arg[8:]
|
||||
if c_version:
|
||||
if c_release:
|
||||
return c_version + '-' + c_release
|
||||
return c_version
|
||||
return version # Whatever we came up with before
|
||||
|
||||
# The language for content autogenerated by Sphinx. Refer to documentation
|
||||
# for a list of supported languages.
|
||||
#
|
||||
@ -247,7 +265,7 @@ highlight_language = 'none'
|
||||
# a list of builtin themes.
|
||||
|
||||
# Default theme
|
||||
html_theme = 'sphinx_rtd_theme'
|
||||
html_theme = 'alabaster'
|
||||
html_css_files = []
|
||||
|
||||
if "DOCS_THEME" in os.environ:
|
||||
@ -278,8 +296,12 @@ if html_theme == 'sphinx_rtd_theme' or html_theme == 'sphinx_rtd_dark_mode':
|
||||
# Add color-specific RTD normal mode
|
||||
html_css_files.append('theme_rtd_colors.css')
|
||||
|
||||
html_theme_options = {
|
||||
'navigation_depth': -1,
|
||||
}
|
||||
|
||||
except ImportError:
|
||||
html_theme = 'classic'
|
||||
html_theme = 'alabaster'
|
||||
|
||||
if "DOCS_CSS" in os.environ:
|
||||
css = os.environ["DOCS_CSS"].split(" ")
|
||||
@ -295,127 +317,29 @@ if major <= 1 and minor < 8:
|
||||
for l in html_css_files:
|
||||
html_context['css_files'].append('_static/' + l)
|
||||
|
||||
if html_theme == 'classic':
|
||||
if html_theme == 'alabaster':
|
||||
html_theme_options = {
|
||||
'rightsidebar': False,
|
||||
'stickysidebar': True,
|
||||
'collapsiblesidebar': True,
|
||||
'externalrefs': False,
|
||||
|
||||
'footerbgcolor': "white",
|
||||
'footertextcolor': "white",
|
||||
'sidebarbgcolor': "white",
|
||||
'sidebarbtncolor': "black",
|
||||
'sidebartextcolor': "black",
|
||||
'sidebarlinkcolor': "#686bff",
|
||||
'relbarbgcolor': "#133f52",
|
||||
'relbartextcolor': "white",
|
||||
'relbarlinkcolor': "white",
|
||||
'bgcolor': "white",
|
||||
'textcolor': "black",
|
||||
'headbgcolor': "#f2f2f2",
|
||||
'headtextcolor': "#20435c",
|
||||
'headlinkcolor': "#c60f0f",
|
||||
'linkcolor': "#355f7c",
|
||||
'visitedlinkcolor': "#355f7c",
|
||||
'codebgcolor': "#3f3f3f",
|
||||
'codetextcolor': "white",
|
||||
|
||||
'bodyfont': "serif",
|
||||
'headfont': "sans-serif",
|
||||
'description': get_cline_version(),
|
||||
'page_width': '65em',
|
||||
'sidebar_width': '15em',
|
||||
'font_size': 'inherit',
|
||||
'font_family': 'serif',
|
||||
}
|
||||
|
||||
sys.stderr.write("Using %s theme\n" % html_theme)
|
||||
|
||||
# Theme options are theme-specific and customize the look and feel of a theme
|
||||
# further. For a list of options available for each theme, see the
|
||||
# documentation.
|
||||
#html_theme_options = {}
|
||||
|
||||
# Add any paths that contain custom themes here, relative to this directory.
|
||||
#html_theme_path = []
|
||||
|
||||
# The name for this set of Sphinx documents. If None, it defaults to
|
||||
# "<project> v<release> documentation".
|
||||
#html_title = None
|
||||
|
||||
# A shorter title for the navigation bar. Default is the same as html_title.
|
||||
#html_short_title = None
|
||||
|
||||
# The name of an image file (relative to this directory) to place at the top
|
||||
# of the sidebar.
|
||||
#html_logo = None
|
||||
|
||||
# The name of an image file (within the static path) to use as favicon of the
|
||||
# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32
|
||||
# pixels large.
|
||||
#html_favicon = None
|
||||
|
||||
# Add any paths that contain custom static files (such as style sheets) here,
|
||||
# relative to this directory. They are copied after the builtin static files,
|
||||
# so a file named "default.css" will overwrite the builtin "default.css".
|
||||
html_static_path = ['sphinx-static']
|
||||
|
||||
# Add any extra paths that contain custom files (such as robots.txt or
|
||||
# .htaccess) here, relative to this directory. These files are copied
|
||||
# directly to the root of the documentation.
|
||||
#html_extra_path = []
|
||||
|
||||
# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
|
||||
# using the given strftime format.
|
||||
#html_last_updated_fmt = '%b %d, %Y'
|
||||
|
||||
# If true, SmartyPants will be used to convert quotes and dashes to
|
||||
# typographically correct entities.
|
||||
html_use_smartypants = False
|
||||
|
||||
# Custom sidebar templates, maps document names to template names.
|
||||
# Note that the RTD theme ignores this.
|
||||
html_sidebars = { '**': ['searchbox.html', 'localtoc.html', 'sourcelink.html']}
|
||||
|
||||
# Additional templates that should be rendered to pages, maps page names to
|
||||
# template names.
|
||||
#html_additional_pages = {}
|
||||
|
||||
# If false, no module index is generated.
|
||||
#html_domain_indices = True
|
||||
|
||||
# If false, no index is generated.
|
||||
#html_use_index = True
|
||||
|
||||
# If true, the index is split into individual pages for each letter.
|
||||
#html_split_index = False
|
||||
|
||||
# If true, links to the reST sources are added to the pages.
|
||||
#html_show_sourcelink = True
|
||||
|
||||
# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
|
||||
#html_show_sphinx = True
|
||||
|
||||
# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
|
||||
#html_show_copyright = True
|
||||
|
||||
# If true, an OpenSearch description file will be output, and all pages will
|
||||
# contain a <link> tag referring to it. The value of this option must be the
|
||||
# base URL from which the finished HTML is served.
|
||||
#html_use_opensearch = ''
|
||||
|
||||
# This is the file name suffix for HTML files (e.g. ".xhtml").
|
||||
#html_file_suffix = None
|
||||
|
||||
# Language to be used for generating the HTML full-text search index.
|
||||
# Sphinx supports the following languages:
|
||||
# 'da', 'de', 'en', 'es', 'fi', 'fr', 'h', 'it', 'ja'
|
||||
# 'nl', 'no', 'pt', 'ro', 'r', 'sv', 'tr'
|
||||
#html_search_language = 'en'
|
||||
|
||||
# A dictionary with options for the search language support, empty by default.
|
||||
# Now only 'ja' uses this config value
|
||||
#html_search_options = {'type': 'default'}
|
||||
|
||||
# The name of a javascript file (relative to the configuration directory) that
|
||||
# implements a search results scorer. If empty, the default will be used.
|
||||
#html_search_scorer = 'scorer.js'
|
||||
# Note that the RTD theme ignores this
|
||||
html_sidebars = { '**': ["about.html", 'searchbox.html', 'localtoc.html', 'sourcelink.html']}
|
||||
|
||||
# Output file base name for HTML help builder.
|
||||
htmlhelp_basename = 'TheLinuxKerneldoc'
|
||||
@ -558,19 +482,6 @@ texinfo_documents = [
|
||||
'Miscellaneous'),
|
||||
]
|
||||
|
||||
# Documents to append as an appendix to all manuals.
|
||||
#texinfo_appendices = []
|
||||
|
||||
# If false, no module index is generated.
|
||||
#texinfo_domain_indices = True
|
||||
|
||||
# How to display URL addresses: 'footnote', 'no', or 'inline'.
|
||||
#texinfo_show_urls = 'footnote'
|
||||
|
||||
# If true, do not generate a @detailmenu in the "Top" node's menu.
|
||||
#texinfo_no_detailmenu = False
|
||||
|
||||
|
||||
# -- Options for Epub output ----------------------------------------------
|
||||
|
||||
# Bibliographic Dublin Core info.
|
||||
@ -579,67 +490,9 @@ epub_author = author
|
||||
epub_publisher = author
|
||||
epub_copyright = copyright
|
||||
|
||||
# The basename for the epub file. It defaults to the project name.
|
||||
#epub_basename = project
|
||||
|
||||
# The HTML theme for the epub output. Since the default themes are not
|
||||
# optimized for small screen space, using the same theme for HTML and epub
|
||||
# output is usually not wise. This defaults to 'epub', a theme designed to save
|
||||
# visual space.
|
||||
#epub_theme = 'epub'
|
||||
|
||||
# The language of the text. It defaults to the language option
|
||||
# or 'en' if the language is not set.
|
||||
#epub_language = ''
|
||||
|
||||
# The scheme of the identifier. Typical schemes are ISBN or URL.
|
||||
#epub_scheme = ''
|
||||
|
||||
# The unique identifier of the text. This can be a ISBN number
|
||||
# or the project homepage.
|
||||
#epub_identifier = ''
|
||||
|
||||
# A unique identification for the text.
|
||||
#epub_uid = ''
|
||||
|
||||
# A tuple containing the cover image and cover page html template filenames.
|
||||
#epub_cover = ()
|
||||
|
||||
# A sequence of (type, uri, title) tuples for the guide element of content.opf.
|
||||
#epub_guide = ()
|
||||
|
||||
# HTML files that should be inserted before the pages created by sphinx.
|
||||
# The format is a list of tuples containing the path and title.
|
||||
#epub_pre_files = []
|
||||
|
||||
# HTML files that should be inserted after the pages created by sphinx.
|
||||
# The format is a list of tuples containing the path and title.
|
||||
#epub_post_files = []
|
||||
|
||||
# A list of files that should not be packed into the epub file.
|
||||
epub_exclude_files = ['search.html']
|
||||
|
||||
# The depth of the table of contents in toc.ncx.
|
||||
#epub_tocdepth = 3
|
||||
|
||||
# Allow duplicate toc entries.
|
||||
#epub_tocdup = True
|
||||
|
||||
# Choose between 'default' and 'includehidden'.
|
||||
#epub_tocscope = 'default'
|
||||
|
||||
# Fix unsupported image types using the Pillow.
|
||||
#epub_fix_images = False
|
||||
|
||||
# Scale large images.
|
||||
#epub_max_image_width = 0
|
||||
|
||||
# How to display URL addresses: 'footnote', 'no', or 'inline'.
|
||||
#epub_show_urls = 'inline'
|
||||
|
||||
# If false, no index is generated.
|
||||
#epub_use_index = True
|
||||
|
||||
#=======
|
||||
# rst2pdf
|
||||
#
|
||||
|
@ -36,6 +36,9 @@ String Conversions
|
||||
String Manipulation
|
||||
-------------------
|
||||
|
||||
.. kernel-doc:: include/linux/fortify-string.h
|
||||
:internal:
|
||||
|
||||
.. kernel-doc:: lib/string.c
|
||||
:export:
|
||||
|
||||
@ -171,9 +174,6 @@ Division Functions
|
||||
.. kernel-doc:: include/linux/math64.h
|
||||
:internal:
|
||||
|
||||
.. kernel-doc:: lib/math/div64.c
|
||||
:functions: div_s64_rem div64_u64_rem div64_u64 div64_s64
|
||||
|
||||
.. kernel-doc:: lib/math/gcd.c
|
||||
:export:
|
||||
|
||||
|
@ -191,7 +191,7 @@ Here is a sample module which implements a basic per cpu counter using
|
||||
|
||||
static void __exit test_exit(void)
|
||||
{
|
||||
del_timer_sync(&test_timer);
|
||||
timer_shutdown_sync(&test_timer);
|
||||
}
|
||||
|
||||
module_init(test_init);
|
||||
|
@ -20,18 +20,15 @@ Author: Dominik Brodowski <linux@brodo.de>
|
||||
|
||||
Mailing List
|
||||
------------
|
||||
There is a CPU frequency changing CVS commit and general list where
|
||||
you can report bugs, problems or submit patches. To post a message,
|
||||
send an email to linux-pm@vger.kernel.org.
|
||||
There is a CPU frequency general list where you can report bugs,
|
||||
problems or submit patches. To post a message, send an email to
|
||||
linux-pm@vger.kernel.org.
|
||||
|
||||
Links
|
||||
-----
|
||||
the FTP archives:
|
||||
* ftp://ftp.linux.org.uk/pub/linux/cpufreq/
|
||||
|
||||
how to access the CVS repository:
|
||||
* http://cvs.arm.linux.org.uk/
|
||||
|
||||
the CPUFreq Mailing list:
|
||||
* http://vger.kernel.org/vger-lists.html#linux-pm
|
||||
|
||||
|
@ -172,7 +172,7 @@ Here are schematics of how these functions are called when operated from
|
||||
other part of the kernel. Note that the .setkey() call might happen
|
||||
before or after any of these schematics happen, but must not happen
|
||||
during any of these are in-flight. Please note that calling .init()
|
||||
followed immediately by .finish() is also a perfectly valid
|
||||
followed immediately by .final() is also a perfectly valid
|
||||
transformation.
|
||||
|
||||
::
|
||||
|
@ -131,9 +131,9 @@ from the kernel crypto API. If the buffer is too small for the message
|
||||
digest, the flag MSG_TRUNC is set by the kernel.
|
||||
|
||||
In order to set a message digest key, the calling application must use
|
||||
the setsockopt() option of ALG_SET_KEY. If the key is not set the HMAC
|
||||
operation is performed without the initial HMAC state change caused by
|
||||
the key.
|
||||
the setsockopt() option of ALG_SET_KEY or ALG_SET_KEY_BY_KEY_SERIAL. If the
|
||||
key is not set the HMAC operation is performed without the initial HMAC state
|
||||
change caused by the key.
|
||||
|
||||
Symmetric Cipher API
|
||||
--------------------
|
||||
@ -382,6 +382,15 @@ mentioned optname:
|
||||
|
||||
- the RNG cipher type to provide the seed
|
||||
|
||||
- ALG_SET_KEY_BY_KEY_SERIAL -- Setting the key via keyring key_serial_t.
|
||||
This operation behaves the same as ALG_SET_KEY. The decrypted
|
||||
data is copied from a keyring key, and uses that data as the
|
||||
key for symmetric encryption.
|
||||
|
||||
The passed in key_serial_t must have the KEY_(POS|USR|GRP|OTH)_SEARCH
|
||||
permission set, otherwise -EPERM is returned. Supports key types: user,
|
||||
logon, encrypted, and trusted.
|
||||
|
||||
- ALG_SET_AEAD_AUTHSIZE -- Setting the authentication tag size for
|
||||
AEAD ciphers. For a encryption operation, the authentication tag of
|
||||
the given size will be generated. For a decryption operation, the
|
||||
|
@ -80,8 +80,8 @@ have the number 1 and the number then must increase by 1 for each additional
|
||||
subtest within the same test at the same nesting level.
|
||||
|
||||
The description is a description of the test, generally the name of
|
||||
the test, and can be any string of words (can't include #). The
|
||||
description is optional, but recommended.
|
||||
the test, and can be any string of characters other than # or a
|
||||
newline. The description is optional, but recommended.
|
||||
|
||||
The directive and any diagnostic data is optional. If either are present, they
|
||||
must follow a hash sign, "#".
|
||||
|
@ -4,16 +4,17 @@
|
||||
KUnit Architecture
|
||||
==================
|
||||
|
||||
The KUnit architecture can be divided into two parts:
|
||||
The KUnit architecture is divided into two parts:
|
||||
|
||||
- `In-Kernel Testing Framework`_
|
||||
- `kunit_tool (Command Line Test Harness)`_
|
||||
- `kunit_tool (Command-line Test Harness)`_
|
||||
|
||||
In-Kernel Testing Framework
|
||||
===========================
|
||||
|
||||
The kernel testing library supports KUnit tests written in C using
|
||||
KUnit. KUnit tests are kernel code. KUnit does several things:
|
||||
KUnit. These KUnit tests are kernel code. KUnit performs the following
|
||||
tasks:
|
||||
|
||||
- Organizes tests
|
||||
- Reports test results
|
||||
@ -22,19 +23,17 @@ KUnit. KUnit tests are kernel code. KUnit does several things:
|
||||
Test Cases
|
||||
----------
|
||||
|
||||
The fundamental unit in KUnit is the test case. The KUnit test cases are
|
||||
grouped into KUnit suites. A KUnit test case is a function with type
|
||||
signature ``void (*)(struct kunit *test)``.
|
||||
These test case functions are wrapped in a struct called
|
||||
struct kunit_case.
|
||||
The test case is the fundamental unit in KUnit. KUnit test cases are organised
|
||||
into suites. A KUnit test case is a function with type signature
|
||||
``void (*)(struct kunit *test)``. These test case functions are wrapped in a
|
||||
struct called struct kunit_case.
|
||||
|
||||
.. note:
|
||||
``generate_params`` is optional for non-parameterized tests.
|
||||
|
||||
Each KUnit test case gets a ``struct kunit`` context
|
||||
object passed to it that tracks a running test. The KUnit assertion
|
||||
macros and other KUnit utilities use the ``struct kunit`` context
|
||||
object. As an exception, there are two fields:
|
||||
Each KUnit test case receives a ``struct kunit`` context object that tracks a
|
||||
running test. The KUnit assertion macros and other KUnit utilities use the
|
||||
``struct kunit`` context object. As an exception, there are two fields:
|
||||
|
||||
- ``->priv``: The setup functions can use it to store arbitrary test
|
||||
user data.
|
||||
@ -77,12 +76,13 @@ Executor
|
||||
|
||||
The KUnit executor can list and run built-in KUnit tests on boot.
|
||||
The Test suites are stored in a linker section
|
||||
called ``.kunit_test_suites``. For code, see:
|
||||
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/include/asm-generic/vmlinux.lds.h?h=v5.15#n945.
|
||||
called ``.kunit_test_suites``. For the code, see ``KUNIT_TABLE()`` macro
|
||||
definition in
|
||||
`include/asm-generic/vmlinux.lds.h <https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/include/asm-generic/vmlinux.lds.h?h=v6.0#n950>`_.
|
||||
The linker section consists of an array of pointers to
|
||||
``struct kunit_suite``, and is populated by the ``kunit_test_suites()``
|
||||
macro. To run all tests compiled into the kernel, the KUnit executor
|
||||
iterates over the linker section array.
|
||||
macro. The KUnit executor iterates over the linker section array in order to
|
||||
run all the tests that are compiled into the kernel.
|
||||
|
||||
.. kernel-figure:: kunit_suitememorydiagram.svg
|
||||
:alt: KUnit Suite Memory
|
||||
@ -90,17 +90,17 @@ iterates over the linker section array.
|
||||
KUnit Suite Memory Diagram
|
||||
|
||||
On the kernel boot, the KUnit executor uses the start and end addresses
|
||||
of this section to iterate over and run all tests. For code, see:
|
||||
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/lib/kunit/executor.c
|
||||
|
||||
of this section to iterate over and run all tests. For the implementation of the
|
||||
executor, see
|
||||
`lib/kunit/executor.c <https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/lib/kunit/executor.c>`_.
|
||||
When built as a module, the ``kunit_test_suites()`` macro defines a
|
||||
``module_init()`` function, which runs all the tests in the compilation
|
||||
unit instead of utilizing the executor.
|
||||
|
||||
In KUnit tests, some error classes do not affect other tests
|
||||
or parts of the kernel, each KUnit case executes in a separate thread
|
||||
context. For code, see:
|
||||
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/lib/kunit/try-catch.c?h=v5.15#n58
|
||||
context. See the ``kunit_try_catch_run()`` function in
|
||||
`lib/kunit/try-catch.c <https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/lib/kunit/try-catch.c?h=v5.15#n58>`_.
|
||||
|
||||
Assertion Macros
|
||||
----------------
|
||||
@ -111,37 +111,36 @@ All expectations/assertions are formatted as:
|
||||
|
||||
- ``{EXPECT|ASSERT}`` determines whether the check is an assertion or an
|
||||
expectation.
|
||||
In the event of a failure, the testing flow differs as follows:
|
||||
|
||||
- For an expectation, if the check fails, marks the test as failed
|
||||
and logs the failure.
|
||||
- For expectations, the test is marked as failed and the failure is logged.
|
||||
|
||||
- An assertion, on failure, causes the test case to terminate
|
||||
immediately.
|
||||
- Failing assertions, on the other hand, result in the test case being
|
||||
terminated immediately.
|
||||
|
||||
- Assertions call function:
|
||||
- Assertions call the function:
|
||||
``void __noreturn kunit_abort(struct kunit *)``.
|
||||
|
||||
- ``kunit_abort`` calls function:
|
||||
- ``kunit_abort`` calls the function:
|
||||
``void __noreturn kunit_try_catch_throw(struct kunit_try_catch *try_catch)``.
|
||||
|
||||
- ``kunit_try_catch_throw`` calls function:
|
||||
- ``kunit_try_catch_throw`` calls the function:
|
||||
``void kthread_complete_and_exit(struct completion *, long) __noreturn;``
|
||||
and terminates the special thread context.
|
||||
|
||||
- ``<op>`` denotes a check with options: ``TRUE`` (supplied property
|
||||
has the boolean value “true”), ``EQ`` (two supplied properties are
|
||||
has the boolean value "true"), ``EQ`` (two supplied properties are
|
||||
equal), ``NOT_ERR_OR_NULL`` (supplied pointer is not null and does not
|
||||
contain an “err” value).
|
||||
contain an "err" value).
|
||||
|
||||
- ``[_MSG]`` prints a custom message on failure.
|
||||
|
||||
Test Result Reporting
|
||||
---------------------
|
||||
KUnit prints test results in KTAP format. KTAP is based on TAP14, see:
|
||||
https://github.com/isaacs/testanything.github.io/blob/tap14/tap-version-14-specification.md.
|
||||
KTAP (yet to be standardized format) works with KUnit and Kselftest.
|
||||
The KUnit executor prints KTAP results to dmesg, and debugfs
|
||||
(if configured).
|
||||
KUnit prints the test results in KTAP format. KTAP is based on TAP14, see
|
||||
Documentation/dev-tools/ktap.rst.
|
||||
KTAP works with KUnit and Kselftest. The KUnit executor prints KTAP results to
|
||||
dmesg, and debugfs (if configured).
|
||||
|
||||
Parameterized Tests
|
||||
-------------------
|
||||
@ -150,33 +149,35 @@ Each KUnit parameterized test is associated with a collection of
|
||||
parameters. The test is invoked multiple times, once for each parameter
|
||||
value and the parameter is stored in the ``param_value`` field.
|
||||
The test case includes a KUNIT_CASE_PARAM() macro that accepts a
|
||||
generator function.
|
||||
The generator function is passed the previous parameter and returns the next
|
||||
parameter. It also provides a macro to generate common-case generators based on
|
||||
arrays.
|
||||
generator function. The generator function is passed the previous parameter
|
||||
and returns the next parameter. It also includes a macro for generating
|
||||
array-based common-case generators.
|
||||
|
||||
kunit_tool (Command Line Test Harness)
|
||||
kunit_tool (Command-line Test Harness)
|
||||
======================================
|
||||
|
||||
kunit_tool is a Python script ``(tools/testing/kunit/kunit.py)``
|
||||
that can be used to configure, build, exec, parse and run (runs other
|
||||
commands in order) test results. You can either run KUnit tests using
|
||||
kunit_tool or can include KUnit in kernel and parse manually.
|
||||
``kunit_tool`` is a Python script, found in ``tools/testing/kunit/kunit.py``. It
|
||||
is used to configure, build, execute, parse test results and run all of the
|
||||
previous commands in correct order (i.e., configure, build, execute and parse).
|
||||
You have two options for running KUnit tests: either build the kernel with KUnit
|
||||
enabled and manually parse the results (see
|
||||
Documentation/dev-tools/kunit/run_manual.rst) or use ``kunit_tool``
|
||||
(see Documentation/dev-tools/kunit/run_wrapper.rst).
|
||||
|
||||
- ``configure`` command generates the kernel ``.config`` from a
|
||||
``.kunitconfig`` file (and any architecture-specific options).
|
||||
For some architectures, additional config options are specified in the
|
||||
``qemu_config`` Python script
|
||||
(For example: ``tools/testing/kunit/qemu_configs/powerpc.py``).
|
||||
The Python scripts available in ``qemu_configs`` folder
|
||||
(for example, ``tools/testing/kunit/qemu configs/powerpc.py``) contains
|
||||
additional configuration options for specific architectures.
|
||||
It parses both the existing ``.config`` and the ``.kunitconfig`` files
|
||||
and ensures that ``.config`` is a superset of ``.kunitconfig``.
|
||||
If this is not the case, it will combine the two and run
|
||||
``make olddefconfig`` to regenerate the ``.config`` file. It then
|
||||
verifies that ``.config`` is now a superset. This checks if all
|
||||
Kconfig dependencies are correctly specified in ``.kunitconfig``.
|
||||
``kunit_config.py`` includes the parsing Kconfigs code. The code which
|
||||
runs ``make olddefconfig`` is a part of ``kunit_kernel.py``. You can
|
||||
invoke this command via: ``./tools/testing/kunit/kunit.py config`` and
|
||||
to ensure that ``.config`` is a superset of ``.kunitconfig``.
|
||||
If not, it will combine the two and run ``make olddefconfig`` to regenerate
|
||||
the ``.config`` file. It then checks to see if ``.config`` has become a superset.
|
||||
This verifies that all the Kconfig dependencies are correctly specified in the
|
||||
file ``.kunitconfig``. The ``kunit_config.py`` script contains the code for parsing
|
||||
Kconfigs. The code which runs ``make olddefconfig`` is part of the
|
||||
``kunit_kernel.py`` script. You can invoke this command through:
|
||||
``./tools/testing/kunit/kunit.py config`` and
|
||||
generate a ``.config`` file.
|
||||
- ``build`` runs ``make`` on the kernel tree with required options
|
||||
(depends on the architecture and some options, for example: build_dir)
|
||||
@ -184,8 +185,8 @@ kunit_tool or can include KUnit in kernel and parse manually.
|
||||
To build a KUnit kernel from the current ``.config``, you can use the
|
||||
``build`` argument: ``./tools/testing/kunit/kunit.py build``.
|
||||
- ``exec`` command executes kernel results either directly (using
|
||||
User-mode Linux configuration), or via an emulator such
|
||||
as QEMU. It reads results from the log via standard
|
||||
User-mode Linux configuration), or through an emulator such
|
||||
as QEMU. It reads results from the log using standard
|
||||
output (stdout), and passes them to ``parse`` to be parsed.
|
||||
If you already have built a kernel with built-in KUnit tests,
|
||||
you can run the kernel and display the test results with the ``exec``
|
||||
|
@ -16,7 +16,6 @@ KUnit - Linux Kernel Unit Testing
|
||||
api/index
|
||||
style
|
||||
faq
|
||||
tips
|
||||
running_tips
|
||||
|
||||
This section details the kernel unit testing framework.
|
||||
@ -100,14 +99,11 @@ Read also :ref:`kinds-of-tests`.
|
||||
How do I use it?
|
||||
================
|
||||
|
||||
* Documentation/dev-tools/kunit/start.rst - for KUnit new users.
|
||||
* Documentation/dev-tools/kunit/architecture.rst - KUnit architecture.
|
||||
* Documentation/dev-tools/kunit/run_wrapper.rst - run kunit_tool.
|
||||
* Documentation/dev-tools/kunit/run_manual.rst - run tests without kunit_tool.
|
||||
* Documentation/dev-tools/kunit/usage.rst - write tests.
|
||||
* Documentation/dev-tools/kunit/tips.rst - best practices with
|
||||
examples.
|
||||
* Documentation/dev-tools/kunit/api/index.rst - KUnit APIs
|
||||
used for testing.
|
||||
* Documentation/dev-tools/kunit/faq.rst - KUnit common questions and
|
||||
answers.
|
||||
You can find a step-by-step guide to writing and running KUnit tests in
|
||||
Documentation/dev-tools/kunit/start.rst
|
||||
|
||||
Alternatively, feel free to look through the rest of the KUnit documentation,
|
||||
or to experiment with tools/testing/kunit/kunit.py and the example test under
|
||||
lib/kunit/kunit-example-test.c
|
||||
|
||||
Happy testing!
|
||||
|
@ -294,13 +294,11 @@ Congrats! You just wrote your first KUnit test.
|
||||
Next Steps
|
||||
==========
|
||||
|
||||
* Documentation/dev-tools/kunit/architecture.rst - KUnit architecture.
|
||||
* Documentation/dev-tools/kunit/run_wrapper.rst - run kunit_tool.
|
||||
* Documentation/dev-tools/kunit/run_manual.rst - run tests without kunit_tool.
|
||||
* Documentation/dev-tools/kunit/usage.rst - write tests.
|
||||
* Documentation/dev-tools/kunit/tips.rst - best practices with
|
||||
examples.
|
||||
* Documentation/dev-tools/kunit/api/index.rst - KUnit APIs
|
||||
used for testing.
|
||||
* Documentation/dev-tools/kunit/faq.rst - KUnit common questions and
|
||||
answers.
|
||||
If you're interested in using some of the more advanced features of kunit.py,
|
||||
take a look at Documentation/dev-tools/kunit/run_wrapper.rst
|
||||
|
||||
If you'd like to run tests without using kunit.py, check out
|
||||
Documentation/dev-tools/kunit/run_manual.rst
|
||||
|
||||
For more information on writing KUnit tests (including some common techniques
|
||||
for testing different things), see Documentation/dev-tools/kunit/usage.rst
|
||||
|
@ -1,190 +0,0 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
============================
|
||||
Tips For Writing KUnit Tests
|
||||
============================
|
||||
|
||||
Exiting early on failed expectations
|
||||
------------------------------------
|
||||
|
||||
``KUNIT_EXPECT_EQ`` and friends will mark the test as failed and continue
|
||||
execution. In some cases, it's unsafe to continue and you can use the
|
||||
``KUNIT_ASSERT`` variant to exit on failure.
|
||||
|
||||
.. code-block:: c
|
||||
|
||||
void example_test_user_alloc_function(struct kunit *test)
|
||||
{
|
||||
void *object = alloc_some_object_for_me();
|
||||
|
||||
/* Make sure we got a valid pointer back. */
|
||||
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, object);
|
||||
do_something_with_object(object);
|
||||
}
|
||||
|
||||
Allocating memory
|
||||
-----------------
|
||||
|
||||
Where you would use ``kzalloc``, you should prefer ``kunit_kzalloc`` instead.
|
||||
KUnit will ensure the memory is freed once the test completes.
|
||||
|
||||
This is particularly useful since it lets you use the ``KUNIT_ASSERT_EQ``
|
||||
macros to exit early from a test without having to worry about remembering to
|
||||
call ``kfree``.
|
||||
|
||||
Example:
|
||||
|
||||
.. code-block:: c
|
||||
|
||||
void example_test_allocation(struct kunit *test)
|
||||
{
|
||||
char *buffer = kunit_kzalloc(test, 16, GFP_KERNEL);
|
||||
/* Ensure allocation succeeded. */
|
||||
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, buffer);
|
||||
|
||||
KUNIT_ASSERT_STREQ(test, buffer, "");
|
||||
}
|
||||
|
||||
|
||||
Testing static functions
|
||||
------------------------
|
||||
|
||||
If you don't want to expose functions or variables just for testing, one option
|
||||
is to conditionally ``#include`` the test file at the end of your .c file, e.g.
|
||||
|
||||
.. code-block:: c
|
||||
|
||||
/* In my_file.c */
|
||||
|
||||
static int do_interesting_thing();
|
||||
|
||||
#ifdef CONFIG_MY_KUNIT_TEST
|
||||
#include "my_kunit_test.c"
|
||||
#endif
|
||||
|
||||
Injecting test-only code
|
||||
------------------------
|
||||
|
||||
Similarly to the above, it can be useful to add test-specific logic.
|
||||
|
||||
.. code-block:: c
|
||||
|
||||
/* In my_file.h */
|
||||
|
||||
#ifdef CONFIG_MY_KUNIT_TEST
|
||||
/* Defined in my_kunit_test.c */
|
||||
void test_only_hook(void);
|
||||
#else
|
||||
void test_only_hook(void) { }
|
||||
#endif
|
||||
|
||||
This test-only code can be made more useful by accessing the current kunit
|
||||
test, see below.
|
||||
|
||||
Accessing the current test
|
||||
--------------------------
|
||||
|
||||
In some cases, you need to call test-only code from outside the test file, e.g.
|
||||
like in the example above or if you're providing a fake implementation of an
|
||||
ops struct.
|
||||
There is a ``kunit_test`` field in ``task_struct``, so you can access it via
|
||||
``current->kunit_test``.
|
||||
|
||||
Here's a slightly in-depth example of how one could implement "mocking":
|
||||
|
||||
.. code-block:: c
|
||||
|
||||
#include <linux/sched.h> /* for current */
|
||||
|
||||
struct test_data {
|
||||
int foo_result;
|
||||
int want_foo_called_with;
|
||||
};
|
||||
|
||||
static int fake_foo(int arg)
|
||||
{
|
||||
struct kunit *test = current->kunit_test;
|
||||
struct test_data *test_data = test->priv;
|
||||
|
||||
KUNIT_EXPECT_EQ(test, test_data->want_foo_called_with, arg);
|
||||
return test_data->foo_result;
|
||||
}
|
||||
|
||||
static void example_simple_test(struct kunit *test)
|
||||
{
|
||||
/* Assume priv is allocated in the suite's .init */
|
||||
struct test_data *test_data = test->priv;
|
||||
|
||||
test_data->foo_result = 42;
|
||||
test_data->want_foo_called_with = 1;
|
||||
|
||||
/* In a real test, we'd probably pass a pointer to fake_foo somewhere
|
||||
* like an ops struct, etc. instead of calling it directly. */
|
||||
KUNIT_EXPECT_EQ(test, fake_foo(1), 42);
|
||||
}
|
||||
|
||||
|
||||
Note: here we're able to get away with using ``test->priv``, but if you wanted
|
||||
something more flexible you could use a named ``kunit_resource``, see
|
||||
Documentation/dev-tools/kunit/api/test.rst.
|
||||
|
||||
Failing the current test
|
||||
------------------------
|
||||
|
||||
But sometimes, you might just want to fail the current test. In that case, we
|
||||
have ``kunit_fail_current_test(fmt, args...)`` which is defined in ``<kunit/test-bug.h>`` and
|
||||
doesn't require pulling in ``<kunit/test.h>``.
|
||||
|
||||
E.g. say we had an option to enable some extra debug checks on some data structure:
|
||||
|
||||
.. code-block:: c
|
||||
|
||||
#include <kunit/test-bug.h>
|
||||
|
||||
#ifdef CONFIG_EXTRA_DEBUG_CHECKS
|
||||
static void validate_my_data(struct data *data)
|
||||
{
|
||||
if (is_valid(data))
|
||||
return;
|
||||
|
||||
kunit_fail_current_test("data %p is invalid", data);
|
||||
|
||||
/* Normal, non-KUnit, error reporting code here. */
|
||||
}
|
||||
#else
|
||||
static void my_debug_function(void) { }
|
||||
#endif
|
||||
|
||||
|
||||
Customizing error messages
|
||||
--------------------------
|
||||
|
||||
Each of the ``KUNIT_EXPECT`` and ``KUNIT_ASSERT`` macros have a ``_MSG`` variant.
|
||||
These take a format string and arguments to provide additional context to the automatically generated error messages.
|
||||
|
||||
.. code-block:: c
|
||||
|
||||
char some_str[41];
|
||||
generate_sha1_hex_string(some_str);
|
||||
|
||||
/* Before. Not easy to tell why the test failed. */
|
||||
KUNIT_EXPECT_EQ(test, strlen(some_str), 40);
|
||||
|
||||
/* After. Now we see the offending string. */
|
||||
KUNIT_EXPECT_EQ_MSG(test, strlen(some_str), 40, "some_str='%s'", some_str);
|
||||
|
||||
Alternatively, one can take full control over the error message by using ``KUNIT_FAIL()``, e.g.
|
||||
|
||||
.. code-block:: c
|
||||
|
||||
/* Before */
|
||||
KUNIT_EXPECT_EQ(test, some_setup_function(), 0);
|
||||
|
||||
/* After: full control over the failure message. */
|
||||
if (some_setup_function())
|
||||
KUNIT_FAIL(test, "Failed to setup thing for testing");
|
||||
|
||||
Next Steps
|
||||
==========
|
||||
* Optional: see the Documentation/dev-tools/kunit/usage.rst page for a more
|
||||
in-depth explanation of KUnit.
|
@ -112,11 +112,45 @@ terminates the test case if the condition is not satisfied. For example:
|
||||
KUNIT_EXPECT_LE(test, a[i], a[i + 1]);
|
||||
}
|
||||
|
||||
In this example, the method under test should return pointer to a value. If the
|
||||
pointer returns null or an errno, we want to stop the test since the following
|
||||
expectation could crash the test case. `ASSERT_NOT_ERR_OR_NULL(...)` allows us
|
||||
to bail out of the test case if the appropriate conditions are not satisfied to
|
||||
complete the test.
|
||||
In this example, we need to be able to allocate an array to test the ``sort()``
|
||||
function. So we use ``KUNIT_ASSERT_NOT_ERR_OR_NULL()`` to abort the test if
|
||||
there's an allocation error.
|
||||
|
||||
.. note::
|
||||
In other test frameworks, ``ASSERT`` macros are often implemented by calling
|
||||
``return`` so they only work from the test function. In KUnit, we stop the
|
||||
current kthread on failure, so you can call them from anywhere.
|
||||
|
||||
Customizing error messages
|
||||
--------------------------
|
||||
|
||||
Each of the ``KUNIT_EXPECT`` and ``KUNIT_ASSERT`` macros have a ``_MSG``
|
||||
variant. These take a format string and arguments to provide additional
|
||||
context to the automatically generated error messages.
|
||||
|
||||
.. code-block:: c
|
||||
|
||||
char some_str[41];
|
||||
generate_sha1_hex_string(some_str);
|
||||
|
||||
/* Before. Not easy to tell why the test failed. */
|
||||
KUNIT_EXPECT_EQ(test, strlen(some_str), 40);
|
||||
|
||||
/* After. Now we see the offending string. */
|
||||
KUNIT_EXPECT_EQ_MSG(test, strlen(some_str), 40, "some_str='%s'", some_str);
|
||||
|
||||
Alternatively, one can take full control over the error message by using
|
||||
``KUNIT_FAIL()``, e.g.
|
||||
|
||||
.. code-block:: c
|
||||
|
||||
/* Before */
|
||||
KUNIT_EXPECT_EQ(test, some_setup_function(), 0);
|
||||
|
||||
/* After: full control over the failure message. */
|
||||
if (some_setup_function())
|
||||
KUNIT_FAIL(test, "Failed to setup thing for testing");
|
||||
|
||||
|
||||
Test Suites
|
||||
~~~~~~~~~~~
|
||||
@ -546,24 +580,6 @@ By reusing the same ``cases`` array from above, we can write the test as a
|
||||
{}
|
||||
};
|
||||
|
||||
Exiting Early on Failed Expectations
|
||||
------------------------------------
|
||||
|
||||
We can use ``KUNIT_EXPECT_EQ`` to mark the test as failed and continue
|
||||
execution. In some cases, it is unsafe to continue. We can use the
|
||||
``KUNIT_ASSERT`` variant to exit on failure.
|
||||
|
||||
.. code-block:: c
|
||||
|
||||
void example_test_user_alloc_function(struct kunit *test)
|
||||
{
|
||||
void *object = alloc_some_object_for_me();
|
||||
|
||||
/* Make sure we got a valid pointer back. */
|
||||
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, object);
|
||||
do_something_with_object(object);
|
||||
}
|
||||
|
||||
Allocating Memory
|
||||
-----------------
|
||||
|
||||
@ -625,17 +641,23 @@ as shown in next section: *Accessing The Current Test*.
|
||||
Accessing The Current Test
|
||||
--------------------------
|
||||
|
||||
In some cases, we need to call test-only code from outside the test file.
|
||||
For example, see example in section *Injecting Test-Only Code* or if
|
||||
we are providing a fake implementation of an ops struct. Using
|
||||
``kunit_test`` field in ``task_struct``, we can access it via
|
||||
``current->kunit_test``.
|
||||
In some cases, we need to call test-only code from outside the test file. This
|
||||
is helpful, for example, when providing a fake implementation of a function, or
|
||||
to fail any current test from within an error handler.
|
||||
We can do this via the ``kunit_test`` field in ``task_struct``, which we can
|
||||
access using the ``kunit_get_current_test()`` function in ``kunit/test-bug.h``.
|
||||
|
||||
The example below includes how to implement "mocking":
|
||||
``kunit_get_current_test()`` is safe to call even if KUnit is not enabled. If
|
||||
KUnit is not enabled, was built as a module (``CONFIG_KUNIT=m``), or no test is
|
||||
running in the current task, it will return ``NULL``. This compiles down to
|
||||
either a no-op or a static key check, so will have a negligible performance
|
||||
impact when no test is running.
|
||||
|
||||
The example below uses this to implement a "mock" implementation of a function, ``foo``:
|
||||
|
||||
.. code-block:: c
|
||||
|
||||
#include <linux/sched.h> /* for current */
|
||||
#include <kunit/test-bug.h> /* for kunit_get_current_test */
|
||||
|
||||
struct test_data {
|
||||
int foo_result;
|
||||
@ -644,7 +666,7 @@ The example below includes how to implement "mocking":
|
||||
|
||||
static int fake_foo(int arg)
|
||||
{
|
||||
struct kunit *test = current->kunit_test;
|
||||
struct kunit *test = kunit_get_current_test();
|
||||
struct test_data *test_data = test->priv;
|
||||
|
||||
KUNIT_EXPECT_EQ(test, test_data->want_foo_called_with, arg);
|
||||
@ -675,7 +697,7 @@ Each test can have multiple resources which have string names providing the same
|
||||
flexibility as a ``priv`` member, but also, for example, allowing helper
|
||||
functions to create resources without conflicting with each other. It is also
|
||||
possible to define a clean up function for each resource, making it easy to
|
||||
avoid resource leaks. For more information, see Documentation/dev-tools/kunit/api/test.rst.
|
||||
avoid resource leaks. For more information, see Documentation/dev-tools/kunit/api/resource.rst.
|
||||
|
||||
Failing The Current Test
|
||||
------------------------
|
||||
@ -703,3 +725,9 @@ structures as shown below:
|
||||
static void my_debug_function(void) { }
|
||||
#endif
|
||||
|
||||
``kunit_fail_current_test()`` is safe to call even if KUnit is not enabled. If
|
||||
KUnit is not enabled, was built as a module (``CONFIG_KUNIT=m``), or no test is
|
||||
running in the current task, it will do nothing. This compiles down to either a
|
||||
no-op or a static key check, so will have a negligible performance impact when
|
||||
no test is running.
|
||||
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user