Merge branch 'linus' into perf/urgent, to synchronize with upstream
Signed-off-by: Ingo Molnar <mingo@kernel.org>
This commit is contained in:
commit
fdff7c21ea
5
.mailmap
5
.mailmap
@ -139,6 +139,7 @@ Juha Yrjola <at solidboot.com>
|
||||
Juha Yrjola <juha.yrjola@nokia.com>
|
||||
Juha Yrjola <juha.yrjola@solidboot.com>
|
||||
Julien Thierry <julien.thierry.kdev@gmail.com> <julien.thierry@arm.com>
|
||||
Kamil Konieczny <k.konieczny@samsung.com> <k.konieczny@partner.samsung.com>
|
||||
Kay Sievers <kay.sievers@vrfy.org>
|
||||
Kenneth W Chen <kenneth.w.chen@intel.com>
|
||||
Konstantin Khlebnikov <koct9i@gmail.com> <k.khlebnikov@samsung.com>
|
||||
@ -210,6 +211,10 @@ Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
|
||||
Patrick Mochel <mochel@digitalimplant.org>
|
||||
Paul Burton <paulburton@kernel.org> <paul.burton@imgtec.com>
|
||||
Paul Burton <paulburton@kernel.org> <paul.burton@mips.com>
|
||||
Paul E. McKenney <paulmck@kernel.org> <paulmck@linux.ibm.com>
|
||||
Paul E. McKenney <paulmck@kernel.org> <paulmck@linux.vnet.ibm.com>
|
||||
Paul E. McKenney <paulmck@kernel.org> <paul.mckenney@linaro.org>
|
||||
Paul E. McKenney <paulmck@kernel.org> <paulmck@us.ibm.com>
|
||||
Peter A Jonsson <pj@ludd.ltu.se>
|
||||
Peter Oruba <peter@oruba.de>
|
||||
Peter Oruba <peter.oruba@amd.com>
|
||||
|
@ -25,11 +25,11 @@ Description:
|
||||
lsm: [[subj_user=] [subj_role=] [subj_type=]
|
||||
[obj_user=] [obj_role=] [obj_type=]]
|
||||
option: [[appraise_type=]] [template=] [permit_directio]
|
||||
[appraise_flag=]
|
||||
[appraise_flag=] [keyrings=]
|
||||
base: func:= [BPRM_CHECK][MMAP_CHECK][CREDS_CHECK][FILE_CHECK][MODULE_CHECK]
|
||||
[FIRMWARE_CHECK]
|
||||
[KEXEC_KERNEL_CHECK] [KEXEC_INITRAMFS_CHECK]
|
||||
[KEXEC_CMDLINE]
|
||||
[KEXEC_CMDLINE] [KEY_CHECK]
|
||||
mask:= [[^]MAY_READ] [[^]MAY_WRITE] [[^]MAY_APPEND]
|
||||
[[^]MAY_EXEC]
|
||||
fsmagic:= hex value
|
||||
@ -42,6 +42,9 @@ Description:
|
||||
appraise_flag:= [check_blacklist]
|
||||
Currently, blacklist check is only for files signed with appended
|
||||
signature.
|
||||
keyrings:= list of keyrings
|
||||
(eg, .builtin_trusted_keys|.ima). Only valid
|
||||
when action is "measure" and func is KEY_CHECK.
|
||||
template:= name of a defined IMA template type
|
||||
(eg, ima-ng). Only valid when action is "measure".
|
||||
pcr:= decimal value
|
||||
@ -113,3 +116,12 @@ Description:
|
||||
Example of appraise rule allowing modsig appended signatures:
|
||||
|
||||
appraise func=KEXEC_KERNEL_CHECK appraise_type=imasig|modsig
|
||||
|
||||
Example of measure rule using KEY_CHECK to measure all keys:
|
||||
|
||||
measure func=KEY_CHECK
|
||||
|
||||
Example of measure rule using KEY_CHECK to only measure
|
||||
keys added to .builtin_trusted_keys or .ima keyring:
|
||||
|
||||
measure func=KEY_CHECK keyrings=.builtin_trusted_keys|.ima
|
||||
|
63
Documentation/ABI/testing/sysfs-bus-mdio
Normal file
63
Documentation/ABI/testing/sysfs-bus-mdio
Normal file
@ -0,0 +1,63 @@
|
||||
What: /sys/bus/mdio_bus/devices/.../statistics/
|
||||
Date: January 2020
|
||||
KernelVersion: 5.6
|
||||
Contact: netdev@vger.kernel.org
|
||||
Description:
|
||||
This folder contains statistics about global and per
|
||||
MDIO bus address statistics.
|
||||
|
||||
What: /sys/bus/mdio_bus/devices/.../statistics/transfers
|
||||
Date: January 2020
|
||||
KernelVersion: 5.6
|
||||
Contact: netdev@vger.kernel.org
|
||||
Description:
|
||||
Total number of transfers for this MDIO bus.
|
||||
|
||||
What: /sys/bus/mdio_bus/devices/.../statistics/errors
|
||||
Date: January 2020
|
||||
KernelVersion: 5.6
|
||||
Contact: netdev@vger.kernel.org
|
||||
Description:
|
||||
Total number of transfer errors for this MDIO bus.
|
||||
|
||||
What: /sys/bus/mdio_bus/devices/.../statistics/writes
|
||||
Date: January 2020
|
||||
KernelVersion: 5.6
|
||||
Contact: netdev@vger.kernel.org
|
||||
Description:
|
||||
Total number of write transactions for this MDIO bus.
|
||||
|
||||
What: /sys/bus/mdio_bus/devices/.../statistics/reads
|
||||
Date: January 2020
|
||||
KernelVersion: 5.6
|
||||
Contact: netdev@vger.kernel.org
|
||||
Description:
|
||||
Total number of read transactions for this MDIO bus.
|
||||
|
||||
What: /sys/bus/mdio_bus/devices/.../statistics/transfers_<addr>
|
||||
Date: January 2020
|
||||
KernelVersion: 5.6
|
||||
Contact: netdev@vger.kernel.org
|
||||
Description:
|
||||
Total number of transfers for this MDIO bus address.
|
||||
|
||||
What: /sys/bus/mdio_bus/devices/.../statistics/errors_<addr>
|
||||
Date: January 2020
|
||||
KernelVersion: 5.6
|
||||
Contact: netdev@vger.kernel.org
|
||||
Description:
|
||||
Total number of transfer errors for this MDIO bus address.
|
||||
|
||||
What: /sys/bus/mdio_bus/devices/.../statistics/writes_<addr>
|
||||
Date: January 2020
|
||||
KernelVersion: 5.6
|
||||
Contact: netdev@vger.kernel.org
|
||||
Description:
|
||||
Total number of write transactions for this MDIO bus address.
|
||||
|
||||
What: /sys/bus/mdio_bus/devices/.../statistics/reads_<addr>
|
||||
Date: January 2020
|
||||
KernelVersion: 5.6
|
||||
Contact: netdev@vger.kernel.org
|
||||
Description:
|
||||
Total number of read transactions for this MDIO bus address.
|
@ -1,4 +1,7 @@
|
||||
.. _NMI_rcu_doc:
|
||||
|
||||
Using RCU to Protect Dynamic NMI Handlers
|
||||
=========================================
|
||||
|
||||
|
||||
Although RCU is usually used to protect read-mostly data structures,
|
||||
@ -9,7 +12,7 @@ work in "arch/x86/oprofile/nmi_timer_int.c" and in
|
||||
"arch/x86/kernel/traps.c".
|
||||
|
||||
The relevant pieces of code are listed below, each followed by a
|
||||
brief explanation.
|
||||
brief explanation::
|
||||
|
||||
static int dummy_nmi_callback(struct pt_regs *regs, int cpu)
|
||||
{
|
||||
@ -18,12 +21,12 @@ brief explanation.
|
||||
|
||||
The dummy_nmi_callback() function is a "dummy" NMI handler that does
|
||||
nothing, but returns zero, thus saying that it did nothing, allowing
|
||||
the NMI handler to take the default machine-specific action.
|
||||
the NMI handler to take the default machine-specific action::
|
||||
|
||||
static nmi_callback_t nmi_callback = dummy_nmi_callback;
|
||||
|
||||
This nmi_callback variable is a global function pointer to the current
|
||||
NMI handler.
|
||||
NMI handler::
|
||||
|
||||
void do_nmi(struct pt_regs * regs, long error_code)
|
||||
{
|
||||
@ -53,11 +56,12 @@ anyway. However, in practice it is a good documentation aid, particularly
|
||||
for anyone attempting to do something similar on Alpha or on systems
|
||||
with aggressive optimizing compilers.
|
||||
|
||||
Quick Quiz: Why might the rcu_dereference_sched() be necessary on Alpha,
|
||||
given that the code referenced by the pointer is read-only?
|
||||
Quick Quiz:
|
||||
Why might the rcu_dereference_sched() be necessary on Alpha, given that the code referenced by the pointer is read-only?
|
||||
|
||||
:ref:`Answer to Quick Quiz <answer_quick_quiz_NMI>`
|
||||
|
||||
Back to the discussion of NMI and RCU...
|
||||
Back to the discussion of NMI and RCU::
|
||||
|
||||
void set_nmi_callback(nmi_callback_t callback)
|
||||
{
|
||||
@ -68,7 +72,7 @@ The set_nmi_callback() function registers an NMI handler. Note that any
|
||||
data that is to be used by the callback must be initialized up -before-
|
||||
the call to set_nmi_callback(). On architectures that do not order
|
||||
writes, the rcu_assign_pointer() ensures that the NMI handler sees the
|
||||
initialized values.
|
||||
initialized values::
|
||||
|
||||
void unset_nmi_callback(void)
|
||||
{
|
||||
@ -82,7 +86,7 @@ up any data structures used by the old NMI handler until execution
|
||||
of it completes on all other CPUs.
|
||||
|
||||
One way to accomplish this is via synchronize_rcu(), perhaps as
|
||||
follows:
|
||||
follows::
|
||||
|
||||
unset_nmi_callback();
|
||||
synchronize_rcu();
|
||||
@ -98,24 +102,23 @@ to free up the handler's data as soon as synchronize_rcu() returns.
|
||||
Important note: for this to work, the architecture in question must
|
||||
invoke nmi_enter() and nmi_exit() on NMI entry and exit, respectively.
|
||||
|
||||
.. _answer_quick_quiz_NMI:
|
||||
|
||||
Answer to Quick Quiz
|
||||
Answer to Quick Quiz:
|
||||
Why might the rcu_dereference_sched() be necessary on Alpha, given that the code referenced by the pointer is read-only?
|
||||
|
||||
Why might the rcu_dereference_sched() be necessary on Alpha, given
|
||||
that the code referenced by the pointer is read-only?
|
||||
The caller to set_nmi_callback() might well have
|
||||
initialized some data that is to be used by the new NMI
|
||||
handler. In this case, the rcu_dereference_sched() would
|
||||
be needed, because otherwise a CPU that received an NMI
|
||||
just after the new handler was set might see the pointer
|
||||
to the new NMI handler, but the old pre-initialized
|
||||
version of the handler's data.
|
||||
|
||||
Answer: The caller to set_nmi_callback() might well have
|
||||
initialized some data that is to be used by the new NMI
|
||||
handler. In this case, the rcu_dereference_sched() would
|
||||
be needed, because otherwise a CPU that received an NMI
|
||||
just after the new handler was set might see the pointer
|
||||
to the new NMI handler, but the old pre-initialized
|
||||
version of the handler's data.
|
||||
This same sad story can happen on other CPUs when using
|
||||
a compiler with aggressive pointer-value speculation
|
||||
optimizations.
|
||||
|
||||
This same sad story can happen on other CPUs when using
|
||||
a compiler with aggressive pointer-value speculation
|
||||
optimizations.
|
||||
|
||||
More important, the rcu_dereference_sched() makes it
|
||||
clear to someone reading the code that the pointer is
|
||||
being protected by RCU-sched.
|
||||
More important, the rcu_dereference_sched() makes it
|
||||
clear to someone reading the code that the pointer is
|
||||
being protected by RCU-sched.
|
@ -1,19 +1,21 @@
|
||||
Using RCU to Protect Read-Mostly Arrays
|
||||
.. _array_rcu_doc:
|
||||
|
||||
Using RCU to Protect Read-Mostly Arrays
|
||||
=======================================
|
||||
|
||||
Although RCU is more commonly used to protect linked lists, it can
|
||||
also be used to protect arrays. Three situations are as follows:
|
||||
|
||||
1. Hash Tables
|
||||
1. :ref:`Hash Tables <hash_tables>`
|
||||
|
||||
2. Static Arrays
|
||||
2. :ref:`Static Arrays <static_arrays>`
|
||||
|
||||
3. Resizeable Arrays
|
||||
3. :ref:`Resizable Arrays <resizable_arrays>`
|
||||
|
||||
Each of these three situations involves an RCU-protected pointer to an
|
||||
array that is separately indexed. It might be tempting to consider use
|
||||
of RCU to instead protect the index into an array, however, this use
|
||||
case is -not- supported. The problem with RCU-protected indexes into
|
||||
case is **not** supported. The problem with RCU-protected indexes into
|
||||
arrays is that compilers can play way too many optimization games with
|
||||
integers, which means that the rules governing handling of these indexes
|
||||
are far more trouble than they are worth. If RCU-protected indexes into
|
||||
@ -24,16 +26,20 @@ to be safely used.
|
||||
That aside, each of the three RCU-protected pointer situations are
|
||||
described in the following sections.
|
||||
|
||||
.. _hash_tables:
|
||||
|
||||
Situation 1: Hash Tables
|
||||
------------------------
|
||||
|
||||
Hash tables are often implemented as an array, where each array entry
|
||||
has a linked-list hash chain. Each hash chain can be protected by RCU
|
||||
as described in the listRCU.txt document. This approach also applies
|
||||
to other array-of-list situations, such as radix trees.
|
||||
|
||||
.. _static_arrays:
|
||||
|
||||
Situation 2: Static Arrays
|
||||
--------------------------
|
||||
|
||||
Static arrays, where the data (rather than a pointer to the data) is
|
||||
located in each array element, and where the array is never resized,
|
||||
@ -41,13 +47,17 @@ have not been used with RCU. Rik van Riel recommends using seqlock in
|
||||
this situation, which would also have minimal read-side overhead as long
|
||||
as updates are rare.
|
||||
|
||||
Quick Quiz: Why is it so important that updates be rare when
|
||||
using seqlock?
|
||||
Quick Quiz:
|
||||
Why is it so important that updates be rare when using seqlock?
|
||||
|
||||
:ref:`Answer to Quick Quiz <answer_quick_quiz_seqlock>`
|
||||
|
||||
Situation 3: Resizeable Arrays
|
||||
.. _resizable_arrays:
|
||||
|
||||
Use of RCU for resizeable arrays is demonstrated by the grow_ary()
|
||||
Situation 3: Resizable Arrays
|
||||
------------------------------
|
||||
|
||||
Use of RCU for resizable arrays is demonstrated by the grow_ary()
|
||||
function formerly used by the System V IPC code. The array is used
|
||||
to map from semaphore, message-queue, and shared-memory IDs to the data
|
||||
structure that represents the corresponding IPC construct. The grow_ary()
|
||||
@ -60,7 +70,7 @@ the remainder of the new, updates the ids->entries pointer to point to
|
||||
the new array, and invokes ipc_rcu_putref() to free up the old array.
|
||||
Note that rcu_assign_pointer() is used to update the ids->entries pointer,
|
||||
which includes any memory barriers required on whatever architecture
|
||||
you are running on.
|
||||
you are running on::
|
||||
|
||||
static int grow_ary(struct ipc_ids* ids, int newsize)
|
||||
{
|
||||
@ -112,7 +122,7 @@ a simple check suffices. The pointer to the structure corresponding
|
||||
to the desired IPC object is placed in "out", with NULL indicating
|
||||
a non-existent entry. After acquiring "out->lock", the "out->deleted"
|
||||
flag indicates whether the IPC object is in the process of being
|
||||
deleted, and, if not, the pointer is returned.
|
||||
deleted, and, if not, the pointer is returned::
|
||||
|
||||
struct kern_ipc_perm* ipc_lock(struct ipc_ids* ids, int id)
|
||||
{
|
||||
@ -144,8 +154,10 @@ deleted, and, if not, the pointer is returned.
|
||||
return out;
|
||||
}
|
||||
|
||||
.. _answer_quick_quiz_seqlock:
|
||||
|
||||
Answer to Quick Quiz:
|
||||
Why is it so important that updates be rare when using seqlock?
|
||||
|
||||
The reason that it is important that updates be rare when
|
||||
using seqlock is that frequent updates can livelock readers.
|
@ -7,8 +7,13 @@ RCU concepts
|
||||
.. toctree::
|
||||
:maxdepth: 3
|
||||
|
||||
arrayRCU
|
||||
rcubarrier
|
||||
rcu_dereference
|
||||
whatisRCU
|
||||
rcu
|
||||
listRCU
|
||||
NMI-RCU
|
||||
UP
|
||||
|
||||
Design/Memory-Ordering/Tree-RCU-Memory-Ordering
|
||||
|
@ -99,7 +99,7 @@ With this change, the rcu_dereference() is always within an RCU
|
||||
read-side critical section, which again would have suppressed the
|
||||
above lockdep-RCU splat.
|
||||
|
||||
But in this particular case, we don't actually deference the pointer
|
||||
But in this particular case, we don't actually dereference the pointer
|
||||
returned from rcu_dereference(). Instead, that pointer is just compared
|
||||
to the cic pointer, which means that the rcu_dereference() can be replaced
|
||||
by rcu_access_pointer() as follows:
|
||||
|
@ -1,4 +1,7 @@
|
||||
.. _rcu_dereference_doc:
|
||||
|
||||
PROPER CARE AND FEEDING OF RETURN VALUES FROM rcu_dereference()
|
||||
===============================================================
|
||||
|
||||
Most of the time, you can use values from rcu_dereference() or one of
|
||||
the similar primitives without worries. Dereferencing (prefix "*"),
|
||||
@ -8,7 +11,7 @@ subtraction of constants, and casts all work quite naturally and safely.
|
||||
It is nevertheless possible to get into trouble with other operations.
|
||||
Follow these rules to keep your RCU code working properly:
|
||||
|
||||
o You must use one of the rcu_dereference() family of primitives
|
||||
- You must use one of the rcu_dereference() family of primitives
|
||||
to load an RCU-protected pointer, otherwise CONFIG_PROVE_RCU
|
||||
will complain. Worse yet, your code can see random memory-corruption
|
||||
bugs due to games that compilers and DEC Alpha can play.
|
||||
@ -25,24 +28,24 @@ o You must use one of the rcu_dereference() family of primitives
|
||||
for an example where the compiler can in fact deduce the exact
|
||||
value of the pointer, and thus cause misordering.
|
||||
|
||||
o You are only permitted to use rcu_dereference on pointer values.
|
||||
- You are only permitted to use rcu_dereference on pointer values.
|
||||
The compiler simply knows too much about integral values to
|
||||
trust it to carry dependencies through integer operations.
|
||||
There are a very few exceptions, namely that you can temporarily
|
||||
cast the pointer to uintptr_t in order to:
|
||||
|
||||
o Set bits and clear bits down in the must-be-zero low-order
|
||||
- Set bits and clear bits down in the must-be-zero low-order
|
||||
bits of that pointer. This clearly means that the pointer
|
||||
must have alignment constraints, for example, this does
|
||||
-not- work in general for char* pointers.
|
||||
|
||||
o XOR bits to translate pointers, as is done in some
|
||||
- XOR bits to translate pointers, as is done in some
|
||||
classic buddy-allocator algorithms.
|
||||
|
||||
It is important to cast the value back to pointer before
|
||||
doing much of anything else with it.
|
||||
|
||||
o Avoid cancellation when using the "+" and "-" infix arithmetic
|
||||
- Avoid cancellation when using the "+" and "-" infix arithmetic
|
||||
operators. For example, for a given variable "x", avoid
|
||||
"(x-(uintptr_t)x)" for char* pointers. The compiler is within its
|
||||
rights to substitute zero for this sort of expression, so that
|
||||
@ -54,16 +57,16 @@ o Avoid cancellation when using the "+" and "-" infix arithmetic
|
||||
"p+a-b" is safe because its value still necessarily depends on
|
||||
the rcu_dereference(), thus maintaining proper ordering.
|
||||
|
||||
o If you are using RCU to protect JITed functions, so that the
|
||||
- If you are using RCU to protect JITed functions, so that the
|
||||
"()" function-invocation operator is applied to a value obtained
|
||||
(directly or indirectly) from rcu_dereference(), you may need to
|
||||
interact directly with the hardware to flush instruction caches.
|
||||
This issue arises on some systems when a newly JITed function is
|
||||
using the same memory that was used by an earlier JITed function.
|
||||
|
||||
o Do not use the results from relational operators ("==", "!=",
|
||||
- Do not use the results from relational operators ("==", "!=",
|
||||
">", ">=", "<", or "<=") when dereferencing. For example,
|
||||
the following (quite strange) code is buggy:
|
||||
the following (quite strange) code is buggy::
|
||||
|
||||
int *p;
|
||||
int *q;
|
||||
@ -81,11 +84,11 @@ o Do not use the results from relational operators ("==", "!=",
|
||||
after such branches, but can speculate loads, which can again
|
||||
result in misordering bugs.
|
||||
|
||||
o Be very careful about comparing pointers obtained from
|
||||
- Be very careful about comparing pointers obtained from
|
||||
rcu_dereference() against non-NULL values. As Linus Torvalds
|
||||
explained, if the two pointers are equal, the compiler could
|
||||
substitute the pointer you are comparing against for the pointer
|
||||
obtained from rcu_dereference(). For example:
|
||||
obtained from rcu_dereference(). For example::
|
||||
|
||||
p = rcu_dereference(gp);
|
||||
if (p == &default_struct)
|
||||
@ -93,7 +96,7 @@ o Be very careful about comparing pointers obtained from
|
||||
|
||||
Because the compiler now knows that the value of "p" is exactly
|
||||
the address of the variable "default_struct", it is free to
|
||||
transform this code into the following:
|
||||
transform this code into the following::
|
||||
|
||||
p = rcu_dereference(gp);
|
||||
if (p == &default_struct)
|
||||
@ -105,14 +108,14 @@ o Be very careful about comparing pointers obtained from
|
||||
|
||||
However, comparisons are OK in the following cases:
|
||||
|
||||
o The comparison was against the NULL pointer. If the
|
||||
- The comparison was against the NULL pointer. If the
|
||||
compiler knows that the pointer is NULL, you had better
|
||||
not be dereferencing it anyway. If the comparison is
|
||||
non-equal, the compiler is none the wiser. Therefore,
|
||||
it is safe to compare pointers from rcu_dereference()
|
||||
against NULL pointers.
|
||||
|
||||
o The pointer is never dereferenced after being compared.
|
||||
- The pointer is never dereferenced after being compared.
|
||||
Since there are no subsequent dereferences, the compiler
|
||||
cannot use anything it learned from the comparison
|
||||
to reorder the non-existent subsequent dereferences.
|
||||
@ -124,31 +127,31 @@ o Be very careful about comparing pointers obtained from
|
||||
dereferenced, rcu_access_pointer() should be used in place
|
||||
of rcu_dereference().
|
||||
|
||||
o The comparison is against a pointer that references memory
|
||||
- The comparison is against a pointer that references memory
|
||||
that was initialized "a long time ago." The reason
|
||||
this is safe is that even if misordering occurs, the
|
||||
misordering will not affect the accesses that follow
|
||||
the comparison. So exactly how long ago is "a long
|
||||
time ago"? Here are some possibilities:
|
||||
|
||||
o Compile time.
|
||||
- Compile time.
|
||||
|
||||
o Boot time.
|
||||
- Boot time.
|
||||
|
||||
o Module-init time for module code.
|
||||
- Module-init time for module code.
|
||||
|
||||
o Prior to kthread creation for kthread code.
|
||||
- Prior to kthread creation for kthread code.
|
||||
|
||||
o During some prior acquisition of the lock that
|
||||
- During some prior acquisition of the lock that
|
||||
we now hold.
|
||||
|
||||
o Before mod_timer() time for a timer handler.
|
||||
- Before mod_timer() time for a timer handler.
|
||||
|
||||
There are many other possibilities involving the Linux
|
||||
kernel's wide array of primitives that cause code to
|
||||
be invoked at a later time.
|
||||
|
||||
o The pointer being compared against also came from
|
||||
- The pointer being compared against also came from
|
||||
rcu_dereference(). In this case, both pointers depend
|
||||
on one rcu_dereference() or another, so you get proper
|
||||
ordering either way.
|
||||
@ -159,13 +162,13 @@ o Be very careful about comparing pointers obtained from
|
||||
of such an RCU usage bug is shown in the section titled
|
||||
"EXAMPLE OF AMPLIFIED RCU-USAGE BUG".
|
||||
|
||||
o All of the accesses following the comparison are stores,
|
||||
- All of the accesses following the comparison are stores,
|
||||
so that a control dependency preserves the needed ordering.
|
||||
That said, it is easy to get control dependencies wrong.
|
||||
Please see the "CONTROL DEPENDENCIES" section of
|
||||
Documentation/memory-barriers.txt for more details.
|
||||
|
||||
o The pointers are not equal -and- the compiler does
|
||||
- The pointers are not equal -and- the compiler does
|
||||
not have enough information to deduce the value of the
|
||||
pointer. Note that the volatile cast in rcu_dereference()
|
||||
will normally prevent the compiler from knowing too much.
|
||||
@ -175,7 +178,7 @@ o Be very careful about comparing pointers obtained from
|
||||
comparison will provide exactly the information that the
|
||||
compiler needs to deduce the value of the pointer.
|
||||
|
||||
o Disable any value-speculation optimizations that your compiler
|
||||
- Disable any value-speculation optimizations that your compiler
|
||||
might provide, especially if you are making use of feedback-based
|
||||
optimizations that take data collected from prior runs. Such
|
||||
value-speculation optimizations reorder operations by design.
|
||||
@ -188,11 +191,12 @@ o Disable any value-speculation optimizations that your compiler
|
||||
|
||||
|
||||
EXAMPLE OF AMPLIFIED RCU-USAGE BUG
|
||||
----------------------------------
|
||||
|
||||
Because updaters can run concurrently with RCU readers, RCU readers can
|
||||
see stale and/or inconsistent values. If RCU readers need fresh or
|
||||
consistent values, which they sometimes do, they need to take proper
|
||||
precautions. To see this, consider the following code fragment:
|
||||
precautions. To see this, consider the following code fragment::
|
||||
|
||||
struct foo {
|
||||
int a;
|
||||
@ -244,7 +248,7 @@ to some reordering from the compiler and CPUs is beside the point.
|
||||
|
||||
But suppose that the reader needs a consistent view?
|
||||
|
||||
Then one approach is to use locking, for example, as follows:
|
||||
Then one approach is to use locking, for example, as follows::
|
||||
|
||||
struct foo {
|
||||
int a;
|
||||
@ -299,6 +303,7 @@ As always, use the right tool for the job!
|
||||
|
||||
|
||||
EXAMPLE WHERE THE COMPILER KNOWS TOO MUCH
|
||||
-----------------------------------------
|
||||
|
||||
If a pointer obtained from rcu_dereference() compares not-equal to some
|
||||
other pointer, the compiler normally has no clue what the value of the
|
||||
@ -308,7 +313,7 @@ guarantees that RCU depends on. And the volatile cast in rcu_dereference()
|
||||
should prevent the compiler from guessing the value.
|
||||
|
||||
But without rcu_dereference(), the compiler knows more than you might
|
||||
expect. Consider the following code fragment:
|
||||
expect. Consider the following code fragment::
|
||||
|
||||
struct foo {
|
||||
int a;
|
||||
@ -354,6 +359,7 @@ dereference the resulting pointer.
|
||||
|
||||
|
||||
WHICH MEMBER OF THE rcu_dereference() FAMILY SHOULD YOU USE?
|
||||
------------------------------------------------------------
|
||||
|
||||
First, please avoid using rcu_dereference_raw() and also please avoid
|
||||
using rcu_dereference_check() and rcu_dereference_protected() with a
|
||||
@ -370,7 +376,7 @@ member of the rcu_dereference() to use in various situations:
|
||||
|
||||
2. If the access might be within an RCU read-side critical section
|
||||
on the one hand, or protected by (say) my_lock on the other,
|
||||
use rcu_dereference_check(), for example:
|
||||
use rcu_dereference_check(), for example::
|
||||
|
||||
p1 = rcu_dereference_check(p->rcu_protected_pointer,
|
||||
lockdep_is_held(&my_lock));
|
||||
@ -378,14 +384,14 @@ member of the rcu_dereference() to use in various situations:
|
||||
|
||||
3. If the access might be within an RCU read-side critical section
|
||||
on the one hand, or protected by either my_lock or your_lock on
|
||||
the other, again use rcu_dereference_check(), for example:
|
||||
the other, again use rcu_dereference_check(), for example::
|
||||
|
||||
p1 = rcu_dereference_check(p->rcu_protected_pointer,
|
||||
lockdep_is_held(&my_lock) ||
|
||||
lockdep_is_held(&your_lock));
|
||||
|
||||
4. If the access is on the update side, so that it is always protected
|
||||
by my_lock, use rcu_dereference_protected():
|
||||
by my_lock, use rcu_dereference_protected()::
|
||||
|
||||
p1 = rcu_dereference_protected(p->rcu_protected_pointer,
|
||||
lockdep_is_held(&my_lock));
|
||||
@ -410,18 +416,19 @@ member of the rcu_dereference() to use in various situations:
|
||||
|
||||
|
||||
SPARSE CHECKING OF RCU-PROTECTED POINTERS
|
||||
-----------------------------------------
|
||||
|
||||
The sparse static-analysis tool checks for direct access to RCU-protected
|
||||
pointers, which can result in "interesting" bugs due to compiler
|
||||
optimizations involving invented loads and perhaps also load tearing.
|
||||
For example, suppose someone mistakenly does something like this:
|
||||
For example, suppose someone mistakenly does something like this::
|
||||
|
||||
p = q->rcu_protected_pointer;
|
||||
do_something_with(p->a);
|
||||
do_something_else_with(p->b);
|
||||
|
||||
If register pressure is high, the compiler might optimize "p" out
|
||||
of existence, transforming the code to something like this:
|
||||
of existence, transforming the code to something like this::
|
||||
|
||||
do_something_with(q->rcu_protected_pointer->a);
|
||||
do_something_else_with(q->rcu_protected_pointer->b);
|
||||
@ -435,7 +442,7 @@ Load tearing could of course result in dereferencing a mashup of a pair
|
||||
of pointers, which also might fatally disappoint your code.
|
||||
|
||||
These problems could have been avoided simply by making the code instead
|
||||
read as follows:
|
||||
read as follows::
|
||||
|
||||
p = rcu_dereference(q->rcu_protected_pointer);
|
||||
do_something_with(p->a);
|
||||
@ -448,7 +455,7 @@ or as a formal parameter, with "__rcu", which tells sparse to complain if
|
||||
this pointer is accessed directly. It will also cause sparse to complain
|
||||
if a pointer not marked with "__rcu" is accessed using rcu_dereference()
|
||||
and friends. For example, ->rcu_protected_pointer might be declared as
|
||||
follows:
|
||||
follows::
|
||||
|
||||
struct foo __rcu *rcu_protected_pointer;
|
||||
|
@ -1,4 +1,7 @@
|
||||
.. _rcu_barrier:
|
||||
|
||||
RCU and Unloadable Modules
|
||||
==========================
|
||||
|
||||
[Originally published in LWN Jan. 14, 2007: http://lwn.net/Articles/217484/]
|
||||
|
||||
@ -21,7 +24,7 @@ given that readers might well leave absolutely no trace of their
|
||||
presence? There is a synchronize_rcu() primitive that blocks until all
|
||||
pre-existing readers have completed. An updater wishing to delete an
|
||||
element p from a linked list might do the following, while holding an
|
||||
appropriate lock, of course:
|
||||
appropriate lock, of course::
|
||||
|
||||
list_del_rcu(p);
|
||||
synchronize_rcu();
|
||||
@ -32,13 +35,13 @@ primitive must be used instead. This primitive takes a pointer to an
|
||||
rcu_head struct placed within the RCU-protected data structure and
|
||||
another pointer to a function that may be invoked later to free that
|
||||
structure. Code to delete an element p from the linked list from IRQ
|
||||
context might then be as follows:
|
||||
context might then be as follows::
|
||||
|
||||
list_del_rcu(p);
|
||||
call_rcu(&p->rcu, p_callback);
|
||||
|
||||
Since call_rcu() never blocks, this code can safely be used from within
|
||||
IRQ context. The function p_callback() might be defined as follows:
|
||||
IRQ context. The function p_callback() might be defined as follows::
|
||||
|
||||
static void p_callback(struct rcu_head *rp)
|
||||
{
|
||||
@ -49,6 +52,7 @@ IRQ context. The function p_callback() might be defined as follows:
|
||||
|
||||
|
||||
Unloading Modules That Use call_rcu()
|
||||
-------------------------------------
|
||||
|
||||
But what if p_callback is defined in an unloadable module?
|
||||
|
||||
@ -69,10 +73,11 @@ in realtime kernels in order to avoid excessive scheduling latencies.
|
||||
|
||||
|
||||
rcu_barrier()
|
||||
-------------
|
||||
|
||||
We instead need the rcu_barrier() primitive. Rather than waiting for
|
||||
a grace period to elapse, rcu_barrier() waits for all outstanding RCU
|
||||
callbacks to complete. Please note that rcu_barrier() does -not- imply
|
||||
callbacks to complete. Please note that rcu_barrier() does **not** imply
|
||||
synchronize_rcu(), in particular, if there are no RCU callbacks queued
|
||||
anywhere, rcu_barrier() is within its rights to return immediately,
|
||||
without waiting for a grace period to elapse.
|
||||
@ -88,79 +93,79 @@ must match the flavor of rcu_barrier() with that of call_rcu(). If your
|
||||
module uses multiple flavors of call_rcu(), then it must also use multiple
|
||||
flavors of rcu_barrier() when unloading that module. For example, if
|
||||
it uses call_rcu(), call_srcu() on srcu_struct_1, and call_srcu() on
|
||||
srcu_struct_2(), then the following three lines of code will be required
|
||||
when unloading:
|
||||
srcu_struct_2, then the following three lines of code will be required
|
||||
when unloading::
|
||||
|
||||
1 rcu_barrier();
|
||||
2 srcu_barrier(&srcu_struct_1);
|
||||
3 srcu_barrier(&srcu_struct_2);
|
||||
|
||||
The rcutorture module makes use of rcu_barrier() in its exit function
|
||||
as follows:
|
||||
as follows::
|
||||
|
||||
1 static void
|
||||
2 rcu_torture_cleanup(void)
|
||||
3 {
|
||||
4 int i;
|
||||
1 static void
|
||||
2 rcu_torture_cleanup(void)
|
||||
3 {
|
||||
4 int i;
|
||||
5
|
||||
6 fullstop = 1;
|
||||
7 if (shuffler_task != NULL) {
|
||||
6 fullstop = 1;
|
||||
7 if (shuffler_task != NULL) {
|
||||
8 VERBOSE_PRINTK_STRING("Stopping rcu_torture_shuffle task");
|
||||
9 kthread_stop(shuffler_task);
|
||||
10 }
|
||||
11 shuffler_task = NULL;
|
||||
12
|
||||
13 if (writer_task != NULL) {
|
||||
14 VERBOSE_PRINTK_STRING("Stopping rcu_torture_writer task");
|
||||
15 kthread_stop(writer_task);
|
||||
16 }
|
||||
17 writer_task = NULL;
|
||||
18
|
||||
19 if (reader_tasks != NULL) {
|
||||
20 for (i = 0; i < nrealreaders; i++) {
|
||||
21 if (reader_tasks[i] != NULL) {
|
||||
22 VERBOSE_PRINTK_STRING(
|
||||
23 "Stopping rcu_torture_reader task");
|
||||
24 kthread_stop(reader_tasks[i]);
|
||||
25 }
|
||||
26 reader_tasks[i] = NULL;
|
||||
27 }
|
||||
28 kfree(reader_tasks);
|
||||
29 reader_tasks = NULL;
|
||||
30 }
|
||||
31 rcu_torture_current = NULL;
|
||||
32
|
||||
33 if (fakewriter_tasks != NULL) {
|
||||
34 for (i = 0; i < nfakewriters; i++) {
|
||||
35 if (fakewriter_tasks[i] != NULL) {
|
||||
36 VERBOSE_PRINTK_STRING(
|
||||
37 "Stopping rcu_torture_fakewriter task");
|
||||
38 kthread_stop(fakewriter_tasks[i]);
|
||||
39 }
|
||||
40 fakewriter_tasks[i] = NULL;
|
||||
41 }
|
||||
42 kfree(fakewriter_tasks);
|
||||
43 fakewriter_tasks = NULL;
|
||||
44 }
|
||||
45
|
||||
46 if (stats_task != NULL) {
|
||||
47 VERBOSE_PRINTK_STRING("Stopping rcu_torture_stats task");
|
||||
48 kthread_stop(stats_task);
|
||||
49 }
|
||||
50 stats_task = NULL;
|
||||
51
|
||||
52 /* Wait for all RCU callbacks to fire. */
|
||||
53 rcu_barrier();
|
||||
54
|
||||
55 rcu_torture_stats_print(); /* -After- the stats thread is stopped! */
|
||||
56
|
||||
57 if (cur_ops->cleanup != NULL)
|
||||
58 cur_ops->cleanup();
|
||||
59 if (atomic_read(&n_rcu_torture_error))
|
||||
60 rcu_torture_print_module_parms("End of test: FAILURE");
|
||||
61 else
|
||||
62 rcu_torture_print_module_parms("End of test: SUCCESS");
|
||||
63 }
|
||||
10 }
|
||||
11 shuffler_task = NULL;
|
||||
12
|
||||
13 if (writer_task != NULL) {
|
||||
14 VERBOSE_PRINTK_STRING("Stopping rcu_torture_writer task");
|
||||
15 kthread_stop(writer_task);
|
||||
16 }
|
||||
17 writer_task = NULL;
|
||||
18
|
||||
19 if (reader_tasks != NULL) {
|
||||
20 for (i = 0; i < nrealreaders; i++) {
|
||||
21 if (reader_tasks[i] != NULL) {
|
||||
22 VERBOSE_PRINTK_STRING(
|
||||
23 "Stopping rcu_torture_reader task");
|
||||
24 kthread_stop(reader_tasks[i]);
|
||||
25 }
|
||||
26 reader_tasks[i] = NULL;
|
||||
27 }
|
||||
28 kfree(reader_tasks);
|
||||
29 reader_tasks = NULL;
|
||||
30 }
|
||||
31 rcu_torture_current = NULL;
|
||||
32
|
||||
33 if (fakewriter_tasks != NULL) {
|
||||
34 for (i = 0; i < nfakewriters; i++) {
|
||||
35 if (fakewriter_tasks[i] != NULL) {
|
||||
36 VERBOSE_PRINTK_STRING(
|
||||
37 "Stopping rcu_torture_fakewriter task");
|
||||
38 kthread_stop(fakewriter_tasks[i]);
|
||||
39 }
|
||||
40 fakewriter_tasks[i] = NULL;
|
||||
41 }
|
||||
42 kfree(fakewriter_tasks);
|
||||
43 fakewriter_tasks = NULL;
|
||||
44 }
|
||||
45
|
||||
46 if (stats_task != NULL) {
|
||||
47 VERBOSE_PRINTK_STRING("Stopping rcu_torture_stats task");
|
||||
48 kthread_stop(stats_task);
|
||||
49 }
|
||||
50 stats_task = NULL;
|
||||
51
|
||||
52 /* Wait for all RCU callbacks to fire. */
|
||||
53 rcu_barrier();
|
||||
54
|
||||
55 rcu_torture_stats_print(); /* -After- the stats thread is stopped! */
|
||||
56
|
||||
57 if (cur_ops->cleanup != NULL)
|
||||
58 cur_ops->cleanup();
|
||||
59 if (atomic_read(&n_rcu_torture_error))
|
||||
60 rcu_torture_print_module_parms("End of test: FAILURE");
|
||||
61 else
|
||||
62 rcu_torture_print_module_parms("End of test: SUCCESS");
|
||||
63 }
|
||||
|
||||
Line 6 sets a global variable that prevents any RCU callbacks from
|
||||
re-posting themselves. This will not be necessary in most cases, since
|
||||
@ -176,9 +181,14 @@ for any pre-existing callbacks to complete.
|
||||
Then lines 55-62 print status and do operation-specific cleanup, and
|
||||
then return, permitting the module-unload operation to be completed.
|
||||
|
||||
Quick Quiz #1: Is there any other situation where rcu_barrier() might
|
||||
.. _rcubarrier_quiz_1:
|
||||
|
||||
Quick Quiz #1:
|
||||
Is there any other situation where rcu_barrier() might
|
||||
be required?
|
||||
|
||||
:ref:`Answer to Quick Quiz #1 <answer_rcubarrier_quiz_1>`
|
||||
|
||||
Your module might have additional complications. For example, if your
|
||||
module invokes call_rcu() from timers, you will need to first cancel all
|
||||
the timers, and only then invoke rcu_barrier() to wait for any remaining
|
||||
@ -188,11 +198,12 @@ Of course, if you module uses call_rcu(), you will need to invoke
|
||||
rcu_barrier() before unloading. Similarly, if your module uses
|
||||
call_srcu(), you will need to invoke srcu_barrier() before unloading,
|
||||
and on the same srcu_struct structure. If your module uses call_rcu()
|
||||
-and- call_srcu(), then you will need to invoke rcu_barrier() -and-
|
||||
**and** call_srcu(), then you will need to invoke rcu_barrier() **and**
|
||||
srcu_barrier().
|
||||
|
||||
|
||||
Implementing rcu_barrier()
|
||||
--------------------------
|
||||
|
||||
Dipankar Sarma's implementation of rcu_barrier() makes use of the fact
|
||||
that RCU callbacks are never reordered once queued on one of the per-CPU
|
||||
@ -200,19 +211,19 @@ queues. His implementation queues an RCU callback on each of the per-CPU
|
||||
callback queues, and then waits until they have all started executing, at
|
||||
which point, all earlier RCU callbacks are guaranteed to have completed.
|
||||
|
||||
The original code for rcu_barrier() was as follows:
|
||||
The original code for rcu_barrier() was as follows::
|
||||
|
||||
1 void rcu_barrier(void)
|
||||
2 {
|
||||
3 BUG_ON(in_interrupt());
|
||||
4 /* Take cpucontrol mutex to protect against CPU hotplug */
|
||||
5 mutex_lock(&rcu_barrier_mutex);
|
||||
6 init_completion(&rcu_barrier_completion);
|
||||
7 atomic_set(&rcu_barrier_cpu_count, 0);
|
||||
8 on_each_cpu(rcu_barrier_func, NULL, 0, 1);
|
||||
9 wait_for_completion(&rcu_barrier_completion);
|
||||
10 mutex_unlock(&rcu_barrier_mutex);
|
||||
11 }
|
||||
1 void rcu_barrier(void)
|
||||
2 {
|
||||
3 BUG_ON(in_interrupt());
|
||||
4 /* Take cpucontrol mutex to protect against CPU hotplug */
|
||||
5 mutex_lock(&rcu_barrier_mutex);
|
||||
6 init_completion(&rcu_barrier_completion);
|
||||
7 atomic_set(&rcu_barrier_cpu_count, 0);
|
||||
8 on_each_cpu(rcu_barrier_func, NULL, 0, 1);
|
||||
9 wait_for_completion(&rcu_barrier_completion);
|
||||
10 mutex_unlock(&rcu_barrier_mutex);
|
||||
11 }
|
||||
|
||||
Line 3 verifies that the caller is in process context, and lines 5 and 10
|
||||
use rcu_barrier_mutex to ensure that only one rcu_barrier() is using the
|
||||
@ -226,18 +237,18 @@ This code was rewritten in 2008 and several times thereafter, but this
|
||||
still gives the general idea.
|
||||
|
||||
The rcu_barrier_func() runs on each CPU, where it invokes call_rcu()
|
||||
to post an RCU callback, as follows:
|
||||
to post an RCU callback, as follows::
|
||||
|
||||
1 static void rcu_barrier_func(void *notused)
|
||||
2 {
|
||||
3 int cpu = smp_processor_id();
|
||||
4 struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
|
||||
5 struct rcu_head *head;
|
||||
1 static void rcu_barrier_func(void *notused)
|
||||
2 {
|
||||
3 int cpu = smp_processor_id();
|
||||
4 struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
|
||||
5 struct rcu_head *head;
|
||||
6
|
||||
7 head = &rdp->barrier;
|
||||
8 atomic_inc(&rcu_barrier_cpu_count);
|
||||
9 call_rcu(head, rcu_barrier_callback);
|
||||
10 }
|
||||
7 head = &rdp->barrier;
|
||||
8 atomic_inc(&rcu_barrier_cpu_count);
|
||||
9 call_rcu(head, rcu_barrier_callback);
|
||||
10 }
|
||||
|
||||
Lines 3 and 4 locate RCU's internal per-CPU rcu_data structure,
|
||||
which contains the struct rcu_head that needed for the later call to
|
||||
@ -248,20 +259,25 @@ the current CPU's queue.
|
||||
|
||||
The rcu_barrier_callback() function simply atomically decrements the
|
||||
rcu_barrier_cpu_count variable and finalizes the completion when it
|
||||
reaches zero, as follows:
|
||||
reaches zero, as follows::
|
||||
|
||||
1 static void rcu_barrier_callback(struct rcu_head *notused)
|
||||
2 {
|
||||
3 if (atomic_dec_and_test(&rcu_barrier_cpu_count))
|
||||
4 complete(&rcu_barrier_completion);
|
||||
3 if (atomic_dec_and_test(&rcu_barrier_cpu_count))
|
||||
4 complete(&rcu_barrier_completion);
|
||||
5 }
|
||||
|
||||
Quick Quiz #2: What happens if CPU 0's rcu_barrier_func() executes
|
||||
.. _rcubarrier_quiz_2:
|
||||
|
||||
Quick Quiz #2:
|
||||
What happens if CPU 0's rcu_barrier_func() executes
|
||||
immediately (thus incrementing rcu_barrier_cpu_count to the
|
||||
value one), but the other CPU's rcu_barrier_func() invocations
|
||||
are delayed for a full grace period? Couldn't this result in
|
||||
rcu_barrier() returning prematurely?
|
||||
|
||||
:ref:`Answer to Quick Quiz #2 <answer_rcubarrier_quiz_2>`
|
||||
|
||||
The current rcu_barrier() implementation is more complex, due to the need
|
||||
to avoid disturbing idle CPUs (especially on battery-powered systems)
|
||||
and the need to minimally disturb non-idle CPUs in real-time systems.
|
||||
@ -269,6 +285,7 @@ However, the code above illustrates the concepts.
|
||||
|
||||
|
||||
rcu_barrier() Summary
|
||||
---------------------
|
||||
|
||||
The rcu_barrier() primitive has seen relatively little use, since most
|
||||
code using RCU is in the core kernel rather than in modules. However, if
|
||||
@ -277,8 +294,12 @@ so that your module may be safely unloaded.
|
||||
|
||||
|
||||
Answers to Quick Quizzes
|
||||
------------------------
|
||||
|
||||
Quick Quiz #1: Is there any other situation where rcu_barrier() might
|
||||
.. _answer_rcubarrier_quiz_1:
|
||||
|
||||
Quick Quiz #1:
|
||||
Is there any other situation where rcu_barrier() might
|
||||
be required?
|
||||
|
||||
Answer: Interestingly enough, rcu_barrier() was not originally
|
||||
@ -292,7 +313,12 @@ Answer: Interestingly enough, rcu_barrier() was not originally
|
||||
implementing rcutorture, and found that rcu_barrier() solves
|
||||
this problem as well.
|
||||
|
||||
Quick Quiz #2: What happens if CPU 0's rcu_barrier_func() executes
|
||||
:ref:`Back to Quick Quiz #1 <rcubarrier_quiz_1>`
|
||||
|
||||
.. _answer_rcubarrier_quiz_2:
|
||||
|
||||
Quick Quiz #2:
|
||||
What happens if CPU 0's rcu_barrier_func() executes
|
||||
immediately (thus incrementing rcu_barrier_cpu_count to the
|
||||
value one), but the other CPU's rcu_barrier_func() invocations
|
||||
are delayed for a full grace period? Couldn't this result in
|
||||
@ -323,3 +349,5 @@ Answer: This cannot happen. The reason is that on_each_cpu() has its last
|
||||
is to add an rcu_read_lock() before line 8 of rcu_barrier()
|
||||
and an rcu_read_unlock() after line 8 of this same function. If
|
||||
you can think of a better change, please let me know!
|
||||
|
||||
:ref:`Back to Quick Quiz #2 <rcubarrier_quiz_2>`
|
@ -225,18 +225,13 @@ an estimate of the total number of RCU callbacks queued across all CPUs
|
||||
In kernels with CONFIG_RCU_FAST_NO_HZ, more information is printed
|
||||
for each CPU:
|
||||
|
||||
0: (64628 ticks this GP) idle=dd5/3fffffffffffffff/0 softirq=82/543 last_accelerate: a345/d342 Nonlazy posted: ..D
|
||||
0: (64628 ticks this GP) idle=dd5/3fffffffffffffff/0 softirq=82/543 last_accelerate: a345/d342 dyntick_enabled: 1
|
||||
|
||||
The "last_accelerate:" prints the low-order 16 bits (in hex) of the
|
||||
jiffies counter when this CPU last invoked rcu_try_advance_all_cbs()
|
||||
from rcu_needs_cpu() or last invoked rcu_accelerate_cbs() from
|
||||
rcu_prepare_for_idle(). The "Nonlazy posted:" indicates lazy-callback
|
||||
status, so that an "l" indicates that all callbacks were lazy at the start
|
||||
of the last idle period and an "L" indicates that there are currently
|
||||
no non-lazy callbacks (in both cases, "." is printed otherwise, as
|
||||
shown above) and "D" indicates that dyntick-idle processing is enabled
|
||||
("." is printed otherwise, for example, if disabled via the "nohz="
|
||||
kernel boot parameter).
|
||||
rcu_prepare_for_idle(). "dyntick_enabled: 1" indicates that dyntick-idle
|
||||
processing is enabled.
|
||||
|
||||
If the grace period ends just as the stall warning starts printing,
|
||||
there will be a spurious stall-warning message, which will include
|
||||
|
@ -1,15 +1,18 @@
|
||||
.. _whatisrcu_doc:
|
||||
|
||||
What is RCU? -- "Read, Copy, Update"
|
||||
======================================
|
||||
|
||||
Please note that the "What is RCU?" LWN series is an excellent place
|
||||
to start learning about RCU:
|
||||
|
||||
1. What is RCU, Fundamentally? http://lwn.net/Articles/262464/
|
||||
2. What is RCU? Part 2: Usage http://lwn.net/Articles/263130/
|
||||
3. RCU part 3: the RCU API http://lwn.net/Articles/264090/
|
||||
4. The RCU API, 2010 Edition http://lwn.net/Articles/418853/
|
||||
2010 Big API Table http://lwn.net/Articles/419086/
|
||||
5. The RCU API, 2014 Edition http://lwn.net/Articles/609904/
|
||||
2014 Big API Table http://lwn.net/Articles/609973/
|
||||
| 1. What is RCU, Fundamentally? http://lwn.net/Articles/262464/
|
||||
| 2. What is RCU? Part 2: Usage http://lwn.net/Articles/263130/
|
||||
| 3. RCU part 3: the RCU API http://lwn.net/Articles/264090/
|
||||
| 4. The RCU API, 2010 Edition http://lwn.net/Articles/418853/
|
||||
| 2010 Big API Table http://lwn.net/Articles/419086/
|
||||
| 5. The RCU API, 2014 Edition http://lwn.net/Articles/609904/
|
||||
| 2014 Big API Table http://lwn.net/Articles/609973/
|
||||
|
||||
|
||||
What is RCU?
|
||||
@ -24,14 +27,21 @@ the experience has been that different people must take different paths
|
||||
to arrive at an understanding of RCU. This document provides several
|
||||
different paths, as follows:
|
||||
|
||||
1. RCU OVERVIEW
|
||||
2. WHAT IS RCU'S CORE API?
|
||||
3. WHAT ARE SOME EXAMPLE USES OF CORE RCU API?
|
||||
4. WHAT IF MY UPDATING THREAD CANNOT BLOCK?
|
||||
5. WHAT ARE SOME SIMPLE IMPLEMENTATIONS OF RCU?
|
||||
6. ANALOGY WITH READER-WRITER LOCKING
|
||||
7. FULL LIST OF RCU APIs
|
||||
8. ANSWERS TO QUICK QUIZZES
|
||||
:ref:`1. RCU OVERVIEW <1_whatisRCU>`
|
||||
|
||||
:ref:`2. WHAT IS RCU'S CORE API? <2_whatisRCU>`
|
||||
|
||||
:ref:`3. WHAT ARE SOME EXAMPLE USES OF CORE RCU API? <3_whatisRCU>`
|
||||
|
||||
:ref:`4. WHAT IF MY UPDATING THREAD CANNOT BLOCK? <4_whatisRCU>`
|
||||
|
||||
:ref:`5. WHAT ARE SOME SIMPLE IMPLEMENTATIONS OF RCU? <5_whatisRCU>`
|
||||
|
||||
:ref:`6. ANALOGY WITH READER-WRITER LOCKING <6_whatisRCU>`
|
||||
|
||||
:ref:`7. FULL LIST OF RCU APIs <7_whatisRCU>`
|
||||
|
||||
:ref:`8. ANSWERS TO QUICK QUIZZES <8_whatisRCU>`
|
||||
|
||||
People who prefer starting with a conceptual overview should focus on
|
||||
Section 1, though most readers will profit by reading this section at
|
||||
@ -49,8 +59,10 @@ everything, feel free to read the whole thing -- but if you are really
|
||||
that type of person, you have perused the source code and will therefore
|
||||
never need this document anyway. ;-)
|
||||
|
||||
.. _1_whatisRCU:
|
||||
|
||||
1. RCU OVERVIEW
|
||||
----------------
|
||||
|
||||
The basic idea behind RCU is to split updates into "removal" and
|
||||
"reclamation" phases. The removal phase removes references to data items
|
||||
@ -116,8 +128,10 @@ So how the heck can a reclaimer tell when a reader is done, given
|
||||
that readers are not doing any sort of synchronization operations???
|
||||
Read on to learn about how RCU's API makes this easy.
|
||||
|
||||
.. _2_whatisRCU:
|
||||
|
||||
2. WHAT IS RCU'S CORE API?
|
||||
---------------------------
|
||||
|
||||
The core RCU API is quite small:
|
||||
|
||||
@ -136,7 +150,7 @@ later. See the kernel docbook documentation for more info, or look directly
|
||||
at the function header comments.
|
||||
|
||||
rcu_read_lock()
|
||||
|
||||
^^^^^^^^^^^^^^^
|
||||
void rcu_read_lock(void);
|
||||
|
||||
Used by a reader to inform the reclaimer that the reader is
|
||||
@ -150,7 +164,7 @@ rcu_read_lock()
|
||||
longer-term references to data structures.
|
||||
|
||||
rcu_read_unlock()
|
||||
|
||||
^^^^^^^^^^^^^^^^^
|
||||
void rcu_read_unlock(void);
|
||||
|
||||
Used by a reader to inform the reclaimer that the reader is
|
||||
@ -158,15 +172,15 @@ rcu_read_unlock()
|
||||
read-side critical sections may be nested and/or overlapping.
|
||||
|
||||
synchronize_rcu()
|
||||
|
||||
^^^^^^^^^^^^^^^^^
|
||||
void synchronize_rcu(void);
|
||||
|
||||
Marks the end of updater code and the beginning of reclaimer
|
||||
code. It does this by blocking until all pre-existing RCU
|
||||
read-side critical sections on all CPUs have completed.
|
||||
Note that synchronize_rcu() will -not- necessarily wait for
|
||||
Note that synchronize_rcu() will **not** necessarily wait for
|
||||
any subsequent RCU read-side critical sections to complete.
|
||||
For example, consider the following sequence of events:
|
||||
For example, consider the following sequence of events::
|
||||
|
||||
CPU 0 CPU 1 CPU 2
|
||||
----------------- ------------------------- ---------------
|
||||
@ -182,7 +196,7 @@ synchronize_rcu()
|
||||
any that begin after synchronize_rcu() is invoked.
|
||||
|
||||
Of course, synchronize_rcu() does not necessarily return
|
||||
-immediately- after the last pre-existing RCU read-side critical
|
||||
**immediately** after the last pre-existing RCU read-side critical
|
||||
section completes. For one thing, there might well be scheduling
|
||||
delays. For another thing, many RCU implementations process
|
||||
requests in batches in order to improve efficiencies, which can
|
||||
@ -211,10 +225,10 @@ synchronize_rcu()
|
||||
checklist.txt for some approaches to limiting the update rate.
|
||||
|
||||
rcu_assign_pointer()
|
||||
|
||||
^^^^^^^^^^^^^^^^^^^^
|
||||
void rcu_assign_pointer(p, typeof(p) v);
|
||||
|
||||
Yes, rcu_assign_pointer() -is- implemented as a macro, though it
|
||||
Yes, rcu_assign_pointer() **is** implemented as a macro, though it
|
||||
would be cool to be able to declare a function in this manner.
|
||||
(Compiler experts will no doubt disagree.)
|
||||
|
||||
@ -231,7 +245,7 @@ rcu_assign_pointer()
|
||||
the _rcu list-manipulation primitives such as list_add_rcu().
|
||||
|
||||
rcu_dereference()
|
||||
|
||||
^^^^^^^^^^^^^^^^^
|
||||
typeof(p) rcu_dereference(p);
|
||||
|
||||
Like rcu_assign_pointer(), rcu_dereference() must be implemented
|
||||
@ -248,13 +262,13 @@ rcu_dereference()
|
||||
|
||||
Common coding practice uses rcu_dereference() to copy an
|
||||
RCU-protected pointer to a local variable, then dereferences
|
||||
this local variable, for example as follows:
|
||||
this local variable, for example as follows::
|
||||
|
||||
p = rcu_dereference(head.next);
|
||||
return p->data;
|
||||
|
||||
However, in this case, one could just as easily combine these
|
||||
into one statement:
|
||||
into one statement::
|
||||
|
||||
return rcu_dereference(head.next)->data;
|
||||
|
||||
@ -266,8 +280,8 @@ rcu_dereference()
|
||||
unnecessary overhead on Alpha CPUs.
|
||||
|
||||
Note that the value returned by rcu_dereference() is valid
|
||||
only within the enclosing RCU read-side critical section [1].
|
||||
For example, the following is -not- legal:
|
||||
only within the enclosing RCU read-side critical section [1]_.
|
||||
For example, the following is **not** legal::
|
||||
|
||||
rcu_read_lock();
|
||||
p = rcu_dereference(head.next);
|
||||
@ -290,9 +304,9 @@ rcu_dereference()
|
||||
at any time, including immediately after the rcu_dereference().
|
||||
And, again like rcu_assign_pointer(), rcu_dereference() is
|
||||
typically used indirectly, via the _rcu list-manipulation
|
||||
primitives, such as list_for_each_entry_rcu() [2].
|
||||
primitives, such as list_for_each_entry_rcu() [2]_.
|
||||
|
||||
[1] The variant rcu_dereference_protected() can be used outside
|
||||
.. [1] The variant rcu_dereference_protected() can be used outside
|
||||
of an RCU read-side critical section as long as the usage is
|
||||
protected by locks acquired by the update-side code. This variant
|
||||
avoids the lockdep warning that would happen when using (for
|
||||
@ -305,7 +319,7 @@ rcu_dereference()
|
||||
a lockdep splat is emitted. See Documentation/RCU/Design/Requirements/Requirements.rst
|
||||
and the API's code comments for more details and example usage.
|
||||
|
||||
[2] If the list_for_each_entry_rcu() instance might be used by
|
||||
.. [2] If the list_for_each_entry_rcu() instance might be used by
|
||||
update-side code as well as by RCU readers, then an additional
|
||||
lockdep expression can be added to its list of arguments.
|
||||
For example, given an additional "lock_is_held(&mylock)" argument,
|
||||
@ -315,6 +329,7 @@ rcu_dereference()
|
||||
|
||||
The following diagram shows how each API communicates among the
|
||||
reader, updater, and reclaimer.
|
||||
::
|
||||
|
||||
|
||||
rcu_assign_pointer()
|
||||
@ -375,12 +390,16 @@ c. RCU applied to scheduler and interrupt/NMI-handler tasks.
|
||||
Again, most uses will be of (a). The (b) and (c) cases are important
|
||||
for specialized uses, but are relatively uncommon.
|
||||
|
||||
.. _3_whatisRCU:
|
||||
|
||||
3. WHAT ARE SOME EXAMPLE USES OF CORE RCU API?
|
||||
-----------------------------------------------
|
||||
|
||||
This section shows a simple use of the core RCU API to protect a
|
||||
global pointer to a dynamically allocated structure. More-typical
|
||||
uses of RCU may be found in listRCU.txt, arrayRCU.txt, and NMI-RCU.txt.
|
||||
uses of RCU may be found in :ref:`listRCU.rst <list_rcu_doc>`,
|
||||
:ref:`arrayRCU.rst <array_rcu_doc>`, and :ref:`NMI-RCU.rst <NMI_rcu_doc>`.
|
||||
::
|
||||
|
||||
struct foo {
|
||||
int a;
|
||||
@ -440,40 +459,43 @@ uses of RCU may be found in listRCU.txt, arrayRCU.txt, and NMI-RCU.txt.
|
||||
|
||||
So, to sum up:
|
||||
|
||||
o Use rcu_read_lock() and rcu_read_unlock() to guard RCU
|
||||
- Use rcu_read_lock() and rcu_read_unlock() to guard RCU
|
||||
read-side critical sections.
|
||||
|
||||
o Within an RCU read-side critical section, use rcu_dereference()
|
||||
- Within an RCU read-side critical section, use rcu_dereference()
|
||||
to dereference RCU-protected pointers.
|
||||
|
||||
o Use some solid scheme (such as locks or semaphores) to
|
||||
- Use some solid scheme (such as locks or semaphores) to
|
||||
keep concurrent updates from interfering with each other.
|
||||
|
||||
o Use rcu_assign_pointer() to update an RCU-protected pointer.
|
||||
- Use rcu_assign_pointer() to update an RCU-protected pointer.
|
||||
This primitive protects concurrent readers from the updater,
|
||||
-not- concurrent updates from each other! You therefore still
|
||||
**not** concurrent updates from each other! You therefore still
|
||||
need to use locking (or something similar) to keep concurrent
|
||||
rcu_assign_pointer() primitives from interfering with each other.
|
||||
|
||||
o Use synchronize_rcu() -after- removing a data element from an
|
||||
RCU-protected data structure, but -before- reclaiming/freeing
|
||||
- Use synchronize_rcu() **after** removing a data element from an
|
||||
RCU-protected data structure, but **before** reclaiming/freeing
|
||||
the data element, in order to wait for the completion of all
|
||||
RCU read-side critical sections that might be referencing that
|
||||
data item.
|
||||
|
||||
See checklist.txt for additional rules to follow when using RCU.
|
||||
And again, more-typical uses of RCU may be found in listRCU.txt,
|
||||
arrayRCU.txt, and NMI-RCU.txt.
|
||||
And again, more-typical uses of RCU may be found in :ref:`listRCU.rst
|
||||
<list_rcu_doc>`, :ref:`arrayRCU.rst <array_rcu_doc>`, and :ref:`NMI-RCU.rst
|
||||
<NMI_rcu_doc>`.
|
||||
|
||||
.. _4_whatisRCU:
|
||||
|
||||
4. WHAT IF MY UPDATING THREAD CANNOT BLOCK?
|
||||
--------------------------------------------
|
||||
|
||||
In the example above, foo_update_a() blocks until a grace period elapses.
|
||||
This is quite simple, but in some cases one cannot afford to wait so
|
||||
long -- there might be other high-priority work to be done.
|
||||
|
||||
In such cases, one uses call_rcu() rather than synchronize_rcu().
|
||||
The call_rcu() API is as follows:
|
||||
The call_rcu() API is as follows::
|
||||
|
||||
void call_rcu(struct rcu_head * head,
|
||||
void (*func)(struct rcu_head *head));
|
||||
@ -481,7 +503,7 @@ The call_rcu() API is as follows:
|
||||
This function invokes func(head) after a grace period has elapsed.
|
||||
This invocation might happen from either softirq or process context,
|
||||
so the function is not permitted to block. The foo struct needs to
|
||||
have an rcu_head structure added, perhaps as follows:
|
||||
have an rcu_head structure added, perhaps as follows::
|
||||
|
||||
struct foo {
|
||||
int a;
|
||||
@ -490,7 +512,7 @@ have an rcu_head structure added, perhaps as follows:
|
||||
struct rcu_head rcu;
|
||||
};
|
||||
|
||||
The foo_update_a() function might then be written as follows:
|
||||
The foo_update_a() function might then be written as follows::
|
||||
|
||||
/*
|
||||
* Create a new struct foo that is the same as the one currently
|
||||
@ -520,7 +542,7 @@ The foo_update_a() function might then be written as follows:
|
||||
call_rcu(&old_fp->rcu, foo_reclaim);
|
||||
}
|
||||
|
||||
The foo_reclaim() function might appear as follows:
|
||||
The foo_reclaim() function might appear as follows::
|
||||
|
||||
void foo_reclaim(struct rcu_head *rp)
|
||||
{
|
||||
@ -544,7 +566,7 @@ namely foo_reclaim().
|
||||
The summary of advice is the same as for the previous section, except
|
||||
that we are now using call_rcu() rather than synchronize_rcu():
|
||||
|
||||
o Use call_rcu() -after- removing a data element from an
|
||||
- Use call_rcu() **after** removing a data element from an
|
||||
RCU-protected data structure in order to register a callback
|
||||
function that will be invoked after the completion of all RCU
|
||||
read-side critical sections that might be referencing that
|
||||
@ -552,14 +574,16 @@ o Use call_rcu() -after- removing a data element from an
|
||||
|
||||
If the callback for call_rcu() is not doing anything more than calling
|
||||
kfree() on the structure, you can use kfree_rcu() instead of call_rcu()
|
||||
to avoid having to write your own callback:
|
||||
to avoid having to write your own callback::
|
||||
|
||||
kfree_rcu(old_fp, rcu);
|
||||
|
||||
Again, see checklist.txt for additional rules governing the use of RCU.
|
||||
|
||||
.. _5_whatisRCU:
|
||||
|
||||
5. WHAT ARE SOME SIMPLE IMPLEMENTATIONS OF RCU?
|
||||
------------------------------------------------
|
||||
|
||||
One of the nice things about RCU is that it has extremely simple "toy"
|
||||
implementations that are a good first step towards understanding the
|
||||
@ -579,7 +603,7 @@ more details on the current implementation as of early 2004.
|
||||
|
||||
|
||||
5A. "TOY" IMPLEMENTATION #1: LOCKING
|
||||
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
This section presents a "toy" RCU implementation that is based on
|
||||
familiar locking primitives. Its overhead makes it a non-starter for
|
||||
real-life use, as does its lack of scalability. It is also unsuitable
|
||||
@ -591,7 +615,7 @@ you allow nested rcu_read_lock() calls, you can deadlock.
|
||||
However, it is probably the easiest implementation to relate to, so is
|
||||
a good starting point.
|
||||
|
||||
It is extremely simple:
|
||||
It is extremely simple::
|
||||
|
||||
static DEFINE_RWLOCK(rcu_gp_mutex);
|
||||
|
||||
@ -614,7 +638,7 @@ It is extremely simple:
|
||||
|
||||
[You can ignore rcu_assign_pointer() and rcu_dereference() without missing
|
||||
much. But here are simplified versions anyway. And whatever you do,
|
||||
don't forget about them when submitting patches making use of RCU!]
|
||||
don't forget about them when submitting patches making use of RCU!]::
|
||||
|
||||
#define rcu_assign_pointer(p, v) \
|
||||
({ \
|
||||
@ -647,18 +671,23 @@ that the only thing that can block rcu_read_lock() is a synchronize_rcu().
|
||||
But synchronize_rcu() does not acquire any locks while holding rcu_gp_mutex,
|
||||
so there can be no deadlock cycle.
|
||||
|
||||
Quick Quiz #1: Why is this argument naive? How could a deadlock
|
||||
.. _quiz_1:
|
||||
|
||||
Quick Quiz #1:
|
||||
Why is this argument naive? How could a deadlock
|
||||
occur when using this algorithm in a real-world Linux
|
||||
kernel? How could this deadlock be avoided?
|
||||
|
||||
:ref:`Answers to Quick Quiz <8_whatisRCU>`
|
||||
|
||||
5B. "TOY" EXAMPLE #2: CLASSIC RCU
|
||||
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
This section presents a "toy" RCU implementation that is based on
|
||||
"classic RCU". It is also short on performance (but only for updates) and
|
||||
on features such as hotplug CPU and the ability to run in CONFIG_PREEMPT
|
||||
kernels. The definitions of rcu_dereference() and rcu_assign_pointer()
|
||||
are the same as those shown in the preceding section, so they are omitted.
|
||||
::
|
||||
|
||||
void rcu_read_lock(void) { }
|
||||
|
||||
@ -683,14 +712,14 @@ CPU in turn. The run_on() primitive can be implemented straightforwardly
|
||||
in terms of the sched_setaffinity() primitive. Of course, a somewhat less
|
||||
"toy" implementation would restore the affinity upon completion rather
|
||||
than just leaving all tasks running on the last CPU, but when I said
|
||||
"toy", I meant -toy-!
|
||||
"toy", I meant **toy**!
|
||||
|
||||
So how the heck is this supposed to work???
|
||||
|
||||
Remember that it is illegal to block while in an RCU read-side critical
|
||||
section. Therefore, if a given CPU executes a context switch, we know
|
||||
that it must have completed all preceding RCU read-side critical sections.
|
||||
Once -all- CPUs have executed a context switch, then -all- preceding
|
||||
Once **all** CPUs have executed a context switch, then **all** preceding
|
||||
RCU read-side critical sections will have completed.
|
||||
|
||||
So, suppose that we remove a data item from its structure and then invoke
|
||||
@ -698,19 +727,32 @@ synchronize_rcu(). Once synchronize_rcu() returns, we are guaranteed
|
||||
that there are no RCU read-side critical sections holding a reference
|
||||
to that data item, so we can safely reclaim it.
|
||||
|
||||
Quick Quiz #2: Give an example where Classic RCU's read-side
|
||||
overhead is -negative-.
|
||||
.. _quiz_2:
|
||||
|
||||
Quick Quiz #3: If it is illegal to block in an RCU read-side
|
||||
Quick Quiz #2:
|
||||
Give an example where Classic RCU's read-side
|
||||
overhead is **negative**.
|
||||
|
||||
:ref:`Answers to Quick Quiz <8_whatisRCU>`
|
||||
|
||||
.. _quiz_3:
|
||||
|
||||
Quick Quiz #3:
|
||||
If it is illegal to block in an RCU read-side
|
||||
critical section, what the heck do you do in
|
||||
PREEMPT_RT, where normal spinlocks can block???
|
||||
|
||||
:ref:`Answers to Quick Quiz <8_whatisRCU>`
|
||||
|
||||
.. _6_whatisRCU:
|
||||
|
||||
6. ANALOGY WITH READER-WRITER LOCKING
|
||||
--------------------------------------
|
||||
|
||||
Although RCU can be used in many different ways, a very common use of
|
||||
RCU is analogous to reader-writer locking. The following unified
|
||||
diff shows how closely related RCU and reader-writer locking can be.
|
||||
::
|
||||
|
||||
@@ -5,5 +5,5 @@ struct el {
|
||||
int data;
|
||||
@ -762,7 +804,7 @@ diff shows how closely related RCU and reader-writer locking can be.
|
||||
return 0;
|
||||
}
|
||||
|
||||
Or, for those who prefer a side-by-side listing:
|
||||
Or, for those who prefer a side-by-side listing::
|
||||
|
||||
1 struct el { 1 struct el {
|
||||
2 struct list_head list; 2 struct list_head list;
|
||||
@ -774,40 +816,44 @@ Or, for those who prefer a side-by-side listing:
|
||||
8 rwlock_t listmutex; 8 spinlock_t listmutex;
|
||||
9 struct el head; 9 struct el head;
|
||||
|
||||
1 int search(long key, int *result) 1 int search(long key, int *result)
|
||||
2 { 2 {
|
||||
3 struct list_head *lp; 3 struct list_head *lp;
|
||||
4 struct el *p; 4 struct el *p;
|
||||
5 5
|
||||
6 read_lock(&listmutex); 6 rcu_read_lock();
|
||||
7 list_for_each_entry(p, head, lp) { 7 list_for_each_entry_rcu(p, head, lp) {
|
||||
8 if (p->key == key) { 8 if (p->key == key) {
|
||||
9 *result = p->data; 9 *result = p->data;
|
||||
10 read_unlock(&listmutex); 10 rcu_read_unlock();
|
||||
11 return 1; 11 return 1;
|
||||
12 } 12 }
|
||||
13 } 13 }
|
||||
14 read_unlock(&listmutex); 14 rcu_read_unlock();
|
||||
15 return 0; 15 return 0;
|
||||
16 } 16 }
|
||||
::
|
||||
|
||||
1 int delete(long key) 1 int delete(long key)
|
||||
2 { 2 {
|
||||
3 struct el *p; 3 struct el *p;
|
||||
4 4
|
||||
5 write_lock(&listmutex); 5 spin_lock(&listmutex);
|
||||
6 list_for_each_entry(p, head, lp) { 6 list_for_each_entry(p, head, lp) {
|
||||
7 if (p->key == key) { 7 if (p->key == key) {
|
||||
8 list_del(&p->list); 8 list_del_rcu(&p->list);
|
||||
9 write_unlock(&listmutex); 9 spin_unlock(&listmutex);
|
||||
10 synchronize_rcu();
|
||||
10 kfree(p); 11 kfree(p);
|
||||
11 return 1; 12 return 1;
|
||||
12 } 13 }
|
||||
13 } 14 }
|
||||
14 write_unlock(&listmutex); 15 spin_unlock(&listmutex);
|
||||
15 return 0; 16 return 0;
|
||||
16 } 17 }
|
||||
1 int search(long key, int *result) 1 int search(long key, int *result)
|
||||
2 { 2 {
|
||||
3 struct list_head *lp; 3 struct list_head *lp;
|
||||
4 struct el *p; 4 struct el *p;
|
||||
5 5
|
||||
6 read_lock(&listmutex); 6 rcu_read_lock();
|
||||
7 list_for_each_entry(p, head, lp) { 7 list_for_each_entry_rcu(p, head, lp) {
|
||||
8 if (p->key == key) { 8 if (p->key == key) {
|
||||
9 *result = p->data; 9 *result = p->data;
|
||||
10 read_unlock(&listmutex); 10 rcu_read_unlock();
|
||||
11 return 1; 11 return 1;
|
||||
12 } 12 }
|
||||
13 } 13 }
|
||||
14 read_unlock(&listmutex); 14 rcu_read_unlock();
|
||||
15 return 0; 15 return 0;
|
||||
16 } 16 }
|
||||
|
||||
::
|
||||
|
||||
1 int delete(long key) 1 int delete(long key)
|
||||
2 { 2 {
|
||||
3 struct el *p; 3 struct el *p;
|
||||
4 4
|
||||
5 write_lock(&listmutex); 5 spin_lock(&listmutex);
|
||||
6 list_for_each_entry(p, head, lp) { 6 list_for_each_entry(p, head, lp) {
|
||||
7 if (p->key == key) { 7 if (p->key == key) {
|
||||
8 list_del(&p->list); 8 list_del_rcu(&p->list);
|
||||
9 write_unlock(&listmutex); 9 spin_unlock(&listmutex);
|
||||
10 synchronize_rcu();
|
||||
10 kfree(p); 11 kfree(p);
|
||||
11 return 1; 12 return 1;
|
||||
12 } 13 }
|
||||
13 } 14 }
|
||||
14 write_unlock(&listmutex); 15 spin_unlock(&listmutex);
|
||||
15 return 0; 16 return 0;
|
||||
16 } 17 }
|
||||
|
||||
Either way, the differences are quite small. Read-side locking moves
|
||||
to rcu_read_lock() and rcu_read_unlock, update-side locking moves from
|
||||
@ -825,22 +871,27 @@ delete() can now block. If this is a problem, there is a callback-based
|
||||
mechanism that never blocks, namely call_rcu() or kfree_rcu(), that can
|
||||
be used in place of synchronize_rcu().
|
||||
|
||||
.. _7_whatisRCU:
|
||||
|
||||
7. FULL LIST OF RCU APIs
|
||||
-------------------------
|
||||
|
||||
The RCU APIs are documented in docbook-format header comments in the
|
||||
Linux-kernel source code, but it helps to have a full list of the
|
||||
APIs, since there does not appear to be a way to categorize them
|
||||
in docbook. Here is the list, by category.
|
||||
|
||||
RCU list traversal:
|
||||
RCU list traversal::
|
||||
|
||||
list_entry_rcu
|
||||
list_entry_lockless
|
||||
list_first_entry_rcu
|
||||
list_next_rcu
|
||||
list_for_each_entry_rcu
|
||||
list_for_each_entry_continue_rcu
|
||||
list_for_each_entry_from_rcu
|
||||
list_first_or_null_rcu
|
||||
list_next_or_null_rcu
|
||||
hlist_first_rcu
|
||||
hlist_next_rcu
|
||||
hlist_pprev_rcu
|
||||
@ -854,7 +905,7 @@ RCU list traversal:
|
||||
hlist_bl_first_rcu
|
||||
hlist_bl_for_each_entry_rcu
|
||||
|
||||
RCU pointer/list update:
|
||||
RCU pointer/list update::
|
||||
|
||||
rcu_assign_pointer
|
||||
list_add_rcu
|
||||
@ -864,10 +915,12 @@ RCU pointer/list update:
|
||||
hlist_add_behind_rcu
|
||||
hlist_add_before_rcu
|
||||
hlist_add_head_rcu
|
||||
hlist_add_tail_rcu
|
||||
hlist_del_rcu
|
||||
hlist_del_init_rcu
|
||||
hlist_replace_rcu
|
||||
list_splice_init_rcu()
|
||||
list_splice_init_rcu
|
||||
list_splice_tail_init_rcu
|
||||
hlist_nulls_del_init_rcu
|
||||
hlist_nulls_del_rcu
|
||||
hlist_nulls_add_head_rcu
|
||||
@ -876,7 +929,9 @@ RCU pointer/list update:
|
||||
hlist_bl_del_rcu
|
||||
hlist_bl_set_first_rcu
|
||||
|
||||
RCU: Critical sections Grace period Barrier
|
||||
RCU::
|
||||
|
||||
Critical sections Grace period Barrier
|
||||
|
||||
rcu_read_lock synchronize_net rcu_barrier
|
||||
rcu_read_unlock synchronize_rcu
|
||||
@ -885,7 +940,9 @@ RCU: Critical sections Grace period Barrier
|
||||
rcu_dereference_check kfree_rcu
|
||||
rcu_dereference_protected
|
||||
|
||||
bh: Critical sections Grace period Barrier
|
||||
bh::
|
||||
|
||||
Critical sections Grace period Barrier
|
||||
|
||||
rcu_read_lock_bh call_rcu rcu_barrier
|
||||
rcu_read_unlock_bh synchronize_rcu
|
||||
@ -896,7 +953,9 @@ bh: Critical sections Grace period Barrier
|
||||
rcu_dereference_bh_protected
|
||||
rcu_read_lock_bh_held
|
||||
|
||||
sched: Critical sections Grace period Barrier
|
||||
sched::
|
||||
|
||||
Critical sections Grace period Barrier
|
||||
|
||||
rcu_read_lock_sched call_rcu rcu_barrier
|
||||
rcu_read_unlock_sched synchronize_rcu
|
||||
@ -910,7 +969,9 @@ sched: Critical sections Grace period Barrier
|
||||
rcu_read_lock_sched_held
|
||||
|
||||
|
||||
SRCU: Critical sections Grace period Barrier
|
||||
SRCU::
|
||||
|
||||
Critical sections Grace period Barrier
|
||||
|
||||
srcu_read_lock call_srcu srcu_barrier
|
||||
srcu_read_unlock synchronize_srcu
|
||||
@ -918,13 +979,14 @@ SRCU: Critical sections Grace period Barrier
|
||||
srcu_dereference_check
|
||||
srcu_read_lock_held
|
||||
|
||||
SRCU: Initialization/cleanup
|
||||
SRCU: Initialization/cleanup::
|
||||
|
||||
DEFINE_SRCU
|
||||
DEFINE_STATIC_SRCU
|
||||
init_srcu_struct
|
||||
cleanup_srcu_struct
|
||||
|
||||
All: lockdep-checked RCU-protected pointer access
|
||||
All: lockdep-checked RCU-protected pointer access::
|
||||
|
||||
rcu_access_pointer
|
||||
rcu_dereference_raw
|
||||
@ -974,15 +1036,19 @@ g. Otherwise, use RCU.
|
||||
Of course, this all assumes that you have determined that RCU is in fact
|
||||
the right tool for your job.
|
||||
|
||||
.. _8_whatisRCU:
|
||||
|
||||
8. ANSWERS TO QUICK QUIZZES
|
||||
----------------------------
|
||||
|
||||
Quick Quiz #1: Why is this argument naive? How could a deadlock
|
||||
Quick Quiz #1:
|
||||
Why is this argument naive? How could a deadlock
|
||||
occur when using this algorithm in a real-world Linux
|
||||
kernel? [Referring to the lock-based "toy" RCU
|
||||
algorithm.]
|
||||
|
||||
Answer: Consider the following sequence of events:
|
||||
Answer:
|
||||
Consider the following sequence of events:
|
||||
|
||||
1. CPU 0 acquires some unrelated lock, call it
|
||||
"problematic_lock", disabling irq via
|
||||
@ -1021,10 +1087,14 @@ Answer: Consider the following sequence of events:
|
||||
approach where tasks in RCU read-side critical sections
|
||||
cannot be blocked by tasks executing synchronize_rcu().
|
||||
|
||||
Quick Quiz #2: Give an example where Classic RCU's read-side
|
||||
overhead is -negative-.
|
||||
:ref:`Back to Quick Quiz #1 <quiz_1>`
|
||||
|
||||
Answer: Imagine a single-CPU system with a non-CONFIG_PREEMPT
|
||||
Quick Quiz #2:
|
||||
Give an example where Classic RCU's read-side
|
||||
overhead is **negative**.
|
||||
|
||||
Answer:
|
||||
Imagine a single-CPU system with a non-CONFIG_PREEMPT
|
||||
kernel where a routing table is used by process-context
|
||||
code, but can be updated by irq-context code (for example,
|
||||
by an "ICMP REDIRECT" packet). The usual way of handling
|
||||
@ -1046,11 +1116,15 @@ Answer: Imagine a single-CPU system with a non-CONFIG_PREEMPT
|
||||
even the theoretical possibility of negative overhead for
|
||||
a synchronization primitive is a bit unexpected. ;-)
|
||||
|
||||
Quick Quiz #3: If it is illegal to block in an RCU read-side
|
||||
:ref:`Back to Quick Quiz #2 <quiz_2>`
|
||||
|
||||
Quick Quiz #3:
|
||||
If it is illegal to block in an RCU read-side
|
||||
critical section, what the heck do you do in
|
||||
PREEMPT_RT, where normal spinlocks can block???
|
||||
|
||||
Answer: Just as PREEMPT_RT permits preemption of spinlock
|
||||
Answer:
|
||||
Just as PREEMPT_RT permits preemption of spinlock
|
||||
critical sections, it permits preemption of RCU
|
||||
read-side critical sections. It also permits
|
||||
spinlocks blocking while in RCU read-side critical
|
||||
@ -1069,6 +1143,7 @@ Answer: Just as PREEMPT_RT permits preemption of spinlock
|
||||
Besides, how does the computer know what pizza parlor
|
||||
the human being went to???
|
||||
|
||||
:ref:`Back to Quick Quiz #3 <quiz_3>`
|
||||
|
||||
ACKNOWLEDGEMENTS
|
||||
|
@ -1165,10 +1165,10 @@
|
||||
|
||||
efi= [EFI]
|
||||
Format: { "old_map", "nochunk", "noruntime", "debug",
|
||||
"nosoftreserve" }
|
||||
"nosoftreserve", "disable_early_pci_dma",
|
||||
"no_disable_early_pci_dma" }
|
||||
old_map [X86-64]: switch to the old ioremap-based EFI
|
||||
runtime services mapping. 32-bit still uses this one by
|
||||
default.
|
||||
runtime services mapping. [Needs CONFIG_X86_UV=y]
|
||||
nochunk: disable reading files in "chunks" in the EFI
|
||||
boot stub, as chunking can cause problems with some
|
||||
firmware implementations.
|
||||
@ -1180,6 +1180,10 @@
|
||||
claim. Specify efi=nosoftreserve to disable this
|
||||
reservation and treat the memory by its base type
|
||||
(i.e. EFI_CONVENTIONAL_MEMORY / "System RAM").
|
||||
disable_early_pci_dma: Disable the busmaster bit on all
|
||||
PCI bridges while in the EFI boot stub
|
||||
no_disable_early_pci_dma: Leave the busmaster bit set
|
||||
on all PCI bridges while in the EFI boot stub
|
||||
|
||||
efi_no_storage_paranoia [EFI; X86]
|
||||
Using this parameter you can use more than 50% of
|
||||
@ -4001,6 +4005,19 @@
|
||||
test until boot completes in order to avoid
|
||||
interference.
|
||||
|
||||
rcuperf.kfree_rcu_test= [KNL]
|
||||
Set to measure performance of kfree_rcu() flooding.
|
||||
|
||||
rcuperf.kfree_nthreads= [KNL]
|
||||
The number of threads running loops of kfree_rcu().
|
||||
|
||||
rcuperf.kfree_alloc_num= [KNL]
|
||||
Number of allocations and frees done in an iteration.
|
||||
|
||||
rcuperf.kfree_loops= [KNL]
|
||||
Number of loops doing rcuperf.kfree_alloc_num number
|
||||
of allocations and frees.
|
||||
|
||||
rcuperf.nreaders= [KNL]
|
||||
Set number of RCU readers. The value -1 selects
|
||||
N, where N is the number of CPUs. A value
|
||||
|
@ -39,6 +39,7 @@ Core utilities
|
||||
../RCU/index
|
||||
gcc-plugins
|
||||
symbol-namespaces
|
||||
padata
|
||||
|
||||
|
||||
Interfaces for kernel debugging
|
||||
|
169
Documentation/core-api/padata.rst
Normal file
169
Documentation/core-api/padata.rst
Normal file
@ -0,0 +1,169 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
=======================================
|
||||
The padata parallel execution mechanism
|
||||
=======================================
|
||||
|
||||
:Date: December 2019
|
||||
|
||||
Padata is a mechanism by which the kernel can farm jobs out to be done in
|
||||
parallel on multiple CPUs while retaining their ordering. It was developed for
|
||||
use with the IPsec code, which needs to be able to perform encryption and
|
||||
decryption on large numbers of packets without reordering those packets. The
|
||||
crypto developers made a point of writing padata in a sufficiently general
|
||||
fashion that it could be put to other uses as well.
|
||||
|
||||
Usage
|
||||
=====
|
||||
|
||||
Initializing
|
||||
------------
|
||||
|
||||
The first step in using padata is to set up a padata_instance structure for
|
||||
overall control of how jobs are to be run::
|
||||
|
||||
#include <linux/padata.h>
|
||||
|
||||
struct padata_instance *padata_alloc_possible(const char *name);
|
||||
|
||||
'name' simply identifies the instance.
|
||||
|
||||
There are functions for enabling and disabling the instance::
|
||||
|
||||
int padata_start(struct padata_instance *pinst);
|
||||
void padata_stop(struct padata_instance *pinst);
|
||||
|
||||
These functions are setting or clearing the "PADATA_INIT" flag; if that flag is
|
||||
not set, other functions will refuse to work. padata_start() returns zero on
|
||||
success (flag set) or -EINVAL if the padata cpumask contains no active CPU
|
||||
(flag not set). padata_stop() clears the flag and blocks until the padata
|
||||
instance is unused.
|
||||
|
||||
Finally, complete padata initialization by allocating a padata_shell::
|
||||
|
||||
struct padata_shell *padata_alloc_shell(struct padata_instance *pinst);
|
||||
|
||||
A padata_shell is used to submit a job to padata and allows a series of such
|
||||
jobs to be serialized independently. A padata_instance may have one or more
|
||||
padata_shells associated with it, each allowing a separate series of jobs.
|
||||
|
||||
Modifying cpumasks
|
||||
------------------
|
||||
|
||||
The CPUs used to run jobs can be changed in two ways, programatically with
|
||||
padata_set_cpumask() or via sysfs. The former is defined::
|
||||
|
||||
int padata_set_cpumask(struct padata_instance *pinst, int cpumask_type,
|
||||
cpumask_var_t cpumask);
|
||||
|
||||
Here cpumask_type is one of PADATA_CPU_PARALLEL or PADATA_CPU_SERIAL, where a
|
||||
parallel cpumask describes which processors will be used to execute jobs
|
||||
submitted to this instance in parallel and a serial cpumask defines which
|
||||
processors are allowed to be used as the serialization callback processor.
|
||||
cpumask specifies the new cpumask to use.
|
||||
|
||||
There may be sysfs files for an instance's cpumasks. For example, pcrypt's
|
||||
live in /sys/kernel/pcrypt/<instance-name>. Within an instance's directory
|
||||
there are two files, parallel_cpumask and serial_cpumask, and either cpumask
|
||||
may be changed by echoing a bitmask into the file, for example::
|
||||
|
||||
echo f > /sys/kernel/pcrypt/pencrypt/parallel_cpumask
|
||||
|
||||
Reading one of these files shows the user-supplied cpumask, which may be
|
||||
different from the 'usable' cpumask.
|
||||
|
||||
Padata maintains two pairs of cpumasks internally, the user-supplied cpumasks
|
||||
and the 'usable' cpumasks. (Each pair consists of a parallel and a serial
|
||||
cpumask.) The user-supplied cpumasks default to all possible CPUs on instance
|
||||
allocation and may be changed as above. The usable cpumasks are always a
|
||||
subset of the user-supplied cpumasks and contain only the online CPUs in the
|
||||
user-supplied masks; these are the cpumasks padata actually uses. So it is
|
||||
legal to supply a cpumask to padata that contains offline CPUs. Once an
|
||||
offline CPU in the user-supplied cpumask comes online, padata is going to use
|
||||
it.
|
||||
|
||||
Changing the CPU masks are expensive operations, so it should not be done with
|
||||
great frequency.
|
||||
|
||||
Running A Job
|
||||
-------------
|
||||
|
||||
Actually submitting work to the padata instance requires the creation of a
|
||||
padata_priv structure, which represents one job::
|
||||
|
||||
struct padata_priv {
|
||||
/* Other stuff here... */
|
||||
void (*parallel)(struct padata_priv *padata);
|
||||
void (*serial)(struct padata_priv *padata);
|
||||
};
|
||||
|
||||
This structure will almost certainly be embedded within some larger
|
||||
structure specific to the work to be done. Most of its fields are private to
|
||||
padata, but the structure should be zeroed at initialisation time, and the
|
||||
parallel() and serial() functions should be provided. Those functions will
|
||||
be called in the process of getting the work done as we will see
|
||||
momentarily.
|
||||
|
||||
The submission of the job is done with::
|
||||
|
||||
int padata_do_parallel(struct padata_shell *ps,
|
||||
struct padata_priv *padata, int *cb_cpu);
|
||||
|
||||
The ps and padata structures must be set up as described above; cb_cpu
|
||||
points to the preferred CPU to be used for the final callback when the job is
|
||||
done; it must be in the current instance's CPU mask (if not the cb_cpu pointer
|
||||
is updated to point to the CPU actually chosen). The return value from
|
||||
padata_do_parallel() is zero on success, indicating that the job is in
|
||||
progress. -EBUSY means that somebody, somewhere else is messing with the
|
||||
instance's CPU mask, while -EINVAL is a complaint about cb_cpu not being in the
|
||||
serial cpumask, no online CPUs in the parallel or serial cpumasks, or a stopped
|
||||
instance.
|
||||
|
||||
Each job submitted to padata_do_parallel() will, in turn, be passed to
|
||||
exactly one call to the above-mentioned parallel() function, on one CPU, so
|
||||
true parallelism is achieved by submitting multiple jobs. parallel() runs with
|
||||
software interrupts disabled and thus cannot sleep. The parallel()
|
||||
function gets the padata_priv structure pointer as its lone parameter;
|
||||
information about the actual work to be done is probably obtained by using
|
||||
container_of() to find the enclosing structure.
|
||||
|
||||
Note that parallel() has no return value; the padata subsystem assumes that
|
||||
parallel() will take responsibility for the job from this point. The job
|
||||
need not be completed during this call, but, if parallel() leaves work
|
||||
outstanding, it should be prepared to be called again with a new job before
|
||||
the previous one completes.
|
||||
|
||||
Serializing Jobs
|
||||
----------------
|
||||
|
||||
When a job does complete, parallel() (or whatever function actually finishes
|
||||
the work) should inform padata of the fact with a call to::
|
||||
|
||||
void padata_do_serial(struct padata_priv *padata);
|
||||
|
||||
At some point in the future, padata_do_serial() will trigger a call to the
|
||||
serial() function in the padata_priv structure. That call will happen on
|
||||
the CPU requested in the initial call to padata_do_parallel(); it, too, is
|
||||
run with local software interrupts disabled.
|
||||
Note that this call may be deferred for a while since the padata code takes
|
||||
pains to ensure that jobs are completed in the order in which they were
|
||||
submitted.
|
||||
|
||||
Destroying
|
||||
----------
|
||||
|
||||
Cleaning up a padata instance predictably involves calling the three free
|
||||
functions that correspond to the allocation in reverse::
|
||||
|
||||
void padata_free_shell(struct padata_shell *ps);
|
||||
void padata_stop(struct padata_instance *pinst);
|
||||
void padata_free(struct padata_instance *pinst);
|
||||
|
||||
It is the user's responsibility to ensure all outstanding jobs are complete
|
||||
before any of the above are called.
|
||||
|
||||
Interface
|
||||
=========
|
||||
|
||||
.. kernel-doc:: include/linux/padata.h
|
||||
.. kernel-doc:: kernel/padata.c
|
@ -31,33 +31,23 @@ The counterparts to those functions are listed below.
|
||||
|
||||
::
|
||||
|
||||
int crypto_unregister_alg(struct crypto_alg *alg);
|
||||
int crypto_unregister_algs(struct crypto_alg *algs, int count);
|
||||
void crypto_unregister_alg(struct crypto_alg *alg);
|
||||
void crypto_unregister_algs(struct crypto_alg *algs, int count);
|
||||
|
||||
|
||||
Notice that both registration and unregistration functions do return a
|
||||
value, so make sure to handle errors. A return code of zero implies
|
||||
success. Any return code < 0 implies an error.
|
||||
The registration functions return 0 on success, or a negative errno
|
||||
value on failure. crypto_register_algs() succeeds only if it
|
||||
successfully registered all the given algorithms; if it fails partway
|
||||
through, then any changes are rolled back.
|
||||
|
||||
The bulk registration/unregistration functions register/unregister each
|
||||
transformation in the given array of length count. They handle errors as
|
||||
follows:
|
||||
|
||||
- crypto_register_algs() succeeds if and only if it successfully
|
||||
registers all the given transformations. If an error occurs partway
|
||||
through, then it rolls back successful registrations before returning
|
||||
the error code. Note that if a driver needs to handle registration
|
||||
errors for individual transformations, then it will need to use the
|
||||
non-bulk function crypto_register_alg() instead.
|
||||
|
||||
- crypto_unregister_algs() tries to unregister all the given
|
||||
transformations, continuing on error. It logs errors and always
|
||||
returns zero.
|
||||
The unregistration functions always succeed, so they don't have a
|
||||
return value. Don't try to unregister algorithms that aren't
|
||||
currently registered.
|
||||
|
||||
Single-Block Symmetric Ciphers [CIPHER]
|
||||
---------------------------------------
|
||||
|
||||
Example of transformations: aes, arc4, ...
|
||||
Example of transformations: aes, serpent, ...
|
||||
|
||||
This section describes the simplest of all transformation
|
||||
implementations, that being the CIPHER type used for symmetric ciphers.
|
||||
@ -108,7 +98,7 @@ is also valid:
|
||||
Multi-Block Ciphers
|
||||
-------------------
|
||||
|
||||
Example of transformations: cbc(aes), ecb(arc4), ...
|
||||
Example of transformations: cbc(aes), chacha20, ...
|
||||
|
||||
This section describes the multi-block cipher transformation
|
||||
implementations. The multi-block ciphers are used for transformations
|
||||
@ -169,10 +159,10 @@ are as follows:
|
||||
|
||||
::
|
||||
|
||||
int crypto_unregister_ahash(struct ahash_alg *alg);
|
||||
void crypto_unregister_ahash(struct ahash_alg *alg);
|
||||
|
||||
int crypto_unregister_shash(struct shash_alg *alg);
|
||||
int crypto_unregister_shashes(struct shash_alg *algs, int count);
|
||||
void crypto_unregister_shash(struct shash_alg *alg);
|
||||
void crypto_unregister_shashes(struct shash_alg *algs, int count);
|
||||
|
||||
|
||||
Cipher Definition With struct shash_alg and ahash_alg
|
||||
|
@ -11,6 +11,7 @@ Required properties:
|
||||
|
||||
- compatible: should contain one of the following:
|
||||
* "brcm,bcm20702a1"
|
||||
* "brcm,bcm4329-bt"
|
||||
* "brcm,bcm4330-bt"
|
||||
* "brcm,bcm43438-bt"
|
||||
* "brcm,bcm4345c5"
|
||||
@ -22,7 +23,9 @@ Optional properties:
|
||||
- max-speed: see Documentation/devicetree/bindings/serial/slave-device.txt
|
||||
- shutdown-gpios: GPIO specifier, used to enable the BT module
|
||||
- device-wakeup-gpios: GPIO specifier, used to wakeup the controller
|
||||
- host-wakeup-gpios: GPIO specifier, used to wakeup the host processor
|
||||
- host-wakeup-gpios: GPIO specifier, used to wakeup the host processor.
|
||||
deprecated, replaced by interrupts and
|
||||
"host-wakeup" interrupt-names
|
||||
- clocks: 1 or 2 clocks as defined in clock-names below, in that order
|
||||
- clock-names: names for clock inputs, matching the clocks given
|
||||
- "extclk": deprecated, replaced by "txco"
|
||||
@ -30,7 +33,14 @@ Optional properties:
|
||||
- "lpo": external low power 32.768 kHz clock
|
||||
- vbat-supply: phandle to regulator supply for VBAT
|
||||
- vddio-supply: phandle to regulator supply for VDDIO
|
||||
|
||||
- brcm,bt-pcm-int-params: configure PCM parameters via a 5-byte array
|
||||
- sco-routing: 0 = PCM, 1 = Transport, 2 = Codec, 3 = I2S
|
||||
- pcm-interface-rate: 128KBps, 256KBps, 512KBps, 1024KBps, 2048KBps
|
||||
- pcm-frame-type: short, long
|
||||
- pcm-sync-mode: slave, master
|
||||
- pcm-clock-mode: slave, master
|
||||
- interrupts: must be one, used to wakeup the host processor
|
||||
- interrupt-names: must be "host-wakeup"
|
||||
|
||||
Example:
|
||||
|
||||
@ -41,5 +51,6 @@ Example:
|
||||
bluetooth {
|
||||
compatible = "brcm,bcm43438-bt";
|
||||
max-speed = <921600>;
|
||||
brcm,bt-pcm-int-params = [01 02 00 01 01];
|
||||
};
|
||||
};
|
||||
|
148
Documentation/devicetree/bindings/net/dsa/ar9331.txt
Normal file
148
Documentation/devicetree/bindings/net/dsa/ar9331.txt
Normal file
@ -0,0 +1,148 @@
|
||||
Atheros AR9331 built-in switch
|
||||
=============================
|
||||
|
||||
It is a switch built-in to Atheros AR9331 WiSoC and addressable over internal
|
||||
MDIO bus. All PHYs are built-in as well.
|
||||
|
||||
Required properties:
|
||||
|
||||
- compatible: should be: "qca,ar9331-switch"
|
||||
- reg: Address on the MII bus for the switch.
|
||||
- resets : Must contain an entry for each entry in reset-names.
|
||||
- reset-names : Must include the following entries: "switch"
|
||||
- interrupt-parent: Phandle to the parent interrupt controller
|
||||
- interrupts: IRQ line for the switch
|
||||
- interrupt-controller: Indicates the switch is itself an interrupt
|
||||
controller. This is used for the PHY interrupts.
|
||||
- #interrupt-cells: must be 1
|
||||
- mdio: Container of PHY and devices on the switches MDIO bus.
|
||||
|
||||
See Documentation/devicetree/bindings/net/dsa/dsa.txt for a list of additional
|
||||
required and optional properties.
|
||||
Examples:
|
||||
|
||||
eth0: ethernet@19000000 {
|
||||
compatible = "qca,ar9330-eth";
|
||||
reg = <0x19000000 0x200>;
|
||||
interrupts = <4>;
|
||||
|
||||
resets = <&rst 9>, <&rst 22>;
|
||||
reset-names = "mac", "mdio";
|
||||
clocks = <&pll ATH79_CLK_AHB>, <&pll ATH79_CLK_AHB>;
|
||||
clock-names = "eth", "mdio";
|
||||
|
||||
phy-mode = "mii";
|
||||
phy-handle = <&phy_port4>;
|
||||
};
|
||||
|
||||
eth1: ethernet@1a000000 {
|
||||
compatible = "qca,ar9330-eth";
|
||||
reg = <0x1a000000 0x200>;
|
||||
interrupts = <5>;
|
||||
resets = <&rst 13>, <&rst 23>;
|
||||
reset-names = "mac", "mdio";
|
||||
clocks = <&pll ATH79_CLK_AHB>, <&pll ATH79_CLK_AHB>;
|
||||
clock-names = "eth", "mdio";
|
||||
|
||||
phy-mode = "gmii";
|
||||
|
||||
fixed-link {
|
||||
speed = <1000>;
|
||||
full-duplex;
|
||||
};
|
||||
|
||||
mdio {
|
||||
#address-cells = <1>;
|
||||
#size-cells = <0>;
|
||||
|
||||
switch10: switch@10 {
|
||||
#address-cells = <1>;
|
||||
#size-cells = <0>;
|
||||
|
||||
compatible = "qca,ar9331-switch";
|
||||
reg = <0x10>;
|
||||
resets = <&rst 8>;
|
||||
reset-names = "switch";
|
||||
|
||||
interrupt-parent = <&miscintc>;
|
||||
interrupts = <12>;
|
||||
|
||||
interrupt-controller;
|
||||
#interrupt-cells = <1>;
|
||||
|
||||
ports {
|
||||
#address-cells = <1>;
|
||||
#size-cells = <0>;
|
||||
|
||||
switch_port0: port@0 {
|
||||
reg = <0x0>;
|
||||
label = "cpu";
|
||||
ethernet = <ð1>;
|
||||
|
||||
phy-mode = "gmii";
|
||||
|
||||
fixed-link {
|
||||
speed = <1000>;
|
||||
full-duplex;
|
||||
};
|
||||
};
|
||||
|
||||
switch_port1: port@1 {
|
||||
reg = <0x1>;
|
||||
phy-handle = <&phy_port0>;
|
||||
phy-mode = "internal";
|
||||
};
|
||||
|
||||
switch_port2: port@2 {
|
||||
reg = <0x2>;
|
||||
phy-handle = <&phy_port1>;
|
||||
phy-mode = "internal";
|
||||
};
|
||||
|
||||
switch_port3: port@3 {
|
||||
reg = <0x3>;
|
||||
phy-handle = <&phy_port2>;
|
||||
phy-mode = "internal";
|
||||
};
|
||||
|
||||
switch_port4: port@4 {
|
||||
reg = <0x4>;
|
||||
phy-handle = <&phy_port3>;
|
||||
phy-mode = "internal";
|
||||
};
|
||||
};
|
||||
|
||||
mdio {
|
||||
#address-cells = <1>;
|
||||
#size-cells = <0>;
|
||||
|
||||
interrupt-parent = <&switch10>;
|
||||
|
||||
phy_port0: phy@0 {
|
||||
reg = <0x0>;
|
||||
interrupts = <0>;
|
||||
};
|
||||
|
||||
phy_port1: phy@1 {
|
||||
reg = <0x1>;
|
||||
interrupts = <0>;
|
||||
};
|
||||
|
||||
phy_port2: phy@2 {
|
||||
reg = <0x2>;
|
||||
interrupts = <0>;
|
||||
};
|
||||
|
||||
phy_port3: phy@3 {
|
||||
reg = <0x3>;
|
||||
interrupts = <0>;
|
||||
};
|
||||
|
||||
phy_port4: phy@4 {
|
||||
reg = <0x4>;
|
||||
interrupts = <0>;
|
||||
};
|
||||
};
|
||||
};
|
||||
};
|
||||
};
|
@ -14,7 +14,7 @@ Required properties:
|
||||
Should be "macirq" for the main MAC IRQ
|
||||
- clocks: Must contain a phandle for each entry in clock-names.
|
||||
- clock-names: The name of the clock listed in the clocks property. These are
|
||||
"axi", "apb", "mac_main", "ptp_ref" for MT2712 SoC
|
||||
"axi", "apb", "mac_main", "ptp_ref", "rmii_internal" for MT2712 SoC.
|
||||
- mac-address: See ethernet.txt in the same directory
|
||||
- phy-mode: See ethernet.txt in the same directory
|
||||
- mediatek,pericfg: A phandle to the syscon node that control ethernet
|
||||
@ -23,8 +23,10 @@ Required properties:
|
||||
Optional properties:
|
||||
- mediatek,tx-delay-ps: TX clock delay macro value. Default is 0.
|
||||
It should be defined for RGMII/MII interface.
|
||||
It should be defined for RMII interface when the reference clock is from MT2712 SoC.
|
||||
- mediatek,rx-delay-ps: RX clock delay macro value. Default is 0.
|
||||
It should be defined for RGMII/MII/RMII interface.
|
||||
It should be defined for RGMII/MII interface.
|
||||
It should be defined for RMII interface.
|
||||
Both delay properties need to be a multiple of 170 for RGMII interface,
|
||||
or will round down. Range 0~31*170.
|
||||
Both delay properties need to be a multiple of 550 for MII/RMII interface,
|
||||
@ -34,13 +36,20 @@ or will round down. Range 0~31*550.
|
||||
reference clock, which is from external PHYs, is connected to RXC pin
|
||||
on MT2712 SoC.
|
||||
Otherwise, is connected to TXC pin.
|
||||
- mediatek,rmii-clk-from-mac: boolean property, if present indicates that
|
||||
MT2712 SoC provides the RMII reference clock, which outputs to TXC pin only.
|
||||
- mediatek,txc-inverse: boolean property, if present indicates that
|
||||
1. tx clock will be inversed in MII/RGMII case,
|
||||
2. tx clock inside MAC will be inversed relative to reference clock
|
||||
which is from external PHYs in RMII case, and it rarely happen.
|
||||
3. the reference clock, which outputs to TXC pin will be inversed in RMII case
|
||||
when the reference clock is from MT2712 SoC.
|
||||
- mediatek,rxc-inverse: boolean property, if present indicates that
|
||||
1. rx clock will be inversed in MII/RGMII case.
|
||||
2. reference clock will be inversed when arrived at MAC in RMII case.
|
||||
2. reference clock will be inversed when arrived at MAC in RMII case, when
|
||||
the reference clock is from external PHYs.
|
||||
3. the inside clock, which be sent to MAC, will be inversed in RMII case when
|
||||
the reference clock is from MT2712 SoC.
|
||||
- assigned-clocks: mac_main and ptp_ref clocks
|
||||
- assigned-clock-parents: parent clocks of the assigned clocks
|
||||
|
||||
@ -50,29 +59,33 @@ Example:
|
||||
reg = <0 0x1101c000 0 0x1300>;
|
||||
interrupts = <GIC_SPI 237 IRQ_TYPE_LEVEL_LOW>;
|
||||
interrupt-names = "macirq";
|
||||
phy-mode ="rgmii";
|
||||
phy-mode ="rgmii-rxid";
|
||||
mac-address = [00 55 7b b5 7d f7];
|
||||
clock-names = "axi",
|
||||
"apb",
|
||||
"mac_main",
|
||||
"ptp_ref",
|
||||
"ptp_top";
|
||||
"rmii_internal";
|
||||
clocks = <&pericfg CLK_PERI_GMAC>,
|
||||
<&pericfg CLK_PERI_GMAC_PCLK>,
|
||||
<&topckgen CLK_TOP_ETHER_125M_SEL>,
|
||||
<&topckgen CLK_TOP_ETHER_50M_SEL>;
|
||||
<&topckgen CLK_TOP_ETHER_50M_SEL>,
|
||||
<&topckgen CLK_TOP_ETHER_50M_RMII_SEL>;
|
||||
assigned-clocks = <&topckgen CLK_TOP_ETHER_125M_SEL>,
|
||||
<&topckgen CLK_TOP_ETHER_50M_SEL>;
|
||||
<&topckgen CLK_TOP_ETHER_50M_SEL>,
|
||||
<&topckgen CLK_TOP_ETHER_50M_RMII_SEL>;
|
||||
assigned-clock-parents = <&topckgen CLK_TOP_ETHERPLL_125M>,
|
||||
<&topckgen CLK_TOP_APLL1_D3>;
|
||||
<&topckgen CLK_TOP_APLL1_D3>,
|
||||
<&topckgen CLK_TOP_ETHERPLL_50M>;
|
||||
power-domains = <&scpsys MT2712_POWER_DOMAIN_AUDIO>;
|
||||
mediatek,pericfg = <&pericfg>;
|
||||
mediatek,tx-delay-ps = <1530>;
|
||||
mediatek,rx-delay-ps = <1530>;
|
||||
mediatek,rmii-rxc;
|
||||
mediatek,txc-inverse;
|
||||
mediatek,rxc-inverse;
|
||||
snps,txpbl = <32>;
|
||||
snps,rxpbl = <32>;
|
||||
snps,txpbl = <1>;
|
||||
snps,rxpbl = <1>;
|
||||
snps,reset-gpio = <&pio 87 GPIO_ACTIVE_LOW>;
|
||||
snps,reset-active-low;
|
||||
};
|
||||
|
@ -8,8 +8,6 @@ Required properties:
|
||||
- ti,tx-internal-delay - RGMII Transmit Clock Delay - see dt-bindings/net/ti-dp83867.h
|
||||
for applicable values. Required only if interface type is
|
||||
PHY_INTERFACE_MODE_RGMII_ID or PHY_INTERFACE_MODE_RGMII_TXID
|
||||
- ti,fifo-depth - Transmitt FIFO depth- see dt-bindings/net/ti-dp83867.h
|
||||
for applicable values
|
||||
|
||||
Note: If the interface type is PHY_INTERFACE_MODE_RGMII the TX/RX clock delays
|
||||
will be left at their default values, as set by the PHY's pin strapping.
|
||||
@ -42,6 +40,14 @@ Optional property:
|
||||
Some MACs work with differential SGMII clock.
|
||||
See data manual for details.
|
||||
|
||||
- ti,fifo-depth - Transmitt FIFO depth- see dt-bindings/net/ti-dp83867.h
|
||||
for applicable values (deprecated)
|
||||
|
||||
-tx-fifo-depth - As defined in the ethernet-controller.yaml. Values for
|
||||
the depth can be found in dt-bindings/net/ti-dp83867.h
|
||||
-rx-fifo-depth - As defined in the ethernet-controller.yaml. Values for
|
||||
the depth can be found in dt-bindings/net/ti-dp83867.h
|
||||
|
||||
Note: ti,min-output-impedance and ti,max-output-impedance are mutually
|
||||
exclusive. When both properties are present ti,max-output-impedance
|
||||
takes precedence.
|
||||
@ -55,7 +61,7 @@ Example:
|
||||
reg = <0>;
|
||||
ti,rx-internal-delay = <DP83867_RGMIIDCTL_2_25_NS>;
|
||||
ti,tx-internal-delay = <DP83867_RGMIIDCTL_2_75_NS>;
|
||||
ti,fifo-depth = <DP83867_PHYCR_FIFO_DEPTH_4_B_NIB>;
|
||||
tx-fifo-depth = <DP83867_PHYCR_FIFO_DEPTH_4_B_NIB>;
|
||||
};
|
||||
|
||||
Datasheet can be found:
|
||||
|
@ -50,7 +50,7 @@ Optional properties:
|
||||
entry in clock-names.
|
||||
- clock-names: Should contain the clock names "wifi_wcss_cmd", "wifi_wcss_ref",
|
||||
"wifi_wcss_rtc" for "qcom,ipq4019-wifi" compatible target and
|
||||
"cxo_ref_clk_pin" for "qcom,wcn3990-wifi"
|
||||
"cxo_ref_clk_pin" and optionally "qdss" for "qcom,wcn3990-wifi"
|
||||
compatible target.
|
||||
- qcom,msi_addr: MSI interrupt address.
|
||||
- qcom,msi_base: Base value to add before writing MSI data into
|
||||
@ -88,6 +88,9 @@ Optional properties:
|
||||
of the host capability QMI request
|
||||
- qcom,xo-cal-data: xo cal offset to be configured in xo trim register.
|
||||
|
||||
- qcom,msa-fixed-perm: Boolean context flag to disable SCM call for statically
|
||||
mapped msa region.
|
||||
|
||||
Example (to supply PCI based wifi block details):
|
||||
|
||||
In this example, the node is defined as child node of the PCI controller.
|
||||
@ -185,4 +188,5 @@ wifi@18000000 {
|
||||
vdd-3.3-ch0-supply = <&vreg_l25a_3p3>;
|
||||
memory-region = <&wifi_msa_mem>;
|
||||
iommus = <&apps_smmu 0x0040 0x1>;
|
||||
qcom,msa-fixed-perm;
|
||||
};
|
||||
|
273
Documentation/devicetree/bindings/net/wireless/qcom,ath11k.yaml
Normal file
273
Documentation/devicetree/bindings/net/wireless/qcom,ath11k.yaml
Normal file
@ -0,0 +1,273 @@
|
||||
# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
|
||||
# Copyright (c) 2018-2019 The Linux Foundation. All rights reserved.
|
||||
|
||||
%YAML 1.2
|
||||
---
|
||||
$id: http://devicetree.org/schemas/net/wireless/qcom,ath11k.yaml#
|
||||
$schema: http://devicetree.org/meta-schemas/core.yaml#
|
||||
|
||||
title: Qualcomm Technologies ath11k wireless devices Generic Binding
|
||||
|
||||
maintainers:
|
||||
- Kalle Valo <kvalo@codeaurora.org>
|
||||
|
||||
description: |
|
||||
These are dt entries for Qualcomm Technologies, Inc. IEEE 802.11ax
|
||||
devices, for example like AHB based IPQ8074.
|
||||
|
||||
properties:
|
||||
compatible:
|
||||
const: qcom,ipq8074-wifi
|
||||
|
||||
reg:
|
||||
maxItems: 1
|
||||
|
||||
interrupts:
|
||||
items:
|
||||
- description: misc-pulse1 interrupt events
|
||||
- description: misc-latch interrupt events
|
||||
- description: sw exception interrupt events
|
||||
- description: watchdog interrupt events
|
||||
- description: interrupt event for ring CE0
|
||||
- description: interrupt event for ring CE1
|
||||
- description: interrupt event for ring CE2
|
||||
- description: interrupt event for ring CE3
|
||||
- description: interrupt event for ring CE4
|
||||
- description: interrupt event for ring CE5
|
||||
- description: interrupt event for ring CE6
|
||||
- description: interrupt event for ring CE7
|
||||
- description: interrupt event for ring CE8
|
||||
- description: interrupt event for ring CE9
|
||||
- description: interrupt event for ring CE10
|
||||
- description: interrupt event for ring CE11
|
||||
- description: interrupt event for ring host2wbm-desc-feed
|
||||
- description: interrupt event for ring host2reo-re-injection
|
||||
- description: interrupt event for ring host2reo-command
|
||||
- description: interrupt event for ring host2rxdma-monitor-ring3
|
||||
- description: interrupt event for ring host2rxdma-monitor-ring2
|
||||
- description: interrupt event for ring host2rxdma-monitor-ring1
|
||||
- description: interrupt event for ring reo2ost-exception
|
||||
- description: interrupt event for ring wbm2host-rx-release
|
||||
- description: interrupt event for ring reo2host-status
|
||||
- description: interrupt event for ring reo2host-destination-ring4
|
||||
- description: interrupt event for ring reo2host-destination-ring3
|
||||
- description: interrupt event for ring reo2host-destination-ring2
|
||||
- description: interrupt event for ring reo2host-destination-ring1
|
||||
- description: interrupt event for ring rxdma2host-monitor-destination-mac3
|
||||
- description: interrupt event for ring rxdma2host-monitor-destination-mac2
|
||||
- description: interrupt event for ring rxdma2host-monitor-destination-mac1
|
||||
- description: interrupt event for ring ppdu-end-interrupts-mac3
|
||||
- description: interrupt event for ring ppdu-end-interrupts-mac2
|
||||
- description: interrupt event for ring ppdu-end-interrupts-mac1
|
||||
- description: interrupt event for ring rxdma2host-monitor-status-ring-mac3
|
||||
- description: interrupt event for ring rxdma2host-monitor-status-ring-mac2
|
||||
- description: interrupt event for ring rxdma2host-monitor-status-ring-mac1
|
||||
- description: interrupt event for ring host2rxdma-host-buf-ring-mac3
|
||||
- description: interrupt event for ring host2rxdma-host-buf-ring-mac2
|
||||
- description: interrupt event for ring host2rxdma-host-buf-ring-mac1
|
||||
- description: interrupt event for ring rxdma2host-destination-ring-mac3
|
||||
- description: interrupt event for ring rxdma2host-destination-ring-mac2
|
||||
- description: interrupt event for ring rxdma2host-destination-ring-mac1
|
||||
- description: interrupt event for ring host2tcl-input-ring4
|
||||
- description: interrupt event for ring host2tcl-input-ring3
|
||||
- description: interrupt event for ring host2tcl-input-ring2
|
||||
- description: interrupt event for ring host2tcl-input-ring1
|
||||
- description: interrupt event for ring wbm2host-tx-completions-ring3
|
||||
- description: interrupt event for ring wbm2host-tx-completions-ring2
|
||||
- description: interrupt event for ring wbm2host-tx-completions-ring1
|
||||
- description: interrupt event for ring tcl2host-status-ring
|
||||
|
||||
|
||||
interrupt-names:
|
||||
items:
|
||||
- const: misc-pulse1
|
||||
- const: misc-latch
|
||||
- const: sw-exception
|
||||
- const: watchdog
|
||||
- const: ce0
|
||||
- const: ce1
|
||||
- const: ce2
|
||||
- const: ce3
|
||||
- const: ce4
|
||||
- const: ce5
|
||||
- const: ce6
|
||||
- const: ce7
|
||||
- const: ce8
|
||||
- const: ce9
|
||||
- const: ce10
|
||||
- const: ce11
|
||||
- const: host2wbm-desc-feed
|
||||
- const: host2reo-re-injection
|
||||
- const: host2reo-command
|
||||
- const: host2rxdma-monitor-ring3
|
||||
- const: host2rxdma-monitor-ring2
|
||||
- const: host2rxdma-monitor-ring1
|
||||
- const: reo2ost-exception
|
||||
- const: wbm2host-rx-release
|
||||
- const: reo2host-status
|
||||
- const: reo2host-destination-ring4
|
||||
- const: reo2host-destination-ring3
|
||||
- const: reo2host-destination-ring2
|
||||
- const: reo2host-destination-ring1
|
||||
- const: rxdma2host-monitor-destination-mac3
|
||||
- const: rxdma2host-monitor-destination-mac2
|
||||
- const: rxdma2host-monitor-destination-mac1
|
||||
- const: ppdu-end-interrupts-mac3
|
||||
- const: ppdu-end-interrupts-mac2
|
||||
- const: ppdu-end-interrupts-mac1
|
||||
- const: rxdma2host-monitor-status-ring-mac3
|
||||
- const: rxdma2host-monitor-status-ring-mac2
|
||||
- const: rxdma2host-monitor-status-ring-mac1
|
||||
- const: host2rxdma-host-buf-ring-mac3
|
||||
- const: host2rxdma-host-buf-ring-mac2
|
||||
- const: host2rxdma-host-buf-ring-mac1
|
||||
- const: rxdma2host-destination-ring-mac3
|
||||
- const: rxdma2host-destination-ring-mac2
|
||||
- const: rxdma2host-destination-ring-mac1
|
||||
- const: host2tcl-input-ring4
|
||||
- const: host2tcl-input-ring3
|
||||
- const: host2tcl-input-ring2
|
||||
- const: host2tcl-input-ring1
|
||||
- const: wbm2host-tx-completions-ring3
|
||||
- const: wbm2host-tx-completions-ring2
|
||||
- const: wbm2host-tx-completions-ring1
|
||||
- const: tcl2host-status-ring
|
||||
|
||||
qcom,rproc:
|
||||
$ref: /schemas/types.yaml#definitions/phandle
|
||||
description:
|
||||
DT entry of q6v5-wcss remoteproc driver.
|
||||
Phandle to a node that can contain the following properties
|
||||
* compatible
|
||||
* reg
|
||||
* reg-names
|
||||
|
||||
required:
|
||||
- compatible
|
||||
- reg
|
||||
- interrupts
|
||||
- interrupt-names
|
||||
- qcom,rproc
|
||||
|
||||
additionalProperties: false
|
||||
|
||||
examples:
|
||||
- |
|
||||
|
||||
q6v5_wcss: q6v5_wcss@CD00000 {
|
||||
compatible = "qcom,ipq8074-wcss-pil";
|
||||
reg = <0xCD00000 0x4040>,
|
||||
<0x4AB000 0x20>;
|
||||
reg-names = "qdsp6",
|
||||
"rmb";
|
||||
};
|
||||
|
||||
wifi0: wifi@c000000 {
|
||||
compatible = "qcom,ipq8074-wifi";
|
||||
reg = <0xc000000 0x2000000>;
|
||||
interrupts = <0 320 1>,
|
||||
<0 319 1>,
|
||||
<0 318 1>,
|
||||
<0 317 1>,
|
||||
<0 316 1>,
|
||||
<0 315 1>,
|
||||
<0 314 1>,
|
||||
<0 311 1>,
|
||||
<0 310 1>,
|
||||
<0 411 1>,
|
||||
<0 410 1>,
|
||||
<0 40 1>,
|
||||
<0 39 1>,
|
||||
<0 302 1>,
|
||||
<0 301 1>,
|
||||
<0 37 1>,
|
||||
<0 36 1>,
|
||||
<0 296 1>,
|
||||
<0 295 1>,
|
||||
<0 294 1>,
|
||||
<0 293 1>,
|
||||
<0 292 1>,
|
||||
<0 291 1>,
|
||||
<0 290 1>,
|
||||
<0 289 1>,
|
||||
<0 288 1>,
|
||||
<0 239 1>,
|
||||
<0 236 1>,
|
||||
<0 235 1>,
|
||||
<0 234 1>,
|
||||
<0 233 1>,
|
||||
<0 232 1>,
|
||||
<0 231 1>,
|
||||
<0 230 1>,
|
||||
<0 229 1>,
|
||||
<0 228 1>,
|
||||
<0 224 1>,
|
||||
<0 223 1>,
|
||||
<0 203 1>,
|
||||
<0 183 1>,
|
||||
<0 180 1>,
|
||||
<0 179 1>,
|
||||
<0 178 1>,
|
||||
<0 177 1>,
|
||||
<0 176 1>,
|
||||
<0 163 1>,
|
||||
<0 162 1>,
|
||||
<0 160 1>,
|
||||
<0 159 1>,
|
||||
<0 158 1>,
|
||||
<0 157 1>,
|
||||
<0 156 1>;
|
||||
interrupt-names = "misc-pulse1",
|
||||
"misc-latch",
|
||||
"sw-exception",
|
||||
"watchdog",
|
||||
"ce0",
|
||||
"ce1",
|
||||
"ce2",
|
||||
"ce3",
|
||||
"ce4",
|
||||
"ce5",
|
||||
"ce6",
|
||||
"ce7",
|
||||
"ce8",
|
||||
"ce9",
|
||||
"ce10",
|
||||
"ce11",
|
||||
"host2wbm-desc-feed",
|
||||
"host2reo-re-injection",
|
||||
"host2reo-command",
|
||||
"host2rxdma-monitor-ring3",
|
||||
"host2rxdma-monitor-ring2",
|
||||
"host2rxdma-monitor-ring1",
|
||||
"reo2ost-exception",
|
||||
"wbm2host-rx-release",
|
||||
"reo2host-status",
|
||||
"reo2host-destination-ring4",
|
||||
"reo2host-destination-ring3",
|
||||
"reo2host-destination-ring2",
|
||||
"reo2host-destination-ring1",
|
||||
"rxdma2host-monitor-destination-mac3",
|
||||
"rxdma2host-monitor-destination-mac2",
|
||||
"rxdma2host-monitor-destination-mac1",
|
||||
"ppdu-end-interrupts-mac3",
|
||||
"ppdu-end-interrupts-mac2",
|
||||
"ppdu-end-interrupts-mac1",
|
||||
"rxdma2host-monitor-status-ring-mac3",
|
||||
"rxdma2host-monitor-status-ring-mac2",
|
||||
"rxdma2host-monitor-status-ring-mac1",
|
||||
"host2rxdma-host-buf-ring-mac3",
|
||||
"host2rxdma-host-buf-ring-mac2",
|
||||
"host2rxdma-host-buf-ring-mac1",
|
||||
"rxdma2host-destination-ring-mac3",
|
||||
"rxdma2host-destination-ring-mac2",
|
||||
"rxdma2host-destination-ring-mac1",
|
||||
"host2tcl-input-ring4",
|
||||
"host2tcl-input-ring3",
|
||||
"host2tcl-input-ring2",
|
||||
"host2tcl-input-ring1",
|
||||
"wbm2host-tx-completions-ring3",
|
||||
"wbm2host-tx-completions-ring2",
|
||||
"wbm2host-tx-completions-ring1",
|
||||
"tcl2host-status-ring";
|
||||
qcom,rproc = <&q6v5_wcss>;
|
||||
};
|
35
Documentation/devicetree/bindings/ptp/ptp-ines.txt
Normal file
35
Documentation/devicetree/bindings/ptp/ptp-ines.txt
Normal file
@ -0,0 +1,35 @@
|
||||
ZHAW InES PTP time stamping IP core
|
||||
|
||||
The IP core needs two different kinds of nodes. The control node
|
||||
lives somewhere in the memory map and specifies the address of the
|
||||
control registers. There can be up to three port handles placed as
|
||||
attributes of PHY nodes. These associate a particular MII bus with a
|
||||
port index within the IP core.
|
||||
|
||||
Required properties of the control node:
|
||||
|
||||
- compatible: "ines,ptp-ctrl"
|
||||
- reg: physical address and size of the register bank
|
||||
|
||||
Required format of the port handle within the PHY node:
|
||||
|
||||
- timestamper: provides control node reference and
|
||||
the port channel within the IP core
|
||||
|
||||
Example:
|
||||
|
||||
tstamper: timestamper@60000000 {
|
||||
compatible = "ines,ptp-ctrl";
|
||||
reg = <0x60000000 0x80>;
|
||||
};
|
||||
|
||||
ethernet@80000000 {
|
||||
...
|
||||
mdio {
|
||||
...
|
||||
ethernet-phy@3 {
|
||||
...
|
||||
timestamper = <&tstamper 0>;
|
||||
};
|
||||
};
|
||||
};
|
42
Documentation/devicetree/bindings/ptp/timestamper.txt
Normal file
42
Documentation/devicetree/bindings/ptp/timestamper.txt
Normal file
@ -0,0 +1,42 @@
|
||||
Time stamps from MII bus snooping devices
|
||||
|
||||
This binding supports non-PHY devices that snoop the MII bus and
|
||||
provide time stamps. In contrast to PHY time stamping drivers (which
|
||||
can simply attach their interface directly to the PHY instance), stand
|
||||
alone MII time stamping drivers use this binding to specify the
|
||||
connection between the snooping device and a given network interface.
|
||||
|
||||
Non-PHY MII time stamping drivers typically talk to the control
|
||||
interface over another bus like I2C, SPI, UART, or via a memory mapped
|
||||
peripheral. This controller device is associated with one or more
|
||||
time stamping channels, each of which snoops on a MII bus.
|
||||
|
||||
The "timestamper" property lives in a phy node and links a time
|
||||
stamping channel from the controller device to that phy's MII bus.
|
||||
|
||||
Example:
|
||||
|
||||
tstamper: timestamper@10000000 {
|
||||
compatible = "ines,ptp-ctrl";
|
||||
reg = <0x10000000 0x80>;
|
||||
};
|
||||
|
||||
ethernet@20000000 {
|
||||
mdio {
|
||||
ethernet-phy@1 {
|
||||
timestamper = <&tstamper 0>;
|
||||
};
|
||||
};
|
||||
};
|
||||
|
||||
ethernet@30000000 {
|
||||
mdio {
|
||||
ethernet-phy@2 {
|
||||
timestamper = <&tstamper 1>;
|
||||
};
|
||||
};
|
||||
};
|
||||
|
||||
In this example, time stamps from the MII bus attached to phy@1 will
|
||||
appear on time stamp channel 0 (zero), and those from phy@2 appear on
|
||||
channel 1.
|
@ -2,6 +2,7 @@ HWRNG support for the iproc-rng200 driver
|
||||
|
||||
Required properties:
|
||||
- compatible : Must be one of:
|
||||
"brcm,bcm2711-rng200"
|
||||
"brcm,bcm7211-rng200"
|
||||
"brcm,bcm7278-rng200"
|
||||
"brcm,iproc-rng200"
|
||||
|
@ -17,6 +17,9 @@ Required properties:
|
||||
* "arb" : memory ARB line (required)
|
||||
* "rst" : dedicated device reset line (optional)
|
||||
- #sound-dai-cells: must be 0.
|
||||
- amlogic,fifo-depth: The size of the controller's fifo in bytes. This
|
||||
is useful for determining certain configuration such
|
||||
as the flush threshold of the fifo
|
||||
|
||||
Example of FRDDR A on the A113 SoC:
|
||||
|
||||
@ -27,4 +30,5 @@ frddr_a: audio-controller@1c0 {
|
||||
interrupts = <GIC_SPI 88 IRQ_TYPE_EDGE_RISING>;
|
||||
clocks = <&clkc_audio AUD_CLKID_FRDDR_A>;
|
||||
resets = <&arb AXG_ARB_FRDDR_A>;
|
||||
fifo-depth = <512>;
|
||||
};
|
||||
|
@ -8,7 +8,12 @@ three substreams within totally 10 channels.
|
||||
|
||||
Required properties:
|
||||
|
||||
- compatible : Contains "fsl,imx35-asrc" or "fsl,imx53-asrc".
|
||||
- compatible : Compatible list, should contain one of the following
|
||||
compatibles:
|
||||
"fsl,imx35-asrc",
|
||||
"fsl,imx53-asrc",
|
||||
"fsl,imx8qm-asrc",
|
||||
"fsl,imx8qxp-asrc",
|
||||
|
||||
- reg : Offset and length of the register set for the device.
|
||||
|
||||
@ -35,6 +40,11 @@ Required properties:
|
||||
|
||||
- fsl,asrc-width : Defines a mutual sample width used by DPCM Back Ends.
|
||||
|
||||
- fsl,asrc-clk-map : Defines clock map used in driver. which is required
|
||||
by imx8qm/imx8qxp platform
|
||||
<0> - select the map for asrc0 in imx8qm/imx8qxp
|
||||
<1> - select the map for asrc1 in imx8qm/imx8qxp
|
||||
|
||||
Optional properties:
|
||||
|
||||
- big-endian : If this property is absent, the little endian mode
|
||||
|
@ -1,10 +1,16 @@
|
||||
GTM601 UMTS modem audio interface CODEC
|
||||
|
||||
This device has no configuration interface. Sample rate is fixed - 8kHz.
|
||||
This device has no configuration interface. The sample rate and channels are
|
||||
based on the compatible string
|
||||
"option,gtm601" = 8kHz mono
|
||||
"broadmobi,bm818" = 48KHz stereo
|
||||
|
||||
Required properties:
|
||||
|
||||
- compatible : "option,gtm601"
|
||||
- compatible : one of
|
||||
"option,gtm601"
|
||||
"broadmobi,bm818"
|
||||
|
||||
|
||||
Example:
|
||||
|
||||
|
55
Documentation/devicetree/bindings/sound/ingenic,codec.yaml
Normal file
55
Documentation/devicetree/bindings/sound/ingenic,codec.yaml
Normal file
@ -0,0 +1,55 @@
|
||||
# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
|
||||
%YAML 1.2
|
||||
---
|
||||
$id: http://devicetree.org/schemas/sound/ingenic,codec.yaml#
|
||||
$schema: http://devicetree.org/meta-schemas/core.yaml#
|
||||
|
||||
title: Ingenic JZ47xx internal codec DT bindings
|
||||
|
||||
maintainers:
|
||||
- Paul Cercueil <paul@crapouillou.net>
|
||||
|
||||
properties:
|
||||
$nodename:
|
||||
pattern: '^audio-codec@.*'
|
||||
|
||||
compatible:
|
||||
oneOf:
|
||||
- const: ingenic,jz4770-codec
|
||||
- const: ingenic,jz4725b-codec
|
||||
- const: ingenic,jz4740-codec
|
||||
|
||||
reg:
|
||||
maxItems: 1
|
||||
|
||||
clocks:
|
||||
maxItems: 1
|
||||
|
||||
clock-names:
|
||||
items:
|
||||
- const: aic
|
||||
|
||||
'#sound-dai-cells':
|
||||
const: 0
|
||||
|
||||
additionalProperties: false
|
||||
|
||||
required:
|
||||
- compatible
|
||||
- reg
|
||||
- clocks
|
||||
- clock-names
|
||||
- '#sound-dai-cells'
|
||||
|
||||
examples:
|
||||
- |
|
||||
#include <dt-bindings/clock/jz4740-cgu.h>
|
||||
codec: audio-codec@10020080 {
|
||||
compatible = "ingenic,jz4740-codec";
|
||||
reg = <0x10020080 0x8>;
|
||||
#sound-dai-cells = <0>;
|
||||
clocks = <&cgu JZ4740_CLK_AIC>;
|
||||
clock-names = "aic";
|
||||
};
|
||||
|
||||
...
|
@ -1,20 +0,0 @@
|
||||
Ingenic JZ4725B codec controller
|
||||
|
||||
Required properties:
|
||||
- compatible : "ingenic,jz4725b-codec"
|
||||
- reg : codec registers location and length
|
||||
- clocks : phandle to the AIC clock.
|
||||
- clock-names: must be set to "aic".
|
||||
- #sound-dai-cells: Must be set to 0.
|
||||
|
||||
Example:
|
||||
|
||||
codec: audio-codec@100200a4 {
|
||||
compatible = "ingenic,jz4725b-codec";
|
||||
reg = <0x100200a4 0x8>;
|
||||
|
||||
#sound-dai-cells = <0>;
|
||||
|
||||
clocks = <&cgu JZ4725B_CLK_AIC>;
|
||||
clock-names = "aic";
|
||||
};
|
@ -1,20 +0,0 @@
|
||||
Ingenic JZ4740 codec controller
|
||||
|
||||
Required properties:
|
||||
- compatible : "ingenic,jz4740-codec"
|
||||
- reg : codec registers location and length
|
||||
- clocks : phandle to the AIC clock.
|
||||
- clock-names: must be set to "aic".
|
||||
- #sound-dai-cells: Must be set to 0.
|
||||
|
||||
Example:
|
||||
|
||||
codec: audio-codec@10020080 {
|
||||
compatible = "ingenic,jz4740-codec";
|
||||
reg = <0x10020080 0x8>;
|
||||
|
||||
#sound-dai-cells = <0>;
|
||||
|
||||
clocks = <&cgu JZ4740_CLK_AIC>;
|
||||
clock-names = "aic";
|
||||
};
|
@ -5,7 +5,10 @@ This binding describes the SDM845 sound card, which uses qdsp for audio.
|
||||
- compatible:
|
||||
Usage: required
|
||||
Value type: <stringlist>
|
||||
Definition: must be "qcom,sdm845-sndcard"
|
||||
Definition: must be one of this
|
||||
"qcom,sdm845-sndcard"
|
||||
"qcom,db845c-sndcard"
|
||||
"lenovo,yoga-c630-sndcard"
|
||||
|
||||
- audio-routing:
|
||||
Usage: Optional
|
||||
|
175
Documentation/devicetree/bindings/sound/qcom,wcd934x.yaml
Normal file
175
Documentation/devicetree/bindings/sound/qcom,wcd934x.yaml
Normal file
@ -0,0 +1,175 @@
|
||||
# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
|
||||
%YAML 1.2
|
||||
---
|
||||
$id: http://devicetree.org/schemas/sound/qcom,wcd934x.yaml#
|
||||
$schema: http://devicetree.org/meta-schemas/core.yaml#
|
||||
|
||||
title: Bindings for Qualcomm WCD9340/WCD9341 Audio Codec
|
||||
|
||||
maintainers:
|
||||
- Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
|
||||
|
||||
description: |
|
||||
Qualcomm WCD9340/WCD9341 Codec is a standalone Hi-Fi audio codec IC.
|
||||
It has in-built Soundwire controller, pin controller, interrupt mux and
|
||||
supports both I2S/I2C and SLIMbus audio interfaces.
|
||||
|
||||
properties:
|
||||
compatible:
|
||||
const: slim217,250
|
||||
|
||||
reg:
|
||||
maxItems: 1
|
||||
|
||||
interrupts:
|
||||
maxItems: 1
|
||||
|
||||
reset-gpios:
|
||||
description: GPIO spec for reset line to use
|
||||
maxItems: 1
|
||||
|
||||
slim-ifc-dev: true
|
||||
|
||||
clocks:
|
||||
maxItems: 1
|
||||
|
||||
clock-names:
|
||||
const: extclk
|
||||
|
||||
vdd-buck-supply:
|
||||
description: A reference to the 1.8V buck supply
|
||||
|
||||
vdd-buck-sido-supply:
|
||||
description: A reference to the 1.8V SIDO buck supply
|
||||
|
||||
vdd-rx-supply:
|
||||
description: A reference to the 1.8V rx supply
|
||||
|
||||
vdd-tx-supply:
|
||||
description: A reference to the 1.8V tx supply
|
||||
|
||||
vdd-vbat-supply:
|
||||
description: A reference to the vbat supply
|
||||
|
||||
vdd-io-supply:
|
||||
description: A reference to the 1.8V I/O supply
|
||||
|
||||
vdd-micbias-supply:
|
||||
description: A reference to the micbias supply
|
||||
|
||||
qcom,micbias1-microvolt:
|
||||
description: micbias1 voltage
|
||||
minimum: 1800000
|
||||
maximum: 2850000
|
||||
|
||||
qcom,micbias2-microvolt:
|
||||
description: micbias2 voltage
|
||||
minimum: 1800000
|
||||
maximum: 2850000
|
||||
|
||||
qcom,micbias3-microvolt:
|
||||
description: micbias3 voltage
|
||||
minimum: 1800000
|
||||
maximum: 2850000
|
||||
|
||||
qcom,micbias4-microvolt:
|
||||
description: micbias4 voltage
|
||||
minimum: 1800000
|
||||
maximum: 2850000
|
||||
|
||||
clock-output-names:
|
||||
const: mclk
|
||||
|
||||
clock-frequency:
|
||||
description: Clock frequency of output clk in Hz
|
||||
|
||||
interrupt-controller: true
|
||||
|
||||
'#interrupt-cells':
|
||||
const: 1
|
||||
|
||||
'#clock-cells':
|
||||
const: 0
|
||||
|
||||
'#sound-dai-cells':
|
||||
const: 1
|
||||
|
||||
"#address-cells":
|
||||
const: 1
|
||||
|
||||
"#size-cells":
|
||||
const: 1
|
||||
|
||||
gpio@42:
|
||||
type: object
|
||||
allOf:
|
||||
- $ref: ../gpio/qcom,wcd934x-gpio.yaml#
|
||||
|
||||
patternProperties:
|
||||
"^.*@[0-9a-f]+$":
|
||||
type: object
|
||||
description: |
|
||||
WCD934x subnode for each slave devices. Bindings of each subnodes
|
||||
depends on the specific driver providing the functionality and
|
||||
documented in their respective bindings.
|
||||
|
||||
properties:
|
||||
reg:
|
||||
maxItems: 1
|
||||
|
||||
required:
|
||||
- reg
|
||||
|
||||
required:
|
||||
- compatible
|
||||
- reg
|
||||
- reset-gpios
|
||||
- slim-ifc-dev
|
||||
- interrupts
|
||||
- interrupt-controller
|
||||
- clock-frequency
|
||||
- clock-output-names
|
||||
- qcom,micbias1-microvolt
|
||||
- qcom,micbias2-microvolt
|
||||
- qcom,micbias3-microvolt
|
||||
- qcom,micbias4-microvolt
|
||||
- "#interrupt-cells"
|
||||
- "#clock-cells"
|
||||
- "#sound-dai-cells"
|
||||
- "#address-cells"
|
||||
- "#size-cells"
|
||||
|
||||
examples:
|
||||
- |
|
||||
codec@1,0{
|
||||
compatible = "slim217,250";
|
||||
reg = <1 0>;
|
||||
reset-gpios = <&tlmm 64 0>;
|
||||
slim-ifc-dev = <&wcd9340_ifd>;
|
||||
#sound-dai-cells = <1>;
|
||||
interrupt-parent = <&tlmm>;
|
||||
interrupts = <54 4>;
|
||||
interrupt-controller;
|
||||
#interrupt-cells = <1>;
|
||||
#clock-cells = <0>;
|
||||
clock-frequency = <9600000>;
|
||||
clock-output-names = "mclk";
|
||||
qcom,micbias1-microvolt = <1800000>;
|
||||
qcom,micbias2-microvolt = <1800000>;
|
||||
qcom,micbias3-microvolt = <1800000>;
|
||||
qcom,micbias4-microvolt = <1800000>;
|
||||
clock-names = "extclk";
|
||||
clocks = <&rpmhcc 2>;
|
||||
|
||||
#address-cells = <1>;
|
||||
#size-cells = <1>;
|
||||
|
||||
gpio@42 {
|
||||
compatible = "qcom,wcd9340-gpio";
|
||||
reg = <0x42 0x2>;
|
||||
gpio-controller;
|
||||
#gpio-cells = <2>;
|
||||
};
|
||||
};
|
||||
|
||||
...
|
68
Documentation/devicetree/bindings/sound/qcom,wsa881x.yaml
Normal file
68
Documentation/devicetree/bindings/sound/qcom,wsa881x.yaml
Normal file
@ -0,0 +1,68 @@
|
||||
# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
|
||||
%YAML 1.2
|
||||
---
|
||||
$id: http://devicetree.org/schemas/sound/qcom,wsa881x.yaml#
|
||||
$schema: http://devicetree.org/meta-schemas/core.yaml#
|
||||
|
||||
title: Bindings for Qualcomm WSA8810/WSA8815 Class-D Smart Speaker Amplifier
|
||||
|
||||
maintainers:
|
||||
- Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
|
||||
|
||||
description: |
|
||||
WSA8810 is a class-D smart speaker amplifier and WSA8815
|
||||
is a high-output power class-D smart speaker amplifier.
|
||||
Their primary operating mode uses a SoundWire digital audio
|
||||
interface. This binding is for SoundWire interface.
|
||||
|
||||
properties:
|
||||
compatible:
|
||||
const: sdw10217201000
|
||||
|
||||
reg:
|
||||
maxItems: 1
|
||||
|
||||
powerdown-gpios:
|
||||
description: GPIO spec for Powerdown/Shutdown line to use
|
||||
maxItems: 1
|
||||
|
||||
'#thermal-sensor-cells':
|
||||
const: 0
|
||||
|
||||
'#sound-dai-cells':
|
||||
const: 0
|
||||
|
||||
required:
|
||||
- compatible
|
||||
- reg
|
||||
- powerdown-gpios
|
||||
- "#thermal-sensor-cells"
|
||||
- "#sound-dai-cells"
|
||||
|
||||
additionalProperties: false
|
||||
|
||||
examples:
|
||||
- |
|
||||
soundwire@c2d0000 {
|
||||
#address-cells = <2>;
|
||||
#size-cells = <0>;
|
||||
reg = <0x0c2d0000 0x2000>;
|
||||
|
||||
speaker@0,1 {
|
||||
compatible = "sdw10217201000";
|
||||
reg = <0 1>;
|
||||
powerdown-gpios = <&wcdpinctrl 2 0>;
|
||||
#thermal-sensor-cells = <0>;
|
||||
#sound-dai-cells = <0>;
|
||||
};
|
||||
|
||||
speaker@0,2 {
|
||||
compatible = "sdw10217201000";
|
||||
reg = <0 2>;
|
||||
powerdown-gpios = <&wcdpinctrl 2 0>;
|
||||
#thermal-sensor-cells = <0>;
|
||||
#sound-dai-cells = <0>;
|
||||
};
|
||||
};
|
||||
|
||||
...
|
17
Documentation/devicetree/bindings/sound/rt1015.txt
Normal file
17
Documentation/devicetree/bindings/sound/rt1015.txt
Normal file
@ -0,0 +1,17 @@
|
||||
RT1015 Mono Class D Audio Amplifier
|
||||
|
||||
This device supports I2C only.
|
||||
|
||||
Required properties:
|
||||
|
||||
- compatible : "realtek,rt1015".
|
||||
|
||||
- reg : The I2C address of the device.
|
||||
|
||||
|
||||
Example:
|
||||
|
||||
rt1015: codec@28 {
|
||||
compatible = "realtek,rt1015";
|
||||
reg = <0x28>;
|
||||
};
|
@ -10,6 +10,10 @@ Required properties:
|
||||
|
||||
- interrupts : The CODEC's interrupt output.
|
||||
|
||||
- avdd-supply: Power supply for AVDD, providing 1.8V.
|
||||
|
||||
- cpvdd-supply: Power supply for CPVDD, providing 3.5V.
|
||||
|
||||
Optional properties:
|
||||
|
||||
- hp-detect-gpios:
|
||||
|
@ -0,0 +1,160 @@
|
||||
# SPDX-License-Identifier: GPL-2.0
|
||||
%YAML 1.2
|
||||
---
|
||||
$id: http://devicetree.org/schemas/thermal/allwinner,sun8i-a83t-ths.yaml#
|
||||
$schema: http://devicetree.org/meta-schemas/core.yaml#
|
||||
|
||||
title: Allwinner SUN8I Thermal Controller Device Tree Bindings
|
||||
|
||||
maintainers:
|
||||
- Vasily Khoruzhick <anarsoul@gmail.com>
|
||||
- Yangtao Li <tiny.windzz@gmail.com>
|
||||
|
||||
properties:
|
||||
compatible:
|
||||
enum:
|
||||
- allwinner,sun8i-a83t-ths
|
||||
- allwinner,sun8i-h3-ths
|
||||
- allwinner,sun8i-r40-ths
|
||||
- allwinner,sun50i-a64-ths
|
||||
- allwinner,sun50i-h5-ths
|
||||
- allwinner,sun50i-h6-ths
|
||||
|
||||
clocks:
|
||||
minItems: 1
|
||||
maxItems: 2
|
||||
items:
|
||||
- description: Bus Clock
|
||||
- description: Module Clock
|
||||
|
||||
clock-names:
|
||||
minItems: 1
|
||||
maxItems: 2
|
||||
items:
|
||||
- const: bus
|
||||
- const: mod
|
||||
|
||||
reg:
|
||||
maxItems: 1
|
||||
|
||||
interrupts:
|
||||
maxItems: 1
|
||||
|
||||
resets:
|
||||
maxItems: 1
|
||||
|
||||
nvmem-cells:
|
||||
maxItems: 1
|
||||
description: Calibration data for thermal sensors
|
||||
|
||||
nvmem-cell-names:
|
||||
const: calibration
|
||||
|
||||
# See ./thermal.txt for details
|
||||
"#thermal-sensor-cells":
|
||||
enum:
|
||||
- 0
|
||||
- 1
|
||||
|
||||
allOf:
|
||||
- if:
|
||||
properties:
|
||||
compatible:
|
||||
contains:
|
||||
const: allwinner,sun50i-h6-ths
|
||||
|
||||
then:
|
||||
properties:
|
||||
clocks:
|
||||
maxItems: 1
|
||||
|
||||
clock-names:
|
||||
maxItems: 1
|
||||
|
||||
else:
|
||||
properties:
|
||||
clocks:
|
||||
minItems: 2
|
||||
|
||||
clock-names:
|
||||
minItems: 2
|
||||
|
||||
- if:
|
||||
properties:
|
||||
compatible:
|
||||
contains:
|
||||
const: allwinner,sun8i-h3-ths
|
||||
|
||||
then:
|
||||
properties:
|
||||
"#thermal-sensor-cells":
|
||||
const: 0
|
||||
|
||||
else:
|
||||
properties:
|
||||
"#thermal-sensor-cells":
|
||||
const: 1
|
||||
|
||||
- if:
|
||||
properties:
|
||||
compatible:
|
||||
contains:
|
||||
enum:
|
||||
- const: allwinner,sun8i-h3-ths
|
||||
- const: allwinner,sun8i-r40-ths
|
||||
- const: allwinner,sun50i-a64-ths
|
||||
- const: allwinner,sun50i-h5-ths
|
||||
- const: allwinner,sun50i-h6-ths
|
||||
|
||||
then:
|
||||
required:
|
||||
- clocks
|
||||
- clock-names
|
||||
- resets
|
||||
|
||||
required:
|
||||
- compatible
|
||||
- reg
|
||||
- interrupts
|
||||
- '#thermal-sensor-cells'
|
||||
|
||||
additionalProperties: false
|
||||
|
||||
examples:
|
||||
- |
|
||||
thermal-sensor@1f04000 {
|
||||
compatible = "allwinner,sun8i-a83t-ths";
|
||||
reg = <0x01f04000 0x100>;
|
||||
interrupts = <0 31 0>;
|
||||
nvmem-cells = <&ths_calibration>;
|
||||
nvmem-cell-names = "calibration";
|
||||
#thermal-sensor-cells = <1>;
|
||||
};
|
||||
|
||||
- |
|
||||
thermal-sensor@1c25000 {
|
||||
compatible = "allwinner,sun8i-h3-ths";
|
||||
reg = <0x01c25000 0x400>;
|
||||
clocks = <&ccu 0>, <&ccu 1>;
|
||||
clock-names = "bus", "mod";
|
||||
resets = <&ccu 2>;
|
||||
interrupts = <0 31 0>;
|
||||
nvmem-cells = <&ths_calibration>;
|
||||
nvmem-cell-names = "calibration";
|
||||
#thermal-sensor-cells = <0>;
|
||||
};
|
||||
|
||||
- |
|
||||
thermal-sensor@5070400 {
|
||||
compatible = "allwinner,sun50i-h6-ths";
|
||||
reg = <0x05070400 0x100>;
|
||||
clocks = <&ccu 0>;
|
||||
clock-names = "bus";
|
||||
resets = <&ccu 2>;
|
||||
interrupts = <0 15 0>;
|
||||
nvmem-cells = <&ths_calibration>;
|
||||
nvmem-cell-names = "calibration";
|
||||
#thermal-sensor-cells = <1>;
|
||||
};
|
||||
|
||||
...
|
@ -0,0 +1,48 @@
|
||||
# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
|
||||
%YAML 1.2
|
||||
---
|
||||
$id: http://devicetree.org/schemas/thermal/brcm,avs-ro-thermal.yaml#
|
||||
$schema: http://devicetree.org/meta-schemas/core.yaml#
|
||||
|
||||
title: Broadcom AVS ring oscillator thermal
|
||||
|
||||
maintainers:
|
||||
- Stefan Wahren <wahrenst@gmx.net>
|
||||
|
||||
description: |+
|
||||
The thermal node should be the child of a syscon node with the
|
||||
required property:
|
||||
|
||||
- compatible: Should be one of the following:
|
||||
"brcm,bcm2711-avs-monitor", "syscon", "simple-mfd"
|
||||
|
||||
Refer to the the bindings described in
|
||||
Documentation/devicetree/bindings/mfd/syscon.txt
|
||||
|
||||
properties:
|
||||
compatible:
|
||||
const: brcm,bcm2711-thermal
|
||||
|
||||
# See ./thermal.txt for details
|
||||
"#thermal-sensor-cells":
|
||||
const: 0
|
||||
|
||||
required:
|
||||
- compatible
|
||||
- '#thermal-sensor-cells'
|
||||
|
||||
additionalProperties: false
|
||||
|
||||
examples:
|
||||
- |
|
||||
avs-monitor@7d5d2000 {
|
||||
compatible = "brcm,bcm2711-avs-monitor",
|
||||
"syscon", "simple-mfd";
|
||||
reg = <0x7d5d2000 0xf00>;
|
||||
|
||||
thermal: thermal {
|
||||
compatible = "brcm,bcm2711-thermal";
|
||||
#thermal-sensor-cells = <0>;
|
||||
};
|
||||
};
|
||||
...
|
@ -3,9 +3,13 @@
|
||||
Thermal management core, provided by the AVS TMON hardware block.
|
||||
|
||||
Required properties:
|
||||
- compatible: must be "brcm,avs-tmon" and/or "brcm,avs-tmon-bcm7445"
|
||||
- compatible: must be one of:
|
||||
"brcm,avs-tmon-bcm7216"
|
||||
"brcm,avs-tmon-bcm7445"
|
||||
"brcm,avs-tmon"
|
||||
- reg: address range for the AVS TMON registers
|
||||
- interrupts: temperature monitor interrupt, for high/low threshold triggers
|
||||
- interrupts: temperature monitor interrupt, for high/low threshold triggers,
|
||||
required except for "brcm,avs-tmon-bcm7216"
|
||||
- interrupt-names: should be "tmon"
|
||||
|
||||
Example:
|
||||
|
189
Documentation/driver-api/thermal/cpu-idle-cooling.rst
Normal file
189
Documentation/driver-api/thermal/cpu-idle-cooling.rst
Normal file
@ -0,0 +1,189 @@
|
||||
|
||||
Situation:
|
||||
----------
|
||||
|
||||
Under certain circumstances a SoC can reach a critical temperature
|
||||
limit and is unable to stabilize the temperature around a temperature
|
||||
control. When the SoC has to stabilize the temperature, the kernel can
|
||||
act on a cooling device to mitigate the dissipated power. When the
|
||||
critical temperature is reached, a decision must be taken to reduce
|
||||
the temperature, that, in turn impacts performance.
|
||||
|
||||
Another situation is when the silicon temperature continues to
|
||||
increase even after the dynamic leakage is reduced to its minimum by
|
||||
clock gating the component. This runaway phenomenon can continue due
|
||||
to the static leakage. The only solution is to power down the
|
||||
component, thus dropping the dynamic and static leakage that will
|
||||
allow the component to cool down.
|
||||
|
||||
Last but not least, the system can ask for a specific power budget but
|
||||
because of the OPP density, we can only choose an OPP with a power
|
||||
budget lower than the requested one and under-utilize the CPU, thus
|
||||
losing performance. In other words, one OPP under-utilizes the CPU
|
||||
with a power less than the requested power budget and the next OPP
|
||||
exceeds the power budget. An intermediate OPP could have been used if
|
||||
it were present.
|
||||
|
||||
Solutions:
|
||||
----------
|
||||
|
||||
If we can remove the static and the dynamic leakage for a specific
|
||||
duration in a controlled period, the SoC temperature will
|
||||
decrease. Acting on the idle state duration or the idle cycle
|
||||
injection period, we can mitigate the temperature by modulating the
|
||||
power budget.
|
||||
|
||||
The Operating Performance Point (OPP) density has a great influence on
|
||||
the control precision of cpufreq, however different vendors have a
|
||||
plethora of OPP density, and some have large power gap between OPPs,
|
||||
that will result in loss of performance during thermal control and
|
||||
loss of power in other scenarios.
|
||||
|
||||
At a specific OPP, we can assume that injecting idle cycle on all CPUs
|
||||
belong to the same cluster, with a duration greater than the cluster
|
||||
idle state target residency, we lead to dropping the static and the
|
||||
dynamic leakage for this period (modulo the energy needed to enter
|
||||
this state). So the sustainable power with idle cycles has a linear
|
||||
relation with the OPP’s sustainable power and can be computed with a
|
||||
coefficient similar to:
|
||||
|
||||
Power(IdleCycle) = Coef x Power(OPP)
|
||||
|
||||
Idle Injection:
|
||||
---------------
|
||||
|
||||
The base concept of the idle injection is to force the CPU to go to an
|
||||
idle state for a specified time each control cycle, it provides
|
||||
another way to control CPU power and heat in addition to
|
||||
cpufreq. Ideally, if all CPUs belonging to the same cluster, inject
|
||||
their idle cycles synchronously, the cluster can reach its power down
|
||||
state with a minimum power consumption and reduce the static leakage
|
||||
to almost zero. However, these idle cycles injection will add extra
|
||||
latencies as the CPUs will have to wakeup from a deep sleep state.
|
||||
|
||||
We use a fixed duration of idle injection that gives an acceptable
|
||||
performance penalty and a fixed latency. Mitigation can be increased
|
||||
or decreased by modulating the duty cycle of the idle injection.
|
||||
|
||||
^
|
||||
|
|
||||
|
|
||||
|------- -------
|
||||
|_______|_______________________|_______|___________
|
||||
|
||||
<------>
|
||||
idle <---------------------->
|
||||
running
|
||||
|
||||
<----------------------------->
|
||||
duty cycle 25%
|
||||
|
||||
|
||||
The implementation of the cooling device bases the number of states on
|
||||
the duty cycle percentage. When no mitigation is happening the cooling
|
||||
device state is zero, meaning the duty cycle is 0%.
|
||||
|
||||
When the mitigation begins, depending on the governor's policy, a
|
||||
starting state is selected. With a fixed idle duration and the duty
|
||||
cycle (aka the cooling device state), the running duration can be
|
||||
computed.
|
||||
|
||||
The governor will change the cooling device state thus the duty cycle
|
||||
and this variation will modulate the cooling effect.
|
||||
|
||||
^
|
||||
|
|
||||
|
|
||||
|------- -------
|
||||
|_______|_______________|_______|___________
|
||||
|
||||
<------>
|
||||
idle <-------------->
|
||||
running
|
||||
|
||||
<----------------------------->
|
||||
duty cycle 33%
|
||||
|
||||
|
||||
^
|
||||
|
|
||||
|
|
||||
|------- -------
|
||||
|_______|_______|_______|___________
|
||||
|
||||
<------>
|
||||
idle <------>
|
||||
running
|
||||
|
||||
<------------->
|
||||
duty cycle 50%
|
||||
|
||||
The idle injection duration value must comply with the constraints:
|
||||
|
||||
- It is less than or equal to the latency we tolerate when the
|
||||
mitigation begins. It is platform dependent and will depend on the
|
||||
user experience, reactivity vs performance trade off we want. This
|
||||
value should be specified.
|
||||
|
||||
- It is greater than the idle state’s target residency we want to go
|
||||
for thermal mitigation, otherwise we end up consuming more energy.
|
||||
|
||||
Power considerations
|
||||
--------------------
|
||||
|
||||
When we reach the thermal trip point, we have to sustain a specified
|
||||
power for a specific temperature but at this time we consume:
|
||||
|
||||
Power = Capacitance x Voltage^2 x Frequency x Utilisation
|
||||
|
||||
... which is more than the sustainable power (or there is something
|
||||
wrong in the system setup). The ‘Capacitance’ and ‘Utilisation’ are a
|
||||
fixed value, ‘Voltage’ and the ‘Frequency’ are fixed artificially
|
||||
because we don’t want to change the OPP. We can group the
|
||||
‘Capacitance’ and the ‘Utilisation’ into a single term which is the
|
||||
‘Dynamic Power Coefficient (Cdyn)’ Simplifying the above, we have:
|
||||
|
||||
Pdyn = Cdyn x Voltage^2 x Frequency
|
||||
|
||||
The power allocator governor will ask us somehow to reduce our power
|
||||
in order to target the sustainable power defined in the device
|
||||
tree. So with the idle injection mechanism, we want an average power
|
||||
(Ptarget) resulting in an amount of time running at full power on a
|
||||
specific OPP and idle another amount of time. That could be put in a
|
||||
equation:
|
||||
|
||||
P(opp)target = ((Trunning x (P(opp)running) + (Tidle x P(opp)idle)) /
|
||||
(Trunning + Tidle)
|
||||
...
|
||||
|
||||
Tidle = Trunning x ((P(opp)running / P(opp)target) - 1)
|
||||
|
||||
At this point if we know the running period for the CPU, that gives us
|
||||
the idle injection we need. Alternatively if we have the idle
|
||||
injection duration, we can compute the running duration with:
|
||||
|
||||
Trunning = Tidle / ((P(opp)running / P(opp)target) - 1)
|
||||
|
||||
Practically, if the running power is less than the targeted power, we
|
||||
end up with a negative time value, so obviously the equation usage is
|
||||
bound to a power reduction, hence a higher OPP is needed to have the
|
||||
running power greater than the targeted power.
|
||||
|
||||
However, in this demonstration we ignore three aspects:
|
||||
|
||||
* The static leakage is not defined here, we can introduce it in the
|
||||
equation but assuming it will be zero most of the time as it is
|
||||
difficult to get the values from the SoC vendors
|
||||
|
||||
* The idle state wake up latency (or entry + exit latency) is not
|
||||
taken into account, it must be added in the equation in order to
|
||||
rigorously compute the idle injection
|
||||
|
||||
* The injected idle duration must be greater than the idle state
|
||||
target residency, otherwise we end up consuming more energy and
|
||||
potentially invert the mitigation effect
|
||||
|
||||
So the final equation is:
|
||||
|
||||
Trunning = (Tidle - Twakeup ) x
|
||||
(((P(opp)dyn + P(opp)static ) - P(opp)target) / P(opp)target )
|
@ -4,7 +4,7 @@ Kernel driver exynos_tmu
|
||||
|
||||
Supported chips:
|
||||
|
||||
* ARM SAMSUNG EXYNOS4, EXYNOS5 series of SoC
|
||||
* ARM Samsung Exynos4, Exynos5 series of SoC
|
||||
|
||||
Datasheet: Not publicly available
|
||||
|
||||
@ -14,7 +14,7 @@ Authors: Amit Daniel <amit.daniel@samsung.com>
|
||||
TMU controller Description:
|
||||
---------------------------
|
||||
|
||||
This driver allows to read temperature inside SAMSUNG EXYNOS4/5 series of SoC.
|
||||
This driver allows to read temperature inside Samsung Exynos4/5 series of SoC.
|
||||
|
||||
The chip only exposes the measured 8-bit temperature code value
|
||||
through a register.
|
||||
@ -43,7 +43,7 @@ The three equations are:
|
||||
Trimming info for 85 degree Celsius (stored at TRIMINFO register)
|
||||
Temperature code measured at 85 degree Celsius which is unchanged
|
||||
|
||||
TMU(Thermal Management Unit) in EXYNOS4/5 generates interrupt
|
||||
TMU(Thermal Management Unit) in Exynos4/5 generates interrupt
|
||||
when temperature exceeds pre-defined levels.
|
||||
The maximum number of configurable threshold is five.
|
||||
The threshold levels are defined as follows::
|
||||
@ -67,7 +67,7 @@ TMU driver description:
|
||||
The exynos thermal driver is structured as::
|
||||
|
||||
Kernel Core thermal framework
|
||||
(thermal_core.c, step_wise.c, cpu_cooling.c)
|
||||
(thermal_core.c, step_wise.c, cpufreq_cooling.c)
|
||||
^
|
||||
|
|
||||
|
|
||||
|
@ -234,8 +234,8 @@ HKDF is more flexible, is nonreversible, and evenly distributes
|
||||
entropy from the master key. HKDF is also standardized and widely
|
||||
used by other software, whereas the AES-128-ECB based KDF is ad-hoc.
|
||||
|
||||
Per-file keys
|
||||
-------------
|
||||
Per-file encryption keys
|
||||
------------------------
|
||||
|
||||
Since each master key can protect many files, it is necessary to
|
||||
"tweak" the encryption of each file so that the same plaintext in two
|
||||
@ -268,9 +268,9 @@ is greater than that of an AES-256-XTS key.
|
||||
Therefore, to improve performance and save memory, for Adiantum a
|
||||
"direct key" configuration is supported. When the user has enabled
|
||||
this by setting FSCRYPT_POLICY_FLAG_DIRECT_KEY in the fscrypt policy,
|
||||
per-file keys are not used. Instead, whenever any data (contents or
|
||||
filenames) is encrypted, the file's 16-byte nonce is included in the
|
||||
IV. Moreover:
|
||||
per-file encryption keys are not used. Instead, whenever any data
|
||||
(contents or filenames) is encrypted, the file's 16-byte nonce is
|
||||
included in the IV. Moreover:
|
||||
|
||||
- For v1 encryption policies, the encryption is done directly with the
|
||||
master key. Because of this, users **must not** use the same master
|
||||
@ -302,6 +302,16 @@ For master keys used for v2 encryption policies, a unique 16-byte "key
|
||||
identifier" is also derived using the KDF. This value is stored in
|
||||
the clear, since it is needed to reliably identify the key itself.
|
||||
|
||||
Dirhash keys
|
||||
------------
|
||||
|
||||
For directories that are indexed using a secret-keyed dirhash over the
|
||||
plaintext filenames, the KDF is also used to derive a 128-bit
|
||||
SipHash-2-4 key per directory in order to hash filenames. This works
|
||||
just like deriving a per-file encryption key, except that a different
|
||||
KDF context is used. Currently, only casefolded ("case-insensitive")
|
||||
encrypted directories use this style of hashing.
|
||||
|
||||
Encryption modes and usage
|
||||
==========================
|
||||
|
||||
@ -325,11 +335,11 @@ used.
|
||||
Adiantum is a (primarily) stream cipher-based mode that is fast even
|
||||
on CPUs without dedicated crypto instructions. It's also a true
|
||||
wide-block mode, unlike XTS. It can also eliminate the need to derive
|
||||
per-file keys. However, it depends on the security of two primitives,
|
||||
XChaCha12 and AES-256, rather than just one. See the paper
|
||||
"Adiantum: length-preserving encryption for entry-level processors"
|
||||
(https://eprint.iacr.org/2018/720.pdf) for more details. To use
|
||||
Adiantum, CONFIG_CRYPTO_ADIANTUM must be enabled. Also, fast
|
||||
per-file encryption keys. However, it depends on the security of two
|
||||
primitives, XChaCha12 and AES-256, rather than just one. See the
|
||||
paper "Adiantum: length-preserving encryption for entry-level
|
||||
processors" (https://eprint.iacr.org/2018/720.pdf) for more details.
|
||||
To use Adiantum, CONFIG_CRYPTO_ADIANTUM must be enabled. Also, fast
|
||||
implementations of ChaCha and NHPoly1305 should be enabled, e.g.
|
||||
CONFIG_CRYPTO_CHACHA20_NEON and CONFIG_CRYPTO_NHPOLY1305_NEON for ARM.
|
||||
|
||||
@ -513,7 +523,9 @@ FS_IOC_SET_ENCRYPTION_POLICY can fail with the following errors:
|
||||
- ``EEXIST``: the file is already encrypted with an encryption policy
|
||||
different from the one specified
|
||||
- ``EINVAL``: an invalid encryption policy was specified (invalid
|
||||
version, mode(s), or flags; or reserved bits were set)
|
||||
version, mode(s), or flags; or reserved bits were set); or a v1
|
||||
encryption policy was specified but the directory has the casefold
|
||||
flag enabled (casefolding is incompatible with v1 policies).
|
||||
- ``ENOKEY``: a v2 encryption policy was specified, but the key with
|
||||
the specified ``master_key_identifier`` has not been added, nor does
|
||||
the process have the CAP_FOWNER capability in the initial user
|
||||
@ -638,7 +650,8 @@ follows::
|
||||
struct fscrypt_add_key_arg {
|
||||
struct fscrypt_key_specifier key_spec;
|
||||
__u32 raw_size;
|
||||
__u32 __reserved[9];
|
||||
__u32 key_id;
|
||||
__u32 __reserved[8];
|
||||
__u8 raw[];
|
||||
};
|
||||
|
||||
@ -655,6 +668,12 @@ follows::
|
||||
} u;
|
||||
};
|
||||
|
||||
struct fscrypt_provisioning_key_payload {
|
||||
__u32 type;
|
||||
__u32 __reserved;
|
||||
__u8 raw[];
|
||||
};
|
||||
|
||||
:c:type:`struct fscrypt_add_key_arg` must be zeroed, then initialized
|
||||
as follows:
|
||||
|
||||
@ -677,9 +696,26 @@ as follows:
|
||||
``Documentation/security/keys/core.rst``).
|
||||
|
||||
- ``raw_size`` must be the size of the ``raw`` key provided, in bytes.
|
||||
Alternatively, if ``key_id`` is nonzero, this field must be 0, since
|
||||
in that case the size is implied by the specified Linux keyring key.
|
||||
|
||||
- ``key_id`` is 0 if the raw key is given directly in the ``raw``
|
||||
field. Otherwise ``key_id`` is the ID of a Linux keyring key of
|
||||
type "fscrypt-provisioning" whose payload is a :c:type:`struct
|
||||
fscrypt_provisioning_key_payload` whose ``raw`` field contains the
|
||||
raw key and whose ``type`` field matches ``key_spec.type``. Since
|
||||
``raw`` is variable-length, the total size of this key's payload
|
||||
must be ``sizeof(struct fscrypt_provisioning_key_payload)`` plus the
|
||||
raw key size. The process must have Search permission on this key.
|
||||
|
||||
Most users should leave this 0 and specify the raw key directly.
|
||||
The support for specifying a Linux keyring key is intended mainly to
|
||||
allow re-adding keys after a filesystem is unmounted and re-mounted,
|
||||
without having to store the raw keys in userspace memory.
|
||||
|
||||
- ``raw`` is a variable-length field which must contain the actual
|
||||
key, ``raw_size`` bytes long.
|
||||
key, ``raw_size`` bytes long. Alternatively, if ``key_id`` is
|
||||
nonzero, then this field is unused.
|
||||
|
||||
For v2 policy keys, the kernel keeps track of which user (identified
|
||||
by effective user ID) added the key, and only allows the key to be
|
||||
@ -701,11 +737,16 @@ FS_IOC_ADD_ENCRYPTION_KEY can fail with the following errors:
|
||||
|
||||
- ``EACCES``: FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR was specified, but the
|
||||
caller does not have the CAP_SYS_ADMIN capability in the initial
|
||||
user namespace
|
||||
user namespace; or the raw key was specified by Linux key ID but the
|
||||
process lacks Search permission on the key.
|
||||
- ``EDQUOT``: the key quota for this user would be exceeded by adding
|
||||
the key
|
||||
- ``EINVAL``: invalid key size or key specifier type, or reserved bits
|
||||
were set
|
||||
- ``EKEYREJECTED``: the raw key was specified by Linux key ID, but the
|
||||
key has the wrong type
|
||||
- ``ENOKEY``: the raw key was specified by Linux key ID, but no key
|
||||
exists with that ID
|
||||
- ``ENOTTY``: this type of filesystem does not implement encryption
|
||||
- ``EOPNOTSUPP``: the kernel was not configured with encryption
|
||||
support for this filesystem, or the filesystem superblock has not
|
||||
@ -1108,8 +1149,8 @@ The context structs contain the same information as the corresponding
|
||||
policy structs (see `Setting an encryption policy`_), except that the
|
||||
context structs also contain a nonce. The nonce is randomly generated
|
||||
by the kernel and is used as KDF input or as a tweak to cause
|
||||
different files to be encrypted differently; see `Per-file keys`_ and
|
||||
`DIRECT_KEY policies`_.
|
||||
different files to be encrypted differently; see `Per-file encryption
|
||||
keys`_ and `DIRECT_KEY policies`_.
|
||||
|
||||
Data path changes
|
||||
-----------------
|
||||
@ -1161,7 +1202,7 @@ filesystem-specific hash(es) needed for directory lookups. This
|
||||
allows the filesystem to still, with a high degree of confidence, map
|
||||
the filename given in ->lookup() back to a particular directory entry
|
||||
that was previously listed by readdir(). See :c:type:`struct
|
||||
fscrypt_digested_name` in the source for more details.
|
||||
fscrypt_nokey_name` in the source for more details.
|
||||
|
||||
Note that the precise way that filenames are presented to userspace
|
||||
without the key is subject to change in the future. It is only meant
|
||||
|
@ -22,9 +22,11 @@ Contents:
|
||||
intel/iavf
|
||||
intel/ice
|
||||
google/gve
|
||||
marvell/octeontx2
|
||||
mellanox/mlx5
|
||||
netronome/nfp
|
||||
pensando/ionic
|
||||
stmicro/stmmac
|
||||
|
||||
.. only:: subproject and html
|
||||
|
||||
|
159
Documentation/networking/device_drivers/marvell/octeontx2.rst
Normal file
159
Documentation/networking/device_drivers/marvell/octeontx2.rst
Normal file
@ -0,0 +1,159 @@
|
||||
.. SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
|
||||
|
||||
====================================
|
||||
Marvell OcteonTx2 RVU Kernel Drivers
|
||||
====================================
|
||||
|
||||
Copyright (c) 2020 Marvell International Ltd.
|
||||
|
||||
Contents
|
||||
========
|
||||
|
||||
- `Overview`_
|
||||
- `Drivers`_
|
||||
- `Basic packet flow`_
|
||||
|
||||
Overview
|
||||
========
|
||||
|
||||
Resource virtualization unit (RVU) on Marvell's OcteonTX2 SOC maps HW
|
||||
resources from the network, crypto and other functional blocks into
|
||||
PCI-compatible physical and virtual functions. Each functional block
|
||||
again has multiple local functions (LFs) for provisioning to PCI devices.
|
||||
RVU supports multiple PCIe SRIOV physical functions (PFs) and virtual
|
||||
functions (VFs). PF0 is called the administrative / admin function (AF)
|
||||
and has privileges to provision RVU functional block's LFs to each of the
|
||||
PF/VF.
|
||||
|
||||
RVU managed networking functional blocks
|
||||
- Network pool or buffer allocator (NPA)
|
||||
- Network interface controller (NIX)
|
||||
- Network parser CAM (NPC)
|
||||
- Schedule/Synchronize/Order unit (SSO)
|
||||
- Loopback interface (LBK)
|
||||
|
||||
RVU managed non-networking functional blocks
|
||||
- Crypto accelerator (CPT)
|
||||
- Scheduled timers unit (TIM)
|
||||
- Schedule/Synchronize/Order unit (SSO)
|
||||
Used for both networking and non networking usecases
|
||||
|
||||
Resource provisioning examples
|
||||
- A PF/VF with NIX-LF & NPA-LF resources works as a pure network device
|
||||
- A PF/VF with CPT-LF resource works as a pure crypto offload device.
|
||||
|
||||
RVU functional blocks are highly configurable as per software requirements.
|
||||
|
||||
Firmware setups following stuff before kernel boots
|
||||
- Enables required number of RVU PFs based on number of physical links.
|
||||
- Number of VFs per PF are either static or configurable at compile time.
|
||||
Based on config, firmware assigns VFs to each of the PFs.
|
||||
- Also assigns MSIX vectors to each of PF and VFs.
|
||||
- These are not changed after kernel boot.
|
||||
|
||||
Drivers
|
||||
=======
|
||||
|
||||
Linux kernel will have multiple drivers registering to different PF and VFs
|
||||
of RVU. Wrt networking there will be 3 flavours of drivers.
|
||||
|
||||
Admin Function driver
|
||||
---------------------
|
||||
|
||||
As mentioned above RVU PF0 is called the admin function (AF), this driver
|
||||
supports resource provisioning and configuration of functional blocks.
|
||||
Doesn't handle any I/O. It sets up few basic stuff but most of the
|
||||
funcionality is achieved via configuration requests from PFs and VFs.
|
||||
|
||||
PF/VFs communicates with AF via a shared memory region (mailbox). Upon
|
||||
receiving requests AF does resource provisioning and other HW configuration.
|
||||
AF is always attached to host kernel, but PFs and their VFs may be used by host
|
||||
kernel itself, or attached to VMs or to userspace applications like
|
||||
DPDK etc. So AF has to handle provisioning/configuration requests sent
|
||||
by any device from any domain.
|
||||
|
||||
AF driver also interacts with underlying firmware to
|
||||
- Manage physical ethernet links ie CGX LMACs.
|
||||
- Retrieve information like speed, duplex, autoneg etc
|
||||
- Retrieve PHY EEPROM and stats.
|
||||
- Configure FEC, PAM modes
|
||||
- etc
|
||||
|
||||
From pure networking side AF driver supports following functionality.
|
||||
- Map a physical link to a RVU PF to which a netdev is registered.
|
||||
- Attach NIX and NPA block LFs to RVU PF/VF which provide buffer pools, RQs, SQs
|
||||
for regular networking functionality.
|
||||
- Flow control (pause frames) enable/disable/config.
|
||||
- HW PTP timestamping related config.
|
||||
- NPC parser profile config, basically how to parse pkt and what info to extract.
|
||||
- NPC extract profile config, what to extract from the pkt to match data in MCAM entries.
|
||||
- Manage NPC MCAM entries, upon request can frame and install requested packet forwarding rules.
|
||||
- Defines receive side scaling (RSS) algorithms.
|
||||
- Defines segmentation offload algorithms (eg TSO)
|
||||
- VLAN stripping, capture and insertion config.
|
||||
- SSO and TIM blocks config which provide packet scheduling support.
|
||||
- Debugfs support, to check current resource provising, current status of
|
||||
NPA pools, NIX RQ, SQ and CQs, various stats etc which helps in debugging issues.
|
||||
- And many more.
|
||||
|
||||
Physical Function driver
|
||||
------------------------
|
||||
|
||||
This RVU PF handles IO, is mapped to a physical ethernet link and this
|
||||
driver registers a netdev. This supports SR-IOV. As said above this driver
|
||||
communicates with AF with a mailbox. To retrieve information from physical
|
||||
links this driver talks to AF and AF gets that info from firmware and responds
|
||||
back ie cannot talk to firmware directly.
|
||||
|
||||
Supports ethtool for configuring links, RSS, queue count, queue size,
|
||||
flow control, ntuple filters, dump PHY EEPROM, config FEC etc.
|
||||
|
||||
Virtual Function driver
|
||||
-----------------------
|
||||
|
||||
There are two types VFs, VFs that share the physical link with their parent
|
||||
SR-IOV PF and the VFs which work in pairs using internal HW loopback channels (LBK).
|
||||
|
||||
Type1:
|
||||
- These VFs and their parent PF share a physical link and used for outside communication.
|
||||
- VFs cannot communicate with AF directly, they send mbox message to PF and PF
|
||||
forwards that to AF. AF after processing, responds back to PF and PF forwards
|
||||
the reply to VF.
|
||||
- From functionality point of view there is no difference between PF and VF as same type
|
||||
HW resources are attached to both. But user would be able to configure few stuff only
|
||||
from PF as PF is treated as owner/admin of the link.
|
||||
|
||||
Type2:
|
||||
- RVU PF0 ie admin function creates these VFs and maps them to loopback block's channels.
|
||||
- A set of two VFs (VF0 & VF1, VF2 & VF3 .. so on) works as a pair ie pkts sent out of
|
||||
VF0 will be received by VF1 and viceversa.
|
||||
- These VFs can be used by applications or virtual machines to communicate between them
|
||||
without sending traffic outside. There is no switch present in HW, hence the support
|
||||
for loopback VFs.
|
||||
- These communicate directly with AF (PF0) via mbox.
|
||||
|
||||
Except for the IO channels or links used for packet reception and transmission there is
|
||||
no other difference between these VF types. AF driver takes care of IO channel mapping,
|
||||
hence same VF driver works for both types of devices.
|
||||
|
||||
Basic packet flow
|
||||
=================
|
||||
|
||||
Ingress
|
||||
-------
|
||||
|
||||
1. CGX LMAC receives packet.
|
||||
2. Forwards the packet to the NIX block.
|
||||
3. Then submitted to NPC block for parsing and then MCAM lookup to get the destination RVU device.
|
||||
4. NIX LF attached to the destination RVU device allocates a buffer from RQ mapped buffer pool of NPA block LF.
|
||||
5. RQ may be selected by RSS or by configuring MCAM rule with a RQ number.
|
||||
6. Packet is DMA'ed and driver is notified.
|
||||
|
||||
Egress
|
||||
------
|
||||
|
||||
1. Driver prepares a send descriptor and submits to SQ for transmission.
|
||||
2. The SQ is already configured (by AF) to transmit on a specific link/channel.
|
||||
3. The SQ descriptor ring is maintained in buffers allocated from SQ mapped pool of NPA block LF.
|
||||
4. NIX block transmits the pkt on the designated channel.
|
||||
5. NPC MCAM entries can be installed to divert pkt onto a different channel.
|
@ -82,3 +82,24 @@ Features
|
||||
contain one or more packets. The send buffer is an optimization, the driver
|
||||
will use slower method to handle very large packets or if the send buffer
|
||||
area is exhausted.
|
||||
|
||||
XDP support
|
||||
-----------
|
||||
XDP (eXpress Data Path) is a feature that runs eBPF bytecode at the early
|
||||
stage when packets arrive at a NIC card. The goal is to increase performance
|
||||
for packet processing, reducing the overhead of SKB allocation and other
|
||||
upper network layers.
|
||||
|
||||
hv_netvsc supports XDP in native mode, and transparently sets the XDP
|
||||
program on the associated VF NIC as well.
|
||||
|
||||
Setting / unsetting XDP program on synthetic NIC (netvsc) propagates to
|
||||
VF NIC automatically. Setting / unsetting XDP program on VF NIC directly
|
||||
is not recommended, also not propagated to synthetic NIC, and may be
|
||||
overwritten by setting of synthetic NIC.
|
||||
|
||||
XDP program cannot run with LRO (RSC) enabled, so you need to disable LRO
|
||||
before running XDP:
|
||||
ethtool -K eth0 lro off
|
||||
|
||||
XDP_REDIRECT action is not yet supported.
|
||||
|
@ -131,3 +131,119 @@ abi_drv_reset
|
||||
abi_drv_load_ifc
|
||||
Defines a list of PF devices allowed to load FW on the device.
|
||||
This variable is not currently user configurable.
|
||||
|
||||
Statistics
|
||||
==========
|
||||
|
||||
Following device statistics are available through the ``ethtool -S`` interface:
|
||||
|
||||
.. flat-table:: NFP device statistics
|
||||
:header-rows: 1
|
||||
:widths: 3 1 11
|
||||
|
||||
* - Name
|
||||
- ID
|
||||
- Meaning
|
||||
|
||||
* - dev_rx_discards
|
||||
- 1
|
||||
- Packet can be discarded on the RX path for one of the following reasons:
|
||||
|
||||
* The NIC is not in promisc mode, and the destination MAC address
|
||||
doesn't match the interfaces' MAC address.
|
||||
* The received packet is larger than the max buffer size on the host.
|
||||
I.e. it exceeds the Layer 3 MRU.
|
||||
* There is no freelist descriptor available on the host for the packet.
|
||||
It is likely that the NIC couldn't cache one in time.
|
||||
* A BPF program discarded the packet.
|
||||
* The datapath drop action was executed.
|
||||
* The MAC discarded the packet due to lack of ingress buffer space
|
||||
on the NIC.
|
||||
|
||||
* - dev_rx_errors
|
||||
- 2
|
||||
- A packet can be counted (and dropped) as RX error for the following
|
||||
reasons:
|
||||
|
||||
* A problem with the VEB lookup (only when SR-IOV is used).
|
||||
* A physical layer problem that causes Ethernet errors, like FCS or
|
||||
alignment errors. The cause is usually faulty cables or SFPs.
|
||||
|
||||
* - dev_rx_bytes
|
||||
- 3
|
||||
- Total number of bytes received.
|
||||
|
||||
* - dev_rx_uc_bytes
|
||||
- 4
|
||||
- Unicast bytes received.
|
||||
|
||||
* - dev_rx_mc_bytes
|
||||
- 5
|
||||
- Multicast bytes received.
|
||||
|
||||
* - dev_rx_bc_bytes
|
||||
- 6
|
||||
- Broadcast bytes received.
|
||||
|
||||
* - dev_rx_pkts
|
||||
- 7
|
||||
- Total number of packets received.
|
||||
|
||||
* - dev_rx_mc_pkts
|
||||
- 8
|
||||
- Multicast packets received.
|
||||
|
||||
* - dev_rx_bc_pkts
|
||||
- 9
|
||||
- Broadcast packets received.
|
||||
|
||||
* - dev_tx_discards
|
||||
- 10
|
||||
- A packet can be discarded in the TX direction if the MAC is
|
||||
being flow controlled and the NIC runs out of TX queue space.
|
||||
|
||||
* - dev_tx_errors
|
||||
- 11
|
||||
- A packet can be counted as TX error (and dropped) for one for the
|
||||
following reasons:
|
||||
|
||||
* The packet is an LSO segment, but the Layer 3 or Layer 4 offset
|
||||
could not be determined. Therefore LSO could not continue.
|
||||
* An invalid packet descriptor was received over PCIe.
|
||||
* The packet Layer 3 length exceeds the device MTU.
|
||||
* An error on the MAC/physical layer. Usually due to faulty cables or
|
||||
SFPs.
|
||||
* A CTM buffer could not be allocated.
|
||||
* The packet offset was incorrect and could not be fixed by the NIC.
|
||||
|
||||
* - dev_tx_bytes
|
||||
- 12
|
||||
- Total number of bytes transmitted.
|
||||
|
||||
* - dev_tx_uc_bytes
|
||||
- 13
|
||||
- Unicast bytes transmitted.
|
||||
|
||||
* - dev_tx_mc_bytes
|
||||
- 14
|
||||
- Multicast bytes transmitted.
|
||||
|
||||
* - dev_tx_bc_bytes
|
||||
- 15
|
||||
- Broadcast bytes transmitted.
|
||||
|
||||
* - dev_tx_pkts
|
||||
- 16
|
||||
- Total number of packets transmitted.
|
||||
|
||||
* - dev_tx_mc_pkts
|
||||
- 17
|
||||
- Multicast packets transmitted.
|
||||
|
||||
* - dev_tx_bc_pkts
|
||||
- 18
|
||||
- Broadcast packets transmitted.
|
||||
|
||||
Note that statistics unknown to the driver will be displayed as
|
||||
``dev_unknown_stat$ID``, where ``$ID`` refers to the second column
|
||||
above.
|
||||
|
697
Documentation/networking/device_drivers/stmicro/stmmac.rst
Normal file
697
Documentation/networking/device_drivers/stmicro/stmmac.rst
Normal file
@ -0,0 +1,697 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0+
|
||||
|
||||
==============================================================
|
||||
Linux Driver for the Synopsys(R) Ethernet Controllers "stmmac"
|
||||
==============================================================
|
||||
|
||||
Authors: Giuseppe Cavallaro <peppe.cavallaro@st.com>,
|
||||
Alexandre Torgue <alexandre.torgue@st.com>, Jose Abreu <joabreu@synopsys.com>
|
||||
|
||||
Contents
|
||||
========
|
||||
|
||||
- In This Release
|
||||
- Feature List
|
||||
- Kernel Configuration
|
||||
- Command Line Parameters
|
||||
- Driver Information and Notes
|
||||
- Debug Information
|
||||
- Support
|
||||
|
||||
In This Release
|
||||
===============
|
||||
|
||||
This file describes the stmmac Linux Driver for all the Synopsys(R) Ethernet
|
||||
Controllers.
|
||||
|
||||
Currently, this network device driver is for all STi embedded MAC/GMAC
|
||||
(i.e. 7xxx/5xxx SoCs), SPEAr (arm), Loongson1B (mips) and XILINX XC2V3000
|
||||
FF1152AMT0221 D1215994A VIRTEX FPGA board. The Synopsys Ethernet QoS 5.0 IPK
|
||||
is also supported.
|
||||
|
||||
DesignWare(R) Cores Ethernet MAC 10/100/1000 Universal version 3.70a
|
||||
(and older) and DesignWare(R) Cores Ethernet Quality-of-Service version 4.0
|
||||
(and upper) have been used for developing this driver as well as
|
||||
DesignWare(R) Cores XGMAC - 10G Ethernet MAC.
|
||||
|
||||
This driver supports both the platform bus and PCI.
|
||||
|
||||
This driver includes support for the following Synopsys(R) DesignWare(R)
|
||||
Cores Ethernet Controllers and corresponding minimum and maximum versions:
|
||||
|
||||
+-------------------------------+--------------+--------------+--------------+
|
||||
| Controller Name | Min. Version | Max. Version | Abbrev. Name |
|
||||
+===============================+==============+==============+==============+
|
||||
| Ethernet MAC Universal | N/A | 3.73a | GMAC |
|
||||
+-------------------------------+--------------+--------------+--------------+
|
||||
| Ethernet Quality-of-Service | 4.00a | N/A | GMAC4+ |
|
||||
+-------------------------------+--------------+--------------+--------------+
|
||||
| XGMAC - 10G Ethernet MAC | 2.10a | N/A | XGMAC2+ |
|
||||
+-------------------------------+--------------+--------------+--------------+
|
||||
|
||||
For questions related to hardware requirements, refer to the documentation
|
||||
supplied with your Ethernet adapter. All hardware requirements listed apply
|
||||
to use with Linux.
|
||||
|
||||
Feature List
|
||||
============
|
||||
|
||||
The following features are available in this driver:
|
||||
- GMII/MII/RGMII/SGMII/RMII/XGMII Interface
|
||||
- Half-Duplex / Full-Duplex Operation
|
||||
- Energy Efficient Ethernet (EEE)
|
||||
- IEEE 802.3x PAUSE Packets (Flow Control)
|
||||
- RMON/MIB Counters
|
||||
- IEEE 1588 Timestamping (PTP)
|
||||
- Pulse-Per-Second Output (PPS)
|
||||
- MDIO Clause 22 / Clause 45 Interface
|
||||
- MAC Loopback
|
||||
- ARP Offloading
|
||||
- Automatic CRC / PAD Insertion and Checking
|
||||
- Checksum Offload for Received and Transmitted Packets
|
||||
- Standard or Jumbo Ethernet Packets
|
||||
- Source Address Insertion / Replacement
|
||||
- VLAN TAG Insertion / Replacement / Deletion / Filtering (HASH and PERFECT)
|
||||
- Programmable TX and RX Watchdog and Coalesce Settings
|
||||
- Destination Address Filtering (PERFECT)
|
||||
- HASH Filtering (Multicast)
|
||||
- Layer 3 / Layer 4 Filtering
|
||||
- Remote Wake-Up Detection
|
||||
- Receive Side Scaling (RSS)
|
||||
- Frame Preemption for TX and RX
|
||||
- Programmable Burst Length, Threshold, Queue Size
|
||||
- Multiple Queues (up to 8)
|
||||
- Multiple Scheduling Algorithms (TX: WRR, DWRR, WFQ, SP, CBS, EST, TBS;
|
||||
RX: WRR, SP)
|
||||
- Flexible RX Parser
|
||||
- TCP / UDP Segmentation Offload (TSO, USO)
|
||||
- Split Header (SPH)
|
||||
- Safety Features (ECC Protection, Data Parity Protection)
|
||||
- Selftests using Ethtool
|
||||
|
||||
Kernel Configuration
|
||||
====================
|
||||
|
||||
The kernel configuration option is ``CONFIG_STMMAC_ETH``:
|
||||
- ``CONFIG_STMMAC_PLATFORM``: is to enable the platform driver.
|
||||
- ``CONFIG_STMMAC_PCI``: is to enable the pci driver.
|
||||
|
||||
Command Line Parameters
|
||||
=======================
|
||||
|
||||
If the driver is built as a module the following optional parameters are used
|
||||
by entering them on the command line with the modprobe command using this
|
||||
syntax (e.g. for PCI module)::
|
||||
|
||||
modprobe stmmac_pci [<option>=<VAL1>,<VAL2>,...]
|
||||
|
||||
Driver parameters can be also passed in command line by using::
|
||||
|
||||
stmmaceth=watchdog:100,chain_mode=1
|
||||
|
||||
The default value for each parameter is generally the recommended setting,
|
||||
unless otherwise noted.
|
||||
|
||||
watchdog
|
||||
--------
|
||||
:Valid Range: 5000-None
|
||||
:Default Value: 5000
|
||||
|
||||
This parameter overrides the transmit timeout in milliseconds.
|
||||
|
||||
debug
|
||||
-----
|
||||
:Valid Range: 0-16 (0=none,...,16=all)
|
||||
:Default Value: 0
|
||||
|
||||
This parameter adjusts the level of debug messages displayed in the system
|
||||
logs.
|
||||
|
||||
phyaddr
|
||||
-------
|
||||
:Valid Range: 0-31
|
||||
:Default Value: -1
|
||||
|
||||
This parameter overrides the physical address of the PHY device.
|
||||
|
||||
flow_ctrl
|
||||
---------
|
||||
:Valid Range: 0-3 (0=off,1=rx,2=tx,3=rx/tx)
|
||||
:Default Value: 3
|
||||
|
||||
This parameter changes the default Flow Control ability.
|
||||
|
||||
pause
|
||||
-----
|
||||
:Valid Range: 0-65535
|
||||
:Default Value: 65535
|
||||
|
||||
This parameter changes the default Flow Control Pause time.
|
||||
|
||||
tc
|
||||
--
|
||||
:Valid Range: 64-256
|
||||
:Default Value: 64
|
||||
|
||||
This parameter changes the default HW FIFO Threshold control value.
|
||||
|
||||
buf_sz
|
||||
------
|
||||
:Valid Range: 1536-16384
|
||||
:Default Value: 1536
|
||||
|
||||
This parameter changes the default RX DMA packet buffer size.
|
||||
|
||||
eee_timer
|
||||
---------
|
||||
:Valid Range: 0-None
|
||||
:Default Value: 1000
|
||||
|
||||
This parameter changes the default LPI TX Expiration time in milliseconds.
|
||||
|
||||
chain_mode
|
||||
----------
|
||||
:Valid Range: 0-1 (0=off,1=on)
|
||||
:Default Value: 0
|
||||
|
||||
This parameter changes the default mode of operation from Ring Mode to
|
||||
Chain Mode.
|
||||
|
||||
Driver Information and Notes
|
||||
============================
|
||||
|
||||
Transmit Process
|
||||
----------------
|
||||
|
||||
The xmit method is invoked when the kernel needs to transmit a packet; it sets
|
||||
the descriptors in the ring and informs the DMA engine that there is a packet
|
||||
ready to be transmitted.
|
||||
|
||||
By default, the driver sets the ``NETIF_F_SG`` bit in the features field of
|
||||
the ``net_device`` structure, enabling the scatter-gather feature. This is
|
||||
true on chips and configurations where the checksum can be done in hardware.
|
||||
|
||||
Once the controller has finished transmitting the packet, timer will be
|
||||
scheduled to release the transmit resources.
|
||||
|
||||
Receive Process
|
||||
---------------
|
||||
|
||||
When one or more packets are received, an interrupt happens. The interrupts
|
||||
are not queued, so the driver has to scan all the descriptors in the ring
|
||||
during the receive process.
|
||||
|
||||
This is based on NAPI, so the interrupt handler signals only if there is work
|
||||
to be done, and it exits. Then the poll method will be scheduled at some
|
||||
future point.
|
||||
|
||||
The incoming packets are stored, by the DMA, in a list of pre-allocated socket
|
||||
buffers in order to avoid the memcpy (zero-copy).
|
||||
|
||||
Interrupt Mitigation
|
||||
--------------------
|
||||
|
||||
The driver is able to mitigate the number of its DMA interrupts using NAPI for
|
||||
the reception on chips older than the 3.50. New chips have an HW RX Watchdog
|
||||
used for this mitigation.
|
||||
|
||||
Mitigation parameters can be tuned by ethtool.
|
||||
|
||||
WoL
|
||||
---
|
||||
|
||||
Wake up on Lan feature through Magic and Unicast frames are supported for the
|
||||
GMAC, GMAC4/5 and XGMAC core.
|
||||
|
||||
DMA Descriptors
|
||||
---------------
|
||||
|
||||
Driver handles both normal and alternate descriptors. The latter has been only
|
||||
tested on DesignWare(R) Cores Ethernet MAC Universal version 3.41a and later.
|
||||
|
||||
stmmac supports DMA descriptor to operate both in dual buffer (RING) and
|
||||
linked-list(CHAINED) mode. In RING each descriptor points to two data buffer
|
||||
pointers whereas in CHAINED mode they point to only one data buffer pointer.
|
||||
RING mode is the default.
|
||||
|
||||
In CHAINED mode each descriptor will have pointer to next descriptor in the
|
||||
list, hence creating the explicit chaining in the descriptor itself, whereas
|
||||
such explicit chaining is not possible in RING mode.
|
||||
|
||||
Extended Descriptors
|
||||
--------------------
|
||||
|
||||
The extended descriptors give us information about the Ethernet payload when
|
||||
it is carrying PTP packets or TCP/UDP/ICMP over IP. These are not available on
|
||||
GMAC Synopsys(R) chips older than the 3.50. At probe time the driver will
|
||||
decide if these can be actually used. This support also is mandatory for PTPv2
|
||||
because the extra descriptors are used for saving the hardware timestamps and
|
||||
Extended Status.
|
||||
|
||||
Ethtool Support
|
||||
---------------
|
||||
|
||||
Ethtool is supported. For example, driver statistics (including RMON),
|
||||
internal errors can be taken using::
|
||||
|
||||
ethtool -S ethX
|
||||
|
||||
Ethtool selftests are also supported. This allows to do some early sanity
|
||||
checks to the HW using MAC and PHY loopback mechanisms::
|
||||
|
||||
ethtool -t ethX
|
||||
|
||||
Jumbo and Segmentation Offloading
|
||||
---------------------------------
|
||||
|
||||
Jumbo frames are supported and tested for the GMAC. The GSO has been also
|
||||
added but it's performed in software. LRO is not supported.
|
||||
|
||||
TSO Support
|
||||
-----------
|
||||
|
||||
TSO (TCP Segmentation Offload) feature is supported by GMAC > 4.x and XGMAC
|
||||
chip family. When a packet is sent through TCP protocol, the TCP stack ensures
|
||||
that the SKB provided to the low level driver (stmmac in our case) matches
|
||||
with the maximum frame len (IP header + TCP header + payload <= 1500 bytes
|
||||
(for MTU set to 1500)). It means that if an application using TCP want to send
|
||||
a packet which will have a length (after adding headers) > 1514 the packet
|
||||
will be split in several TCP packets: The data payload is split and headers
|
||||
(TCP/IP ..) are added. It is done by software.
|
||||
|
||||
When TSO is enabled, the TCP stack doesn't care about the maximum frame length
|
||||
and provide SKB packet to stmmac as it is. The GMAC IP will have to perform
|
||||
the segmentation by it self to match with maximum frame length.
|
||||
|
||||
This feature can be enabled in device tree through ``snps,tso`` entry.
|
||||
|
||||
Energy Efficient Ethernet
|
||||
-------------------------
|
||||
|
||||
Energy Efficient Ethernet (EEE) enables IEEE 802.3 MAC sublayer along with a
|
||||
family of Physical layer to operate in the Low Power Idle (LPI) mode. The EEE
|
||||
mode supports the IEEE 802.3 MAC operation at 100Mbps, 1000Mbps and 1Gbps.
|
||||
|
||||
The LPI mode allows power saving by switching off parts of the communication
|
||||
device functionality when there is no data to be transmitted & received.
|
||||
The system on both the side of the link can disable some functionalities and
|
||||
save power during the period of low-link utilization. The MAC controls whether
|
||||
the system should enter or exit the LPI mode and communicate this to PHY.
|
||||
|
||||
As soon as the interface is opened, the driver verifies if the EEE can be
|
||||
supported. This is done by looking at both the DMA HW capability register and
|
||||
the PHY devices MCD registers.
|
||||
|
||||
To enter in TX LPI mode the driver needs to have a software timer that enable
|
||||
and disable the LPI mode when there is nothing to be transmitted.
|
||||
|
||||
Precision Time Protocol (PTP)
|
||||
-----------------------------
|
||||
|
||||
The driver supports the IEEE 1588-2002, Precision Time Protocol (PTP), which
|
||||
enables precise synchronization of clocks in measurement and control systems
|
||||
implemented with technologies such as network communication.
|
||||
|
||||
In addition to the basic timestamp features mentioned in IEEE 1588-2002
|
||||
Timestamps, new GMAC cores support the advanced timestamp features.
|
||||
IEEE 1588-2008 can be enabled when configuring the Kernel.
|
||||
|
||||
SGMII/RGMII Support
|
||||
-------------------
|
||||
|
||||
New GMAC devices provide own way to manage RGMII/SGMII. This information is
|
||||
available at run-time by looking at the HW capability register. This means
|
||||
that the stmmac can manage auto-negotiation and link status w/o using the
|
||||
PHYLIB stuff. In fact, the HW provides a subset of extended registers to
|
||||
restart the ANE, verify Full/Half duplex mode and Speed. Thanks to these
|
||||
registers, it is possible to look at the Auto-negotiated Link Parter Ability.
|
||||
|
||||
Physical
|
||||
--------
|
||||
|
||||
The driver is compatible with Physical Abstraction Layer to be connected with
|
||||
PHY and GPHY devices.
|
||||
|
||||
Platform Information
|
||||
--------------------
|
||||
|
||||
Several information can be passed through the platform and device-tree.
|
||||
|
||||
::
|
||||
|
||||
struct plat_stmmacenet_data {
|
||||
|
||||
1) Bus identifier::
|
||||
|
||||
int bus_id;
|
||||
|
||||
2) PHY Physical Address. If set to -1 the driver will pick the first PHY it
|
||||
finds::
|
||||
|
||||
int phy_addr;
|
||||
|
||||
3) PHY Device Interface::
|
||||
|
||||
int interface;
|
||||
|
||||
4) Specific platform fields for the MDIO bus::
|
||||
|
||||
struct stmmac_mdio_bus_data *mdio_bus_data;
|
||||
|
||||
5) Internal DMA parameters::
|
||||
|
||||
struct stmmac_dma_cfg *dma_cfg;
|
||||
|
||||
6) Fixed CSR Clock Range selection::
|
||||
|
||||
int clk_csr;
|
||||
|
||||
7) HW uses the GMAC core::
|
||||
|
||||
int has_gmac;
|
||||
|
||||
8) If set the MAC will use Enhanced Descriptors::
|
||||
|
||||
int enh_desc;
|
||||
|
||||
9) Core is able to perform TX Checksum and/or RX Checksum in HW::
|
||||
|
||||
int tx_coe;
|
||||
int rx_coe;
|
||||
|
||||
11) Some HWs are not able to perform the csum in HW for over-sized frames due
|
||||
to limited buffer sizes. Setting this flag the csum will be done in SW on
|
||||
JUMBO frames::
|
||||
|
||||
int bugged_jumbo;
|
||||
|
||||
12) Core has the embedded power module::
|
||||
|
||||
int pmt;
|
||||
|
||||
13) Force DMA to use the Store and Forward mode or Threshold mode::
|
||||
|
||||
int force_sf_dma_mode;
|
||||
int force_thresh_dma_mode;
|
||||
|
||||
15) Force to disable the RX Watchdog feature and switch to NAPI mode::
|
||||
|
||||
int riwt_off;
|
||||
|
||||
16) Limit the maximum operating speed and MTU::
|
||||
|
||||
int max_speed;
|
||||
int maxmtu;
|
||||
|
||||
18) Number of Multicast/Unicast filters::
|
||||
|
||||
int multicast_filter_bins;
|
||||
int unicast_filter_entries;
|
||||
|
||||
20) Limit the maximum TX and RX FIFO size::
|
||||
|
||||
int tx_fifo_size;
|
||||
int rx_fifo_size;
|
||||
|
||||
21) Use the specified number of TX and RX Queues::
|
||||
|
||||
u32 rx_queues_to_use;
|
||||
u32 tx_queues_to_use;
|
||||
|
||||
22) Use the specified TX and RX scheduling algorithm::
|
||||
|
||||
u8 rx_sched_algorithm;
|
||||
u8 tx_sched_algorithm;
|
||||
|
||||
23) Internal TX and RX Queue parameters::
|
||||
|
||||
struct stmmac_rxq_cfg rx_queues_cfg[MTL_MAX_RX_QUEUES];
|
||||
struct stmmac_txq_cfg tx_queues_cfg[MTL_MAX_TX_QUEUES];
|
||||
|
||||
24) This callback is used for modifying some syscfg registers (on ST SoCs)
|
||||
according to the link speed negotiated by the physical layer::
|
||||
|
||||
void (*fix_mac_speed)(void *priv, unsigned int speed);
|
||||
|
||||
25) Callbacks used for calling a custom initialization; This is sometimes
|
||||
necessary on some platforms (e.g. ST boxes) where the HW needs to have set
|
||||
some PIO lines or system cfg registers. init/exit callbacks should not use
|
||||
or modify platform data::
|
||||
|
||||
int (*init)(struct platform_device *pdev, void *priv);
|
||||
void (*exit)(struct platform_device *pdev, void *priv);
|
||||
|
||||
26) Perform HW setup of the bus. For example, on some ST platforms this field
|
||||
is used to configure the AMBA bridge to generate more efficient STBus traffic::
|
||||
|
||||
struct mac_device_info *(*setup)(void *priv);
|
||||
void *bsp_priv;
|
||||
|
||||
27) Internal clocks and rates::
|
||||
|
||||
struct clk *stmmac_clk;
|
||||
struct clk *pclk;
|
||||
struct clk *clk_ptp_ref;
|
||||
unsigned int clk_ptp_rate;
|
||||
unsigned int clk_ref_rate;
|
||||
s32 ptp_max_adj;
|
||||
|
||||
28) Main reset::
|
||||
|
||||
struct reset_control *stmmac_rst;
|
||||
|
||||
29) AXI Internal Parameters::
|
||||
|
||||
struct stmmac_axi *axi;
|
||||
|
||||
30) HW uses GMAC>4 cores::
|
||||
|
||||
int has_gmac4;
|
||||
|
||||
31) HW is sun8i based::
|
||||
|
||||
bool has_sun8i;
|
||||
|
||||
32) Enables TSO feature::
|
||||
|
||||
bool tso_en;
|
||||
|
||||
33) Enables Receive Side Scaling (RSS) feature::
|
||||
|
||||
int rss_en;
|
||||
|
||||
34) MAC Port selection::
|
||||
|
||||
int mac_port_sel_speed;
|
||||
|
||||
35) Enables TX LPI Clock Gating::
|
||||
|
||||
bool en_tx_lpi_clockgating;
|
||||
|
||||
36) HW uses XGMAC>2.10 cores::
|
||||
|
||||
int has_xgmac;
|
||||
|
||||
::
|
||||
|
||||
}
|
||||
|
||||
For MDIO bus data, we have:
|
||||
|
||||
::
|
||||
|
||||
struct stmmac_mdio_bus_data {
|
||||
|
||||
1) PHY mask passed when MDIO bus is registered::
|
||||
|
||||
unsigned int phy_mask;
|
||||
|
||||
2) List of IRQs, one per PHY::
|
||||
|
||||
int *irqs;
|
||||
|
||||
3) If IRQs is NULL, use this for probed PHY::
|
||||
|
||||
int probed_phy_irq;
|
||||
|
||||
4) Set to true if PHY needs reset::
|
||||
|
||||
bool needs_reset;
|
||||
|
||||
::
|
||||
|
||||
}
|
||||
|
||||
For DMA engine configuration, we have:
|
||||
|
||||
::
|
||||
|
||||
struct stmmac_dma_cfg {
|
||||
|
||||
1) Programmable Burst Length (TX and RX)::
|
||||
|
||||
int pbl;
|
||||
|
||||
2) If set, DMA TX / RX will use this value rather than pbl::
|
||||
|
||||
int txpbl;
|
||||
int rxpbl;
|
||||
|
||||
3) Enable 8xPBL::
|
||||
|
||||
bool pblx8;
|
||||
|
||||
4) Enable Fixed or Mixed burst::
|
||||
|
||||
int fixed_burst;
|
||||
int mixed_burst;
|
||||
|
||||
5) Enable Address Aligned Beats::
|
||||
|
||||
bool aal;
|
||||
|
||||
6) Enable Enhanced Addressing (> 32 bits)::
|
||||
|
||||
bool eame;
|
||||
|
||||
::
|
||||
|
||||
}
|
||||
|
||||
For DMA AXI parameters, we have:
|
||||
|
||||
::
|
||||
|
||||
struct stmmac_axi {
|
||||
|
||||
1) Enable AXI LPI::
|
||||
|
||||
bool axi_lpi_en;
|
||||
bool axi_xit_frm;
|
||||
|
||||
2) Set AXI Write / Read maximum outstanding requests::
|
||||
|
||||
u32 axi_wr_osr_lmt;
|
||||
u32 axi_rd_osr_lmt;
|
||||
|
||||
3) Set AXI 4KB bursts::
|
||||
|
||||
bool axi_kbbe;
|
||||
|
||||
4) Set AXI maximum burst length map::
|
||||
|
||||
u32 axi_blen[AXI_BLEN];
|
||||
|
||||
5) Set AXI Fixed burst / mixed burst::
|
||||
|
||||
bool axi_fb;
|
||||
bool axi_mb;
|
||||
|
||||
6) Set AXI rebuild incrx mode::
|
||||
|
||||
bool axi_rb;
|
||||
|
||||
::
|
||||
|
||||
}
|
||||
|
||||
For the RX Queues configuration, we have:
|
||||
|
||||
::
|
||||
|
||||
struct stmmac_rxq_cfg {
|
||||
|
||||
1) Mode to use (DCB or AVB)::
|
||||
|
||||
u8 mode_to_use;
|
||||
|
||||
2) DMA channel to use::
|
||||
|
||||
u32 chan;
|
||||
|
||||
3) Packet routing, if applicable::
|
||||
|
||||
u8 pkt_route;
|
||||
|
||||
4) Use priority routing, and priority to route::
|
||||
|
||||
bool use_prio;
|
||||
u32 prio;
|
||||
|
||||
::
|
||||
|
||||
}
|
||||
|
||||
For the TX Queues configuration, we have:
|
||||
|
||||
::
|
||||
|
||||
struct stmmac_txq_cfg {
|
||||
|
||||
1) Queue weight in scheduler::
|
||||
|
||||
u32 weight;
|
||||
|
||||
2) Mode to use (DCB or AVB)::
|
||||
|
||||
u8 mode_to_use;
|
||||
|
||||
3) Credit Base Shaper Parameters::
|
||||
|
||||
u32 send_slope;
|
||||
u32 idle_slope;
|
||||
u32 high_credit;
|
||||
u32 low_credit;
|
||||
|
||||
4) Use priority scheduling, and priority::
|
||||
|
||||
bool use_prio;
|
||||
u32 prio;
|
||||
|
||||
::
|
||||
|
||||
}
|
||||
|
||||
Device Tree Information
|
||||
-----------------------
|
||||
|
||||
Please refer to the following document:
|
||||
Documentation/devicetree/bindings/net/snps,dwmac.yaml
|
||||
|
||||
HW Capabilities
|
||||
---------------
|
||||
|
||||
Note that, starting from new chips, where it is available the HW capability
|
||||
register, many configurations are discovered at run-time for example to
|
||||
understand if EEE, HW csum, PTP, enhanced descriptor etc are actually
|
||||
available. As strategy adopted in this driver, the information from the HW
|
||||
capability register can replace what has been passed from the platform.
|
||||
|
||||
Debug Information
|
||||
=================
|
||||
|
||||
The driver exports many information i.e. internal statistics, debug
|
||||
information, MAC and DMA registers etc.
|
||||
|
||||
These can be read in several ways depending on the type of the information
|
||||
actually needed.
|
||||
|
||||
For example a user can be use the ethtool support to get statistics: e.g.
|
||||
using: ``ethtool -S ethX`` (that shows the Management counters (MMC) if
|
||||
supported) or sees the MAC/DMA registers: e.g. using: ``ethtool -d ethX``
|
||||
|
||||
Compiling the Kernel with ``CONFIG_DEBUG_FS`` the driver will export the
|
||||
following debugfs entries:
|
||||
|
||||
- ``descriptors_status``: To show the DMA TX/RX descriptor rings
|
||||
- ``dma_cap``: To show the HW Capabilities
|
||||
|
||||
Developer can also use the ``debug`` module parameter to get further debug
|
||||
information (please see: NETIF Msg Level).
|
||||
|
||||
Support
|
||||
=======
|
||||
|
||||
If an issue is identified with the released source code on a supported kernel
|
||||
with a supported adapter, email the specific information related to the
|
||||
issue to netdev@vger.kernel.org
|
@ -1,401 +0,0 @@
|
||||
STMicroelectronics 10/100/1000 Synopsys Ethernet driver
|
||||
|
||||
Copyright (C) 2007-2015 STMicroelectronics Ltd
|
||||
Author: Giuseppe Cavallaro <peppe.cavallaro@st.com>
|
||||
|
||||
This is the driver for the MAC 10/100/1000 on-chip Ethernet controllers
|
||||
(Synopsys IP blocks).
|
||||
|
||||
Currently this network device driver is for all STi embedded MAC/GMAC
|
||||
(i.e. 7xxx/5xxx SoCs), SPEAr (arm), Loongson1B (mips) and XLINX XC2V3000
|
||||
FF1152AMT0221 D1215994A VIRTEX FPGA board.
|
||||
|
||||
DWC Ether MAC 10/100/1000 Universal version 3.70a (and older) and DWC Ether
|
||||
MAC 10/100 Universal version 4.0 have been used for developing this driver.
|
||||
|
||||
This driver supports both the platform bus and PCI.
|
||||
|
||||
Please, for more information also visit: www.stlinux.com
|
||||
|
||||
1) Kernel Configuration
|
||||
The kernel configuration option is STMMAC_ETH:
|
||||
Device Drivers ---> Network device support ---> Ethernet (1000 Mbit) --->
|
||||
STMicroelectronics 10/100/1000 Ethernet driver (STMMAC_ETH)
|
||||
|
||||
CONFIG_STMMAC_PLATFORM: is to enable the platform driver.
|
||||
CONFIG_STMMAC_PCI: is to enable the pci driver.
|
||||
|
||||
2) Driver parameters list:
|
||||
debug: message level (0: no output, 16: all);
|
||||
phyaddr: to manually provide the physical address to the PHY device;
|
||||
buf_sz: DMA buffer size;
|
||||
tc: control the HW FIFO threshold;
|
||||
watchdog: transmit timeout (in milliseconds);
|
||||
flow_ctrl: Flow control ability [on/off];
|
||||
pause: Flow Control Pause Time;
|
||||
eee_timer: tx EEE timer;
|
||||
chain_mode: select chain mode instead of ring.
|
||||
|
||||
3) Command line options
|
||||
Driver parameters can be also passed in command line by using:
|
||||
stmmaceth=watchdog:100,chain_mode=1
|
||||
|
||||
4) Driver information and notes
|
||||
|
||||
4.1) Transmit process
|
||||
The xmit method is invoked when the kernel needs to transmit a packet; it sets
|
||||
the descriptors in the ring and informs the DMA engine, that there is a packet
|
||||
ready to be transmitted.
|
||||
By default, the driver sets the NETIF_F_SG bit in the features field of the
|
||||
net_device structure, enabling the scatter-gather feature. This is true on
|
||||
chips and configurations where the checksum can be done in hardware.
|
||||
Once the controller has finished transmitting the packet, timer will be
|
||||
scheduled to release the transmit resources.
|
||||
|
||||
4.2) Receive process
|
||||
When one or more packets are received, an interrupt happens. The interrupts
|
||||
are not queued, so the driver has to scan all the descriptors in the ring during
|
||||
the receive process.
|
||||
This is based on NAPI, so the interrupt handler signals only if there is work
|
||||
to be done, and it exits.
|
||||
Then the poll method will be scheduled at some future point.
|
||||
The incoming packets are stored, by the DMA, in a list of pre-allocated socket
|
||||
buffers in order to avoid the memcpy (zero-copy).
|
||||
|
||||
4.3) Interrupt mitigation
|
||||
The driver is able to mitigate the number of its DMA interrupts
|
||||
using NAPI for the reception on chips older than the 3.50.
|
||||
New chips have an HW RX-Watchdog used for this mitigation.
|
||||
Mitigation parameters can be tuned by ethtool.
|
||||
|
||||
4.4) WOL
|
||||
Wake up on Lan feature through Magic and Unicast frames are supported for the
|
||||
GMAC core.
|
||||
|
||||
4.5) DMA descriptors
|
||||
Driver handles both normal and alternate descriptors. The latter has been only
|
||||
tested on DWC Ether MAC 10/100/1000 Universal version 3.41a and later.
|
||||
|
||||
STMMAC supports DMA descriptor to operate both in dual buffer (RING)
|
||||
and linked-list(CHAINED) mode. In RING each descriptor points to two
|
||||
data buffer pointers whereas in CHAINED mode they point to only one data
|
||||
buffer pointer. RING mode is the default.
|
||||
|
||||
In CHAINED mode each descriptor will have pointer to next descriptor in
|
||||
the list, hence creating the explicit chaining in the descriptor itself,
|
||||
whereas such explicit chaining is not possible in RING mode.
|
||||
|
||||
4.5.1) Extended descriptors
|
||||
The extended descriptors give us information about the Ethernet payload
|
||||
when it is carrying PTP packets or TCP/UDP/ICMP over IP.
|
||||
These are not available on GMAC Synopsys chips older than the 3.50.
|
||||
At probe time the driver will decide if these can be actually used.
|
||||
This support also is mandatory for PTPv2 because the extra descriptors
|
||||
are used for saving the hardware timestamps and Extended Status.
|
||||
|
||||
4.6) Ethtool support
|
||||
Ethtool is supported.
|
||||
|
||||
For example, driver statistics (including RMON), internal errors can be taken
|
||||
using:
|
||||
# ethtool -S ethX
|
||||
command
|
||||
|
||||
4.7) Jumbo and Segmentation Offloading
|
||||
Jumbo frames are supported and tested for the GMAC.
|
||||
The GSO has been also added but it's performed in software.
|
||||
LRO is not supported.
|
||||
|
||||
4.8) Physical
|
||||
The driver is compatible with Physical Abstraction Layer to be connected with
|
||||
PHY and GPHY devices.
|
||||
|
||||
4.9) Platform information
|
||||
Several information can be passed through the platform and device-tree.
|
||||
|
||||
struct plat_stmmacenet_data {
|
||||
char *phy_bus_name;
|
||||
int bus_id;
|
||||
int phy_addr;
|
||||
int interface;
|
||||
struct stmmac_mdio_bus_data *mdio_bus_data;
|
||||
struct stmmac_dma_cfg *dma_cfg;
|
||||
int clk_csr;
|
||||
int has_gmac;
|
||||
int enh_desc;
|
||||
int tx_coe;
|
||||
int rx_coe;
|
||||
int bugged_jumbo;
|
||||
int pmt;
|
||||
int force_sf_dma_mode;
|
||||
int force_thresh_dma_mode;
|
||||
int riwt_off;
|
||||
int max_speed;
|
||||
int maxmtu;
|
||||
void (*fix_mac_speed)(void *priv, unsigned int speed);
|
||||
void (*bus_setup)(void __iomem *ioaddr);
|
||||
int (*init)(struct platform_device *pdev, void *priv);
|
||||
void (*exit)(struct platform_device *pdev, void *priv);
|
||||
void *bsp_priv;
|
||||
int has_gmac4;
|
||||
bool tso_en;
|
||||
};
|
||||
|
||||
Where:
|
||||
o phy_bus_name: phy bus name to attach to the stmmac.
|
||||
o bus_id: bus identifier.
|
||||
o phy_addr: the physical address can be passed from the platform.
|
||||
If it is set to -1 the driver will automatically
|
||||
detect it at run-time by probing all the 32 addresses.
|
||||
o interface: PHY device's interface.
|
||||
o mdio_bus_data: specific platform fields for the MDIO bus.
|
||||
o dma_cfg: internal DMA parameters
|
||||
o pbl: the Programmable Burst Length is maximum number of beats to
|
||||
be transferred in one DMA transaction.
|
||||
GMAC also enables the 4xPBL by default. (8xPBL for GMAC 3.50 and newer)
|
||||
o txpbl/rxpbl: GMAC and newer supports independent DMA pbl for tx/rx.
|
||||
o pblx8: Enable 8xPBL (4xPBL for core rev < 3.50). Enabled by default.
|
||||
o fixed_burst/mixed_burst/aal
|
||||
o clk_csr: fixed CSR Clock range selection.
|
||||
o has_gmac: uses the GMAC core.
|
||||
o enh_desc: if sets the MAC will use the enhanced descriptor structure.
|
||||
o tx_coe: core is able to perform the tx csum in HW.
|
||||
o rx_coe: the supports three check sum offloading engine types:
|
||||
type_1, type_2 (full csum) and no RX coe.
|
||||
o bugged_jumbo: some HWs are not able to perform the csum in HW for
|
||||
over-sized frames due to limited buffer sizes.
|
||||
Setting this flag the csum will be done in SW on
|
||||
JUMBO frames.
|
||||
o pmt: core has the embedded power module (optional).
|
||||
o force_sf_dma_mode: force DMA to use the Store and Forward mode
|
||||
instead of the Threshold.
|
||||
o force_thresh_dma_mode: force DMA to use the Threshold mode other than
|
||||
the Store and Forward mode.
|
||||
o riwt_off: force to disable the RX watchdog feature and switch to NAPI mode.
|
||||
o fix_mac_speed: this callback is used for modifying some syscfg registers
|
||||
(on ST SoCs) according to the link speed negotiated by the
|
||||
physical layer .
|
||||
o bus_setup: perform HW setup of the bus. For example, on some ST platforms
|
||||
this field is used to configure the AMBA bridge to generate more
|
||||
efficient STBus traffic.
|
||||
o init/exit: callbacks used for calling a custom initialization;
|
||||
this is sometime necessary on some platforms (e.g. ST boxes)
|
||||
where the HW needs to have set some PIO lines or system cfg
|
||||
registers. init/exit callbacks should not use or modify
|
||||
platform data.
|
||||
o bsp_priv: another private pointer.
|
||||
o has_gmac4: uses GMAC4 core.
|
||||
o tso_en: Enables TSO (TCP Segmentation Offload) feature.
|
||||
|
||||
For MDIO bus The we have:
|
||||
|
||||
struct stmmac_mdio_bus_data {
|
||||
int (*phy_reset)(void *priv);
|
||||
unsigned int phy_mask;
|
||||
int *irqs;
|
||||
int probed_phy_irq;
|
||||
};
|
||||
|
||||
Where:
|
||||
o phy_reset: hook to reset the phy device attached to the bus.
|
||||
o phy_mask: phy mask passed when register the MDIO bus within the driver.
|
||||
o irqs: list of IRQs, one per PHY.
|
||||
o probed_phy_irq: if irqs is NULL, use this for probed PHY.
|
||||
|
||||
For DMA engine we have the following internal fields that should be
|
||||
tuned according to the HW capabilities.
|
||||
|
||||
struct stmmac_dma_cfg {
|
||||
int pbl;
|
||||
int txpbl;
|
||||
int rxpbl;
|
||||
bool pblx8;
|
||||
int fixed_burst;
|
||||
int mixed_burst;
|
||||
bool aal;
|
||||
};
|
||||
|
||||
Where:
|
||||
o pbl: Programmable Burst Length (tx and rx)
|
||||
o txpbl: Transmit Programmable Burst Length. Only for GMAC and newer.
|
||||
If set, DMA tx will use this value rather than pbl.
|
||||
o rxpbl: Receive Programmable Burst Length. Only for GMAC and newer.
|
||||
If set, DMA rx will use this value rather than pbl.
|
||||
o pblx8: Enable 8xPBL (4xPBL for core rev < 3.50). Enabled by default.
|
||||
o fixed_burst: program the DMA to use the fixed burst mode
|
||||
o mixed_burst: program the DMA to use the mixed burst mode
|
||||
o aal: Address-Aligned Beats
|
||||
|
||||
---
|
||||
|
||||
Below an example how the structures above are using on ST platforms.
|
||||
|
||||
static struct plat_stmmacenet_data stxYYY_ethernet_platform_data = {
|
||||
.has_gmac = 0,
|
||||
.enh_desc = 0,
|
||||
.fix_mac_speed = stxYYY_ethernet_fix_mac_speed,
|
||||
|
|
||||
|-> to write an internal syscfg
|
||||
| on this platform when the
|
||||
| link speed changes from 10 to
|
||||
| 100 and viceversa
|
||||
.init = &stmmac_claim_resource,
|
||||
|
|
||||
|-> On ST SoC this calls own "PAD"
|
||||
| manager framework to claim
|
||||
| all the resources necessary
|
||||
| (GPIO ...). The .custom_cfg field
|
||||
| is used to pass a custom config.
|
||||
};
|
||||
|
||||
Below the usage of the stmmac_mdio_bus_data: on this SoC, in fact,
|
||||
there are two MAC cores: one MAC is for MDIO Bus/PHY emulation
|
||||
with fixed_link support.
|
||||
|
||||
static struct stmmac_mdio_bus_data stmmac1_mdio_bus = {
|
||||
.phy_reset = phy_reset;
|
||||
|
|
||||
|-> function to provide the phy_reset on this board
|
||||
.phy_mask = 0,
|
||||
};
|
||||
|
||||
static struct fixed_phy_status stmmac0_fixed_phy_status = {
|
||||
.link = 1,
|
||||
.speed = 100,
|
||||
.duplex = 1,
|
||||
};
|
||||
|
||||
During the board's device_init we can configure the first
|
||||
MAC for fixed_link by calling:
|
||||
fixed_phy_add(PHY_POLL, 1, &stmmac0_fixed_phy_status);
|
||||
and the second one, with a real PHY device attached to the bus,
|
||||
by using the stmmac_mdio_bus_data structure (to provide the id, the
|
||||
reset procedure etc).
|
||||
|
||||
Note that, starting from new chips, where it is available the HW capability
|
||||
register, many configurations are discovered at run-time for example to
|
||||
understand if EEE, HW csum, PTP, enhanced descriptor etc are actually
|
||||
available. As strategy adopted in this driver, the information from the HW
|
||||
capability register can replace what has been passed from the platform.
|
||||
|
||||
4.10) Device-tree support.
|
||||
|
||||
Please see the following document:
|
||||
Documentation/devicetree/bindings/net/stmmac.txt
|
||||
|
||||
4.11) This is a summary of the content of some relevant files:
|
||||
o stmmac_main.c: implements the main network device driver;
|
||||
o stmmac_mdio.c: provides MDIO functions;
|
||||
o stmmac_pci: this is the PCI driver;
|
||||
o stmmac_platform.c: this the platform driver (OF supported);
|
||||
o stmmac_ethtool.c: implements the ethtool support;
|
||||
o stmmac.h: private driver structure;
|
||||
o common.h: common definitions and VFTs;
|
||||
o mmc_core.c/mmc.h: Management MAC Counters;
|
||||
o stmmac_hwtstamp.c: HW timestamp support for PTP;
|
||||
o stmmac_ptp.c: PTP 1588 clock;
|
||||
o stmmac_pcs.h: Physical Coding Sublayer common implementation;
|
||||
o dwmac-<XXX>.c: these are for the platform glue-logic file; e.g. dwmac-sti.c
|
||||
for STMicroelectronics SoCs.
|
||||
|
||||
- GMAC 3.x
|
||||
o descs.h: descriptor structure definitions;
|
||||
o dwmac1000_core.c: dwmac GiGa core functions;
|
||||
o dwmac1000_dma.c: dma functions for the GMAC chip;
|
||||
o dwmac1000.h: specific header file for the dwmac GiGa;
|
||||
o dwmac100_core: dwmac 100 core code;
|
||||
o dwmac100_dma.c: dma functions for the dwmac 100 chip;
|
||||
o dwmac1000.h: specific header file for the MAC;
|
||||
o dwmac_lib.c: generic DMA functions;
|
||||
o enh_desc.c: functions for handling enhanced descriptors;
|
||||
o norm_desc.c: functions for handling normal descriptors;
|
||||
o chain_mode.c/ring_mode.c:: functions to manage RING/CHAINED modes;
|
||||
|
||||
- GMAC4.x generation
|
||||
o dwmac4_core.c: dwmac GMAC4.x core functions;
|
||||
o dwmac4_desc.c: functions for handling GMAC4.x descriptors;
|
||||
o dwmac4_descs.h: descriptor definitions;
|
||||
o dwmac4_dma.c: dma functions for the GMAC4.x chip;
|
||||
o dwmac4_dma.h: dma definitions for the GMAC4.x chip;
|
||||
o dwmac4.h: core definitions for the GMAC4.x chip;
|
||||
o dwmac4_lib.c: generic GMAC4.x functions;
|
||||
|
||||
4.12) TSO support (GMAC4.x)
|
||||
|
||||
TSO (Tcp Segmentation Offload) feature is supported by GMAC 4.x chip family.
|
||||
When a packet is sent through TCP protocol, the TCP stack ensures that
|
||||
the SKB provided to the low level driver (stmmac in our case) matches with
|
||||
the maximum frame len (IP header + TCP header + payload <= 1500 bytes (for
|
||||
MTU set to 1500)). It means that if an application using TCP want to send a
|
||||
packet which will have a length (after adding headers) > 1514 the packet
|
||||
will be split in several TCP packets: The data payload is split and headers
|
||||
(TCP/IP ..) are added. It is done by software.
|
||||
|
||||
When TSO is enabled, the TCP stack doesn't care about the maximum frame
|
||||
length and provide SKB packet to stmmac as it is. The GMAC IP will have to
|
||||
perform the segmentation by it self to match with maximum frame length.
|
||||
|
||||
This feature can be enabled in device tree through "snps,tso" entry.
|
||||
|
||||
5) Debug Information
|
||||
|
||||
The driver exports many information i.e. internal statistics,
|
||||
debug information, MAC and DMA registers etc.
|
||||
|
||||
These can be read in several ways depending on the
|
||||
type of the information actually needed.
|
||||
|
||||
For example a user can be use the ethtool support
|
||||
to get statistics: e.g. using: ethtool -S ethX
|
||||
(that shows the Management counters (MMC) if supported)
|
||||
or sees the MAC/DMA registers: e.g. using: ethtool -d ethX
|
||||
|
||||
Compiling the Kernel with CONFIG_DEBUG_FS the driver will export the following
|
||||
debugfs entries:
|
||||
|
||||
/sys/kernel/debug/stmmaceth/descriptors_status
|
||||
To show the DMA TX/RX descriptor rings
|
||||
|
||||
Developer can also use the "debug" module parameter to get further debug
|
||||
information (please see: NETIF Msg Level).
|
||||
|
||||
6) Energy Efficient Ethernet
|
||||
|
||||
Energy Efficient Ethernet(EEE) enables IEEE 802.3 MAC sublayer along
|
||||
with a family of Physical layer to operate in the Low power Idle(LPI)
|
||||
mode. The EEE mode supports the IEEE 802.3 MAC operation at 100Mbps,
|
||||
1000Mbps & 10Gbps.
|
||||
|
||||
The LPI mode allows power saving by switching off parts of the
|
||||
communication device functionality when there is no data to be
|
||||
transmitted & received. The system on both the side of the link can
|
||||
disable some functionalities & save power during the period of low-link
|
||||
utilization. The MAC controls whether the system should enter or exit
|
||||
the LPI mode & communicate this to PHY.
|
||||
|
||||
As soon as the interface is opened, the driver verifies if the EEE can
|
||||
be supported. This is done by looking at both the DMA HW capability
|
||||
register and the PHY devices MCD registers.
|
||||
To enter in Tx LPI mode the driver needs to have a software timer
|
||||
that enable and disable the LPI mode when there is nothing to be
|
||||
transmitted.
|
||||
|
||||
7) Precision Time Protocol (PTP)
|
||||
The driver supports the IEEE 1588-2002, Precision Time Protocol (PTP),
|
||||
which enables precise synchronization of clocks in measurement and
|
||||
control systems implemented with technologies such as network
|
||||
communication.
|
||||
|
||||
In addition to the basic timestamp features mentioned in IEEE 1588-2002
|
||||
Timestamps, new GMAC cores support the advanced timestamp features.
|
||||
IEEE 1588-2008 that can be enabled when configure the Kernel.
|
||||
|
||||
8) SGMII/RGMII support
|
||||
New GMAC devices provide own way to manage RGMII/SGMII.
|
||||
This information is available at run-time by looking at the
|
||||
HW capability register. This means that the stmmac can manage
|
||||
auto-negotiation and link status w/o using the PHYLIB stuff.
|
||||
In fact, the HW provides a subset of extended registers to
|
||||
restart the ANE, verify Full/Half duplex mode and Speed.
|
||||
Thanks to these registers, it is possible to look at the
|
||||
Auto-negotiated Link Parter Ability.
|
@ -39,7 +39,7 @@ but without enabling "switch" mode, or to different bridges.
|
||||
|
||||
Devlink configuration parameters
|
||||
====================
|
||||
See Documentation/networking/devlink-params-ti-cpsw-switch.txt
|
||||
See Documentation/networking/devlink/ti-cpsw-switch.rst
|
||||
|
||||
====================
|
||||
# Bridging in dual mac mode
|
||||
|
@ -1,86 +0,0 @@
|
||||
The health mechanism is targeted for Real Time Alerting, in order to know when
|
||||
something bad had happened to a PCI device
|
||||
- Provide alert debug information
|
||||
- Self healing
|
||||
- If problem needs vendor support, provide a way to gather all needed debugging
|
||||
information.
|
||||
|
||||
The main idea is to unify and centralize driver health reports in the
|
||||
generic devlink instance and allow the user to set different
|
||||
attributes of the health reporting and recovery procedures.
|
||||
|
||||
The devlink health reporter:
|
||||
Device driver creates a "health reporter" per each error/health type.
|
||||
Error/Health type can be a known/generic (eg pci error, fw error, rx/tx error)
|
||||
or unknown (driver specific).
|
||||
For each registered health reporter a driver can issue error/health reports
|
||||
asynchronously. All health reports handling is done by devlink.
|
||||
Device driver can provide specific callbacks for each "health reporter", e.g.
|
||||
- Recovery procedures
|
||||
- Diagnostics and object dump procedures
|
||||
- OOB initial parameters
|
||||
Different parts of the driver can register different types of health reporters
|
||||
with different handlers.
|
||||
|
||||
Once an error is reported, devlink health will do the following actions:
|
||||
* A log is being send to the kernel trace events buffer
|
||||
* Health status and statistics are being updated for the reporter instance
|
||||
* Object dump is being taken and saved at the reporter instance (as long as
|
||||
there is no other dump which is already stored)
|
||||
* Auto recovery attempt is being done. Depends on:
|
||||
- Auto-recovery configuration
|
||||
- Grace period vs. time passed since last recover
|
||||
|
||||
The user interface:
|
||||
User can access/change each reporter's parameters and driver specific callbacks
|
||||
via devlink, e.g per error type (per health reporter)
|
||||
- Configure reporter's generic parameters (like: disable/enable auto recovery)
|
||||
- Invoke recovery procedure
|
||||
- Run diagnostics
|
||||
- Object dump
|
||||
|
||||
The devlink health interface (via netlink):
|
||||
DEVLINK_CMD_HEALTH_REPORTER_GET
|
||||
Retrieves status and configuration info per DEV and reporter.
|
||||
DEVLINK_CMD_HEALTH_REPORTER_SET
|
||||
Allows reporter-related configuration setting.
|
||||
DEVLINK_CMD_HEALTH_REPORTER_RECOVER
|
||||
Triggers a reporter's recovery procedure.
|
||||
DEVLINK_CMD_HEALTH_REPORTER_DIAGNOSE
|
||||
Retrieves diagnostics data from a reporter on a device.
|
||||
DEVLINK_CMD_HEALTH_REPORTER_DUMP_GET
|
||||
Retrieves the last stored dump. Devlink health
|
||||
saves a single dump. If an dump is not already stored by the devlink
|
||||
for this reporter, devlink generates a new dump.
|
||||
dump output is defined by the reporter.
|
||||
DEVLINK_CMD_HEALTH_REPORTER_DUMP_CLEAR
|
||||
Clears the last saved dump file for the specified reporter.
|
||||
|
||||
|
||||
netlink
|
||||
+--------------------------+
|
||||
| |
|
||||
| + |
|
||||
| | |
|
||||
+--------------------------+
|
||||
|request for ops
|
||||
|(diagnose,
|
||||
mlx5_core devlink |recover,
|
||||
|dump)
|
||||
+--------+ +--------------------------+
|
||||
| | | reporter| |
|
||||
| | | +---------v----------+ |
|
||||
| | ops execution | | | |
|
||||
| <----------------------------------+ | |
|
||||
| | | | | |
|
||||
| | | + ^------------------+ |
|
||||
| | | | request for ops |
|
||||
| | | | (recover, dump) |
|
||||
| | | | |
|
||||
| | | +-+------------------+ |
|
||||
| | health report | | health handler | |
|
||||
| +-------------------------------> | |
|
||||
| | | +--------------------+ |
|
||||
| | health reporter create | |
|
||||
| +----------------------------> |
|
||||
+--------+ +--------------------------+
|
@ -1,64 +0,0 @@
|
||||
.. SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
|
||||
|
||||
=====================
|
||||
Devlink info versions
|
||||
=====================
|
||||
|
||||
board.id
|
||||
========
|
||||
|
||||
Unique identifier of the board design.
|
||||
|
||||
board.rev
|
||||
=========
|
||||
|
||||
Board design revision.
|
||||
|
||||
asic.id
|
||||
=======
|
||||
|
||||
ASIC design identifier.
|
||||
|
||||
asic.rev
|
||||
========
|
||||
|
||||
ASIC design revision.
|
||||
|
||||
board.manufacture
|
||||
=================
|
||||
|
||||
An identifier of the company or the facility which produced the part.
|
||||
|
||||
fw
|
||||
==
|
||||
|
||||
Overall firmware version, often representing the collection of
|
||||
fw.mgmt, fw.app, etc.
|
||||
|
||||
fw.mgmt
|
||||
=======
|
||||
|
||||
Control unit firmware version. This firmware is responsible for house
|
||||
keeping tasks, PHY control etc. but not the packet-by-packet data path
|
||||
operation.
|
||||
|
||||
fw.app
|
||||
======
|
||||
|
||||
Data path microcode controlling high-speed packet processing.
|
||||
|
||||
fw.undi
|
||||
=======
|
||||
|
||||
UNDI software, may include the UEFI driver, firmware or both.
|
||||
|
||||
fw.ncsi
|
||||
=======
|
||||
|
||||
Version of the software responsible for supporting/handling the
|
||||
Network Controller Sideband Interface.
|
||||
|
||||
fw.psid
|
||||
=======
|
||||
|
||||
Unique identifier of the firmware parameter set.
|
@ -1,18 +0,0 @@
|
||||
enable_sriov [DEVICE, GENERIC]
|
||||
Configuration mode: Permanent
|
||||
|
||||
ignore_ari [DEVICE, GENERIC]
|
||||
Configuration mode: Permanent
|
||||
|
||||
msix_vec_per_pf_max [DEVICE, GENERIC]
|
||||
Configuration mode: Permanent
|
||||
|
||||
msix_vec_per_pf_min [DEVICE, GENERIC]
|
||||
Configuration mode: Permanent
|
||||
|
||||
gre_ver_check [DEVICE, DRIVER-SPECIFIC]
|
||||
Generic Routing Encapsulation (GRE) version check will
|
||||
be enabled in the device. If disabled, device skips
|
||||
version checking for incoming packets.
|
||||
Type: Boolean
|
||||
Configuration mode: Permanent
|
@ -1,17 +0,0 @@
|
||||
flow_steering_mode [DEVICE, DRIVER-SPECIFIC]
|
||||
Controls the flow steering mode of the driver.
|
||||
Two modes are supported:
|
||||
1. 'dmfs' - Device managed flow steering.
|
||||
2. 'smfs - Software/Driver managed flow steering.
|
||||
In DMFS mode, the HW steering entities are created and
|
||||
managed through the Firmware.
|
||||
In SMFS mode, the HW steering entities are created and
|
||||
managed though by the driver directly into Hardware
|
||||
without firmware intervention.
|
||||
Type: String
|
||||
Configuration mode: runtime
|
||||
|
||||
enable_roce [DEVICE, GENERIC]
|
||||
Enable handling of RoCE traffic in the device.
|
||||
Defaultly enabled.
|
||||
Configuration mode: driverinit
|
@ -1,10 +0,0 @@
|
||||
fw_load_policy [DEVICE, GENERIC]
|
||||
Configuration mode: driverinit
|
||||
|
||||
acl_region_rehash_interval [DEVICE, DRIVER-SPECIFIC]
|
||||
Sets an interval for periodic ACL region rehashes.
|
||||
The value is in milliseconds, minimal value is "3000".
|
||||
Value "0" disables the periodic work.
|
||||
The first rehash will be run right after value is set.
|
||||
Type: u32
|
||||
Configuration mode: runtime
|
@ -1,7 +0,0 @@
|
||||
ATU_hash [DEVICE, DRIVER-SPECIFIC]
|
||||
Select one of four possible hashing algorithms for
|
||||
MAC addresses in the Address Translation Unit.
|
||||
A value of 3 seems to work better than the default of
|
||||
1 when many MAC addresses have the same OUI.
|
||||
Configuration mode: runtime
|
||||
Type: u8. 0-3 valid.
|
@ -1,5 +0,0 @@
|
||||
fw_load_policy [DEVICE, GENERIC]
|
||||
Configuration mode: permanent
|
||||
|
||||
reset_dev_on_drv_probe [DEVICE, GENERIC]
|
||||
Configuration mode: permanent
|
@ -1,10 +0,0 @@
|
||||
ale_bypass [DEVICE, DRIVER-SPECIFIC]
|
||||
Allows to enable ALE_CONTROL(4).BYPASS mode for debug purposes.
|
||||
All packets will be sent to the Host port only if enabled.
|
||||
Type: bool
|
||||
Configuration mode: runtime
|
||||
|
||||
switch_mode [DEVICE, DRIVER-SPECIFIC]
|
||||
Enable switch mode
|
||||
Type: bool
|
||||
Configuration mode: runtime
|
@ -1,71 +0,0 @@
|
||||
Devlink configuration parameters
|
||||
================================
|
||||
Following is the list of configuration parameters via devlink interface.
|
||||
Each parameter can be generic or driver specific and are device level
|
||||
parameters.
|
||||
|
||||
Note that the driver-specific files should contain the generic params
|
||||
they support to, with supported config modes.
|
||||
|
||||
Each parameter can be set in different configuration modes:
|
||||
runtime - set while driver is running, no reset required.
|
||||
driverinit - applied while driver initializes, requires restart
|
||||
driver by devlink reload command.
|
||||
permanent - written to device's non-volatile memory, hard reset
|
||||
required.
|
||||
|
||||
Following is the list of parameters:
|
||||
====================================
|
||||
enable_sriov [DEVICE, GENERIC]
|
||||
Enable Single Root I/O Virtualisation (SRIOV) in
|
||||
the device.
|
||||
Type: Boolean
|
||||
|
||||
ignore_ari [DEVICE, GENERIC]
|
||||
Ignore Alternative Routing-ID Interpretation (ARI)
|
||||
capability. If enabled, adapter will ignore ARI
|
||||
capability even when platforms has the support
|
||||
enabled and creates same number of partitions when
|
||||
platform does not support ARI.
|
||||
Type: Boolean
|
||||
|
||||
msix_vec_per_pf_max [DEVICE, GENERIC]
|
||||
Provides the maximum number of MSIX interrupts that
|
||||
a device can create. Value is same across all
|
||||
physical functions (PFs) in the device.
|
||||
Type: u32
|
||||
|
||||
msix_vec_per_pf_min [DEVICE, GENERIC]
|
||||
Provides the minimum number of MSIX interrupts required
|
||||
for the device initialization. Value is same across all
|
||||
physical functions (PFs) in the device.
|
||||
Type: u32
|
||||
|
||||
fw_load_policy [DEVICE, GENERIC]
|
||||
Controls the device's firmware loading policy.
|
||||
Valid values:
|
||||
* DEVLINK_PARAM_FW_LOAD_POLICY_VALUE_DRIVER (0)
|
||||
Load firmware version preferred by the driver.
|
||||
* DEVLINK_PARAM_FW_LOAD_POLICY_VALUE_FLASH (1)
|
||||
Load firmware currently stored in flash.
|
||||
* DEVLINK_PARAM_FW_LOAD_POLICY_VALUE_DISK (2)
|
||||
Load firmware currently available on host's disk.
|
||||
Type: u8
|
||||
|
||||
reset_dev_on_drv_probe [DEVICE, GENERIC]
|
||||
Controls the device's reset policy on driver probe.
|
||||
Valid values:
|
||||
* DEVLINK_PARAM_RESET_DEV_ON_DRV_PROBE_VALUE_UNKNOWN (0)
|
||||
Unknown or invalid value.
|
||||
* DEVLINK_PARAM_RESET_DEV_ON_DRV_PROBE_VALUE_ALWAYS (1)
|
||||
Always reset device on driver probe.
|
||||
* DEVLINK_PARAM_RESET_DEV_ON_DRV_PROBE_VALUE_NEVER (2)
|
||||
Never reset device on driver probe.
|
||||
* DEVLINK_PARAM_RESET_DEV_ON_DRV_PROBE_VALUE_DISK (3)
|
||||
Reset only if device firmware can be found in the
|
||||
filesystem.
|
||||
Type: u8
|
||||
|
||||
enable_roce [DEVICE, GENERIC]
|
||||
Enable handling of RoCE traffic in the device.
|
||||
Type: Boolean
|
@ -1,20 +0,0 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
======================
|
||||
Devlink Trap netdevsim
|
||||
======================
|
||||
|
||||
Driver-specific Traps
|
||||
=====================
|
||||
|
||||
.. list-table:: List of Driver-specific Traps Registered by ``netdevsim``
|
||||
:widths: 5 5 90
|
||||
|
||||
* - Name
|
||||
- Type
|
||||
- Description
|
||||
* - ``fid_miss``
|
||||
- ``exception``
|
||||
- When a packet enters the device it is classified to a filtering
|
||||
indentifier (FID) based on the ingress port and VLAN. This trap is used
|
||||
to trap packets for which a FID could not be found
|
74
Documentation/networking/devlink/bnxt.rst
Normal file
74
Documentation/networking/devlink/bnxt.rst
Normal file
@ -0,0 +1,74 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
====================
|
||||
bnxt devlink support
|
||||
====================
|
||||
|
||||
This document describes the devlink features implemented by the ``bnxt``
|
||||
device driver.
|
||||
|
||||
Parameters
|
||||
==========
|
||||
|
||||
.. list-table:: Generic parameters implemented
|
||||
|
||||
* - Name
|
||||
- Mode
|
||||
* - ``enable_sriov``
|
||||
- Permanent
|
||||
* - ``ignore_ari``
|
||||
- Permanent
|
||||
* - ``msix_vec_per_pf_max``
|
||||
- Permanent
|
||||
* - ``msix_vec_per_pf_min``
|
||||
- Permanent
|
||||
|
||||
The ``bnxt`` driver also implements the following driver-specific
|
||||
parameters.
|
||||
|
||||
.. list-table:: Driver-specific parameters implemented
|
||||
:widths: 5 5 5 85
|
||||
|
||||
* - Name
|
||||
- Type
|
||||
- Mode
|
||||
- Description
|
||||
* - ``gre_ver_check``
|
||||
- Boolean
|
||||
- Permanent
|
||||
- Generic Routing Encapsulation (GRE) version check will be enabled in
|
||||
the device. If disabled, the device will skip the version check for
|
||||
incoming packets.
|
||||
|
||||
Info versions
|
||||
=============
|
||||
|
||||
The ``bnxt_en`` driver reports the following versions
|
||||
|
||||
.. list-table:: devlink info versions implemented
|
||||
:widths: 5 5 90
|
||||
|
||||
* - Name
|
||||
- Type
|
||||
- Description
|
||||
* - ``asic.id``
|
||||
- fixed
|
||||
- ASIC design identifier
|
||||
* - ``asic.rev``
|
||||
- fixed
|
||||
- ASIC design revision
|
||||
* - ``fw.psid``
|
||||
- stored, running
|
||||
- Firmware parameter set version of the board
|
||||
* - ``fw``
|
||||
- stored, running
|
||||
- Overall board firmware version
|
||||
* - ``fw.app``
|
||||
- stored, running
|
||||
- Data path firmware version
|
||||
* - ``fw.mgmt``
|
||||
- stored, running
|
||||
- Management firmware version
|
||||
* - ``fw.roce``
|
||||
- stored, running
|
||||
- RoCE management firmware version
|
252
Documentation/networking/devlink/devlink-dpipe.rst
Normal file
252
Documentation/networking/devlink/devlink-dpipe.rst
Normal file
@ -0,0 +1,252 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
=============
|
||||
Devlink DPIPE
|
||||
=============
|
||||
|
||||
Background
|
||||
==========
|
||||
|
||||
While performing the hardware offloading process, much of the hardware
|
||||
specifics cannot be presented. These details are useful for debugging, and
|
||||
``devlink-dpipe`` provides a standardized way to provide visibility into the
|
||||
offloading process.
|
||||
|
||||
For example, the routing longest prefix match (LPM) algorithm used by the
|
||||
Linux kernel may differ from the hardware implementation. The pipeline debug
|
||||
API (DPIPE) is aimed at providing the user visibility into the ASIC's
|
||||
pipeline in a generic way.
|
||||
|
||||
The hardware offload process is expected to be done in a way that the user
|
||||
should not be able to distinguish between the hardware vs. software
|
||||
implementation. In this process, hardware specifics are neglected. In
|
||||
reality those details can have lots of meaning and should be exposed in some
|
||||
standard way.
|
||||
|
||||
This problem is made even more complex when one wishes to offload the
|
||||
control path of the whole networking stack to a switch ASIC. Due to
|
||||
differences in the hardware and software models some processes cannot be
|
||||
represented correctly.
|
||||
|
||||
One example is the kernel's LPM algorithm which in many cases differs
|
||||
greatly to the hardware implementation. The configuration API is the same,
|
||||
but one cannot rely on the Forward Information Base (FIB) to look like the
|
||||
Level Path Compression trie (LPC-trie) in hardware.
|
||||
|
||||
In many situations trying to analyze systems failure solely based on the
|
||||
kernel's dump may not be enough. By combining this data with complementary
|
||||
information about the underlying hardware, this debugging can be made
|
||||
easier; additionally, the information can be useful when debugging
|
||||
performance issues.
|
||||
|
||||
Overview
|
||||
========
|
||||
|
||||
The ``devlink-dpipe`` interface closes this gap. The hardware's pipeline is
|
||||
modeled as a graph of match/action tables. Each table represents a specific
|
||||
hardware block. This model is not new, first being used by the P4 language.
|
||||
|
||||
Traditionally it has been used as an alternative model for hardware
|
||||
configuration, but the ``devlink-dpipe`` interface uses it for visibility
|
||||
purposes as a standard complementary tool. The system's view from
|
||||
``devlink-dpipe`` should change according to the changes done by the
|
||||
standard configuration tools.
|
||||
|
||||
For example, it’s quiet common to implement Access Control Lists (ACL)
|
||||
using Ternary Content Addressable Memory (TCAM). The TCAM memory can be
|
||||
divided into TCAM regions. Complex TC filters can have multiple rules with
|
||||
different priorities and different lookup keys. On the other hand hardware
|
||||
TCAM regions have a predefined lookup key. Offloading the TC filter rules
|
||||
using TCAM engine can result in multiple TCAM regions being interconnected
|
||||
in a chain (which may affect the data path latency). In response to a new TC
|
||||
filter new tables should be created describing those regions.
|
||||
|
||||
Model
|
||||
=====
|
||||
|
||||
The ``DPIPE`` model introduces several objects:
|
||||
|
||||
* headers
|
||||
* tables
|
||||
* entries
|
||||
|
||||
A ``header`` describes packet formats and provides names for fields within
|
||||
the packet. A ``table`` describes hardware blocks. An ``entry`` describes
|
||||
the actual content of a specific table.
|
||||
|
||||
The hardware pipeline is not port specific, but rather describes the whole
|
||||
ASIC. Thus it is tied to the top of the ``devlink`` infrastructure.
|
||||
|
||||
Drivers can register and unregister tables at run time, in order to support
|
||||
dynamic behavior. This dynamic behavior is mandatory for describing hardware
|
||||
blocks like TCAM regions which can be allocated and freed dynamically.
|
||||
|
||||
``devlink-dpipe`` generally is not intended for configuration. The exception
|
||||
is hardware counting for a specific table.
|
||||
|
||||
The following commands are used to obtain the ``dpipe`` objects from
|
||||
userspace:
|
||||
|
||||
* ``table_get``: Receive a table's description.
|
||||
* ``headers_get``: Receive a device's supported headers.
|
||||
* ``entries_get``: Receive a table's current entries.
|
||||
* ``counters_set``: Enable or disable counters on a table.
|
||||
|
||||
Table
|
||||
-----
|
||||
|
||||
The driver should implement the following operations for each table:
|
||||
|
||||
* ``matches_dump``: Dump the supported matches.
|
||||
* ``actions_dump``: Dump the supported actions.
|
||||
* ``entries_dump``: Dump the actual content of the table.
|
||||
* ``counters_set_update``: Synchronize hardware with counters enabled or
|
||||
disabled.
|
||||
|
||||
Header/Field
|
||||
------------
|
||||
|
||||
In a similar way to P4 headers and fields are used to describe a table's
|
||||
behavior. There is a slight difference between the standard protocol headers
|
||||
and specific ASIC metadata. The protocol headers should be declared in the
|
||||
``devlink`` core API. On the other hand ASIC meta data is driver specific
|
||||
and should be defined in the driver. Additionally, each driver-specific
|
||||
devlink documentation file should document the driver-specific ``dpipe``
|
||||
headers it implements. The headers and fields are identified by enumeration.
|
||||
|
||||
In order to provide further visibility some ASIC metadata fields could be
|
||||
mapped to kernel objects. For example, internal router interface indexes can
|
||||
be directly mapped to the net device ifindex. FIB table indexes used by
|
||||
different Virtual Routing and Forwarding (VRF) tables can be mapped to
|
||||
internal routing table indexes.
|
||||
|
||||
Match
|
||||
-----
|
||||
|
||||
Matches are kept primitive and close to hardware operation. Match types like
|
||||
LPM are not supported due to the fact that this is exactly a process we wish
|
||||
to describe in full detail. Example of matches:
|
||||
|
||||
* ``field_exact``: Exact match on a specific field.
|
||||
* ``field_exact_mask``: Exact match on a specific field after masking.
|
||||
* ``field_range``: Match on a specific range.
|
||||
|
||||
The id's of the header and the field should be specified in order to
|
||||
identify the specific field. Furthermore, the header index should be
|
||||
specified in order to distinguish multiple headers of the same type in a
|
||||
packet (tunneling).
|
||||
|
||||
Action
|
||||
------
|
||||
|
||||
Similar to match, the actions are kept primitive and close to hardware
|
||||
operation. For example:
|
||||
|
||||
* ``field_modify``: Modify the field value.
|
||||
* ``field_inc``: Increment the field value.
|
||||
* ``push_header``: Add a header.
|
||||
* ``pop_header``: Remove a header.
|
||||
|
||||
Entry
|
||||
-----
|
||||
|
||||
Entries of a specific table can be dumped on demand. Each eentry is
|
||||
identified with an index and its properties are described by a list of
|
||||
match/action values and specific counter. By dumping the tables content the
|
||||
interactions between tables can be resolved.
|
||||
|
||||
Abstraction Example
|
||||
===================
|
||||
|
||||
The following is an example of the abstraction model of the L3 part of
|
||||
Mellanox Spectrum ASIC. The blocks are described in the order they appear in
|
||||
the pipeline. The table sizes in the following examples are not real
|
||||
hardware sizes and are provided for demonstration purposes.
|
||||
|
||||
LPM
|
||||
---
|
||||
|
||||
The LPM algorithm can be implemented as a list of hash tables. Each hash
|
||||
table contains routes with the same prefix length. The root of the list is
|
||||
/32, and in case of a miss the hardware will continue to the next hash
|
||||
table. The depth of the search will affect the data path latency.
|
||||
|
||||
In case of a hit the entry contains information about the next stage of the
|
||||
pipeline which resolves the MAC address. The next stage can be either local
|
||||
host table for directly connected routes, or adjacency table for next-hops.
|
||||
The ``meta.lpm_prefix`` field is used to connect two LPM tables.
|
||||
|
||||
.. code::
|
||||
|
||||
table lpm_prefix_16 {
|
||||
size: 4096,
|
||||
counters_enabled: true,
|
||||
match: { meta.vr_id: exact,
|
||||
ipv4.dst_addr: exact_mask,
|
||||
ipv6.dst_addr: exact_mask,
|
||||
meta.lpm_prefix: exact },
|
||||
action: { meta.adj_index: set,
|
||||
meta.adj_group_size: set,
|
||||
meta.rif_port: set,
|
||||
meta.lpm_prefix: set },
|
||||
}
|
||||
|
||||
Local Host
|
||||
----------
|
||||
|
||||
In the case of local routes the LPM lookup already resolves the egress
|
||||
router interface (RIF), yet the exact MAC address is not known. The local
|
||||
host table is a hash table combining the output interface id with
|
||||
destination IP address as a key. The result is the MAC address.
|
||||
|
||||
.. code::
|
||||
|
||||
table local_host {
|
||||
size: 4096,
|
||||
counters_enabled: true,
|
||||
match: { meta.rif_port: exact,
|
||||
ipv4.dst_addr: exact},
|
||||
action: { ethernet.daddr: set }
|
||||
}
|
||||
|
||||
Adjacency
|
||||
---------
|
||||
|
||||
In case of remote routes this table does the ECMP. The LPM lookup results in
|
||||
ECMP group size and index that serves as a global offset into this table.
|
||||
Concurrently a hash of the packet is generated. Based on the ECMP group size
|
||||
and the packet's hash a local offset is generated. Multiple LPM entries can
|
||||
point to the same adjacency group.
|
||||
|
||||
.. code::
|
||||
|
||||
table adjacency {
|
||||
size: 4096,
|
||||
counters_enabled: true,
|
||||
match: { meta.adj_index: exact,
|
||||
meta.adj_group_size: exact,
|
||||
meta.packet_hash_index: exact },
|
||||
action: { ethernet.daddr: set,
|
||||
meta.erif: set }
|
||||
}
|
||||
|
||||
ERIF
|
||||
----
|
||||
|
||||
In case the egress RIF and destination MAC have been resolved by previous
|
||||
tables this table does multiple operations like TTL decrease and MTU check.
|
||||
Then the decision of forward/drop is taken and the port L3 statistics are
|
||||
updated based on the packet's type (broadcast, unicast, multicast).
|
||||
|
||||
.. code::
|
||||
|
||||
table erif {
|
||||
size: 800,
|
||||
counters_enabled: true,
|
||||
match: { meta.rif_port: exact,
|
||||
meta.is_l3_unicast: exact,
|
||||
meta.is_l3_broadcast: exact,
|
||||
meta.is_l3_multicast, exact },
|
||||
action: { meta.l3_drop: set,
|
||||
meta.l3_forward: set }
|
||||
}
|
114
Documentation/networking/devlink/devlink-health.rst
Normal file
114
Documentation/networking/devlink/devlink-health.rst
Normal file
@ -0,0 +1,114 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
==============
|
||||
Devlink Health
|
||||
==============
|
||||
|
||||
Background
|
||||
==========
|
||||
|
||||
The ``devlink`` health mechanism is targeted for Real Time Alerting, in
|
||||
order to know when something bad happened to a PCI device.
|
||||
|
||||
* Provide alert debug information.
|
||||
* Self healing.
|
||||
* If problem needs vendor support, provide a way to gather all needed
|
||||
debugging information.
|
||||
|
||||
Overview
|
||||
========
|
||||
|
||||
The main idea is to unify and centralize driver health reports in the
|
||||
generic ``devlink`` instance and allow the user to set different
|
||||
attributes of the health reporting and recovery procedures.
|
||||
|
||||
The ``devlink`` health reporter:
|
||||
Device driver creates a "health reporter" per each error/health type.
|
||||
Error/Health type can be a known/generic (eg pci error, fw error, rx/tx error)
|
||||
or unknown (driver specific).
|
||||
For each registered health reporter a driver can issue error/health reports
|
||||
asynchronously. All health reports handling is done by ``devlink``.
|
||||
Device driver can provide specific callbacks for each "health reporter", e.g.:
|
||||
|
||||
* Recovery procedures
|
||||
* Diagnostics procedures
|
||||
* Object dump procedures
|
||||
* OOB initial parameters
|
||||
|
||||
Different parts of the driver can register different types of health reporters
|
||||
with different handlers.
|
||||
|
||||
Actions
|
||||
=======
|
||||
|
||||
Once an error is reported, devlink health will perform the following actions:
|
||||
|
||||
* A log is being send to the kernel trace events buffer
|
||||
* Health status and statistics are being updated for the reporter instance
|
||||
* Object dump is being taken and saved at the reporter instance (as long as
|
||||
there is no other dump which is already stored)
|
||||
* Auto recovery attempt is being done. Depends on:
|
||||
- Auto-recovery configuration
|
||||
- Grace period vs. time passed since last recover
|
||||
|
||||
User Interface
|
||||
==============
|
||||
|
||||
User can access/change each reporter's parameters and driver specific callbacks
|
||||
via ``devlink``, e.g per error type (per health reporter):
|
||||
|
||||
* Configure reporter's generic parameters (like: disable/enable auto recovery)
|
||||
* Invoke recovery procedure
|
||||
* Run diagnostics
|
||||
* Object dump
|
||||
|
||||
.. list-table:: List of devlink health interfaces
|
||||
:widths: 10 90
|
||||
|
||||
* - Name
|
||||
- Description
|
||||
* - ``DEVLINK_CMD_HEALTH_REPORTER_GET``
|
||||
- Retrieves status and configuration info per DEV and reporter.
|
||||
* - ``DEVLINK_CMD_HEALTH_REPORTER_SET``
|
||||
- Allows reporter-related configuration setting.
|
||||
* - ``DEVLINK_CMD_HEALTH_REPORTER_RECOVER``
|
||||
- Triggers a reporter's recovery procedure.
|
||||
* - ``DEVLINK_CMD_HEALTH_REPORTER_DIAGNOSE``
|
||||
- Retrieves diagnostics data from a reporter on a device.
|
||||
* - ``DEVLINK_CMD_HEALTH_REPORTER_DUMP_GET``
|
||||
- Retrieves the last stored dump. Devlink health
|
||||
saves a single dump. If an dump is not already stored by the devlink
|
||||
for this reporter, devlink generates a new dump.
|
||||
dump output is defined by the reporter.
|
||||
* - ``DEVLINK_CMD_HEALTH_REPORTER_DUMP_CLEAR``
|
||||
- Clears the last saved dump file for the specified reporter.
|
||||
|
||||
The following diagram provides a general overview of ``devlink-health``::
|
||||
|
||||
netlink
|
||||
+--------------------------+
|
||||
| |
|
||||
| + |
|
||||
| | |
|
||||
+--------------------------+
|
||||
|request for ops
|
||||
|(diagnose,
|
||||
mlx5_core devlink |recover,
|
||||
|dump)
|
||||
+--------+ +--------------------------+
|
||||
| | | reporter| |
|
||||
| | | +---------v----------+ |
|
||||
| | ops execution | | | |
|
||||
| <----------------------------------+ | |
|
||||
| | | | | |
|
||||
| | | + ^------------------+ |
|
||||
| | | | request for ops |
|
||||
| | | | (recover, dump) |
|
||||
| | | | |
|
||||
| | | +-+------------------+ |
|
||||
| | health report | | health handler | |
|
||||
| +-------------------------------> | |
|
||||
| | | +--------------------+ |
|
||||
| | health reporter create | |
|
||||
| +----------------------------> |
|
||||
+--------+ +--------------------------+
|
100
Documentation/networking/devlink/devlink-info.rst
Normal file
100
Documentation/networking/devlink/devlink-info.rst
Normal file
@ -0,0 +1,100 @@
|
||||
.. SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
|
||||
|
||||
============
|
||||
Devlink Info
|
||||
============
|
||||
|
||||
The ``devlink-info`` mechanism enables device drivers to report device
|
||||
information in a generic fashion. It is extensible, and enables exporting
|
||||
even device or driver specific information.
|
||||
|
||||
devlink supports representing the following types of versions
|
||||
|
||||
.. list-table:: List of version types
|
||||
:widths: 5 95
|
||||
|
||||
* - Type
|
||||
- Description
|
||||
* - ``fixed``
|
||||
- Represents fixed versions, which cannot change. For example,
|
||||
component identifiers or the board version reported in the PCI VPD.
|
||||
* - ``running``
|
||||
- Represents the version of the currently running component. For
|
||||
example the running version of firmware. These versions generally
|
||||
only update after a reboot.
|
||||
* - ``stored``
|
||||
- Represents the version of a component as stored, such as after a
|
||||
flash update. Stored values should update to reflect changes in the
|
||||
flash even if a reboot has not yet occurred.
|
||||
|
||||
Generic Versions
|
||||
================
|
||||
|
||||
It is expected that drivers use the following generic names for exporting
|
||||
version information. Other information may be exposed using driver-specific
|
||||
names, but these should be documented in the driver-specific file.
|
||||
|
||||
board.id
|
||||
--------
|
||||
|
||||
Unique identifier of the board design.
|
||||
|
||||
board.rev
|
||||
---------
|
||||
|
||||
Board design revision.
|
||||
|
||||
asic.id
|
||||
-------
|
||||
|
||||
ASIC design identifier.
|
||||
|
||||
asic.rev
|
||||
--------
|
||||
|
||||
ASIC design revision.
|
||||
|
||||
board.manufacture
|
||||
-----------------
|
||||
|
||||
An identifier of the company or the facility which produced the part.
|
||||
|
||||
fw
|
||||
--
|
||||
|
||||
Overall firmware version, often representing the collection of
|
||||
fw.mgmt, fw.app, etc.
|
||||
|
||||
fw.mgmt
|
||||
-------
|
||||
|
||||
Control unit firmware version. This firmware is responsible for house
|
||||
keeping tasks, PHY control etc. but not the packet-by-packet data path
|
||||
operation.
|
||||
|
||||
fw.app
|
||||
------
|
||||
|
||||
Data path microcode controlling high-speed packet processing.
|
||||
|
||||
fw.undi
|
||||
-------
|
||||
|
||||
UNDI software, may include the UEFI driver, firmware or both.
|
||||
|
||||
fw.ncsi
|
||||
-------
|
||||
|
||||
Version of the software responsible for supporting/handling the
|
||||
Network Controller Sideband Interface.
|
||||
|
||||
fw.psid
|
||||
-------
|
||||
|
||||
Unique identifier of the firmware parameter set.
|
||||
|
||||
fw.roce
|
||||
-------
|
||||
|
||||
RoCE firmware version which is responsible for handling roce
|
||||
management.
|
108
Documentation/networking/devlink/devlink-params.rst
Normal file
108
Documentation/networking/devlink/devlink-params.rst
Normal file
@ -0,0 +1,108 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
==============
|
||||
Devlink Params
|
||||
==============
|
||||
|
||||
``devlink`` provides capability for a driver to expose device parameters for low
|
||||
level device functionality. Since devlink can operate at the device-wide
|
||||
level, it can be used to provide configuration that may affect multiple
|
||||
ports on a single device.
|
||||
|
||||
This document describes a number of generic parameters that are supported
|
||||
across multiple drivers. Each driver is also free to add their own
|
||||
parameters. Each driver must document the specific parameters they support,
|
||||
whether generic or not.
|
||||
|
||||
Configuration modes
|
||||
===================
|
||||
|
||||
Parameters may be set in different configuration modes.
|
||||
|
||||
.. list-table:: Possible configuration modes
|
||||
:widths: 5 90
|
||||
|
||||
* - Name
|
||||
- Description
|
||||
* - ``runtime``
|
||||
- set while the driver is running, and takes effect immediately. No
|
||||
reset is required.
|
||||
* - ``driverinit``
|
||||
- applied while the driver initializes. Requires the user to restart
|
||||
the driver using the ``devlink`` reload command.
|
||||
* - ``permanent``
|
||||
- written to the device's non-volatile memory. A hard reset is required
|
||||
for it to take effect.
|
||||
|
||||
Reloading
|
||||
---------
|
||||
|
||||
In order for ``driverinit`` parameters to take effect, the driver must
|
||||
support reloading via the ``devlink-reload`` command. This command will
|
||||
request a reload of the device driver.
|
||||
|
||||
Generic configuration parameters
|
||||
================================
|
||||
The following is a list of generic configuration parameters that drivers may
|
||||
add. Use of generic parameters is preferred over each driver creating their
|
||||
own name.
|
||||
|
||||
.. list-table:: List of generic parameters
|
||||
:widths: 5 5 90
|
||||
|
||||
* - Name
|
||||
- Type
|
||||
- Description
|
||||
* - ``enable_sriov``
|
||||
- Boolean
|
||||
- Enable Single Root I/O Virtualization (SRIOV) in the device.
|
||||
* - ``ignore_ari``
|
||||
- Boolean
|
||||
- Ignore Alternative Routing-ID Interpretation (ARI) capability. If
|
||||
enabled, the adapter will ignore ARI capability even when the
|
||||
platform has support enabled. The device will create the same number
|
||||
of partitions as when the platform does not support ARI.
|
||||
* - ``msix_vec_per_pf_max``
|
||||
- u32
|
||||
- Provides the maximum number of MSI-X interrupts that a device can
|
||||
create. Value is the same across all physical functions (PFs) in the
|
||||
device.
|
||||
* - ``msix_vec_per_pf_min``
|
||||
- u32
|
||||
- Provides the minimum number of MSI-X interrupts required for the
|
||||
device to initialize. Value is the same across all physical functions
|
||||
(PFs) in the device.
|
||||
* - ``fw_load_policy``
|
||||
- u8
|
||||
- Control the device's firmware loading policy.
|
||||
- ``DEVLINK_PARAM_FW_LOAD_POLICY_VALUE_DRIVER`` (0)
|
||||
Load firmware version preferred by the driver.
|
||||
- ``DEVLINK_PARAM_FW_LOAD_POLICY_VALUE_FLASH`` (1)
|
||||
Load firmware currently stored in flash.
|
||||
- ``DEVLINK_PARAM_FW_LOAD_POLICY_VALUE_DISK`` (2)
|
||||
Load firmware currently available on host's disk.
|
||||
* - ``reset_dev_on_drv_probe``
|
||||
- u8
|
||||
- Controls the device's reset policy on driver probe.
|
||||
- ``DEVLINK_PARAM_RESET_DEV_ON_DRV_PROBE_VALUE_UNKNOWN`` (0)
|
||||
Unknown or invalid value.
|
||||
- ``DEVLINK_PARAM_RESET_DEV_ON_DRV_PROBE_VALUE_ALWAYS`` (1)
|
||||
Always reset device on driver probe.
|
||||
- ``DEVLINK_PARAM_RESET_DEV_ON_DRV_PROBE_VALUE_NEVER`` (2)
|
||||
Never reset device on driver probe.
|
||||
- ``DEVLINK_PARAM_RESET_DEV_ON_DRV_PROBE_VALUE_DISK`` (3)
|
||||
Reset the device only if firmware can be found in the filesystem.
|
||||
* - ``enable_roce``
|
||||
- Boolean
|
||||
- Enable handling of RoCE traffic in the device.
|
||||
* - ``internal_err_reset``
|
||||
- Boolean
|
||||
- When enabled, the device driver will reset the device on internal
|
||||
errors.
|
||||
* - ``max_macs``
|
||||
- u32
|
||||
- Specifies the maximum number of MAC addresses per ethernet port of
|
||||
this device.
|
||||
* - ``region_snapshot_enable``
|
||||
- Boolean
|
||||
- Enable capture of ``devlink-region`` snapshots.
|
60
Documentation/networking/devlink/devlink-region.rst
Normal file
60
Documentation/networking/devlink/devlink-region.rst
Normal file
@ -0,0 +1,60 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
==============
|
||||
Devlink Region
|
||||
==============
|
||||
|
||||
``devlink`` regions enable access to driver defined address regions using
|
||||
devlink.
|
||||
|
||||
Each device can create and register its own supported address regions. The
|
||||
region can then be accessed via the devlink region interface.
|
||||
|
||||
Region snapshots are collected by the driver, and can be accessed via read
|
||||
or dump commands. This allows future analysis on the created snapshots.
|
||||
Regions may optionally support triggering snapshots on demand.
|
||||
|
||||
The major benefit to creating a region is to provide access to internal
|
||||
address regions that are otherwise inaccessible to the user.
|
||||
|
||||
Regions may also be used to provide an additional way to debug complex error
|
||||
states, but see also :doc:`devlink-health`
|
||||
|
||||
example usage
|
||||
-------------
|
||||
|
||||
.. code:: shell
|
||||
|
||||
$ devlink region help
|
||||
$ devlink region show [ DEV/REGION ]
|
||||
$ devlink region del DEV/REGION snapshot SNAPSHOT_ID
|
||||
$ devlink region dump DEV/REGION [ snapshot SNAPSHOT_ID ]
|
||||
$ devlink region read DEV/REGION [ snapshot SNAPSHOT_ID ]
|
||||
address ADDRESS length length
|
||||
|
||||
# Show all of the exposed regions with region sizes:
|
||||
$ devlink region show
|
||||
pci/0000:00:05.0/cr-space: size 1048576 snapshot [1 2]
|
||||
pci/0000:00:05.0/fw-health: size 64 snapshot [1 2]
|
||||
|
||||
# Delete a snapshot using:
|
||||
$ devlink region del pci/0000:00:05.0/cr-space snapshot 1
|
||||
|
||||
# Trigger (request) a snapshot be taken:
|
||||
$ devlink region trigger pci/0000:00:05.0/cr-space
|
||||
|
||||
# Dump a snapshot:
|
||||
$ devlink region dump pci/0000:00:05.0/fw-health snapshot 1
|
||||
0000000000000000 0014 95dc 0014 9514 0035 1670 0034 db30
|
||||
0000000000000010 0000 0000 ffff ff04 0029 8c00 0028 8cc8
|
||||
0000000000000020 0016 0bb8 0016 1720 0000 0000 c00f 3ffc
|
||||
0000000000000030 bada cce5 bada cce5 bada cce5 bada cce5
|
||||
|
||||
# Read a specific part of a snapshot:
|
||||
$ devlink region read pci/0000:00:05.0/fw-health snapshot 1 address 0
|
||||
length 16
|
||||
0000000000000000 0014 95dc 0014 9514 0035 1670 0034 db30
|
||||
|
||||
As regions are likely very device or driver specific, no generic regions are
|
||||
defined. See the driver-specific documentation files for information on the
|
||||
specific regions a driver supports.
|
62
Documentation/networking/devlink/devlink-resource.rst
Normal file
62
Documentation/networking/devlink/devlink-resource.rst
Normal file
@ -0,0 +1,62 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
================
|
||||
Devlink Resource
|
||||
================
|
||||
|
||||
``devlink`` provides the ability for drivers to register resources, which
|
||||
can allow administrators to see the device restrictions for a given
|
||||
resource, as well as how much of the given resource is currently
|
||||
in use. Additionally, these resources can optionally have configurable size.
|
||||
This could enable the administrator to limit the number of resources that
|
||||
are used.
|
||||
|
||||
For example, the ``netdevsim`` driver enables ``/IPv4/fib`` and
|
||||
``/IPv4/fib-rules`` as resources to limit the number of IPv4 FIB entries and
|
||||
rules for a given device.
|
||||
|
||||
Resource Ids
|
||||
============
|
||||
|
||||
Each resource is represented by an id, and contains information about its
|
||||
current size and related sub resources. To access a sub resource, you
|
||||
specify the path of the resource. For example ``/IPv4/fib`` is the id for
|
||||
the ``fib`` sub-resource under the ``IPv4`` resource.
|
||||
|
||||
example usage
|
||||
-------------
|
||||
|
||||
The resources exposed by the driver can be observed, for example:
|
||||
|
||||
.. code:: shell
|
||||
|
||||
$devlink resource show pci/0000:03:00.0
|
||||
pci/0000:03:00.0:
|
||||
name kvd size 245760 unit entry
|
||||
resources:
|
||||
name linear size 98304 occ 0 unit entry size_min 0 size_max 147456 size_gran 128
|
||||
name hash_double size 60416 unit entry size_min 32768 size_max 180224 size_gran 128
|
||||
name hash_single size 87040 unit entry size_min 65536 size_max 212992 size_gran 128
|
||||
|
||||
Some resource's size can be changed. Examples:
|
||||
|
||||
.. code:: shell
|
||||
|
||||
$devlink resource set pci/0000:03:00.0 path /kvd/hash_single size 73088
|
||||
$devlink resource set pci/0000:03:00.0 path /kvd/hash_double size 74368
|
||||
|
||||
The changes do not apply immediately, this can be validated by the 'size_new'
|
||||
attribute, which represents the pending change in size. For example:
|
||||
|
||||
.. code:: shell
|
||||
|
||||
$devlink resource show pci/0000:03:00.0
|
||||
pci/0000:03:00.0:
|
||||
name kvd size 245760 unit entry size_valid false
|
||||
resources:
|
||||
name linear size 98304 size_new 147456 occ 0 unit entry size_min 0 size_max 147456 size_gran 128
|
||||
name hash_double size 60416 unit entry size_min 32768 size_max 180224 size_gran 128
|
||||
name hash_single size 87040 unit entry size_min 65536 size_max 212992 size_gran 128
|
||||
|
||||
Note that changes in resource size may require a device reload to properly
|
||||
take effect.
|
@ -223,6 +223,21 @@ be added to the following table:
|
||||
* - ``ipv6_lpm_miss``
|
||||
- ``exception``
|
||||
- Traps unicast IPv6 packets that did not match any route
|
||||
* - ``non_routable_packet``
|
||||
- ``drop``
|
||||
- Traps packets that the device decided to drop because they are not
|
||||
supposed to be routed. For example, IGMP queries can be flooded by the
|
||||
device in layer 2 and reach the router. Such packets should not be
|
||||
routed and instead dropped
|
||||
* - ``decap_error``
|
||||
- ``exception``
|
||||
- Traps NVE and IPinIP packets that the device decided to drop because of
|
||||
failure during decapsulation (e.g., packet being too short, reserved
|
||||
bits set in VXLAN header)
|
||||
* - ``overlay_smac_is_mc``
|
||||
- ``drop``
|
||||
- Traps NVE packets that the device decided to drop because their overlay
|
||||
source MAC is multicast
|
||||
|
||||
Driver-specific Packet Traps
|
||||
============================
|
||||
@ -233,7 +248,8 @@ help debug packet drops caused by these exceptions. The following list includes
|
||||
links to the description of driver-specific traps registered by various device
|
||||
drivers:
|
||||
|
||||
* :doc:`devlink-trap-netdevsim`
|
||||
* :doc:`netdevsim`
|
||||
* :doc:`mlxsw`
|
||||
|
||||
Generic Packet Trap Groups
|
||||
==========================
|
||||
@ -258,6 +274,9 @@ narrow. The description of these groups must be added to the following table:
|
||||
* - ``buffer_drops``
|
||||
- Contains packet traps for packets that were dropped by the device due to
|
||||
an enqueue decision
|
||||
* - ``tunnel_drops``
|
||||
- Contains packet traps for packets that were dropped by the device during
|
||||
tunnel encapsulation / decapsulation
|
||||
|
||||
Testing
|
||||
=======
|
42
Documentation/networking/devlink/index.rst
Normal file
42
Documentation/networking/devlink/index.rst
Normal file
@ -0,0 +1,42 @@
|
||||
Linux Devlink Documentation
|
||||
===========================
|
||||
|
||||
devlink is an API to expose device information and resources not directly
|
||||
related to any device class, such as chip-wide/switch-ASIC-wide configuration.
|
||||
|
||||
Interface documentation
|
||||
-----------------------
|
||||
|
||||
The following pages describe various interfaces available through devlink in
|
||||
general.
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 1
|
||||
|
||||
devlink-dpipe
|
||||
devlink-health
|
||||
devlink-info
|
||||
devlink-params
|
||||
devlink-region
|
||||
devlink-resource
|
||||
devlink-trap
|
||||
|
||||
Driver-specific documentation
|
||||
-----------------------------
|
||||
|
||||
Each driver that implements ``devlink`` is expected to document what
|
||||
parameters, info versions, and other features it supports.
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 1
|
||||
|
||||
bnxt
|
||||
ionic
|
||||
mlx4
|
||||
mlx5
|
||||
mlxsw
|
||||
mv88e6xxx
|
||||
netdevsim
|
||||
nfp
|
||||
qed
|
||||
ti-cpsw-switch
|
29
Documentation/networking/devlink/ionic.rst
Normal file
29
Documentation/networking/devlink/ionic.rst
Normal file
@ -0,0 +1,29 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
=====================
|
||||
ionic devlink support
|
||||
=====================
|
||||
|
||||
This document describes the devlink features implemented by the ``ionic``
|
||||
device driver.
|
||||
|
||||
Info versions
|
||||
=============
|
||||
|
||||
The ``ionic`` driver reports the following versions
|
||||
|
||||
.. list-table:: devlink info versions implemented
|
||||
:widths: 5 5 90
|
||||
|
||||
* - Name
|
||||
- Type
|
||||
- Description
|
||||
* - ``fw``
|
||||
- running
|
||||
- Version of firmware running on the device
|
||||
* - ``asic.id``
|
||||
- fixed
|
||||
- The ASIC type for this device
|
||||
* - ``asic.rev``
|
||||
- fixed
|
||||
- The revision of the ASIC for this device
|
56
Documentation/networking/devlink/mlx4.rst
Normal file
56
Documentation/networking/devlink/mlx4.rst
Normal file
@ -0,0 +1,56 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
====================
|
||||
mlx4 devlink support
|
||||
====================
|
||||
|
||||
This document describes the devlink features implemented by the ``mlx4``
|
||||
device driver.
|
||||
|
||||
Parameters
|
||||
==========
|
||||
|
||||
.. list-table:: Generic parameters implemented
|
||||
|
||||
* - Name
|
||||
- Mode
|
||||
* - ``internal_err_reset``
|
||||
- driverinit, runtime
|
||||
* - ``max_macs``
|
||||
- driverinit
|
||||
* - ``region_snapshot_enable``
|
||||
- driverinit, runtime
|
||||
|
||||
The ``mlx4`` driver also implements the following driver-specific
|
||||
parameters.
|
||||
|
||||
.. list-table:: Driver-specific parameters implemented
|
||||
:widths: 5 5 5 85
|
||||
|
||||
* - Name
|
||||
- Type
|
||||
- Mode
|
||||
- Description
|
||||
* - ``enable_64b_cqe_eqe``
|
||||
- Boolean
|
||||
- driverinit
|
||||
- Enable 64 byte CQEs/EQEs, if the FW supports it.
|
||||
* - ``enable_4k_uar``
|
||||
- Boolean
|
||||
- driverinit
|
||||
- Enable using the 4k UAR.
|
||||
|
||||
The ``mlx4`` driver supports reloading via ``DEVLINK_CMD_RELOAD``
|
||||
|
||||
Regions
|
||||
=======
|
||||
|
||||
The ``mlx4`` driver supports dumping the firmware PCI crspace and health
|
||||
buffer during a critical firmware issue.
|
||||
|
||||
In case a firmware command times out, firmware getting stuck, or a non zero
|
||||
value on the catastrophic buffer, a snapshot will be taken by the driver.
|
||||
|
||||
The ``cr-space`` region will contain the firmware PCI crspace contents. The
|
||||
``fw-health`` region will contain the device firmware's health buffer.
|
||||
Snapshots for both of these regions are taken on the same event triggers.
|
59
Documentation/networking/devlink/mlx5.rst
Normal file
59
Documentation/networking/devlink/mlx5.rst
Normal file
@ -0,0 +1,59 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
====================
|
||||
mlx5 devlink support
|
||||
====================
|
||||
|
||||
This document describes the devlink features implemented by the ``mlx5``
|
||||
device driver.
|
||||
|
||||
Parameters
|
||||
==========
|
||||
|
||||
.. list-table:: Generic parameters implemented
|
||||
|
||||
* - Name
|
||||
- Mode
|
||||
* - ``enable_roce``
|
||||
- driverinit
|
||||
|
||||
The ``mlx5`` driver also implements the following driver-specific
|
||||
parameters.
|
||||
|
||||
.. list-table:: Driver-specific parameters implemented
|
||||
:widths: 5 5 5 85
|
||||
|
||||
* - Name
|
||||
- Type
|
||||
- Mode
|
||||
- Description
|
||||
* - ``flow_steering_mode``
|
||||
- string
|
||||
- runtime
|
||||
- Controls the flow steering mode of the driver
|
||||
|
||||
* ``dmfs`` Device managed flow steering. In DMFS mode, the HW
|
||||
steering entities are created and managed through firmware.
|
||||
* ``smfs`` Software managed flow steering. In SMFS mode, the HW
|
||||
steering entities are created and manage through the driver without
|
||||
firmware intervention.
|
||||
|
||||
The ``mlx5`` driver supports reloading via ``DEVLINK_CMD_RELOAD``
|
||||
|
||||
Info versions
|
||||
=============
|
||||
|
||||
The ``mlx5`` driver reports the following versions
|
||||
|
||||
.. list-table:: devlink info versions implemented
|
||||
:widths: 5 5 90
|
||||
|
||||
* - Name
|
||||
- Type
|
||||
- Description
|
||||
* - ``fw.psid``
|
||||
- fixed
|
||||
- Used to represent the board id of the device.
|
||||
* - ``fw.version``
|
||||
- stored, running
|
||||
- Three digit major.minor.subminor firmware version number.
|
81
Documentation/networking/devlink/mlxsw.rst
Normal file
81
Documentation/networking/devlink/mlxsw.rst
Normal file
@ -0,0 +1,81 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
=====================
|
||||
mlxsw devlink support
|
||||
=====================
|
||||
|
||||
This document describes the devlink features implemented by the ``mlxsw``
|
||||
device driver.
|
||||
|
||||
Parameters
|
||||
==========
|
||||
|
||||
.. list-table:: Generic parameters implemented
|
||||
|
||||
* - Name
|
||||
- Mode
|
||||
* - ``fw_load_policy``
|
||||
- driverinit
|
||||
|
||||
The ``mlxsw`` driver also implements the following driver-specific
|
||||
parameters.
|
||||
|
||||
.. list-table:: Driver-specific parameters implemented
|
||||
:widths: 5 5 5 85
|
||||
|
||||
* - Name
|
||||
- Type
|
||||
- Mode
|
||||
- Description
|
||||
* - ``acl_region_rehash_interval``
|
||||
- u32
|
||||
- runtime
|
||||
- Sets an interval for periodic ACL region rehashes. The value is
|
||||
specified in milliseconds, with a minimum of ``3000``. The value of
|
||||
``0`` disables periodic work entirely. The first rehash will be run
|
||||
immediately after the value is set.
|
||||
|
||||
The ``mlxsw`` driver supports reloading via ``DEVLINK_CMD_RELOAD``
|
||||
|
||||
Info versions
|
||||
=============
|
||||
|
||||
The ``mlxsw`` driver reports the following versions
|
||||
|
||||
.. list-table:: devlink info versions implemented
|
||||
:widths: 5 5 90
|
||||
|
||||
* - Name
|
||||
- Type
|
||||
- Description
|
||||
* - ``hw.revision``
|
||||
- fixed
|
||||
- The hardware revision for this board
|
||||
* - ``fw.psid``
|
||||
- fixed
|
||||
- Firmware PSID
|
||||
* - ``fw.version``
|
||||
- running
|
||||
- Three digit firmware version
|
||||
|
||||
Driver-specific Traps
|
||||
=====================
|
||||
|
||||
.. list-table:: List of Driver-specific Traps Registered by ``mlxsw``
|
||||
:widths: 5 5 90
|
||||
|
||||
* - Name
|
||||
- Type
|
||||
- Description
|
||||
* - ``irif_disabled``
|
||||
- ``drop``
|
||||
- Traps packets that the device decided to drop because they need to be
|
||||
routed from a disabled router interface (RIF). This can happen during
|
||||
RIF dismantle, when the RIF is first disabled before being removed
|
||||
completely
|
||||
* - ``erif_disabled``
|
||||
- ``drop``
|
||||
- Traps packets that the device decided to drop because they need to be
|
||||
routed through a disabled router interface (RIF). This can happen during
|
||||
RIF dismantle, when the RIF is first disabled before being removed
|
||||
completely
|
28
Documentation/networking/devlink/mv88e6xxx.rst
Normal file
28
Documentation/networking/devlink/mv88e6xxx.rst
Normal file
@ -0,0 +1,28 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
=========================
|
||||
mv88e6xxx devlink support
|
||||
=========================
|
||||
|
||||
This document describes the devlink features implemented by the ``mv88e6xxx``
|
||||
device driver.
|
||||
|
||||
Parameters
|
||||
==========
|
||||
|
||||
The ``mv88e6xxx`` driver implements the following driver-specific parameters.
|
||||
|
||||
.. list-table:: Driver-specific parameters implemented
|
||||
:widths: 5 5 5 85
|
||||
|
||||
* - Name
|
||||
- Type
|
||||
- Mode
|
||||
- Description
|
||||
* - ``ATU_hash``
|
||||
- u8
|
||||
- runtime
|
||||
- Select one of four possible hashing algorithms for MAC addresses in
|
||||
the Address Translation Unit. A value of 3 may work better than the
|
||||
default of 1 when many MAC addresses have the same OUI. Only the
|
||||
values 0 to 3 are valid for this parameter.
|
72
Documentation/networking/devlink/netdevsim.rst
Normal file
72
Documentation/networking/devlink/netdevsim.rst
Normal file
@ -0,0 +1,72 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
=========================
|
||||
netdevsim devlink support
|
||||
=========================
|
||||
|
||||
This document describes the ``devlink`` features supported by the
|
||||
``netdevsim`` device driver.
|
||||
|
||||
Parameters
|
||||
==========
|
||||
|
||||
.. list-table:: Generic parameters implemented
|
||||
|
||||
* - Name
|
||||
- Mode
|
||||
* - ``max_macs``
|
||||
- driverinit
|
||||
|
||||
The ``netdevsim`` driver also implements the following driver-specific
|
||||
parameters.
|
||||
|
||||
.. list-table:: Driver-specific parameters implemented
|
||||
:widths: 5 5 5 85
|
||||
|
||||
* - Name
|
||||
- Type
|
||||
- Mode
|
||||
- Description
|
||||
* - ``test1``
|
||||
- Boolean
|
||||
- driverinit
|
||||
- Test parameter used to show how a driver-specific devlink parameter
|
||||
can be implemented.
|
||||
|
||||
The ``netdevsim`` driver supports reloading via ``DEVLINK_CMD_RELOAD``
|
||||
|
||||
Regions
|
||||
=======
|
||||
|
||||
The ``netdevsim`` driver exposes a ``dummy`` region as an example of how the
|
||||
devlink-region interfaces work. A snapshot is taken whenever the
|
||||
``take_snapshot`` debugfs file is written to.
|
||||
|
||||
Resources
|
||||
=========
|
||||
|
||||
The ``netdevsim`` driver exposes resources to control the number of FIB
|
||||
entries and FIB rule entries that the driver will allow.
|
||||
|
||||
.. code:: shell
|
||||
|
||||
$ devlink resource set netdevsim/netdevsim0 path /IPv4/fib size 96
|
||||
$ devlink resource set netdevsim/netdevsim0 path /IPv4/fib-rules size 16
|
||||
$ devlink resource set netdevsim/netdevsim0 path /IPv6/fib size 64
|
||||
$ devlink resource set netdevsim/netdevsim0 path /IPv6/fib-rules size 16
|
||||
$ devlink dev reload netdevsim/netdevsim0
|
||||
|
||||
Driver-specific Traps
|
||||
=====================
|
||||
|
||||
.. list-table:: List of Driver-specific Traps Registered by ``netdevsim``
|
||||
:widths: 5 5 90
|
||||
|
||||
* - Name
|
||||
- Type
|
||||
- Description
|
||||
* - ``fid_miss``
|
||||
- ``exception``
|
||||
- When a packet enters the device it is classified to a filtering
|
||||
indentifier (FID) based on the ingress port and VLAN. This trap is used
|
||||
to trap packets for which a FID could not be found
|
65
Documentation/networking/devlink/nfp.rst
Normal file
65
Documentation/networking/devlink/nfp.rst
Normal file
@ -0,0 +1,65 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
===================
|
||||
nfp devlink support
|
||||
===================
|
||||
|
||||
This document describes the devlink features implemented by the ``nfp``
|
||||
device driver.
|
||||
|
||||
Parameters
|
||||
==========
|
||||
|
||||
.. list-table:: Generic parameters implemented
|
||||
|
||||
* - Name
|
||||
- Mode
|
||||
* - ``fw_load_policy``
|
||||
- permanent
|
||||
* - ``reset_dev_on_drv_probe``
|
||||
- permanent
|
||||
|
||||
Info versions
|
||||
=============
|
||||
|
||||
The ``nfp`` driver reports the following versions
|
||||
|
||||
.. list-table:: devlink info versions implemented
|
||||
:widths: 5 5 90
|
||||
|
||||
* - Name
|
||||
- Type
|
||||
- Description
|
||||
* - ``board.id``
|
||||
- fixed
|
||||
- Part number identifying the board design
|
||||
* - ``board.rev``
|
||||
- fixed
|
||||
- Revision of the board design
|
||||
* - ``board.manufacture``
|
||||
- fixed
|
||||
- Vendor of the board design
|
||||
* - ``board.model``
|
||||
- fixed
|
||||
- Model name of the board design
|
||||
* - ``fw.bundle_id``
|
||||
- stored, running
|
||||
- Firmware bundle id
|
||||
* - ``fw.mgmt``
|
||||
- stored, running
|
||||
- Version of the management firmware
|
||||
* - ``fw.cpld``
|
||||
- stored, running
|
||||
- The CPLD firmware component version
|
||||
* - ``fw.app``
|
||||
- stored, running
|
||||
- The APP firmware component version
|
||||
* - ``fw.undi``
|
||||
- stored, running
|
||||
- The UNDI firmware component version
|
||||
* - ``fw.ncsi``
|
||||
- stored, running
|
||||
- The NSCI firmware component version
|
||||
* - ``chip.init``
|
||||
- stored, running
|
||||
- The CFGR firmware component version
|
26
Documentation/networking/devlink/qed.rst
Normal file
26
Documentation/networking/devlink/qed.rst
Normal file
@ -0,0 +1,26 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
===================
|
||||
qed devlink support
|
||||
===================
|
||||
|
||||
This document describes the devlink features implemented by the ``qed`` core
|
||||
device driver.
|
||||
|
||||
Parameters
|
||||
==========
|
||||
|
||||
The ``qed`` driver implements the following driver-specific parameters.
|
||||
|
||||
.. list-table:: Driver-specific parameters implemented
|
||||
:widths: 5 5 5 85
|
||||
|
||||
* - Name
|
||||
- Type
|
||||
- Mode
|
||||
- Description
|
||||
* - ``iwarp_cmt``
|
||||
- Boolean
|
||||
- runtime
|
||||
- Enable iWARP functionality for 100g devices. Note that this impacts
|
||||
L2 performance, and is therefore not enabled by default.
|
31
Documentation/networking/devlink/ti-cpsw-switch.rst
Normal file
31
Documentation/networking/devlink/ti-cpsw-switch.rst
Normal file
@ -0,0 +1,31 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
==============================
|
||||
ti-cpsw-switch devlink support
|
||||
==============================
|
||||
|
||||
This document describes the devlink features implemented by the ``ti-cpsw-switch``
|
||||
device driver.
|
||||
|
||||
Parameters
|
||||
==========
|
||||
|
||||
The ``ti-cpsw-switch`` driver implements the following driver-specific
|
||||
parameters.
|
||||
|
||||
.. list-table:: Driver-specific parameters implemented
|
||||
:widths: 5 5 5 85
|
||||
|
||||
* - Name
|
||||
- Type
|
||||
- Mode
|
||||
- Description
|
||||
* - ``ale_bypass``
|
||||
- Boolean
|
||||
- runtime
|
||||
- Enables ALE_CONTROL(4).BYPASS mode for debugging purposes. In this
|
||||
mode, all packets will be sent to the host port only.
|
||||
* - ``switch_mode``
|
||||
- Boolean
|
||||
- runtime
|
||||
- Enable switch mode
|
618
Documentation/networking/ethtool-netlink.rst
Normal file
618
Documentation/networking/ethtool-netlink.rst
Normal file
@ -0,0 +1,618 @@
|
||||
=============================
|
||||
Netlink interface for ethtool
|
||||
=============================
|
||||
|
||||
|
||||
Basic information
|
||||
=================
|
||||
|
||||
Netlink interface for ethtool uses generic netlink family ``ethtool``
|
||||
(userspace application should use macros ``ETHTOOL_GENL_NAME`` and
|
||||
``ETHTOOL_GENL_VERSION`` defined in ``<linux/ethtool_netlink.h>`` uapi
|
||||
header). This family does not use a specific header, all information in
|
||||
requests and replies is passed using netlink attributes.
|
||||
|
||||
The ethtool netlink interface uses extended ACK for error and warning
|
||||
reporting, userspace application developers are encouraged to make these
|
||||
messages available to user in a suitable way.
|
||||
|
||||
Requests can be divided into three categories: "get" (retrieving information),
|
||||
"set" (setting parameters) and "action" (invoking an action).
|
||||
|
||||
All "set" and "action" type requests require admin privileges
|
||||
(``CAP_NET_ADMIN`` in the namespace). Most "get" type requests are allowed for
|
||||
anyone but there are exceptions (where the response contains sensitive
|
||||
information). In some cases, the request as such is allowed for anyone but
|
||||
unprivileged users have attributes with sensitive information (e.g.
|
||||
wake-on-lan password) omitted.
|
||||
|
||||
|
||||
Conventions
|
||||
===========
|
||||
|
||||
Attributes which represent a boolean value usually use NLA_U8 type so that we
|
||||
can distinguish three states: "on", "off" and "not present" (meaning the
|
||||
information is not available in "get" requests or value is not to be changed
|
||||
in "set" requests). For these attributes, the "true" value should be passed as
|
||||
number 1 but any non-zero value should be understood as "true" by recipient.
|
||||
In the tables below, "bool" denotes NLA_U8 attributes interpreted in this way.
|
||||
|
||||
In the message structure descriptions below, if an attribute name is suffixed
|
||||
with "+", parent nest can contain multiple attributes of the same type. This
|
||||
implements an array of entries.
|
||||
|
||||
|
||||
Request header
|
||||
==============
|
||||
|
||||
Each request or reply message contains a nested attribute with common header.
|
||||
Structure of this header is
|
||||
|
||||
============================== ====== =============================
|
||||
``ETHTOOL_A_HEADER_DEV_INDEX`` u32 device ifindex
|
||||
``ETHTOOL_A_HEADER_DEV_NAME`` string device name
|
||||
``ETHTOOL_A_HEADER_FLAGS`` u32 flags common for all requests
|
||||
============================== ====== =============================
|
||||
|
||||
``ETHTOOL_A_HEADER_DEV_INDEX`` and ``ETHTOOL_A_HEADER_DEV_NAME`` identify the
|
||||
device message relates to. One of them is sufficient in requests, if both are
|
||||
used, they must identify the same device. Some requests, e.g. global string
|
||||
sets, do not require device identification. Most ``GET`` requests also allow
|
||||
dump requests without device identification to query the same information for
|
||||
all devices providing it (each device in a separate message).
|
||||
|
||||
``ETHTOOL_A_HEADER_FLAGS`` is a bitmap of request flags common for all request
|
||||
types. The interpretation of these flags is the same for all request types but
|
||||
the flags may not apply to requests. Recognized flags are:
|
||||
|
||||
================================= ===================================
|
||||
``ETHTOOL_FLAG_COMPACT_BITSETS`` use compact format bitsets in reply
|
||||
``ETHTOOL_FLAG_OMIT_REPLY`` omit optional reply (_SET and _ACT)
|
||||
================================= ===================================
|
||||
|
||||
New request flags should follow the general idea that if the flag is not set,
|
||||
the behaviour is backward compatible, i.e. requests from old clients not aware
|
||||
of the flag should be interpreted the way the client expects. A client must
|
||||
not set flags it does not understand.
|
||||
|
||||
|
||||
Bit sets
|
||||
========
|
||||
|
||||
For short bitmaps of (reasonably) fixed length, standard ``NLA_BITFIELD32``
|
||||
type is used. For arbitrary length bitmaps, ethtool netlink uses a nested
|
||||
attribute with contents of one of two forms: compact (two binary bitmaps
|
||||
representing bit values and mask of affected bits) and bit-by-bit (list of
|
||||
bits identified by either index or name).
|
||||
|
||||
Verbose (bit-by-bit) bitsets allow sending symbolic names for bits together
|
||||
with their values which saves a round trip (when the bitset is passed in a
|
||||
request) or at least a second request (when the bitset is in a reply). This is
|
||||
useful for one shot applications like traditional ethtool command. On the
|
||||
other hand, long running applications like ethtool monitor (displaying
|
||||
notifications) or network management daemons may prefer fetching the names
|
||||
only once and using compact form to save message size. Notifications from
|
||||
ethtool netlink interface always use compact form for bitsets.
|
||||
|
||||
A bitset can represent either a value/mask pair (``ETHTOOL_A_BITSET_NOMASK``
|
||||
not set) or a single bitmap (``ETHTOOL_A_BITSET_NOMASK`` set). In requests
|
||||
modifying a bitmap, the former changes the bit set in mask to values set in
|
||||
value and preserves the rest; the latter sets the bits set in the bitmap and
|
||||
clears the rest.
|
||||
|
||||
Compact form: nested (bitset) atrribute contents:
|
||||
|
||||
============================ ====== ============================
|
||||
``ETHTOOL_A_BITSET_NOMASK`` flag no mask, only a list
|
||||
``ETHTOOL_A_BITSET_SIZE`` u32 number of significant bits
|
||||
``ETHTOOL_A_BITSET_VALUE`` binary bitmap of bit values
|
||||
``ETHTOOL_A_BITSET_MASK`` binary bitmap of valid bits
|
||||
============================ ====== ============================
|
||||
|
||||
Value and mask must have length at least ``ETHTOOL_A_BITSET_SIZE`` bits
|
||||
rounded up to a multiple of 32 bits. They consist of 32-bit words in host byte
|
||||
order, words ordered from least significant to most significant (i.e. the same
|
||||
way as bitmaps are passed with ioctl interface).
|
||||
|
||||
For compact form, ``ETHTOOL_A_BITSET_SIZE`` and ``ETHTOOL_A_BITSET_VALUE`` are
|
||||
mandatory. ``ETHTOOL_A_BITSET_MASK`` attribute is mandatory if
|
||||
``ETHTOOL_A_BITSET_NOMASK`` is not set (bitset represents a value/mask pair);
|
||||
if ``ETHTOOL_A_BITSET_NOMASK`` is not set, ``ETHTOOL_A_BITSET_MASK`` is not
|
||||
allowed (bitset represents a single bitmap.
|
||||
|
||||
Kernel bit set length may differ from userspace length if older application is
|
||||
used on newer kernel or vice versa. If userspace bitmap is longer, an error is
|
||||
issued only if the request actually tries to set values of some bits not
|
||||
recognized by kernel.
|
||||
|
||||
Bit-by-bit form: nested (bitset) attribute contents:
|
||||
|
||||
+------------------------------------+--------+-----------------------------+
|
||||
| ``ETHTOOL_A_BITSET_NOMASK`` | flag | no mask, only a list |
|
||||
+------------------------------------+--------+-----------------------------+
|
||||
| ``ETHTOOL_A_BITSET_SIZE`` | u32 | number of significant bits |
|
||||
+------------------------------------+--------+-----------------------------+
|
||||
| ``ETHTOOL_A_BITSET_BITS`` | nested | array of bits |
|
||||
+-+----------------------------------+--------+-----------------------------+
|
||||
| | ``ETHTOOL_A_BITSET_BITS_BIT+`` | nested | one bit |
|
||||
+-+-+--------------------------------+--------+-----------------------------+
|
||||
| | | ``ETHTOOL_A_BITSET_BIT_INDEX`` | u32 | bit index (0 for LSB) |
|
||||
+-+-+--------------------------------+--------+-----------------------------+
|
||||
| | | ``ETHTOOL_A_BITSET_BIT_NAME`` | string | bit name |
|
||||
+-+-+--------------------------------+--------+-----------------------------+
|
||||
| | | ``ETHTOOL_A_BITSET_BIT_VALUE`` | flag | present if bit is set |
|
||||
+-+-+--------------------------------+--------+-----------------------------+
|
||||
|
||||
Bit size is optional for bit-by-bit form. ``ETHTOOL_A_BITSET_BITS`` nest can
|
||||
only contain ``ETHTOOL_A_BITSET_BITS_BIT`` attributes but there can be an
|
||||
arbitrary number of them. A bit may be identified by its index or by its
|
||||
name. When used in requests, listed bits are set to 0 or 1 according to
|
||||
``ETHTOOL_A_BITSET_BIT_VALUE``, the rest is preserved. A request fails if
|
||||
index exceeds kernel bit length or if name is not recognized.
|
||||
|
||||
When ``ETHTOOL_A_BITSET_NOMASK`` flag is present, bitset is interpreted as
|
||||
a simple bitmap. ``ETHTOOL_A_BITSET_BIT_VALUE`` attributes are not used in
|
||||
such case. Such bitset represents a bitmap with listed bits set and the rest
|
||||
zero.
|
||||
|
||||
In requests, application can use either form. Form used by kernel in reply is
|
||||
determined by ``ETHTOOL_FLAG_COMPACT_BITSETS`` flag in flags field of request
|
||||
header. Semantics of value and mask depends on the attribute.
|
||||
|
||||
|
||||
List of message types
|
||||
=====================
|
||||
|
||||
All constants identifying message types use ``ETHTOOL_CMD_`` prefix and suffix
|
||||
according to message purpose:
|
||||
|
||||
============== ======================================
|
||||
``_GET`` userspace request to retrieve data
|
||||
``_SET`` userspace request to set data
|
||||
``_ACT`` userspace request to perform an action
|
||||
``_GET_REPLY`` kernel reply to a ``GET`` request
|
||||
``_SET_REPLY`` kernel reply to a ``SET`` request
|
||||
``_ACT_REPLY`` kernel reply to an ``ACT`` request
|
||||
``_NTF`` kernel notification
|
||||
============== ======================================
|
||||
|
||||
Userspace to kernel:
|
||||
|
||||
===================================== ================================
|
||||
``ETHTOOL_MSG_STRSET_GET`` get string set
|
||||
``ETHTOOL_MSG_LINKINFO_GET`` get link settings
|
||||
``ETHTOOL_MSG_LINKINFO_SET`` set link settings
|
||||
``ETHTOOL_MSG_LINKMODES_GET`` get link modes info
|
||||
``ETHTOOL_MSG_LINKMODES_SET`` set link modes info
|
||||
``ETHTOOL_MSG_LINKSTATE_GET`` get link state
|
||||
``ETHTOOL_MSG_DEBUG_GET`` get debugging settings
|
||||
``ETHTOOL_MSG_DEBUG_SET`` set debugging settings
|
||||
``ETHTOOL_MSG_WOL_GET`` get wake-on-lan settings
|
||||
``ETHTOOL_MSG_WOL_SET`` set wake-on-lan settings
|
||||
===================================== ================================
|
||||
|
||||
Kernel to userspace:
|
||||
|
||||
===================================== =================================
|
||||
``ETHTOOL_MSG_STRSET_GET_REPLY`` string set contents
|
||||
``ETHTOOL_MSG_LINKINFO_GET_REPLY`` link settings
|
||||
``ETHTOOL_MSG_LINKINFO_NTF`` link settings notification
|
||||
``ETHTOOL_MSG_LINKMODES_GET_REPLY`` link modes info
|
||||
``ETHTOOL_MSG_LINKMODES_NTF`` link modes notification
|
||||
``ETHTOOL_MSG_LINKSTATE_GET_REPLY`` link state info
|
||||
``ETHTOOL_MSG_DEBUG_GET_REPLY`` debugging settings
|
||||
``ETHTOOL_MSG_DEBUG_NTF`` debugging settings notification
|
||||
``ETHTOOL_MSG_WOL_GET_REPLY`` wake-on-lan settings
|
||||
``ETHTOOL_MSG_WOL_NTF`` wake-on-lan settings notification
|
||||
===================================== =================================
|
||||
|
||||
``GET`` requests are sent by userspace applications to retrieve device
|
||||
information. They usually do not contain any message specific attributes.
|
||||
Kernel replies with corresponding "GET_REPLY" message. For most types, ``GET``
|
||||
request with ``NLM_F_DUMP`` and no device identification can be used to query
|
||||
the information for all devices supporting the request.
|
||||
|
||||
If the data can be also modified, corresponding ``SET`` message with the same
|
||||
layout as corresponding ``GET_REPLY`` is used to request changes. Only
|
||||
attributes where a change is requested are included in such request (also, not
|
||||
all attributes may be changed). Replies to most ``SET`` request consist only
|
||||
of error code and extack; if kernel provides additional data, it is sent in
|
||||
the form of corresponding ``SET_REPLY`` message which can be suppressed by
|
||||
setting ``ETHTOOL_FLAG_OMIT_REPLY`` flag in request header.
|
||||
|
||||
Data modification also triggers sending a ``NTF`` message with a notification.
|
||||
These usually bear only a subset of attributes which was affected by the
|
||||
change. The same notification is issued if the data is modified using other
|
||||
means (mostly ioctl ethtool interface). Unlike notifications from ethtool
|
||||
netlink code which are only sent if something actually changed, notifications
|
||||
triggered by ioctl interface may be sent even if the request did not actually
|
||||
change any data.
|
||||
|
||||
``ACT`` messages request kernel (driver) to perform a specific action. If some
|
||||
information is reported by kernel (which can be suppressed by setting
|
||||
``ETHTOOL_FLAG_OMIT_REPLY`` flag in request header), the reply takes form of
|
||||
an ``ACT_REPLY`` message. Performing an action also triggers a notification
|
||||
(``NTF`` message).
|
||||
|
||||
Later sections describe the format and semantics of these messages.
|
||||
|
||||
|
||||
STRSET_GET
|
||||
==========
|
||||
|
||||
Requests contents of a string set as provided by ioctl commands
|
||||
``ETHTOOL_GSSET_INFO`` and ``ETHTOOL_GSTRINGS.`` String sets are not user
|
||||
writeable so that the corresponding ``STRSET_SET`` message is only used in
|
||||
kernel replies. There are two types of string sets: global (independent of
|
||||
a device, e.g. device feature names) and device specific (e.g. device private
|
||||
flags).
|
||||
|
||||
Request contents:
|
||||
|
||||
+---------------------------------------+--------+------------------------+
|
||||
| ``ETHTOOL_A_STRSET_HEADER`` | nested | request header |
|
||||
+---------------------------------------+--------+------------------------+
|
||||
| ``ETHTOOL_A_STRSET_STRINGSETS`` | nested | string set to request |
|
||||
+-+-------------------------------------+--------+------------------------+
|
||||
| | ``ETHTOOL_A_STRINGSETS_STRINGSET+`` | nested | one string set |
|
||||
+-+-+-----------------------------------+--------+------------------------+
|
||||
| | | ``ETHTOOL_A_STRINGSET_ID`` | u32 | set id |
|
||||
+-+-+-----------------------------------+--------+------------------------+
|
||||
|
||||
Kernel response contents:
|
||||
|
||||
+---------------------------------------+--------+-----------------------+
|
||||
| ``ETHTOOL_A_STRSET_HEADER`` | nested | reply header |
|
||||
+---------------------------------------+--------+-----------------------+
|
||||
| ``ETHTOOL_A_STRSET_STRINGSETS`` | nested | array of string sets |
|
||||
+-+-------------------------------------+--------+-----------------------+
|
||||
| | ``ETHTOOL_A_STRINGSETS_STRINGSET+`` | nested | one string set |
|
||||
+-+-+-----------------------------------+--------+-----------------------+
|
||||
| | | ``ETHTOOL_A_STRINGSET_ID`` | u32 | set id |
|
||||
+-+-+-----------------------------------+--------+-----------------------+
|
||||
| | | ``ETHTOOL_A_STRINGSET_COUNT`` | u32 | number of strings |
|
||||
+-+-+-----------------------------------+--------+-----------------------+
|
||||
| | | ``ETHTOOL_A_STRINGSET_STRINGS`` | nested | array of strings |
|
||||
+-+-+-+---------------------------------+--------+-----------------------+
|
||||
| | | | ``ETHTOOL_A_STRINGS_STRING+`` | nested | one string |
|
||||
+-+-+-+-+-------------------------------+--------+-----------------------+
|
||||
| | | | | ``ETHTOOL_A_STRING_INDEX`` | u32 | string index |
|
||||
+-+-+-+-+-------------------------------+--------+-----------------------+
|
||||
| | | | | ``ETHTOOL_A_STRING_VALUE`` | string | string value |
|
||||
+-+-+-+-+-------------------------------+--------+-----------------------+
|
||||
| ``ETHTOOL_A_STRSET_COUNTS_ONLY`` | flag | return only counts |
|
||||
+---------------------------------------+--------+-----------------------+
|
||||
|
||||
Device identification in request header is optional. Depending on its presence
|
||||
a and ``NLM_F_DUMP`` flag, there are three type of ``STRSET_GET`` requests:
|
||||
|
||||
- no ``NLM_F_DUMP,`` no device: get "global" stringsets
|
||||
- no ``NLM_F_DUMP``, with device: get string sets related to the device
|
||||
- ``NLM_F_DUMP``, no device: get device related string sets for all devices
|
||||
|
||||
If there is no ``ETHTOOL_A_STRSET_STRINGSETS`` array, all string sets of
|
||||
requested type are returned, otherwise only those specified in the request.
|
||||
Flag ``ETHTOOL_A_STRSET_COUNTS_ONLY`` tells kernel to only return string
|
||||
counts of the sets, not the actual strings.
|
||||
|
||||
|
||||
LINKINFO_GET
|
||||
============
|
||||
|
||||
Requests link settings as provided by ``ETHTOOL_GLINKSETTINGS`` except for
|
||||
link modes and autonegotiation related information. The request does not use
|
||||
any attributes.
|
||||
|
||||
Request contents:
|
||||
|
||||
==================================== ====== ==========================
|
||||
``ETHTOOL_A_LINKINFO_HEADER`` nested request header
|
||||
==================================== ====== ==========================
|
||||
|
||||
Kernel response contents:
|
||||
|
||||
==================================== ====== ==========================
|
||||
``ETHTOOL_A_LINKINFO_HEADER`` nested reply header
|
||||
``ETHTOOL_A_LINKINFO_PORT`` u8 physical port
|
||||
``ETHTOOL_A_LINKINFO_PHYADDR`` u8 phy MDIO address
|
||||
``ETHTOOL_A_LINKINFO_TP_MDIX`` u8 MDI(-X) status
|
||||
``ETHTOOL_A_LINKINFO_TP_MDIX_CTRL`` u8 MDI(-X) control
|
||||
``ETHTOOL_A_LINKINFO_TRANSCEIVER`` u8 transceiver
|
||||
==================================== ====== ==========================
|
||||
|
||||
Attributes and their values have the same meaning as matching members of the
|
||||
corresponding ioctl structures.
|
||||
|
||||
``LINKINFO_GET`` allows dump requests (kernel returns reply message for all
|
||||
devices supporting the request).
|
||||
|
||||
|
||||
LINKINFO_SET
|
||||
============
|
||||
|
||||
``LINKINFO_SET`` request allows setting some of the attributes reported by
|
||||
``LINKINFO_GET``.
|
||||
|
||||
Request contents:
|
||||
|
||||
==================================== ====== ==========================
|
||||
``ETHTOOL_A_LINKINFO_HEADER`` nested request header
|
||||
``ETHTOOL_A_LINKINFO_PORT`` u8 physical port
|
||||
``ETHTOOL_A_LINKINFO_PHYADDR`` u8 phy MDIO address
|
||||
``ETHTOOL_A_LINKINFO_TP_MDIX_CTRL`` u8 MDI(-X) control
|
||||
==================================== ====== ==========================
|
||||
|
||||
MDI(-X) status and transceiver cannot be set, request with the corresponding
|
||||
attributes is rejected.
|
||||
|
||||
|
||||
LINKMODES_GET
|
||||
=============
|
||||
|
||||
Requests link modes (supported, advertised and peer advertised) and related
|
||||
information (autonegotiation status, link speed and duplex) as provided by
|
||||
``ETHTOOL_GLINKSETTINGS``. The request does not use any attributes.
|
||||
|
||||
Request contents:
|
||||
|
||||
==================================== ====== ==========================
|
||||
``ETHTOOL_A_LINKMODES_HEADER`` nested request header
|
||||
==================================== ====== ==========================
|
||||
|
||||
Kernel response contents:
|
||||
|
||||
==================================== ====== ==========================
|
||||
``ETHTOOL_A_LINKMODES_HEADER`` nested reply header
|
||||
``ETHTOOL_A_LINKMODES_AUTONEG`` u8 autonegotiation status
|
||||
``ETHTOOL_A_LINKMODES_OURS`` bitset advertised link modes
|
||||
``ETHTOOL_A_LINKMODES_PEER`` bitset partner link modes
|
||||
``ETHTOOL_A_LINKMODES_SPEED`` u32 link speed (Mb/s)
|
||||
``ETHTOOL_A_LINKMODES_DUPLEX`` u8 duplex mode
|
||||
==================================== ====== ==========================
|
||||
|
||||
For ``ETHTOOL_A_LINKMODES_OURS``, value represents advertised modes and mask
|
||||
represents supported modes. ``ETHTOOL_A_LINKMODES_PEER`` in the reply is a bit
|
||||
list.
|
||||
|
||||
``LINKMODES_GET`` allows dump requests (kernel returns reply messages for all
|
||||
devices supporting the request).
|
||||
|
||||
|
||||
LINKMODES_SET
|
||||
=============
|
||||
|
||||
Request contents:
|
||||
|
||||
==================================== ====== ==========================
|
||||
``ETHTOOL_A_LINKMODES_HEADER`` nested request header
|
||||
``ETHTOOL_A_LINKMODES_AUTONEG`` u8 autonegotiation status
|
||||
``ETHTOOL_A_LINKMODES_OURS`` bitset advertised link modes
|
||||
``ETHTOOL_A_LINKMODES_PEER`` bitset partner link modes
|
||||
``ETHTOOL_A_LINKMODES_SPEED`` u32 link speed (Mb/s)
|
||||
``ETHTOOL_A_LINKMODES_DUPLEX`` u8 duplex mode
|
||||
==================================== ====== ==========================
|
||||
|
||||
``ETHTOOL_A_LINKMODES_OURS`` bit set allows setting advertised link modes. If
|
||||
autonegotiation is on (either set now or kept from before), advertised modes
|
||||
are not changed (no ``ETHTOOL_A_LINKMODES_OURS`` attribute) and at least one
|
||||
of speed and duplex is specified, kernel adjusts advertised modes to all
|
||||
supported modes matching speed, duplex or both (whatever is specified). This
|
||||
autoselection is done on ethtool side with ioctl interface, netlink interface
|
||||
is supposed to allow requesting changes without knowing what exactly kernel
|
||||
supports.
|
||||
|
||||
|
||||
LINKSTATE_GET
|
||||
=============
|
||||
|
||||
Requests link state information. At the moment, only link up/down flag (as
|
||||
provided by ``ETHTOOL_GLINK`` ioctl command) is provided but some future
|
||||
extensions are planned (e.g. link down reason). This request does not have any
|
||||
attributes.
|
||||
|
||||
Request contents:
|
||||
|
||||
==================================== ====== ==========================
|
||||
``ETHTOOL_A_LINKSTATE_HEADER`` nested request header
|
||||
==================================== ====== ==========================
|
||||
|
||||
Kernel response contents:
|
||||
|
||||
==================================== ====== ==========================
|
||||
``ETHTOOL_A_LINKSTATE_HEADER`` nested reply header
|
||||
``ETHTOOL_A_LINKSTATE_LINK`` bool link state (up/down)
|
||||
==================================== ====== ==========================
|
||||
|
||||
For most NIC drivers, the value of ``ETHTOOL_A_LINKSTATE_LINK`` returns
|
||||
carrier flag provided by ``netif_carrier_ok()`` but there are drivers which
|
||||
define their own handler.
|
||||
|
||||
``LINKSTATE_GET`` allows dump requests (kernel returns reply messages for all
|
||||
devices supporting the request).
|
||||
|
||||
|
||||
DEBUG_GET
|
||||
=========
|
||||
|
||||
Requests debugging settings of a device. At the moment, only message mask is
|
||||
provided.
|
||||
|
||||
Request contents:
|
||||
|
||||
==================================== ====== ==========================
|
||||
``ETHTOOL_A_DEBUG_HEADER`` nested request header
|
||||
==================================== ====== ==========================
|
||||
|
||||
Kernel response contents:
|
||||
|
||||
==================================== ====== ==========================
|
||||
``ETHTOOL_A_DEBUG_HEADER`` nested reply header
|
||||
``ETHTOOL_A_DEBUG_MSGMASK`` bitset message mask
|
||||
==================================== ====== ==========================
|
||||
|
||||
The message mask (``ETHTOOL_A_DEBUG_MSGMASK``) is equal to message level as
|
||||
provided by ``ETHTOOL_GMSGLVL`` and set by ``ETHTOOL_SMSGLVL`` in ioctl
|
||||
interface. While it is called message level there for historical reasons, most
|
||||
drivers and almost all newer drivers use it as a mask of enabled message
|
||||
classes (represented by ``NETIF_MSG_*`` constants); therefore netlink
|
||||
interface follows its actual use in practice.
|
||||
|
||||
``DEBUG_GET`` allows dump requests (kernel returns reply messages for all
|
||||
devices supporting the request).
|
||||
|
||||
|
||||
DEBUG_SET
|
||||
=========
|
||||
|
||||
Set or update debugging settings of a device. At the moment, only message mask
|
||||
is supported.
|
||||
|
||||
Request contents:
|
||||
|
||||
==================================== ====== ==========================
|
||||
``ETHTOOL_A_DEBUG_HEADER`` nested request header
|
||||
``ETHTOOL_A_DEBUG_MSGMASK`` bitset message mask
|
||||
==================================== ====== ==========================
|
||||
|
||||
``ETHTOOL_A_DEBUG_MSGMASK`` bit set allows setting or modifying mask of
|
||||
enabled debugging message types for the device.
|
||||
|
||||
|
||||
WOL_GET
|
||||
=======
|
||||
|
||||
Query device wake-on-lan settings. Unlike most "GET" type requests,
|
||||
``ETHTOOL_MSG_WOL_GET`` requires (netns) ``CAP_NET_ADMIN`` privileges as it
|
||||
(potentially) provides SecureOn(tm) password which is confidential.
|
||||
|
||||
Request contents:
|
||||
|
||||
==================================== ====== ==========================
|
||||
``ETHTOOL_A_WOL_HEADER`` nested request header
|
||||
==================================== ====== ==========================
|
||||
|
||||
Kernel response contents:
|
||||
|
||||
==================================== ====== ==========================
|
||||
``ETHTOOL_A_WOL_HEADER`` nested reply header
|
||||
``ETHTOOL_A_WOL_MODES`` bitset mask of enabled WoL modes
|
||||
``ETHTOOL_A_WOL_SOPASS`` binary SecureOn(tm) password
|
||||
==================================== ====== ==========================
|
||||
|
||||
In reply, ``ETHTOOL_A_WOL_MODES`` mask consists of modes supported by the
|
||||
device, value of modes which are enabled. ``ETHTOOL_A_WOL_SOPASS`` is only
|
||||
included in reply if ``WAKE_MAGICSECURE`` mode is supported.
|
||||
|
||||
|
||||
WOL_SET
|
||||
=======
|
||||
|
||||
Set or update wake-on-lan settings.
|
||||
|
||||
Request contents:
|
||||
|
||||
==================================== ====== ==========================
|
||||
``ETHTOOL_A_WOL_HEADER`` nested request header
|
||||
``ETHTOOL_A_WOL_MODES`` bitset enabled WoL modes
|
||||
``ETHTOOL_A_WOL_SOPASS`` binary SecureOn(tm) password
|
||||
==================================== ====== ==========================
|
||||
|
||||
``ETHTOOL_A_WOL_SOPASS`` is only allowed for devices supporting
|
||||
``WAKE_MAGICSECURE`` mode.
|
||||
|
||||
|
||||
Request translation
|
||||
===================
|
||||
|
||||
The following table maps ioctl commands to netlink commands providing their
|
||||
functionality. Entries with "n/a" in right column are commands which do not
|
||||
have their netlink replacement yet.
|
||||
|
||||
=================================== =====================================
|
||||
ioctl command netlink command
|
||||
=================================== =====================================
|
||||
``ETHTOOL_GSET`` ``ETHTOOL_MSG_LINKINFO_GET``
|
||||
``ETHTOOL_MSG_LINKMODES_GET``
|
||||
``ETHTOOL_SSET`` ``ETHTOOL_MSG_LINKINFO_SET``
|
||||
``ETHTOOL_MSG_LINKMODES_SET``
|
||||
``ETHTOOL_GDRVINFO`` n/a
|
||||
``ETHTOOL_GREGS`` n/a
|
||||
``ETHTOOL_GWOL`` ``ETHTOOL_MSG_WOL_GET``
|
||||
``ETHTOOL_SWOL`` ``ETHTOOL_MSG_WOL_SET``
|
||||
``ETHTOOL_GMSGLVL`` ``ETHTOOL_MSG_DEBUG_GET``
|
||||
``ETHTOOL_SMSGLVL`` ``ETHTOOL_MSG_DEBUG_SET``
|
||||
``ETHTOOL_NWAY_RST`` n/a
|
||||
``ETHTOOL_GLINK`` ``ETHTOOL_MSG_LINKSTATE_GET``
|
||||
``ETHTOOL_GEEPROM`` n/a
|
||||
``ETHTOOL_SEEPROM`` n/a
|
||||
``ETHTOOL_GCOALESCE`` n/a
|
||||
``ETHTOOL_SCOALESCE`` n/a
|
||||
``ETHTOOL_GRINGPARAM`` n/a
|
||||
``ETHTOOL_SRINGPARAM`` n/a
|
||||
``ETHTOOL_GPAUSEPARAM`` n/a
|
||||
``ETHTOOL_SPAUSEPARAM`` n/a
|
||||
``ETHTOOL_GRXCSUM`` n/a
|
||||
``ETHTOOL_SRXCSUM`` n/a
|
||||
``ETHTOOL_GTXCSUM`` n/a
|
||||
``ETHTOOL_STXCSUM`` n/a
|
||||
``ETHTOOL_GSG`` n/a
|
||||
``ETHTOOL_SSG`` n/a
|
||||
``ETHTOOL_TEST`` n/a
|
||||
``ETHTOOL_GSTRINGS`` ``ETHTOOL_MSG_STRSET_GET``
|
||||
``ETHTOOL_PHYS_ID`` n/a
|
||||
``ETHTOOL_GSTATS`` n/a
|
||||
``ETHTOOL_GTSO`` n/a
|
||||
``ETHTOOL_STSO`` n/a
|
||||
``ETHTOOL_GPERMADDR`` rtnetlink ``RTM_GETLINK``
|
||||
``ETHTOOL_GUFO`` n/a
|
||||
``ETHTOOL_SUFO`` n/a
|
||||
``ETHTOOL_GGSO`` n/a
|
||||
``ETHTOOL_SGSO`` n/a
|
||||
``ETHTOOL_GFLAGS`` n/a
|
||||
``ETHTOOL_SFLAGS`` n/a
|
||||
``ETHTOOL_GPFLAGS`` n/a
|
||||
``ETHTOOL_SPFLAGS`` n/a
|
||||
``ETHTOOL_GRXFH`` n/a
|
||||
``ETHTOOL_SRXFH`` n/a
|
||||
``ETHTOOL_GGRO`` n/a
|
||||
``ETHTOOL_SGRO`` n/a
|
||||
``ETHTOOL_GRXRINGS`` n/a
|
||||
``ETHTOOL_GRXCLSRLCNT`` n/a
|
||||
``ETHTOOL_GRXCLSRULE`` n/a
|
||||
``ETHTOOL_GRXCLSRLALL`` n/a
|
||||
``ETHTOOL_SRXCLSRLDEL`` n/a
|
||||
``ETHTOOL_SRXCLSRLINS`` n/a
|
||||
``ETHTOOL_FLASHDEV`` n/a
|
||||
``ETHTOOL_RESET`` n/a
|
||||
``ETHTOOL_SRXNTUPLE`` n/a
|
||||
``ETHTOOL_GRXNTUPLE`` n/a
|
||||
``ETHTOOL_GSSET_INFO`` ``ETHTOOL_MSG_STRSET_GET``
|
||||
``ETHTOOL_GRXFHINDIR`` n/a
|
||||
``ETHTOOL_SRXFHINDIR`` n/a
|
||||
``ETHTOOL_GFEATURES`` n/a
|
||||
``ETHTOOL_SFEATURES`` n/a
|
||||
``ETHTOOL_GCHANNELS`` n/a
|
||||
``ETHTOOL_SCHANNELS`` n/a
|
||||
``ETHTOOL_SET_DUMP`` n/a
|
||||
``ETHTOOL_GET_DUMP_FLAG`` n/a
|
||||
``ETHTOOL_GET_DUMP_DATA`` n/a
|
||||
``ETHTOOL_GET_TS_INFO`` n/a
|
||||
``ETHTOOL_GMODULEINFO`` n/a
|
||||
``ETHTOOL_GMODULEEEPROM`` n/a
|
||||
``ETHTOOL_GEEE`` n/a
|
||||
``ETHTOOL_SEEE`` n/a
|
||||
``ETHTOOL_GRSSH`` n/a
|
||||
``ETHTOOL_SRSSH`` n/a
|
||||
``ETHTOOL_GTUNABLE`` n/a
|
||||
``ETHTOOL_STUNABLE`` n/a
|
||||
``ETHTOOL_GPHYSTATS`` n/a
|
||||
``ETHTOOL_PERQUEUE`` n/a
|
||||
``ETHTOOL_GLINKSETTINGS`` ``ETHTOOL_MSG_LINKINFO_GET``
|
||||
``ETHTOOL_MSG_LINKMODES_GET``
|
||||
``ETHTOOL_SLINKSETTINGS`` ``ETHTOOL_MSG_LINKINFO_SET``
|
||||
``ETHTOOL_MSG_LINKMODES_SET``
|
||||
``ETHTOOL_PHY_GTUNABLE`` n/a
|
||||
``ETHTOOL_PHY_STUNABLE`` n/a
|
||||
``ETHTOOL_GFECPARAM`` n/a
|
||||
``ETHTOOL_SFECPARAM`` n/a
|
||||
=================================== =====================================
|
@ -13,9 +13,8 @@ Contents:
|
||||
can_ucan_protocol
|
||||
device_drivers/index
|
||||
dsa/index
|
||||
devlink-info-versions
|
||||
devlink-trap
|
||||
devlink-trap-netdevsim
|
||||
devlink/index
|
||||
ethtool-netlink
|
||||
ieee802154
|
||||
j1939
|
||||
kapi
|
||||
|
@ -479,6 +479,10 @@ tcp_no_metrics_save - BOOLEAN
|
||||
degradation. If set, TCP will not cache metrics on closing
|
||||
connections.
|
||||
|
||||
tcp_no_ssthresh_metrics_save - BOOLEAN
|
||||
Controls whether TCP saves ssthresh metrics in the route cache.
|
||||
Default is 1, which disables ssthresh metrics.
|
||||
|
||||
tcp_orphan_retries - INTEGER
|
||||
This value influences the timeout of a locally closed TCP connection,
|
||||
when RTO retransmissions remain unacknowledged.
|
||||
|
@ -267,6 +267,24 @@ Some of the interface modes are described below:
|
||||
duplex, pause or other settings. This is dependent on the MAC and/or
|
||||
PHY behaviour.
|
||||
|
||||
``PHY_INTERFACE_MODE_10GBASER``
|
||||
This is the IEEE 802.3 Clause 49 defined 10GBASE-R protocol used with
|
||||
various different mediums. Please refer to the IEEE standard for a
|
||||
definition of this.
|
||||
|
||||
Note: 10GBASE-R is just one protocol that can be used with XFI and SFI.
|
||||
XFI and SFI permit multiple protocols over a single SERDES lane, and
|
||||
also defines the electrical characteristics of the signals with a host
|
||||
compliance board plugged into the host XFP/SFP connector. Therefore,
|
||||
XFI and SFI are not PHY interface types in their own right.
|
||||
|
||||
``PHY_INTERFACE_MODE_10GKR``
|
||||
This is the IEEE 802.3 Clause 49 defined 10GBASE-R with Clause 73
|
||||
autonegotiation. Please refer to the IEEE standard for further
|
||||
information.
|
||||
|
||||
Note: due to legacy usage, some 10GBASE-R usage incorrectly makes
|
||||
use of this definition.
|
||||
|
||||
Pause frames / flow control
|
||||
===========================
|
||||
|
@ -251,7 +251,8 @@ this documentation.
|
||||
phylink_mac_change(priv->phylink, link_is_up);
|
||||
|
||||
where ``link_is_up`` is true if the link is currently up or false
|
||||
otherwise.
|
||||
otherwise. If a MAC is unable to provide these interrupts, then
|
||||
it should set ``priv->phylink_config.pcs_poll = true;`` in step 9.
|
||||
|
||||
11. Verify that the driver does not call::
|
||||
|
||||
|
@ -1,163 +0,0 @@
|
||||
=======================================
|
||||
The padata parallel execution mechanism
|
||||
=======================================
|
||||
|
||||
:Last updated: for 2.6.36
|
||||
|
||||
Padata is a mechanism by which the kernel can farm work out to be done in
|
||||
parallel on multiple CPUs while retaining the ordering of tasks. It was
|
||||
developed for use with the IPsec code, which needs to be able to perform
|
||||
encryption and decryption on large numbers of packets without reordering
|
||||
those packets. The crypto developers made a point of writing padata in a
|
||||
sufficiently general fashion that it could be put to other uses as well.
|
||||
|
||||
The first step in using padata is to set up a padata_instance structure for
|
||||
overall control of how tasks are to be run::
|
||||
|
||||
#include <linux/padata.h>
|
||||
|
||||
struct padata_instance *padata_alloc(const char *name,
|
||||
const struct cpumask *pcpumask,
|
||||
const struct cpumask *cbcpumask);
|
||||
|
||||
'name' simply identifies the instance.
|
||||
|
||||
The pcpumask describes which processors will be used to execute work
|
||||
submitted to this instance in parallel. The cbcpumask defines which
|
||||
processors are allowed to be used as the serialization callback processor.
|
||||
The workqueue wq is where the work will actually be done; it should be
|
||||
a multithreaded queue, naturally.
|
||||
|
||||
To allocate a padata instance with the cpu_possible_mask for both
|
||||
cpumasks this helper function can be used::
|
||||
|
||||
struct padata_instance *padata_alloc_possible(struct workqueue_struct *wq);
|
||||
|
||||
Note: Padata maintains two kinds of cpumasks internally. The user supplied
|
||||
cpumasks, submitted by padata_alloc/padata_alloc_possible and the 'usable'
|
||||
cpumasks. The usable cpumasks are always a subset of active CPUs in the
|
||||
user supplied cpumasks; these are the cpumasks padata actually uses. So
|
||||
it is legal to supply a cpumask to padata that contains offline CPUs.
|
||||
Once an offline CPU in the user supplied cpumask comes online, padata
|
||||
is going to use it.
|
||||
|
||||
There are functions for enabling and disabling the instance::
|
||||
|
||||
int padata_start(struct padata_instance *pinst);
|
||||
void padata_stop(struct padata_instance *pinst);
|
||||
|
||||
These functions are setting or clearing the "PADATA_INIT" flag;
|
||||
if that flag is not set, other functions will refuse to work.
|
||||
padata_start returns zero on success (flag set) or -EINVAL if the
|
||||
padata cpumask contains no active CPU (flag not set).
|
||||
padata_stop clears the flag and blocks until the padata instance
|
||||
is unused.
|
||||
|
||||
The list of CPUs to be used can be adjusted with these functions::
|
||||
|
||||
int padata_set_cpumasks(struct padata_instance *pinst,
|
||||
cpumask_var_t pcpumask,
|
||||
cpumask_var_t cbcpumask);
|
||||
int padata_set_cpumask(struct padata_instance *pinst, int cpumask_type,
|
||||
cpumask_var_t cpumask);
|
||||
int padata_add_cpu(struct padata_instance *pinst, int cpu, int mask);
|
||||
int padata_remove_cpu(struct padata_instance *pinst, int cpu, int mask);
|
||||
|
||||
Changing the CPU masks are expensive operations, though, so it should not be
|
||||
done with great frequency.
|
||||
|
||||
It's possible to change both cpumasks of a padata instance with
|
||||
padata_set_cpumasks by specifying the cpumasks for parallel execution (pcpumask)
|
||||
and for the serial callback function (cbcpumask). padata_set_cpumask is used to
|
||||
change just one of the cpumasks. Here cpumask_type is one of PADATA_CPU_SERIAL,
|
||||
PADATA_CPU_PARALLEL and cpumask specifies the new cpumask to use.
|
||||
To simply add or remove one CPU from a certain cpumask the functions
|
||||
padata_add_cpu/padata_remove_cpu are used. cpu specifies the CPU to add or
|
||||
remove and mask is one of PADATA_CPU_SERIAL, PADATA_CPU_PARALLEL.
|
||||
|
||||
If a user is interested in padata cpumask changes, he can register to
|
||||
the padata cpumask change notifier::
|
||||
|
||||
int padata_register_cpumask_notifier(struct padata_instance *pinst,
|
||||
struct notifier_block *nblock);
|
||||
|
||||
To unregister from that notifier::
|
||||
|
||||
int padata_unregister_cpumask_notifier(struct padata_instance *pinst,
|
||||
struct notifier_block *nblock);
|
||||
|
||||
The padata cpumask change notifier notifies about changes of the usable
|
||||
cpumasks, i.e. the subset of active CPUs in the user supplied cpumask.
|
||||
|
||||
Padata calls the notifier chain with::
|
||||
|
||||
blocking_notifier_call_chain(&pinst->cpumask_change_notifier,
|
||||
notification_mask,
|
||||
&pd_new->cpumask);
|
||||
|
||||
Here cpumask_change_notifier is registered notifier, notification_mask
|
||||
is one of PADATA_CPU_SERIAL, PADATA_CPU_PARALLEL and cpumask is a pointer
|
||||
to a struct padata_cpumask that contains the new cpumask information.
|
||||
|
||||
Actually submitting work to the padata instance requires the creation of a
|
||||
padata_priv structure::
|
||||
|
||||
struct padata_priv {
|
||||
/* Other stuff here... */
|
||||
void (*parallel)(struct padata_priv *padata);
|
||||
void (*serial)(struct padata_priv *padata);
|
||||
};
|
||||
|
||||
This structure will almost certainly be embedded within some larger
|
||||
structure specific to the work to be done. Most of its fields are private to
|
||||
padata, but the structure should be zeroed at initialisation time, and the
|
||||
parallel() and serial() functions should be provided. Those functions will
|
||||
be called in the process of getting the work done as we will see
|
||||
momentarily.
|
||||
|
||||
The submission of work is done with::
|
||||
|
||||
int padata_do_parallel(struct padata_instance *pinst,
|
||||
struct padata_priv *padata, int cb_cpu);
|
||||
|
||||
The pinst and padata structures must be set up as described above; cb_cpu
|
||||
specifies which CPU will be used for the final callback when the work is
|
||||
done; it must be in the current instance's CPU mask. The return value from
|
||||
padata_do_parallel() is zero on success, indicating that the work is in
|
||||
progress. -EBUSY means that somebody, somewhere else is messing with the
|
||||
instance's CPU mask, while -EINVAL is a complaint about cb_cpu not being
|
||||
in that CPU mask or about a not running instance.
|
||||
|
||||
Each task submitted to padata_do_parallel() will, in turn, be passed to
|
||||
exactly one call to the above-mentioned parallel() function, on one CPU, so
|
||||
true parallelism is achieved by submitting multiple tasks. parallel() runs with
|
||||
software interrupts disabled and thus cannot sleep. The parallel()
|
||||
function gets the padata_priv structure pointer as its lone parameter;
|
||||
information about the actual work to be done is probably obtained by using
|
||||
container_of() to find the enclosing structure.
|
||||
|
||||
Note that parallel() has no return value; the padata subsystem assumes that
|
||||
parallel() will take responsibility for the task from this point. The work
|
||||
need not be completed during this call, but, if parallel() leaves work
|
||||
outstanding, it should be prepared to be called again with a new job before
|
||||
the previous one completes. When a task does complete, parallel() (or
|
||||
whatever function actually finishes the job) should inform padata of the
|
||||
fact with a call to::
|
||||
|
||||
void padata_do_serial(struct padata_priv *padata);
|
||||
|
||||
At some point in the future, padata_do_serial() will trigger a call to the
|
||||
serial() function in the padata_priv structure. That call will happen on
|
||||
the CPU requested in the initial call to padata_do_parallel(); it, too, is
|
||||
run with local software interrupts disabled.
|
||||
Note that this call may be deferred for a while since the padata code takes
|
||||
pains to ensure that tasks are completed in the order in which they were
|
||||
submitted.
|
||||
|
||||
The one remaining function in the padata API should be called to clean up
|
||||
when a padata instance is no longer needed::
|
||||
|
||||
void padata_free(struct padata_instance *pinst);
|
||||
|
||||
This function will busy-wait while any remaining tasks are completed, so it
|
||||
might be best not to call it while there is work outstanding.
|
@ -495,7 +495,8 @@ Module for C-Media CMI8338/8738/8768/8770 PCI sound cards.
|
||||
mpu_port
|
||||
port address of MIDI interface (8338 only):
|
||||
0x300,0x310,0x320,0x330 = legacy port,
|
||||
0 = disable (default)
|
||||
1 = integrated PCI port (default on 8738),
|
||||
0 = disable
|
||||
fm_port
|
||||
port address of OPL-3 FM synthesizer (8x38 only):
|
||||
0x388 = legacy port,
|
||||
|
@ -259,7 +259,7 @@ to details explained in the following section.
|
||||
{
|
||||
struct mychip *chip;
|
||||
int err;
|
||||
static struct snd_device_ops ops = {
|
||||
static const struct snd_device_ops ops = {
|
||||
.dev_free = snd_mychip_dev_free,
|
||||
};
|
||||
|
||||
@ -675,7 +675,7 @@ low-level device with a specified ``ops``,
|
||||
|
||||
::
|
||||
|
||||
static struct snd_device_ops ops = {
|
||||
static const struct snd_device_ops ops = {
|
||||
.dev_free = snd_mychip_dev_free,
|
||||
};
|
||||
....
|
||||
@ -761,7 +761,7 @@ destructor and PCI entries. Example code is shown first, below.
|
||||
{
|
||||
struct mychip *chip;
|
||||
int err;
|
||||
static struct snd_device_ops ops = {
|
||||
static const struct snd_device_ops ops = {
|
||||
.dev_free = snd_mychip_dev_free,
|
||||
};
|
||||
|
||||
@ -3912,7 +3912,7 @@ For a raw-data proc-file, set the attributes as follows:
|
||||
|
||||
::
|
||||
|
||||
static struct snd_info_entry_ops my_file_io_ops = {
|
||||
static const struct snd_info_entry_ops my_file_io_ops = {
|
||||
.read = my_file_io_read,
|
||||
};
|
||||
|
||||
|
@ -112,6 +112,83 @@ kernel are handled by the kernel driver. Other RPC messages will be forwarded to
|
||||
tee-supplicant without further involvement of the driver, except switching
|
||||
shared memory buffer representation.
|
||||
|
||||
AMD-TEE driver
|
||||
==============
|
||||
|
||||
The AMD-TEE driver handles the communication with AMD's TEE environment. The
|
||||
TEE environment is provided by AMD Secure Processor.
|
||||
|
||||
The AMD Secure Processor (formerly called Platform Security Processor or PSP)
|
||||
is a dedicated processor that features ARM TrustZone technology, along with a
|
||||
software-based Trusted Execution Environment (TEE) designed to enable
|
||||
third-party Trusted Applications. This feature is currently enabled only for
|
||||
APUs.
|
||||
|
||||
The following picture shows a high level overview of AMD-TEE::
|
||||
|
||||
|
|
||||
x86 |
|
||||
|
|
||||
User space (Kernel space) | AMD Secure Processor (PSP)
|
||||
~~~~~~~~~~ ~~~~~~~~~~~~~~ | ~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
|
||||
+--------+ | +-------------+
|
||||
| Client | | | Trusted |
|
||||
+--------+ | | Application |
|
||||
/\ | +-------------+
|
||||
|| | /\
|
||||
|| | ||
|
||||
|| | \/
|
||||
|| | +----------+
|
||||
|| | | TEE |
|
||||
|| | | Internal |
|
||||
\/ | | API |
|
||||
+---------+ +-----------+---------+ +----------+
|
||||
| TEE | | TEE | AMD-TEE | | AMD-TEE |
|
||||
| Client | | subsystem | driver | | Trusted |
|
||||
| API | | | | | OS |
|
||||
+---------+-----------+----+------+---------+---------+----------+
|
||||
| Generic TEE API | | ASP | Mailbox |
|
||||
| IOCTL (TEE_IOC_*) | | driver | Register Protocol |
|
||||
+--------------------------+ +---------+--------------------+
|
||||
|
||||
At the lowest level (in x86), the AMD Secure Processor (ASP) driver uses the
|
||||
CPU to PSP mailbox regsister to submit commands to the PSP. The format of the
|
||||
command buffer is opaque to the ASP driver. It's role is to submit commands to
|
||||
the secure processor and return results to AMD-TEE driver. The interface
|
||||
between AMD-TEE driver and AMD Secure Processor driver can be found in [6].
|
||||
|
||||
The AMD-TEE driver packages the command buffer payload for processing in TEE.
|
||||
The command buffer format for the different TEE commands can be found in [7].
|
||||
|
||||
The TEE commands supported by AMD-TEE Trusted OS are:
|
||||
* TEE_CMD_ID_LOAD_TA - loads a Trusted Application (TA) binary into
|
||||
TEE environment.
|
||||
* TEE_CMD_ID_UNLOAD_TA - unloads TA binary from TEE environment.
|
||||
* TEE_CMD_ID_OPEN_SESSION - opens a session with a loaded TA.
|
||||
* TEE_CMD_ID_CLOSE_SESSION - closes session with loaded TA
|
||||
* TEE_CMD_ID_INVOKE_CMD - invokes a command with loaded TA
|
||||
* TEE_CMD_ID_MAP_SHARED_MEM - maps shared memory
|
||||
* TEE_CMD_ID_UNMAP_SHARED_MEM - unmaps shared memory
|
||||
|
||||
AMD-TEE Trusted OS is the firmware running on AMD Secure Processor.
|
||||
|
||||
The AMD-TEE driver registers itself with TEE subsystem and implements the
|
||||
following driver function callbacks:
|
||||
|
||||
* get_version - returns the driver implementation id and capability.
|
||||
* open - sets up the driver context data structure.
|
||||
* release - frees up driver resources.
|
||||
* open_session - loads the TA binary and opens session with loaded TA.
|
||||
* close_session - closes session with loaded TA and unloads it.
|
||||
* invoke_func - invokes a command with loaded TA.
|
||||
|
||||
cancel_req driver callback is not supported by AMD-TEE.
|
||||
|
||||
The GlobalPlatform TEE Client API [5] can be used by the user space (client) to
|
||||
talk to AMD's TEE. AMD's TEE provides a secure environment for loading, opening
|
||||
a session, invoking commands and clossing session with TA.
|
||||
|
||||
References
|
||||
==========
|
||||
|
||||
@ -125,3 +202,7 @@ References
|
||||
|
||||
[5] http://www.globalplatform.org/specificationsdevice.asp look for
|
||||
"TEE Client API Specification v1.0" and click download.
|
||||
|
||||
[6] include/linux/psp-tee.h
|
||||
|
||||
[7] drivers/tee/amdtee/amdtee_if.h
|
||||
|
@ -251,7 +251,7 @@ setting fields in the header, you must make sure only to set fields
|
||||
supported by the protocol version in use.
|
||||
|
||||
|
||||
Details of Harder Fileds
|
||||
Details of Header Fields
|
||||
========================
|
||||
|
||||
For each field, some are information from the kernel to the bootloader
|
||||
|
67
MAINTAINERS
67
MAINTAINERS
@ -694,6 +694,14 @@ L: linux-crypto@vger.kernel.org
|
||||
S: Maintained
|
||||
F: drivers/crypto/allwinner/
|
||||
|
||||
ALLWINNER THERMAL DRIVER
|
||||
M: Vasily Khoruzhick <anarsoul@gmail.com>
|
||||
M: Yangtao Li <tiny.windzz@gmail.com>
|
||||
L: linux-pm@vger.kernel.org
|
||||
S: Maintained
|
||||
F: Documentation/devicetree/bindings/thermal/allwinner,sun8i-a83t-ths.yaml
|
||||
F: drivers/thermal/sun8i_thermal.c
|
||||
|
||||
ALLWINNER VPU DRIVER
|
||||
M: Maxime Ripard <mripard@kernel.org>
|
||||
M: Paul Kocialkowski <paul.kocialkowski@bootlin.com>
|
||||
@ -791,7 +799,6 @@ F: include/uapi/rdma/efa-abi.h
|
||||
|
||||
AMD CRYPTOGRAPHIC COPROCESSOR (CCP) DRIVER
|
||||
M: Tom Lendacky <thomas.lendacky@amd.com>
|
||||
M: Gary Hook <gary.hook@amd.com>
|
||||
L: linux-crypto@vger.kernel.org
|
||||
S: Supported
|
||||
F: drivers/crypto/ccp/
|
||||
@ -4866,6 +4873,7 @@ S: Supported
|
||||
F: net/core/devlink.c
|
||||
F: include/net/devlink.h
|
||||
F: include/uapi/linux/devlink.h
|
||||
F: Documentation/networking/devlink
|
||||
|
||||
DIALOG SEMICONDUCTOR DRIVERS
|
||||
M: Support Opensource <support.opensource@diasemi.com>
|
||||
@ -9171,7 +9179,7 @@ F: arch/x86/include/uapi/asm/svm.h
|
||||
F: arch/x86/include/asm/kvm*
|
||||
F: arch/x86/include/asm/pvclock-abi.h
|
||||
F: arch/x86/include/asm/svm.h
|
||||
F: arch/x86/include/asm/vmx.h
|
||||
F: arch/x86/include/asm/vmx*.h
|
||||
F: arch/x86/kernel/kvm.c
|
||||
F: arch/x86/kernel/kvmclock.c
|
||||
|
||||
@ -9928,7 +9936,7 @@ S: Maintained
|
||||
F: drivers/net/dsa/mv88e6xxx/
|
||||
F: include/linux/platform_data/mv88e6xxx.h
|
||||
F: Documentation/devicetree/bindings/net/dsa/marvell.txt
|
||||
F: Documentation/networking/devlink-params-mv88e6xxx.txt
|
||||
F: Documentation/networking/devlink/mv88e6xxx.rst
|
||||
|
||||
MARVELL ARMADA DRM SUPPORT
|
||||
M: Russell King <linux@armlinux.org.uk>
|
||||
@ -9998,8 +10006,7 @@ F: drivers/net/ethernet/marvell/mvneta.*
|
||||
|
||||
MARVELL MWIFIEX WIRELESS DRIVER
|
||||
M: Amitkumar Karwar <amitkarwar@gmail.com>
|
||||
M: Nishant Sarmukadam <nishants@marvell.com>
|
||||
M: Ganapathi Bhat <gbhat@marvell.com>
|
||||
M: Ganapathi Bhat <ganapathi.bhat@nxp.com>
|
||||
M: Xinming Hu <huxinming820@gmail.com>
|
||||
L: linux-wireless@vger.kernel.org
|
||||
S: Maintained
|
||||
@ -10038,6 +10045,16 @@ M: Jerin Jacob <jerinj@marvell.com>
|
||||
L: netdev@vger.kernel.org
|
||||
S: Supported
|
||||
F: drivers/net/ethernet/marvell/octeontx2/af/
|
||||
F: Documentation/networking/device_drivers/marvell/octeontx2.rst
|
||||
|
||||
MARVELL OCTEONTX2 PHYSICAL FUNCTION DRIVER
|
||||
M: Sunil Goutham <sgoutham@marvell.com>
|
||||
M: Geetha sowjanya <gakula@marvell.com>
|
||||
M: Subbaraya Sundeep <sbhatta@marvell.com>
|
||||
M: hariprasad <hkelam@marvell.com>
|
||||
L: netdev@vger.kernel.org
|
||||
S: Supported
|
||||
F: drivers/net/ethernet/marvell/octeontx2/nic/
|
||||
|
||||
MATROX FRAMEBUFFER DRIVER
|
||||
L: linux-fbdev@vger.kernel.org
|
||||
@ -11432,7 +11449,7 @@ F: Documentation/networking/net_failover.rst
|
||||
|
||||
NETEM NETWORK EMULATOR
|
||||
M: Stephen Hemminger <stephen@networkplumber.org>
|
||||
L: netem@lists.linux-foundation.org (moderated for non-subscribers)
|
||||
L: netdev@vger.kernel.org
|
||||
S: Maintained
|
||||
F: net/sched/sch_netem.c
|
||||
|
||||
@ -11620,6 +11637,18 @@ F: net/ipv6/calipso.c
|
||||
F: net/netfilter/xt_CONNSECMARK.c
|
||||
F: net/netfilter/xt_SECMARK.c
|
||||
|
||||
NETWORKING [MPTCP]
|
||||
M: Mat Martineau <mathew.j.martineau@linux.intel.com>
|
||||
M: Matthieu Baerts <matthieu.baerts@tessares.net>
|
||||
L: netdev@vger.kernel.org
|
||||
L: mptcp@lists.01.org
|
||||
W: https://github.com/multipath-tcp/mptcp_net-next/wiki
|
||||
B: https://github.com/multipath-tcp/mptcp_net-next/issues
|
||||
S: Maintained
|
||||
F: include/net/mptcp.h
|
||||
F: net/mptcp/
|
||||
F: tools/testing/selftests/net/mptcp/
|
||||
|
||||
NETWORKING [TCP]
|
||||
M: Eric Dumazet <edumazet@google.com>
|
||||
L: netdev@vger.kernel.org
|
||||
@ -12478,7 +12507,7 @@ L: linux-crypto@vger.kernel.org
|
||||
S: Maintained
|
||||
F: kernel/padata.c
|
||||
F: include/linux/padata.h
|
||||
F: Documentation/padata.txt
|
||||
F: Documentation/core-api/padata.rst
|
||||
|
||||
PAGE POOL
|
||||
M: Jesper Dangaard Brouer <hawk@kernel.org>
|
||||
@ -13703,6 +13732,13 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/kvalo/ath.git
|
||||
S: Supported
|
||||
F: drivers/net/wireless/ath/ath10k/
|
||||
|
||||
QUALCOMM ATHEROS ATH11K WIRELESS DRIVER
|
||||
M: Kalle Valo <kvalo@codeaurora.org>
|
||||
L: ath11k@lists.infradead.org
|
||||
T: git git://git.kernel.org/pub/scm/linux/kernel/git/kvalo/ath.git
|
||||
S: Supported
|
||||
F: drivers/net/wireless/ath/ath11k/
|
||||
|
||||
QUALCOMM ATHEROS ATH9K WIRELESS DRIVER
|
||||
M: QCA ath9k Development <ath9k-devel@qca.qualcomm.com>
|
||||
L: linux-wireless@vger.kernel.org
|
||||
@ -14568,7 +14604,7 @@ F: drivers/media/i2c/s5k5baf.c
|
||||
SAMSUNG S5P Security SubSystem (SSS) DRIVER
|
||||
M: Krzysztof Kozlowski <krzk@kernel.org>
|
||||
M: Vladimir Zapolskiy <vz@mleia.com>
|
||||
M: Kamil Konieczny <k.konieczny@partner.samsung.com>
|
||||
M: Kamil Konieczny <k.konieczny@samsung.com>
|
||||
L: linux-crypto@vger.kernel.org
|
||||
L: linux-samsung-soc@vger.kernel.org
|
||||
S: Maintained
|
||||
@ -15831,6 +15867,7 @@ M: Jose Abreu <joabreu@synopsys.com>
|
||||
L: netdev@vger.kernel.org
|
||||
W: http://www.stlinux.com
|
||||
S: Supported
|
||||
F: Documentation/networking/device_drivers/stmicro/
|
||||
F: drivers/net/ethernet/stmicro/stmmac/
|
||||
|
||||
SUN3/3X
|
||||
@ -16403,12 +16440,15 @@ F: Documentation/devicetree/bindings/thermal/
|
||||
|
||||
THERMAL/CPU_COOLING
|
||||
M: Amit Daniel Kachhap <amit.kachhap@gmail.com>
|
||||
M: Daniel Lezcano <daniel.lezcano@linaro.org>
|
||||
M: Viresh Kumar <viresh.kumar@linaro.org>
|
||||
M: Javi Merino <javi.merino@kernel.org>
|
||||
L: linux-pm@vger.kernel.org
|
||||
S: Supported
|
||||
F: Documentation/driver-api/thermal/cpu-cooling-api.rst
|
||||
F: drivers/thermal/cpu_cooling.c
|
||||
F: Documentation/driver-api/thermal/cpu-idle-cooling.rst
|
||||
F: drivers/thermal/cpufreq_cooling.c
|
||||
F: drivers/thermal/cpuidle_cooling.c
|
||||
F: include/linux/cpu_cooling.h
|
||||
|
||||
THERMAL DRIVER FOR AMLOGIC SOCS
|
||||
@ -17560,6 +17600,7 @@ F: net/vmw_vsock/diag.c
|
||||
F: net/vmw_vsock/af_vsock_tap.c
|
||||
F: net/vmw_vsock/virtio_transport_common.c
|
||||
F: net/vmw_vsock/virtio_transport.c
|
||||
F: net/vmw_vsock/vsock_loopback.c
|
||||
F: drivers/net/vsockmon.c
|
||||
F: drivers/vhost/vsock.c
|
||||
F: tools/testing/vsock/
|
||||
@ -17930,6 +17971,14 @@ L: linux-gpio@vger.kernel.org
|
||||
S: Maintained
|
||||
F: drivers/gpio/gpio-ws16c48.c
|
||||
|
||||
WIREGUARD SECURE NETWORK TUNNEL
|
||||
M: Jason A. Donenfeld <Jason@zx2c4.com>
|
||||
S: Maintained
|
||||
F: drivers/net/wireguard/
|
||||
F: tools/testing/selftests/wireguard/
|
||||
L: wireguard@lists.zx2c4.com
|
||||
L: netdev@vger.kernel.org
|
||||
|
||||
WISTRON LAPTOP BUTTON DRIVER
|
||||
M: Miloslav Trmac <mitr@volny.cz>
|
||||
S: Maintained
|
||||
|
4
arch/alpha/include/asm/vmalloc.h
Normal file
4
arch/alpha/include/asm/vmalloc.h
Normal file
@ -0,0 +1,4 @@
|
||||
#ifndef _ASM_ALPHA_VMALLOC_H
|
||||
#define _ASM_ALPHA_VMALLOC_H
|
||||
|
||||
#endif /* _ASM_ALPHA_VMALLOC_H */
|
@ -13,7 +13,7 @@ config ARC
|
||||
select ARCH_HAS_SYNC_DMA_FOR_DEVICE
|
||||
select ARCH_SUPPORTS_ATOMIC_RMW if ARC_HAS_LLSC
|
||||
select ARCH_32BIT_OFF_T
|
||||
select BUILDTIME_EXTABLE_SORT
|
||||
select BUILDTIME_TABLE_SORT
|
||||
select CLONE_BACKWARDS
|
||||
select COMMON_CLK
|
||||
select DMA_DIRECT_REMAP
|
||||
|
4
arch/arc/include/asm/vmalloc.h
Normal file
4
arch/arc/include/asm/vmalloc.h
Normal file
@ -0,0 +1,4 @@
|
||||
#ifndef _ASM_ARC_VMALLOC_H
|
||||
#define _ASM_ARC_VMALLOC_H
|
||||
|
||||
#endif /* _ASM_ARC_VMALLOC_H */
|
@ -337,11 +337,11 @@ resume_user_mode_begin:
|
||||
resume_kernel_mode:
|
||||
|
||||
; Disable Interrupts from this point on
|
||||
; CONFIG_PREEMPT: This is a must for preempt_schedule_irq()
|
||||
; !CONFIG_PREEMPT: To ensure restore_regs is intr safe
|
||||
; CONFIG_PREEMPTION: This is a must for preempt_schedule_irq()
|
||||
; !CONFIG_PREEMPTION: To ensure restore_regs is intr safe
|
||||
IRQ_DISABLE r9
|
||||
|
||||
#ifdef CONFIG_PREEMPT
|
||||
#ifdef CONFIG_PREEMPTION
|
||||
|
||||
; Can't preempt if preemption disabled
|
||||
GET_CURR_THR_INFO_FROM_SP r10
|
||||
|
@ -36,7 +36,7 @@ config ARM
|
||||
select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU
|
||||
select ARCH_WANT_IPC_PARSE_VERSION
|
||||
select BINFMT_FLAT_ARGVP_ENVP_ON_STACK
|
||||
select BUILDTIME_EXTABLE_SORT if MMU
|
||||
select BUILDTIME_TABLE_SORT if MMU
|
||||
select CLONE_BACKWARDS
|
||||
select CPU_PM if SUSPEND || CPU_IDLE
|
||||
select DCACHE_WORD_ACCESS if HAVE_EFFICIENT_UNALIGNED_ACCESS
|
||||
|
@ -66,6 +66,17 @@
|
||||
IRQ_TYPE_LEVEL_HIGH)>;
|
||||
};
|
||||
|
||||
avs_monitor: avs-monitor@7d5d2000 {
|
||||
compatible = "brcm,bcm2711-avs-monitor",
|
||||
"syscon", "simple-mfd";
|
||||
reg = <0x7d5d2000 0xf00>;
|
||||
|
||||
thermal: thermal {
|
||||
compatible = "brcm,bcm2711-thermal";
|
||||
#thermal-sensor-cells = <0>;
|
||||
};
|
||||
};
|
||||
|
||||
dma: dma@7e007000 {
|
||||
compatible = "brcm,bcm2835-dma";
|
||||
reg = <0x7e007000 0xb00>;
|
||||
@ -363,6 +374,7 @@
|
||||
|
||||
&cpu_thermal {
|
||||
coefficients = <(-487) 410040>;
|
||||
thermal-sensors = <&thermal>;
|
||||
};
|
||||
|
||||
&dsi0 {
|
||||
|
@ -496,6 +496,7 @@ CONFIG_IMX_THERMAL=y
|
||||
CONFIG_ROCKCHIP_THERMAL=y
|
||||
CONFIG_RCAR_THERMAL=y
|
||||
CONFIG_ARMADA_THERMAL=y
|
||||
CONFIG_BCM2711_THERMAL=m
|
||||
CONFIG_BCM2835_THERMAL=m
|
||||
CONFIG_BRCMSTB_THERMAL=m
|
||||
CONFIG_ST_THERMAL_MEMMAP=y
|
||||
|
@ -138,14 +138,8 @@ static int ce_aes_setkey(struct crypto_skcipher *tfm, const u8 *in_key,
|
||||
unsigned int key_len)
|
||||
{
|
||||
struct crypto_aes_ctx *ctx = crypto_skcipher_ctx(tfm);
|
||||
int ret;
|
||||
|
||||
ret = ce_aes_expandkey(ctx, in_key, key_len);
|
||||
if (!ret)
|
||||
return 0;
|
||||
|
||||
crypto_skcipher_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
|
||||
return -EINVAL;
|
||||
return ce_aes_expandkey(ctx, in_key, key_len);
|
||||
}
|
||||
|
||||
struct crypto_aes_xts_ctx {
|
||||
@ -167,11 +161,7 @@ static int xts_set_key(struct crypto_skcipher *tfm, const u8 *in_key,
|
||||
if (!ret)
|
||||
ret = ce_aes_expandkey(&ctx->key2, &in_key[key_len / 2],
|
||||
key_len / 2);
|
||||
if (!ret)
|
||||
return 0;
|
||||
|
||||
crypto_skcipher_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
|
||||
return -EINVAL;
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int ecb_encrypt(struct skcipher_request *req)
|
||||
|
@ -115,7 +115,7 @@ static int chacha_stream_xor(struct skcipher_request *req,
|
||||
if (nbytes < walk.total)
|
||||
nbytes = round_down(nbytes, walk.stride);
|
||||
|
||||
if (!neon) {
|
||||
if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon) {
|
||||
chacha_doarm(walk.dst.virt.addr, walk.src.virt.addr,
|
||||
nbytes, state, ctx->nrounds);
|
||||
state[12] += DIV_ROUND_UP(nbytes, CHACHA_BLOCK_SIZE);
|
||||
@ -159,7 +159,7 @@ static int do_xchacha(struct skcipher_request *req, bool neon)
|
||||
|
||||
chacha_init_generic(state, ctx->key, req->iv);
|
||||
|
||||
if (!neon) {
|
||||
if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon) {
|
||||
hchacha_block_arm(state, subctx.key, ctx->nrounds);
|
||||
} else {
|
||||
kernel_neon_begin();
|
||||
|
@ -54,10 +54,8 @@ static int crc32_setkey(struct crypto_shash *hash, const u8 *key,
|
||||
{
|
||||
u32 *mctx = crypto_shash_ctx(hash);
|
||||
|
||||
if (keylen != sizeof(u32)) {
|
||||
crypto_shash_set_flags(hash, CRYPTO_TFM_RES_BAD_KEY_LEN);
|
||||
if (keylen != sizeof(u32))
|
||||
return -EINVAL;
|
||||
}
|
||||
*mctx = le32_to_cpup((__le32 *)key);
|
||||
return 0;
|
||||
}
|
||||
|
@ -163,10 +163,8 @@ static int ghash_setkey(struct crypto_shash *tfm,
|
||||
struct ghash_key *key = crypto_shash_ctx(tfm);
|
||||
be128 h;
|
||||
|
||||
if (keylen != GHASH_BLOCK_SIZE) {
|
||||
crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
|
||||
if (keylen != GHASH_BLOCK_SIZE)
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
/* needed for the fallback */
|
||||
memcpy(&key->k, inkey, GHASH_BLOCK_SIZE);
|
||||
@ -296,16 +294,11 @@ static int ghash_async_setkey(struct crypto_ahash *tfm, const u8 *key,
|
||||
{
|
||||
struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm);
|
||||
struct crypto_ahash *child = &ctx->cryptd_tfm->base;
|
||||
int err;
|
||||
|
||||
crypto_ahash_clear_flags(child, CRYPTO_TFM_REQ_MASK);
|
||||
crypto_ahash_set_flags(child, crypto_ahash_get_flags(tfm)
|
||||
& CRYPTO_TFM_REQ_MASK);
|
||||
err = crypto_ahash_setkey(child, key, keylen);
|
||||
crypto_ahash_set_flags(tfm, crypto_ahash_get_flags(child)
|
||||
& CRYPTO_TFM_RES_MASK);
|
||||
|
||||
return err;
|
||||
return crypto_ahash_setkey(child, key, keylen);
|
||||
}
|
||||
|
||||
static int ghash_async_init_tfm(struct crypto_tfm *tfm)
|
||||
|
@ -20,7 +20,7 @@
|
||||
|
||||
void poly1305_init_arm(void *state, const u8 *key);
|
||||
void poly1305_blocks_arm(void *state, const u8 *src, u32 len, u32 hibit);
|
||||
void poly1305_emit_arm(void *state, __le32 *digest, const u32 *nonce);
|
||||
void poly1305_emit_arm(void *state, u8 *digest, const u32 *nonce);
|
||||
|
||||
void __weak poly1305_blocks_neon(void *state, const u8 *src, u32 len, u32 hibit)
|
||||
{
|
||||
@ -179,9 +179,6 @@ EXPORT_SYMBOL(poly1305_update_arch);
|
||||
|
||||
void poly1305_final_arch(struct poly1305_desc_ctx *dctx, u8 *dst)
|
||||
{
|
||||
__le32 digest[4];
|
||||
u64 f = 0;
|
||||
|
||||
if (unlikely(dctx->buflen)) {
|
||||
dctx->buf[dctx->buflen++] = 1;
|
||||
memset(dctx->buf + dctx->buflen, 0,
|
||||
@ -189,18 +186,7 @@ void poly1305_final_arch(struct poly1305_desc_ctx *dctx, u8 *dst)
|
||||
poly1305_blocks_arm(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 0);
|
||||
}
|
||||
|
||||
poly1305_emit_arm(&dctx->h, digest, dctx->s);
|
||||
|
||||
/* mac = (h + s) % (2^128) */
|
||||
f = (f >> 32) + le32_to_cpu(digest[0]);
|
||||
put_unaligned_le32(f, dst);
|
||||
f = (f >> 32) + le32_to_cpu(digest[1]);
|
||||
put_unaligned_le32(f, dst + 4);
|
||||
f = (f >> 32) + le32_to_cpu(digest[2]);
|
||||
put_unaligned_le32(f, dst + 8);
|
||||
f = (f >> 32) + le32_to_cpu(digest[3]);
|
||||
put_unaligned_le32(f, dst + 12);
|
||||
|
||||
poly1305_emit_arm(&dctx->h, dst, dctx->s);
|
||||
*dctx = (struct poly1305_desc_ctx){};
|
||||
}
|
||||
EXPORT_SYMBOL(poly1305_final_arch);
|
||||
|
@ -50,19 +50,16 @@ void efi_virtmap_unload(void);
|
||||
|
||||
/* arch specific definitions used by the stub code */
|
||||
|
||||
#define efi_call_early(f, ...) sys_table_arg->boottime->f(__VA_ARGS__)
|
||||
#define __efi_call_early(f, ...) f(__VA_ARGS__)
|
||||
#define efi_call_runtime(f, ...) sys_table_arg->runtime->f(__VA_ARGS__)
|
||||
#define efi_is_64bit() (false)
|
||||
#define efi_bs_call(func, ...) efi_system_table()->boottime->func(__VA_ARGS__)
|
||||
#define efi_rt_call(func, ...) efi_system_table()->runtime->func(__VA_ARGS__)
|
||||
#define efi_is_native() (true)
|
||||
|
||||
#define efi_table_attr(table, attr, instance) \
|
||||
((table##_t *)instance)->attr
|
||||
#define efi_table_attr(inst, attr) (inst->attr)
|
||||
|
||||
#define efi_call_proto(protocol, f, instance, ...) \
|
||||
((protocol##_t *)instance)->f(instance, ##__VA_ARGS__)
|
||||
#define efi_call_proto(inst, func, ...) inst->func(inst, ##__VA_ARGS__)
|
||||
|
||||
struct screen_info *alloc_screen_info(efi_system_table_t *sys_table_arg);
|
||||
void free_screen_info(efi_system_table_t *sys_table, struct screen_info *si);
|
||||
struct screen_info *alloc_screen_info(void);
|
||||
void free_screen_info(struct screen_info *si);
|
||||
|
||||
static inline void efifb_setup_from_dmi(struct screen_info *si, const char *opt)
|
||||
{
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user