2019-05-27 08:55:01 +02:00
// SPDX-License-Identifier: GPL-2.0-or-later
2005-04-16 15:20:36 -07:00
/*
* Generic address resolution entity
*
* Authors :
* Pedro Roque < roque @ di . fc . ul . pt >
* Alexey Kuznetsov < kuznet @ ms2 . inr . ac . ru >
*
* Fixes :
* Vitaly E . Lavrov releasing NULL neighbor in neigh_add .
* Harald Welte Add neighbour cache statistics like rtstat
*/
2012-05-16 19:58:40 +00:00
# define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
include cleanup: Update gfp.h and slab.h includes to prepare for breaking implicit slab.h inclusion from percpu.h
percpu.h is included by sched.h and module.h and thus ends up being
included when building most .c files. percpu.h includes slab.h which
in turn includes gfp.h making everything defined by the two files
universally available and complicating inclusion dependencies.
percpu.h -> slab.h dependency is about to be removed. Prepare for
this change by updating users of gfp and slab facilities include those
headers directly instead of assuming availability. As this conversion
needs to touch large number of source files, the following script is
used as the basis of conversion.
http://userweb.kernel.org/~tj/misc/slabh-sweep.py
The script does the followings.
* Scan files for gfp and slab usages and update includes such that
only the necessary includes are there. ie. if only gfp is used,
gfp.h, if slab is used, slab.h.
* When the script inserts a new include, it looks at the include
blocks and try to put the new include such that its order conforms
to its surrounding. It's put in the include block which contains
core kernel includes, in the same order that the rest are ordered -
alphabetical, Christmas tree, rev-Xmas-tree or at the end if there
doesn't seem to be any matching order.
* If the script can't find a place to put a new include (mostly
because the file doesn't have fitting include block), it prints out
an error message indicating which .h file needs to be added to the
file.
The conversion was done in the following steps.
1. The initial automatic conversion of all .c files updated slightly
over 4000 files, deleting around 700 includes and adding ~480 gfp.h
and ~3000 slab.h inclusions. The script emitted errors for ~400
files.
2. Each error was manually checked. Some didn't need the inclusion,
some needed manual addition while adding it to implementation .h or
embedding .c file was more appropriate for others. This step added
inclusions to around 150 files.
3. The script was run again and the output was compared to the edits
from #2 to make sure no file was left behind.
4. Several build tests were done and a couple of problems were fixed.
e.g. lib/decompress_*.c used malloc/free() wrappers around slab
APIs requiring slab.h to be added manually.
5. The script was run on all .h files but without automatically
editing them as sprinkling gfp.h and slab.h inclusions around .h
files could easily lead to inclusion dependency hell. Most gfp.h
inclusion directives were ignored as stuff from gfp.h was usually
wildly available and often used in preprocessor macros. Each
slab.h inclusion directive was examined and added manually as
necessary.
6. percpu.h was updated not to include slab.h.
7. Build test were done on the following configurations and failures
were fixed. CONFIG_GCOV_KERNEL was turned off for all tests (as my
distributed build env didn't work with gcov compiles) and a few
more options had to be turned off depending on archs to make things
build (like ipr on powerpc/64 which failed due to missing writeq).
* x86 and x86_64 UP and SMP allmodconfig and a custom test config.
* powerpc and powerpc64 SMP allmodconfig
* sparc and sparc64 SMP allmodconfig
* ia64 SMP allmodconfig
* s390 SMP allmodconfig
* alpha SMP allmodconfig
* um on x86_64 SMP allmodconfig
8. percpu.h modifications were reverted so that it could be applied as
a separate patch and serve as bisection point.
Given the fact that I had only a couple of failures from tests on step
6, I'm fairly confident about the coverage of this conversion patch.
If there is a breakage, it's likely to be something in one of the arch
headers which should be easily discoverable easily on most builds of
the specific arch.
Signed-off-by: Tejun Heo <tj@kernel.org>
Guess-its-ok-by: Christoph Lameter <cl@linux-foundation.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
2010-03-24 17:04:11 +09:00
# include <linux/slab.h>
2019-01-08 12:30:00 +03:00
# include <linux/kmemleak.h>
2005-04-16 15:20:36 -07:00
# include <linux/types.h>
# include <linux/kernel.h>
# include <linux/module.h>
# include <linux/socket.h>
# include <linux/netdevice.h>
# include <linux/proc_fs.h>
# ifdef CONFIG_SYSCTL
# include <linux/sysctl.h>
# endif
# include <linux/times.h>
2007-09-12 12:01:34 +02:00
# include <net/net_namespace.h>
2005-04-16 15:20:36 -07:00
# include <net/neighbour.h>
2019-05-01 18:18:42 -07:00
# include <net/arp.h>
2005-04-16 15:20:36 -07:00
# include <net/dst.h>
# include <net/sock.h>
2006-07-30 20:43:36 -07:00
# include <net/netevent.h>
2006-08-07 17:53:08 -07:00
# include <net/netlink.h>
2005-04-16 15:20:36 -07:00
# include <linux/rtnetlink.h>
# include <linux/random.h>
2005-06-23 00:09:02 -07:00
# include <linux/string.h>
2007-08-24 22:27:55 -07:00
# include <linux/log2.h>
2013-12-07 19:26:56 +01:00
# include <linux/inetdevice.h>
2013-12-07 19:26:57 +01:00
# include <net/addrconf.h>
2005-04-16 15:20:36 -07:00
2019-02-14 09:15:11 -08:00
# include <trace/events/neigh.h>
2005-04-16 15:20:36 -07:00
# define NEIGH_DEBUG 1
2013-04-15 15:17:19 +00:00
# define neigh_dbg(level, fmt, ...) \
do { \
if ( level < = NEIGH_DEBUG ) \
pr_debug ( fmt , # # __VA_ARGS__ ) ; \
} while ( 0 )
2005-04-16 15:20:36 -07:00
# define PNEIGH_HASHMASK 0xF
treewide: setup_timer() -> timer_setup()
This converts all remaining cases of the old setup_timer() API into using
timer_setup(), where the callback argument is the structure already
holding the struct timer_list. These should have no behavioral changes,
since they just change which pointer is passed into the callback with
the same available pointers after conversion. It handles the following
examples, in addition to some other variations.
Casting from unsigned long:
void my_callback(unsigned long data)
{
struct something *ptr = (struct something *)data;
...
}
...
setup_timer(&ptr->my_timer, my_callback, ptr);
and forced object casts:
void my_callback(struct something *ptr)
{
...
}
...
setup_timer(&ptr->my_timer, my_callback, (unsigned long)ptr);
become:
void my_callback(struct timer_list *t)
{
struct something *ptr = from_timer(ptr, t, my_timer);
...
}
...
timer_setup(&ptr->my_timer, my_callback, 0);
Direct function assignments:
void my_callback(unsigned long data)
{
struct something *ptr = (struct something *)data;
...
}
...
ptr->my_timer.function = my_callback;
have a temporary cast added, along with converting the args:
void my_callback(struct timer_list *t)
{
struct something *ptr = from_timer(ptr, t, my_timer);
...
}
...
ptr->my_timer.function = (TIMER_FUNC_TYPE)my_callback;
And finally, callbacks without a data assignment:
void my_callback(unsigned long data)
{
...
}
...
setup_timer(&ptr->my_timer, my_callback, 0);
have their argument renamed to verify they're unused during conversion:
void my_callback(struct timer_list *unused)
{
...
}
...
timer_setup(&ptr->my_timer, my_callback, 0);
The conversion is done with the following Coccinelle script:
spatch --very-quiet --all-includes --include-headers \
-I ./arch/x86/include -I ./arch/x86/include/generated \
-I ./include -I ./arch/x86/include/uapi \
-I ./arch/x86/include/generated/uapi -I ./include/uapi \
-I ./include/generated/uapi --include ./include/linux/kconfig.h \
--dir . \
--cocci-file ~/src/data/timer_setup.cocci
@fix_address_of@
expression e;
@@
setup_timer(
-&(e)
+&e
, ...)
// Update any raw setup_timer() usages that have a NULL callback, but
// would otherwise match change_timer_function_usage, since the latter
// will update all function assignments done in the face of a NULL
// function initialization in setup_timer().
@change_timer_function_usage_NULL@
expression _E;
identifier _timer;
type _cast_data;
@@
(
-setup_timer(&_E->_timer, NULL, _E);
+timer_setup(&_E->_timer, NULL, 0);
|
-setup_timer(&_E->_timer, NULL, (_cast_data)_E);
+timer_setup(&_E->_timer, NULL, 0);
|
-setup_timer(&_E._timer, NULL, &_E);
+timer_setup(&_E._timer, NULL, 0);
|
-setup_timer(&_E._timer, NULL, (_cast_data)&_E);
+timer_setup(&_E._timer, NULL, 0);
)
@change_timer_function_usage@
expression _E;
identifier _timer;
struct timer_list _stl;
identifier _callback;
type _cast_func, _cast_data;
@@
(
-setup_timer(&_E->_timer, _callback, _E);
+timer_setup(&_E->_timer, _callback, 0);
|
-setup_timer(&_E->_timer, &_callback, _E);
+timer_setup(&_E->_timer, _callback, 0);
|
-setup_timer(&_E->_timer, _callback, (_cast_data)_E);
+timer_setup(&_E->_timer, _callback, 0);
|
-setup_timer(&_E->_timer, &_callback, (_cast_data)_E);
+timer_setup(&_E->_timer, _callback, 0);
|
-setup_timer(&_E->_timer, (_cast_func)_callback, _E);
+timer_setup(&_E->_timer, _callback, 0);
|
-setup_timer(&_E->_timer, (_cast_func)&_callback, _E);
+timer_setup(&_E->_timer, _callback, 0);
|
-setup_timer(&_E->_timer, (_cast_func)_callback, (_cast_data)_E);
+timer_setup(&_E->_timer, _callback, 0);
|
-setup_timer(&_E->_timer, (_cast_func)&_callback, (_cast_data)_E);
+timer_setup(&_E->_timer, _callback, 0);
|
-setup_timer(&_E._timer, _callback, (_cast_data)_E);
+timer_setup(&_E._timer, _callback, 0);
|
-setup_timer(&_E._timer, _callback, (_cast_data)&_E);
+timer_setup(&_E._timer, _callback, 0);
|
-setup_timer(&_E._timer, &_callback, (_cast_data)_E);
+timer_setup(&_E._timer, _callback, 0);
|
-setup_timer(&_E._timer, &_callback, (_cast_data)&_E);
+timer_setup(&_E._timer, _callback, 0);
|
-setup_timer(&_E._timer, (_cast_func)_callback, (_cast_data)_E);
+timer_setup(&_E._timer, _callback, 0);
|
-setup_timer(&_E._timer, (_cast_func)_callback, (_cast_data)&_E);
+timer_setup(&_E._timer, _callback, 0);
|
-setup_timer(&_E._timer, (_cast_func)&_callback, (_cast_data)_E);
+timer_setup(&_E._timer, _callback, 0);
|
-setup_timer(&_E._timer, (_cast_func)&_callback, (_cast_data)&_E);
+timer_setup(&_E._timer, _callback, 0);
|
_E->_timer@_stl.function = _callback;
|
_E->_timer@_stl.function = &_callback;
|
_E->_timer@_stl.function = (_cast_func)_callback;
|
_E->_timer@_stl.function = (_cast_func)&_callback;
|
_E._timer@_stl.function = _callback;
|
_E._timer@_stl.function = &_callback;
|
_E._timer@_stl.function = (_cast_func)_callback;
|
_E._timer@_stl.function = (_cast_func)&_callback;
)
// callback(unsigned long arg)
@change_callback_handle_cast
depends on change_timer_function_usage@
identifier change_timer_function_usage._callback;
identifier change_timer_function_usage._timer;
type _origtype;
identifier _origarg;
type _handletype;
identifier _handle;
@@
void _callback(
-_origtype _origarg
+struct timer_list *t
)
{
(
... when != _origarg
_handletype *_handle =
-(_handletype *)_origarg;
+from_timer(_handle, t, _timer);
... when != _origarg
|
... when != _origarg
_handletype *_handle =
-(void *)_origarg;
+from_timer(_handle, t, _timer);
... when != _origarg
|
... when != _origarg
_handletype *_handle;
... when != _handle
_handle =
-(_handletype *)_origarg;
+from_timer(_handle, t, _timer);
... when != _origarg
|
... when != _origarg
_handletype *_handle;
... when != _handle
_handle =
-(void *)_origarg;
+from_timer(_handle, t, _timer);
... when != _origarg
)
}
// callback(unsigned long arg) without existing variable
@change_callback_handle_cast_no_arg
depends on change_timer_function_usage &&
!change_callback_handle_cast@
identifier change_timer_function_usage._callback;
identifier change_timer_function_usage._timer;
type _origtype;
identifier _origarg;
type _handletype;
@@
void _callback(
-_origtype _origarg
+struct timer_list *t
)
{
+ _handletype *_origarg = from_timer(_origarg, t, _timer);
+
... when != _origarg
- (_handletype *)_origarg
+ _origarg
... when != _origarg
}
// Avoid already converted callbacks.
@match_callback_converted
depends on change_timer_function_usage &&
!change_callback_handle_cast &&
!change_callback_handle_cast_no_arg@
identifier change_timer_function_usage._callback;
identifier t;
@@
void _callback(struct timer_list *t)
{ ... }
// callback(struct something *handle)
@change_callback_handle_arg
depends on change_timer_function_usage &&
!match_callback_converted &&
!change_callback_handle_cast &&
!change_callback_handle_cast_no_arg@
identifier change_timer_function_usage._callback;
identifier change_timer_function_usage._timer;
type _handletype;
identifier _handle;
@@
void _callback(
-_handletype *_handle
+struct timer_list *t
)
{
+ _handletype *_handle = from_timer(_handle, t, _timer);
...
}
// If change_callback_handle_arg ran on an empty function, remove
// the added handler.
@unchange_callback_handle_arg
depends on change_timer_function_usage &&
change_callback_handle_arg@
identifier change_timer_function_usage._callback;
identifier change_timer_function_usage._timer;
type _handletype;
identifier _handle;
identifier t;
@@
void _callback(struct timer_list *t)
{
- _handletype *_handle = from_timer(_handle, t, _timer);
}
// We only want to refactor the setup_timer() data argument if we've found
// the matching callback. This undoes changes in change_timer_function_usage.
@unchange_timer_function_usage
depends on change_timer_function_usage &&
!change_callback_handle_cast &&
!change_callback_handle_cast_no_arg &&
!change_callback_handle_arg@
expression change_timer_function_usage._E;
identifier change_timer_function_usage._timer;
identifier change_timer_function_usage._callback;
type change_timer_function_usage._cast_data;
@@
(
-timer_setup(&_E->_timer, _callback, 0);
+setup_timer(&_E->_timer, _callback, (_cast_data)_E);
|
-timer_setup(&_E._timer, _callback, 0);
+setup_timer(&_E._timer, _callback, (_cast_data)&_E);
)
// If we fixed a callback from a .function assignment, fix the
// assignment cast now.
@change_timer_function_assignment
depends on change_timer_function_usage &&
(change_callback_handle_cast ||
change_callback_handle_cast_no_arg ||
change_callback_handle_arg)@
expression change_timer_function_usage._E;
identifier change_timer_function_usage._timer;
identifier change_timer_function_usage._callback;
type _cast_func;
typedef TIMER_FUNC_TYPE;
@@
(
_E->_timer.function =
-_callback
+(TIMER_FUNC_TYPE)_callback
;
|
_E->_timer.function =
-&_callback
+(TIMER_FUNC_TYPE)_callback
;
|
_E->_timer.function =
-(_cast_func)_callback;
+(TIMER_FUNC_TYPE)_callback
;
|
_E->_timer.function =
-(_cast_func)&_callback
+(TIMER_FUNC_TYPE)_callback
;
|
_E._timer.function =
-_callback
+(TIMER_FUNC_TYPE)_callback
;
|
_E._timer.function =
-&_callback;
+(TIMER_FUNC_TYPE)_callback
;
|
_E._timer.function =
-(_cast_func)_callback
+(TIMER_FUNC_TYPE)_callback
;
|
_E._timer.function =
-(_cast_func)&_callback
+(TIMER_FUNC_TYPE)_callback
;
)
// Sometimes timer functions are called directly. Replace matched args.
@change_timer_function_calls
depends on change_timer_function_usage &&
(change_callback_handle_cast ||
change_callback_handle_cast_no_arg ||
change_callback_handle_arg)@
expression _E;
identifier change_timer_function_usage._timer;
identifier change_timer_function_usage._callback;
type _cast_data;
@@
_callback(
(
-(_cast_data)_E
+&_E->_timer
|
-(_cast_data)&_E
+&_E._timer
|
-_E
+&_E->_timer
)
)
// If a timer has been configured without a data argument, it can be
// converted without regard to the callback argument, since it is unused.
@match_timer_function_unused_data@
expression _E;
identifier _timer;
identifier _callback;
@@
(
-setup_timer(&_E->_timer, _callback, 0);
+timer_setup(&_E->_timer, _callback, 0);
|
-setup_timer(&_E->_timer, _callback, 0L);
+timer_setup(&_E->_timer, _callback, 0);
|
-setup_timer(&_E->_timer, _callback, 0UL);
+timer_setup(&_E->_timer, _callback, 0);
|
-setup_timer(&_E._timer, _callback, 0);
+timer_setup(&_E._timer, _callback, 0);
|
-setup_timer(&_E._timer, _callback, 0L);
+timer_setup(&_E._timer, _callback, 0);
|
-setup_timer(&_E._timer, _callback, 0UL);
+timer_setup(&_E._timer, _callback, 0);
|
-setup_timer(&_timer, _callback, 0);
+timer_setup(&_timer, _callback, 0);
|
-setup_timer(&_timer, _callback, 0L);
+timer_setup(&_timer, _callback, 0);
|
-setup_timer(&_timer, _callback, 0UL);
+timer_setup(&_timer, _callback, 0);
|
-setup_timer(_timer, _callback, 0);
+timer_setup(_timer, _callback, 0);
|
-setup_timer(_timer, _callback, 0L);
+timer_setup(_timer, _callback, 0);
|
-setup_timer(_timer, _callback, 0UL);
+timer_setup(_timer, _callback, 0);
)
@change_callback_unused_data
depends on match_timer_function_unused_data@
identifier match_timer_function_unused_data._callback;
type _origtype;
identifier _origarg;
@@
void _callback(
-_origtype _origarg
+struct timer_list *unused
)
{
... when != _origarg
}
Signed-off-by: Kees Cook <keescook@chromium.org>
2017-10-16 14:43:17 -07:00
static void neigh_timer_handler ( struct timer_list * t ) ;
2017-03-19 22:01:28 -07:00
static void __neigh_notify ( struct neighbour * n , int type , int flags ,
u32 pid ) ;
static void neigh_update_notify ( struct neighbour * neigh , u32 nlmsg_pid ) ;
2018-04-12 10:46:55 +02:00
static int pneigh_ifdown_and_unlock ( struct neigh_table * tbl ,
struct net_device * dev ) ;
2005-04-16 15:20:36 -07:00
2005-09-24 16:53:16 -07:00
# ifdef CONFIG_PROC_FS
2018-04-15 10:16:41 +02:00
static const struct seq_operations neigh_stat_seq_ops ;
2005-09-24 16:53:16 -07:00
# endif
2005-04-16 15:20:36 -07:00
/*
Neighbour hash table buckets are protected with rwlock tbl - > lock .
- All the scans / updates to hash buckets MUST be made under this lock .
- NOTHING clever should be made under this lock : no callbacks
to protocol backends , no attempts to send something to network .
It will result in deadlocks , if backend / driver wants to use neighbour
cache .
- If the entry requires some non - trivial actions , increase
its reference count and release table lock .
Neighbour entries are protected :
- with reference count .
- with rwlock neigh - > lock
Reference count prevents destruction .
neigh - > lock mainly serializes ll address data and its validity state .
However , the same lock is used to protect another entry fields :
- timer
- resolution queue
Again , nothing clever shall be made under neigh - > lock ,
the most complicated procedure , which we allow is dev - > hard_header .
It is supposed , that dev - > hard_header is simplistic and does
not make callbacks to neighbour tables .
*/
2011-07-17 13:34:11 -07:00
static int neigh_blackhole ( struct neighbour * neigh , struct sk_buff * skb )
2005-04-16 15:20:36 -07:00
{
kfree_skb ( skb ) ;
return - ENETDOWN ;
}
2007-08-08 23:12:36 -07:00
static void neigh_cleanup_and_release ( struct neighbour * neigh )
{
2019-02-14 09:15:11 -08:00
trace_neigh_cleanup_and_release ( neigh , 0 ) ;
2017-03-19 22:01:28 -07:00
__neigh_notify ( neigh , RTM_DELNEIGH , 0 , 0 ) ;
2016-12-23 09:32:48 +01:00
call_netevent_notifiers ( NETEVENT_NEIGH_UPDATE , neigh ) ;
2007-08-08 23:12:36 -07:00
neigh_release ( neigh ) ;
}
2005-04-16 15:20:36 -07:00
/*
* It is random distribution in the interval ( 1 / 2 ) * base . . . ( 3 / 2 ) * base .
* It corresponds to default IPv6 settings and is not overridable ,
* because it is really reasonable choice .
*/
unsigned long neigh_rand_reach_time ( unsigned long base )
{
2022-10-09 20:44:02 -06:00
return base ? get_random_u32_below ( base ) + ( base > > 1 ) : 0 ;
2005-04-16 15:20:36 -07:00
}
2008-03-24 18:39:10 +09:00
EXPORT_SYMBOL ( neigh_rand_reach_time ) ;
2005-04-16 15:20:36 -07:00
neighbor: Improve garbage collection
The existing garbage collection algorithm has a number of problems:
1. The gc algorithm will not evict PERMANENT entries as those entries
are managed by userspace, yet the existing algorithm walks the entire
hash table which means it always considers PERMANENT entries when
looking for entries to evict. In some use cases (e.g., EVPN) there
can be tens of thousands of PERMANENT entries leading to wasted
CPU cycles when gc kicks in. As an example, with 32k permanent
entries, neigh_alloc has been observed taking more than 4 msec per
invocation.
2. Currently, when the number of neighbor entries hits gc_thresh2 and
the last flush for the table was more than 5 seconds ago gc kicks in
walks the entire hash table evicting *all* entries not in PERMANENT
or REACHABLE state and not marked as externally learned. There is no
discriminator on when the neigh entry was created or if it just moved
from REACHABLE to another NUD_VALID state (e.g., NUD_STALE).
It is possible for entries to be created or for established neighbor
entries to be moved to STALE (e.g., an external node sends an ARP
request) right before the 5 second window lapses:
-----|---------x|----------|-----
t-5 t t+5
If that happens those entries are evicted during gc causing unnecessary
thrashing on neighbor entries and userspace caches trying to track them.
Further, this contradicts the description of gc_thresh2 which says
"Entries older than 5 seconds will be cleared".
One workaround is to make gc_thresh2 == gc_thresh3 but that negates the
whole point of having separate thresholds.
3. Clearing *all* neigh non-PERMANENT/REACHABLE/externally learned entries
when gc_thresh2 is exceeded is over kill and contributes to trashing
especially during startup.
This patch addresses these problems as follows:
1. Use of a separate list_head to track entries that can be garbage
collected along with a separate counter. PERMANENT entries are not
added to this list.
The gc_thresh parameters are only compared to the new counter, not the
total entries in the table. The forced_gc function is updated to only
walk this new gc_list looking for entries to evict.
2. Entries are added to the list head at the tail and removed from the
front.
3. Entries are only evicted if they were last updated more than 5 seconds
ago, adhering to the original intent of gc_thresh2.
4. Forced gc is stopped once the number of gc_entries drops below
gc_thresh2.
5. Since gc checks do not apply to PERMANENT entries, gc levels are skipped
when allocating a new neighbor for a PERMANENT entry. By extension this
means there are no explicit limits on the number of PERMANENT entries
that can be created, but this is no different than FIB entries or FDB
entries.
Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-12-07 12:24:57 -08:00
static void neigh_mark_dead ( struct neighbour * n )
{
n - > dead = 1 ;
if ( ! list_empty ( & n - > gc_list ) ) {
list_del_init ( & n - > gc_list ) ;
atomic_dec ( & n - > tbl - > gc_entries ) ;
}
2021-10-11 14:12:38 +02:00
if ( ! list_empty ( & n - > managed_list ) )
list_del_init ( & n - > managed_list ) ;
neighbor: Improve garbage collection
The existing garbage collection algorithm has a number of problems:
1. The gc algorithm will not evict PERMANENT entries as those entries
are managed by userspace, yet the existing algorithm walks the entire
hash table which means it always considers PERMANENT entries when
looking for entries to evict. In some use cases (e.g., EVPN) there
can be tens of thousands of PERMANENT entries leading to wasted
CPU cycles when gc kicks in. As an example, with 32k permanent
entries, neigh_alloc has been observed taking more than 4 msec per
invocation.
2. Currently, when the number of neighbor entries hits gc_thresh2 and
the last flush for the table was more than 5 seconds ago gc kicks in
walks the entire hash table evicting *all* entries not in PERMANENT
or REACHABLE state and not marked as externally learned. There is no
discriminator on when the neigh entry was created or if it just moved
from REACHABLE to another NUD_VALID state (e.g., NUD_STALE).
It is possible for entries to be created or for established neighbor
entries to be moved to STALE (e.g., an external node sends an ARP
request) right before the 5 second window lapses:
-----|---------x|----------|-----
t-5 t t+5
If that happens those entries are evicted during gc causing unnecessary
thrashing on neighbor entries and userspace caches trying to track them.
Further, this contradicts the description of gc_thresh2 which says
"Entries older than 5 seconds will be cleared".
One workaround is to make gc_thresh2 == gc_thresh3 but that negates the
whole point of having separate thresholds.
3. Clearing *all* neigh non-PERMANENT/REACHABLE/externally learned entries
when gc_thresh2 is exceeded is over kill and contributes to trashing
especially during startup.
This patch addresses these problems as follows:
1. Use of a separate list_head to track entries that can be garbage
collected along with a separate counter. PERMANENT entries are not
added to this list.
The gc_thresh parameters are only compared to the new counter, not the
total entries in the table. The forced_gc function is updated to only
walk this new gc_list looking for entries to evict.
2. Entries are added to the list head at the tail and removed from the
front.
3. Entries are only evicted if they were last updated more than 5 seconds
ago, adhering to the original intent of gc_thresh2.
4. Forced gc is stopped once the number of gc_entries drops below
gc_thresh2.
5. Since gc checks do not apply to PERMANENT entries, gc levels are skipped
when allocating a new neighbor for a PERMANENT entry. By extension this
means there are no explicit limits on the number of PERMANENT entries
that can be created, but this is no different than FIB entries or FDB
entries.
Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-12-07 12:24:57 -08:00
}
2018-12-11 18:57:21 -07:00
static void neigh_update_gc_list ( struct neighbour * n )
neighbor: Improve garbage collection
The existing garbage collection algorithm has a number of problems:
1. The gc algorithm will not evict PERMANENT entries as those entries
are managed by userspace, yet the existing algorithm walks the entire
hash table which means it always considers PERMANENT entries when
looking for entries to evict. In some use cases (e.g., EVPN) there
can be tens of thousands of PERMANENT entries leading to wasted
CPU cycles when gc kicks in. As an example, with 32k permanent
entries, neigh_alloc has been observed taking more than 4 msec per
invocation.
2. Currently, when the number of neighbor entries hits gc_thresh2 and
the last flush for the table was more than 5 seconds ago gc kicks in
walks the entire hash table evicting *all* entries not in PERMANENT
or REACHABLE state and not marked as externally learned. There is no
discriminator on when the neigh entry was created or if it just moved
from REACHABLE to another NUD_VALID state (e.g., NUD_STALE).
It is possible for entries to be created or for established neighbor
entries to be moved to STALE (e.g., an external node sends an ARP
request) right before the 5 second window lapses:
-----|---------x|----------|-----
t-5 t t+5
If that happens those entries are evicted during gc causing unnecessary
thrashing on neighbor entries and userspace caches trying to track them.
Further, this contradicts the description of gc_thresh2 which says
"Entries older than 5 seconds will be cleared".
One workaround is to make gc_thresh2 == gc_thresh3 but that negates the
whole point of having separate thresholds.
3. Clearing *all* neigh non-PERMANENT/REACHABLE/externally learned entries
when gc_thresh2 is exceeded is over kill and contributes to trashing
especially during startup.
This patch addresses these problems as follows:
1. Use of a separate list_head to track entries that can be garbage
collected along with a separate counter. PERMANENT entries are not
added to this list.
The gc_thresh parameters are only compared to the new counter, not the
total entries in the table. The forced_gc function is updated to only
walk this new gc_list looking for entries to evict.
2. Entries are added to the list head at the tail and removed from the
front.
3. Entries are only evicted if they were last updated more than 5 seconds
ago, adhering to the original intent of gc_thresh2.
4. Forced gc is stopped once the number of gc_entries drops below
gc_thresh2.
5. Since gc checks do not apply to PERMANENT entries, gc levels are skipped
when allocating a new neighbor for a PERMANENT entry. By extension this
means there are no explicit limits on the number of PERMANENT entries
that can be created, but this is no different than FIB entries or FDB
entries.
Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-12-07 12:24:57 -08:00
{
2018-12-11 18:57:25 -07:00
bool on_gc_list , exempt_from_gc ;
neighbor: Improve garbage collection
The existing garbage collection algorithm has a number of problems:
1. The gc algorithm will not evict PERMANENT entries as those entries
are managed by userspace, yet the existing algorithm walks the entire
hash table which means it always considers PERMANENT entries when
looking for entries to evict. In some use cases (e.g., EVPN) there
can be tens of thousands of PERMANENT entries leading to wasted
CPU cycles when gc kicks in. As an example, with 32k permanent
entries, neigh_alloc has been observed taking more than 4 msec per
invocation.
2. Currently, when the number of neighbor entries hits gc_thresh2 and
the last flush for the table was more than 5 seconds ago gc kicks in
walks the entire hash table evicting *all* entries not in PERMANENT
or REACHABLE state and not marked as externally learned. There is no
discriminator on when the neigh entry was created or if it just moved
from REACHABLE to another NUD_VALID state (e.g., NUD_STALE).
It is possible for entries to be created or for established neighbor
entries to be moved to STALE (e.g., an external node sends an ARP
request) right before the 5 second window lapses:
-----|---------x|----------|-----
t-5 t t+5
If that happens those entries are evicted during gc causing unnecessary
thrashing on neighbor entries and userspace caches trying to track them.
Further, this contradicts the description of gc_thresh2 which says
"Entries older than 5 seconds will be cleared".
One workaround is to make gc_thresh2 == gc_thresh3 but that negates the
whole point of having separate thresholds.
3. Clearing *all* neigh non-PERMANENT/REACHABLE/externally learned entries
when gc_thresh2 is exceeded is over kill and contributes to trashing
especially during startup.
This patch addresses these problems as follows:
1. Use of a separate list_head to track entries that can be garbage
collected along with a separate counter. PERMANENT entries are not
added to this list.
The gc_thresh parameters are only compared to the new counter, not the
total entries in the table. The forced_gc function is updated to only
walk this new gc_list looking for entries to evict.
2. Entries are added to the list head at the tail and removed from the
front.
3. Entries are only evicted if they were last updated more than 5 seconds
ago, adhering to the original intent of gc_thresh2.
4. Forced gc is stopped once the number of gc_entries drops below
gc_thresh2.
5. Since gc checks do not apply to PERMANENT entries, gc levels are skipped
when allocating a new neighbor for a PERMANENT entry. By extension this
means there are no explicit limits on the number of PERMANENT entries
that can be created, but this is no different than FIB entries or FDB
entries.
Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-12-07 12:24:57 -08:00
2018-12-11 18:57:21 -07:00
write_lock_bh ( & n - > tbl - > lock ) ;
write_lock ( & n - > lock ) ;
2021-04-22 01:12:22 +05:30
if ( n - > dead )
goto out ;
2018-12-11 18:57:25 -07:00
/* remove from the gc list if new state is permanent or if neighbor
* is externally learned ; otherwise entry should be on the gc list
neighbor: Improve garbage collection
The existing garbage collection algorithm has a number of problems:
1. The gc algorithm will not evict PERMANENT entries as those entries
are managed by userspace, yet the existing algorithm walks the entire
hash table which means it always considers PERMANENT entries when
looking for entries to evict. In some use cases (e.g., EVPN) there
can be tens of thousands of PERMANENT entries leading to wasted
CPU cycles when gc kicks in. As an example, with 32k permanent
entries, neigh_alloc has been observed taking more than 4 msec per
invocation.
2. Currently, when the number of neighbor entries hits gc_thresh2 and
the last flush for the table was more than 5 seconds ago gc kicks in
walks the entire hash table evicting *all* entries not in PERMANENT
or REACHABLE state and not marked as externally learned. There is no
discriminator on when the neigh entry was created or if it just moved
from REACHABLE to another NUD_VALID state (e.g., NUD_STALE).
It is possible for entries to be created or for established neighbor
entries to be moved to STALE (e.g., an external node sends an ARP
request) right before the 5 second window lapses:
-----|---------x|----------|-----
t-5 t t+5
If that happens those entries are evicted during gc causing unnecessary
thrashing on neighbor entries and userspace caches trying to track them.
Further, this contradicts the description of gc_thresh2 which says
"Entries older than 5 seconds will be cleared".
One workaround is to make gc_thresh2 == gc_thresh3 but that negates the
whole point of having separate thresholds.
3. Clearing *all* neigh non-PERMANENT/REACHABLE/externally learned entries
when gc_thresh2 is exceeded is over kill and contributes to trashing
especially during startup.
This patch addresses these problems as follows:
1. Use of a separate list_head to track entries that can be garbage
collected along with a separate counter. PERMANENT entries are not
added to this list.
The gc_thresh parameters are only compared to the new counter, not the
total entries in the table. The forced_gc function is updated to only
walk this new gc_list looking for entries to evict.
2. Entries are added to the list head at the tail and removed from the
front.
3. Entries are only evicted if they were last updated more than 5 seconds
ago, adhering to the original intent of gc_thresh2.
4. Forced gc is stopped once the number of gc_entries drops below
gc_thresh2.
5. Since gc checks do not apply to PERMANENT entries, gc levels are skipped
when allocating a new neighbor for a PERMANENT entry. By extension this
means there are no explicit limits on the number of PERMANENT entries
that can be created, but this is no different than FIB entries or FDB
entries.
Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-12-07 12:24:57 -08:00
*/
2018-12-11 18:57:25 -07:00
exempt_from_gc = n - > nud_state & NUD_PERMANENT | |
n - > flags & NTF_EXT_LEARNED ;
2018-12-11 18:57:21 -07:00
on_gc_list = ! list_empty ( & n - > gc_list ) ;
2018-12-11 18:57:25 -07:00
if ( exempt_from_gc & & on_gc_list ) {
neighbor: Improve garbage collection
The existing garbage collection algorithm has a number of problems:
1. The gc algorithm will not evict PERMANENT entries as those entries
are managed by userspace, yet the existing algorithm walks the entire
hash table which means it always considers PERMANENT entries when
looking for entries to evict. In some use cases (e.g., EVPN) there
can be tens of thousands of PERMANENT entries leading to wasted
CPU cycles when gc kicks in. As an example, with 32k permanent
entries, neigh_alloc has been observed taking more than 4 msec per
invocation.
2. Currently, when the number of neighbor entries hits gc_thresh2 and
the last flush for the table was more than 5 seconds ago gc kicks in
walks the entire hash table evicting *all* entries not in PERMANENT
or REACHABLE state and not marked as externally learned. There is no
discriminator on when the neigh entry was created or if it just moved
from REACHABLE to another NUD_VALID state (e.g., NUD_STALE).
It is possible for entries to be created or for established neighbor
entries to be moved to STALE (e.g., an external node sends an ARP
request) right before the 5 second window lapses:
-----|---------x|----------|-----
t-5 t t+5
If that happens those entries are evicted during gc causing unnecessary
thrashing on neighbor entries and userspace caches trying to track them.
Further, this contradicts the description of gc_thresh2 which says
"Entries older than 5 seconds will be cleared".
One workaround is to make gc_thresh2 == gc_thresh3 but that negates the
whole point of having separate thresholds.
3. Clearing *all* neigh non-PERMANENT/REACHABLE/externally learned entries
when gc_thresh2 is exceeded is over kill and contributes to trashing
especially during startup.
This patch addresses these problems as follows:
1. Use of a separate list_head to track entries that can be garbage
collected along with a separate counter. PERMANENT entries are not
added to this list.
The gc_thresh parameters are only compared to the new counter, not the
total entries in the table. The forced_gc function is updated to only
walk this new gc_list looking for entries to evict.
2. Entries are added to the list head at the tail and removed from the
front.
3. Entries are only evicted if they were last updated more than 5 seconds
ago, adhering to the original intent of gc_thresh2.
4. Forced gc is stopped once the number of gc_entries drops below
gc_thresh2.
5. Since gc checks do not apply to PERMANENT entries, gc levels are skipped
when allocating a new neighbor for a PERMANENT entry. By extension this
means there are no explicit limits on the number of PERMANENT entries
that can be created, but this is no different than FIB entries or FDB
entries.
Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-12-07 12:24:57 -08:00
list_del_init ( & n - > gc_list ) ;
atomic_dec ( & n - > tbl - > gc_entries ) ;
2018-12-11 18:57:25 -07:00
} else if ( ! exempt_from_gc & & ! on_gc_list ) {
neighbor: Improve garbage collection
The existing garbage collection algorithm has a number of problems:
1. The gc algorithm will not evict PERMANENT entries as those entries
are managed by userspace, yet the existing algorithm walks the entire
hash table which means it always considers PERMANENT entries when
looking for entries to evict. In some use cases (e.g., EVPN) there
can be tens of thousands of PERMANENT entries leading to wasted
CPU cycles when gc kicks in. As an example, with 32k permanent
entries, neigh_alloc has been observed taking more than 4 msec per
invocation.
2. Currently, when the number of neighbor entries hits gc_thresh2 and
the last flush for the table was more than 5 seconds ago gc kicks in
walks the entire hash table evicting *all* entries not in PERMANENT
or REACHABLE state and not marked as externally learned. There is no
discriminator on when the neigh entry was created or if it just moved
from REACHABLE to another NUD_VALID state (e.g., NUD_STALE).
It is possible for entries to be created or for established neighbor
entries to be moved to STALE (e.g., an external node sends an ARP
request) right before the 5 second window lapses:
-----|---------x|----------|-----
t-5 t t+5
If that happens those entries are evicted during gc causing unnecessary
thrashing on neighbor entries and userspace caches trying to track them.
Further, this contradicts the description of gc_thresh2 which says
"Entries older than 5 seconds will be cleared".
One workaround is to make gc_thresh2 == gc_thresh3 but that negates the
whole point of having separate thresholds.
3. Clearing *all* neigh non-PERMANENT/REACHABLE/externally learned entries
when gc_thresh2 is exceeded is over kill and contributes to trashing
especially during startup.
This patch addresses these problems as follows:
1. Use of a separate list_head to track entries that can be garbage
collected along with a separate counter. PERMANENT entries are not
added to this list.
The gc_thresh parameters are only compared to the new counter, not the
total entries in the table. The forced_gc function is updated to only
walk this new gc_list looking for entries to evict.
2. Entries are added to the list head at the tail and removed from the
front.
3. Entries are only evicted if they were last updated more than 5 seconds
ago, adhering to the original intent of gc_thresh2.
4. Forced gc is stopped once the number of gc_entries drops below
gc_thresh2.
5. Since gc checks do not apply to PERMANENT entries, gc levels are skipped
when allocating a new neighbor for a PERMANENT entry. By extension this
means there are no explicit limits on the number of PERMANENT entries
that can be created, but this is no different than FIB entries or FDB
entries.
Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-12-07 12:24:57 -08:00
/* add entries to the tail; cleaning removes from the front */
list_add_tail ( & n - > gc_list , & n - > tbl - > gc_list ) ;
atomic_inc ( & n - > tbl - > gc_entries ) ;
}
2021-10-11 14:12:38 +02:00
out :
write_unlock ( & n - > lock ) ;
write_unlock_bh ( & n - > tbl - > lock ) ;
}
static void neigh_update_managed_list ( struct neighbour * n )
{
bool on_managed_list , add_to_managed ;
write_lock_bh ( & n - > tbl - > lock ) ;
write_lock ( & n - > lock ) ;
if ( n - > dead )
goto out ;
add_to_managed = n - > flags & NTF_MANAGED ;
on_managed_list = ! list_empty ( & n - > managed_list ) ;
2018-12-11 18:57:21 -07:00
2021-10-11 14:12:38 +02:00
if ( ! add_to_managed & & on_managed_list )
list_del_init ( & n - > managed_list ) ;
else if ( add_to_managed & & ! on_managed_list )
list_add_tail ( & n - > managed_list , & n - > tbl - > managed_list ) ;
2021-04-22 01:12:22 +05:30
out :
2018-12-11 18:57:21 -07:00
write_unlock ( & n - > lock ) ;
write_unlock_bh ( & n - > tbl - > lock ) ;
neighbor: Improve garbage collection
The existing garbage collection algorithm has a number of problems:
1. The gc algorithm will not evict PERMANENT entries as those entries
are managed by userspace, yet the existing algorithm walks the entire
hash table which means it always considers PERMANENT entries when
looking for entries to evict. In some use cases (e.g., EVPN) there
can be tens of thousands of PERMANENT entries leading to wasted
CPU cycles when gc kicks in. As an example, with 32k permanent
entries, neigh_alloc has been observed taking more than 4 msec per
invocation.
2. Currently, when the number of neighbor entries hits gc_thresh2 and
the last flush for the table was more than 5 seconds ago gc kicks in
walks the entire hash table evicting *all* entries not in PERMANENT
or REACHABLE state and not marked as externally learned. There is no
discriminator on when the neigh entry was created or if it just moved
from REACHABLE to another NUD_VALID state (e.g., NUD_STALE).
It is possible for entries to be created or for established neighbor
entries to be moved to STALE (e.g., an external node sends an ARP
request) right before the 5 second window lapses:
-----|---------x|----------|-----
t-5 t t+5
If that happens those entries are evicted during gc causing unnecessary
thrashing on neighbor entries and userspace caches trying to track them.
Further, this contradicts the description of gc_thresh2 which says
"Entries older than 5 seconds will be cleared".
One workaround is to make gc_thresh2 == gc_thresh3 but that negates the
whole point of having separate thresholds.
3. Clearing *all* neigh non-PERMANENT/REACHABLE/externally learned entries
when gc_thresh2 is exceeded is over kill and contributes to trashing
especially during startup.
This patch addresses these problems as follows:
1. Use of a separate list_head to track entries that can be garbage
collected along with a separate counter. PERMANENT entries are not
added to this list.
The gc_thresh parameters are only compared to the new counter, not the
total entries in the table. The forced_gc function is updated to only
walk this new gc_list looking for entries to evict.
2. Entries are added to the list head at the tail and removed from the
front.
3. Entries are only evicted if they were last updated more than 5 seconds
ago, adhering to the original intent of gc_thresh2.
4. Forced gc is stopped once the number of gc_entries drops below
gc_thresh2.
5. Since gc checks do not apply to PERMANENT entries, gc levels are skipped
when allocating a new neighbor for a PERMANENT entry. By extension this
means there are no explicit limits on the number of PERMANENT entries
that can be created, but this is no different than FIB entries or FDB
entries.
Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-12-07 12:24:57 -08:00
}
2005-04-16 15:20:36 -07:00
2021-10-11 14:12:38 +02:00
static void neigh_update_flags ( struct neighbour * neigh , u32 flags , int * notify ,
bool * gc_update , bool * managed_update )
2018-12-11 18:57:24 -07:00
{
2021-10-11 14:12:38 +02:00
u32 ndm_flags , old_flags = neigh - > flags ;
2018-12-11 18:57:24 -07:00
if ( ! ( flags & NEIGH_UPDATE_F_ADMIN ) )
2021-10-11 14:12:38 +02:00
return ;
ndm_flags = ( flags & NEIGH_UPDATE_F_EXT_LEARNED ) ? NTF_EXT_LEARNED : 0 ;
ndm_flags | = ( flags & NEIGH_UPDATE_F_MANAGED ) ? NTF_MANAGED : 0 ;
2018-12-11 18:57:24 -07:00
2021-10-11 14:12:38 +02:00
if ( ( old_flags ^ ndm_flags ) & NTF_EXT_LEARNED ) {
2018-12-11 18:57:24 -07:00
if ( ndm_flags & NTF_EXT_LEARNED )
neigh - > flags | = NTF_EXT_LEARNED ;
else
neigh - > flags & = ~ NTF_EXT_LEARNED ;
* notify = 1 ;
2021-10-11 14:12:38 +02:00
* gc_update = true ;
}
if ( ( old_flags ^ ndm_flags ) & NTF_MANAGED ) {
if ( ndm_flags & NTF_MANAGED )
neigh - > flags | = NTF_MANAGED ;
else
neigh - > flags & = ~ NTF_MANAGED ;
* notify = 1 ;
* managed_update = true ;
2018-12-11 18:57:24 -07:00
}
}
2018-12-11 18:57:23 -07:00
static bool neigh_del ( struct neighbour * n , struct neighbour __rcu * * np ,
struct neigh_table * tbl )
2017-06-02 09:01:49 -07:00
{
bool retval = false ;
write_lock ( & n - > lock ) ;
2018-12-11 18:57:23 -07:00
if ( refcount_read ( & n - > refcnt ) = = 1 ) {
2017-06-02 09:01:49 -07:00
struct neighbour * neigh ;
neigh = rcu_dereference_protected ( n - > next ,
lockdep_is_held ( & tbl - > lock ) ) ;
rcu_assign_pointer ( * np , neigh ) ;
neighbor: Improve garbage collection
The existing garbage collection algorithm has a number of problems:
1. The gc algorithm will not evict PERMANENT entries as those entries
are managed by userspace, yet the existing algorithm walks the entire
hash table which means it always considers PERMANENT entries when
looking for entries to evict. In some use cases (e.g., EVPN) there
can be tens of thousands of PERMANENT entries leading to wasted
CPU cycles when gc kicks in. As an example, with 32k permanent
entries, neigh_alloc has been observed taking more than 4 msec per
invocation.
2. Currently, when the number of neighbor entries hits gc_thresh2 and
the last flush for the table was more than 5 seconds ago gc kicks in
walks the entire hash table evicting *all* entries not in PERMANENT
or REACHABLE state and not marked as externally learned. There is no
discriminator on when the neigh entry was created or if it just moved
from REACHABLE to another NUD_VALID state (e.g., NUD_STALE).
It is possible for entries to be created or for established neighbor
entries to be moved to STALE (e.g., an external node sends an ARP
request) right before the 5 second window lapses:
-----|---------x|----------|-----
t-5 t t+5
If that happens those entries are evicted during gc causing unnecessary
thrashing on neighbor entries and userspace caches trying to track them.
Further, this contradicts the description of gc_thresh2 which says
"Entries older than 5 seconds will be cleared".
One workaround is to make gc_thresh2 == gc_thresh3 but that negates the
whole point of having separate thresholds.
3. Clearing *all* neigh non-PERMANENT/REACHABLE/externally learned entries
when gc_thresh2 is exceeded is over kill and contributes to trashing
especially during startup.
This patch addresses these problems as follows:
1. Use of a separate list_head to track entries that can be garbage
collected along with a separate counter. PERMANENT entries are not
added to this list.
The gc_thresh parameters are only compared to the new counter, not the
total entries in the table. The forced_gc function is updated to only
walk this new gc_list looking for entries to evict.
2. Entries are added to the list head at the tail and removed from the
front.
3. Entries are only evicted if they were last updated more than 5 seconds
ago, adhering to the original intent of gc_thresh2.
4. Forced gc is stopped once the number of gc_entries drops below
gc_thresh2.
5. Since gc checks do not apply to PERMANENT entries, gc levels are skipped
when allocating a new neighbor for a PERMANENT entry. By extension this
means there are no explicit limits on the number of PERMANENT entries
that can be created, but this is no different than FIB entries or FDB
entries.
Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-12-07 12:24:57 -08:00
neigh_mark_dead ( n ) ;
2017-06-02 09:01:49 -07:00
retval = true ;
}
write_unlock ( & n - > lock ) ;
if ( retval )
neigh_cleanup_and_release ( n ) ;
return retval ;
}
bool neigh_remove_one ( struct neighbour * ndel , struct neigh_table * tbl )
{
struct neigh_hash_table * nht ;
void * pkey = ndel - > primary_key ;
u32 hash_val ;
struct neighbour * n ;
struct neighbour __rcu * * np ;
nht = rcu_dereference_protected ( tbl - > nht ,
lockdep_is_held ( & tbl - > lock ) ) ;
hash_val = tbl - > hash ( pkey , ndel - > dev , nht - > hash_rnd ) ;
hash_val = hash_val > > ( 32 - nht - > hash_shift ) ;
np = & nht - > hash_buckets [ hash_val ] ;
while ( ( n = rcu_dereference_protected ( * np ,
lockdep_is_held ( & tbl - > lock ) ) ) ) {
if ( n = = ndel )
2018-12-11 18:57:23 -07:00
return neigh_del ( n , np , tbl ) ;
2017-06-02 09:01:49 -07:00
np = & n - > next ;
}
return false ;
}
2005-04-16 15:20:36 -07:00
static int neigh_forced_gc ( struct neigh_table * tbl )
{
neighbor: Improve garbage collection
The existing garbage collection algorithm has a number of problems:
1. The gc algorithm will not evict PERMANENT entries as those entries
are managed by userspace, yet the existing algorithm walks the entire
hash table which means it always considers PERMANENT entries when
looking for entries to evict. In some use cases (e.g., EVPN) there
can be tens of thousands of PERMANENT entries leading to wasted
CPU cycles when gc kicks in. As an example, with 32k permanent
entries, neigh_alloc has been observed taking more than 4 msec per
invocation.
2. Currently, when the number of neighbor entries hits gc_thresh2 and
the last flush for the table was more than 5 seconds ago gc kicks in
walks the entire hash table evicting *all* entries not in PERMANENT
or REACHABLE state and not marked as externally learned. There is no
discriminator on when the neigh entry was created or if it just moved
from REACHABLE to another NUD_VALID state (e.g., NUD_STALE).
It is possible for entries to be created or for established neighbor
entries to be moved to STALE (e.g., an external node sends an ARP
request) right before the 5 second window lapses:
-----|---------x|----------|-----
t-5 t t+5
If that happens those entries are evicted during gc causing unnecessary
thrashing on neighbor entries and userspace caches trying to track them.
Further, this contradicts the description of gc_thresh2 which says
"Entries older than 5 seconds will be cleared".
One workaround is to make gc_thresh2 == gc_thresh3 but that negates the
whole point of having separate thresholds.
3. Clearing *all* neigh non-PERMANENT/REACHABLE/externally learned entries
when gc_thresh2 is exceeded is over kill and contributes to trashing
especially during startup.
This patch addresses these problems as follows:
1. Use of a separate list_head to track entries that can be garbage
collected along with a separate counter. PERMANENT entries are not
added to this list.
The gc_thresh parameters are only compared to the new counter, not the
total entries in the table. The forced_gc function is updated to only
walk this new gc_list looking for entries to evict.
2. Entries are added to the list head at the tail and removed from the
front.
3. Entries are only evicted if they were last updated more than 5 seconds
ago, adhering to the original intent of gc_thresh2.
4. Forced gc is stopped once the number of gc_entries drops below
gc_thresh2.
5. Since gc checks do not apply to PERMANENT entries, gc levels are skipped
when allocating a new neighbor for a PERMANENT entry. By extension this
means there are no explicit limits on the number of PERMANENT entries
that can be created, but this is no different than FIB entries or FDB
entries.
Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-12-07 12:24:57 -08:00
int max_clean = atomic_read ( & tbl - > gc_entries ) - tbl - > gc_thresh2 ;
unsigned long tref = jiffies - 5 * HZ ;
struct neighbour * n , * tmp ;
2005-04-16 15:20:36 -07:00
int shrunk = 0 ;
NEIGH_CACHE_STAT_INC ( tbl , forced_gc_runs ) ;
write_lock_bh ( & tbl - > lock ) ;
neighbor: Improve garbage collection
The existing garbage collection algorithm has a number of problems:
1. The gc algorithm will not evict PERMANENT entries as those entries
are managed by userspace, yet the existing algorithm walks the entire
hash table which means it always considers PERMANENT entries when
looking for entries to evict. In some use cases (e.g., EVPN) there
can be tens of thousands of PERMANENT entries leading to wasted
CPU cycles when gc kicks in. As an example, with 32k permanent
entries, neigh_alloc has been observed taking more than 4 msec per
invocation.
2. Currently, when the number of neighbor entries hits gc_thresh2 and
the last flush for the table was more than 5 seconds ago gc kicks in
walks the entire hash table evicting *all* entries not in PERMANENT
or REACHABLE state and not marked as externally learned. There is no
discriminator on when the neigh entry was created or if it just moved
from REACHABLE to another NUD_VALID state (e.g., NUD_STALE).
It is possible for entries to be created or for established neighbor
entries to be moved to STALE (e.g., an external node sends an ARP
request) right before the 5 second window lapses:
-----|---------x|----------|-----
t-5 t t+5
If that happens those entries are evicted during gc causing unnecessary
thrashing on neighbor entries and userspace caches trying to track them.
Further, this contradicts the description of gc_thresh2 which says
"Entries older than 5 seconds will be cleared".
One workaround is to make gc_thresh2 == gc_thresh3 but that negates the
whole point of having separate thresholds.
3. Clearing *all* neigh non-PERMANENT/REACHABLE/externally learned entries
when gc_thresh2 is exceeded is over kill and contributes to trashing
especially during startup.
This patch addresses these problems as follows:
1. Use of a separate list_head to track entries that can be garbage
collected along with a separate counter. PERMANENT entries are not
added to this list.
The gc_thresh parameters are only compared to the new counter, not the
total entries in the table. The forced_gc function is updated to only
walk this new gc_list looking for entries to evict.
2. Entries are added to the list head at the tail and removed from the
front.
3. Entries are only evicted if they were last updated more than 5 seconds
ago, adhering to the original intent of gc_thresh2.
4. Forced gc is stopped once the number of gc_entries drops below
gc_thresh2.
5. Since gc checks do not apply to PERMANENT entries, gc levels are skipped
when allocating a new neighbor for a PERMANENT entry. By extension this
means there are no explicit limits on the number of PERMANENT entries
that can be created, but this is no different than FIB entries or FDB
entries.
Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-12-07 12:24:57 -08:00
list_for_each_entry_safe ( n , tmp , & tbl - > gc_list , gc_list ) {
if ( refcount_read ( & n - > refcnt ) = = 1 ) {
bool remove = false ;
write_lock ( & n - > lock ) ;
2018-12-11 18:57:22 -07:00
if ( ( n - > nud_state = = NUD_FAILED ) | |
2021-06-07 11:35:30 -06:00
( n - > nud_state = = NUD_NOARP ) | |
2020-11-12 20:58:15 -05:00
( tbl - > is_multicast & &
tbl - > is_multicast ( n - > primary_key ) ) | |
neigh: make sure used and confirmed times are valid
Entries can linger in cache without timer for days, thanks to
the gc_thresh1 limit. As result, without traffic, the confirmed
time can be outdated and to appear to be in the future. Later,
on traffic, NUD_STALE entries can switch to NUD_DELAY and start
the timer which can see the invalid confirmed time and wrongly
switch to NUD_REACHABLE state instead of NUD_PROBE. As result,
timer is set many days in the future. This is more visible on
32-bit platforms, with higher HZ value.
Why this is a problem? While we expect unused entries to expire,
such entries stay in REACHABLE state for too long, locked in
cache. They are not expired normally, only when cache is full.
Problem and the wrong state change reported by Zhang Changzhong:
172.16.1.18 dev bond0 lladdr 0a:0e:0f:01:12:01 ref 1 used 350521/15994171/350520 probes 4 REACHABLE
350520 seconds have elapsed since this entry was last updated, but it is
still in the REACHABLE state (base_reachable_time_ms is 30000),
preventing lladdr from being updated through probe.
Fix it by ensuring timer is started with valid used/confirmed
times. Considering the valid time range is LONG_MAX jiffies,
we try not to go too much in the past while we are in
DELAY/PROBE state. There are also places that need
used/updated times to be validated while timer is not running.
Reported-by: Zhang Changzhong <zhangchangzhong@huawei.com>
Signed-off-by: Julian Anastasov <ja@ssi.bg>
Tested-by: Zhang Changzhong <zhangchangzhong@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2023-02-02 17:25:51 +02:00
! time_in_range ( n - > updated , tref , jiffies ) )
neighbor: Improve garbage collection
The existing garbage collection algorithm has a number of problems:
1. The gc algorithm will not evict PERMANENT entries as those entries
are managed by userspace, yet the existing algorithm walks the entire
hash table which means it always considers PERMANENT entries when
looking for entries to evict. In some use cases (e.g., EVPN) there
can be tens of thousands of PERMANENT entries leading to wasted
CPU cycles when gc kicks in. As an example, with 32k permanent
entries, neigh_alloc has been observed taking more than 4 msec per
invocation.
2. Currently, when the number of neighbor entries hits gc_thresh2 and
the last flush for the table was more than 5 seconds ago gc kicks in
walks the entire hash table evicting *all* entries not in PERMANENT
or REACHABLE state and not marked as externally learned. There is no
discriminator on when the neigh entry was created or if it just moved
from REACHABLE to another NUD_VALID state (e.g., NUD_STALE).
It is possible for entries to be created or for established neighbor
entries to be moved to STALE (e.g., an external node sends an ARP
request) right before the 5 second window lapses:
-----|---------x|----------|-----
t-5 t t+5
If that happens those entries are evicted during gc causing unnecessary
thrashing on neighbor entries and userspace caches trying to track them.
Further, this contradicts the description of gc_thresh2 which says
"Entries older than 5 seconds will be cleared".
One workaround is to make gc_thresh2 == gc_thresh3 but that negates the
whole point of having separate thresholds.
3. Clearing *all* neigh non-PERMANENT/REACHABLE/externally learned entries
when gc_thresh2 is exceeded is over kill and contributes to trashing
especially during startup.
This patch addresses these problems as follows:
1. Use of a separate list_head to track entries that can be garbage
collected along with a separate counter. PERMANENT entries are not
added to this list.
The gc_thresh parameters are only compared to the new counter, not the
total entries in the table. The forced_gc function is updated to only
walk this new gc_list looking for entries to evict.
2. Entries are added to the list head at the tail and removed from the
front.
3. Entries are only evicted if they were last updated more than 5 seconds
ago, adhering to the original intent of gc_thresh2.
4. Forced gc is stopped once the number of gc_entries drops below
gc_thresh2.
5. Since gc checks do not apply to PERMANENT entries, gc levels are skipped
when allocating a new neighbor for a PERMANENT entry. By extension this
means there are no explicit limits on the number of PERMANENT entries
that can be created, but this is no different than FIB entries or FDB
entries.
Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-12-07 12:24:57 -08:00
remove = true ;
write_unlock ( & n - > lock ) ;
if ( remove & & neigh_remove_one ( n , tbl ) )
shrunk + + ;
if ( shrunk > = max_clean )
break ;
2005-04-16 15:20:36 -07:00
}
}
tbl - > last_flush = jiffies ;
write_unlock_bh ( & tbl - > lock ) ;
return shrunk ;
}
2007-12-20 15:49:05 -08:00
static void neigh_add_timer ( struct neighbour * n , unsigned long when )
{
neigh: make sure used and confirmed times are valid
Entries can linger in cache without timer for days, thanks to
the gc_thresh1 limit. As result, without traffic, the confirmed
time can be outdated and to appear to be in the future. Later,
on traffic, NUD_STALE entries can switch to NUD_DELAY and start
the timer which can see the invalid confirmed time and wrongly
switch to NUD_REACHABLE state instead of NUD_PROBE. As result,
timer is set many days in the future. This is more visible on
32-bit platforms, with higher HZ value.
Why this is a problem? While we expect unused entries to expire,
such entries stay in REACHABLE state for too long, locked in
cache. They are not expired normally, only when cache is full.
Problem and the wrong state change reported by Zhang Changzhong:
172.16.1.18 dev bond0 lladdr 0a:0e:0f:01:12:01 ref 1 used 350521/15994171/350520 probes 4 REACHABLE
350520 seconds have elapsed since this entry was last updated, but it is
still in the REACHABLE state (base_reachable_time_ms is 30000),
preventing lladdr from being updated through probe.
Fix it by ensuring timer is started with valid used/confirmed
times. Considering the valid time range is LONG_MAX jiffies,
we try not to go too much in the past while we are in
DELAY/PROBE state. There are also places that need
used/updated times to be validated while timer is not running.
Reported-by: Zhang Changzhong <zhangchangzhong@huawei.com>
Signed-off-by: Julian Anastasov <ja@ssi.bg>
Tested-by: Zhang Changzhong <zhangchangzhong@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2023-02-02 17:25:51 +02:00
/* Use safe distance from the jiffies - LONG_MAX point while timer
* is running in DELAY / PROBE state but still show to user space
* large times in the past .
*/
unsigned long mint = jiffies - ( LONG_MAX - 86400 * HZ ) ;
2007-12-20 15:49:05 -08:00
neigh_hold ( n ) ;
neigh: make sure used and confirmed times are valid
Entries can linger in cache without timer for days, thanks to
the gc_thresh1 limit. As result, without traffic, the confirmed
time can be outdated and to appear to be in the future. Later,
on traffic, NUD_STALE entries can switch to NUD_DELAY and start
the timer which can see the invalid confirmed time and wrongly
switch to NUD_REACHABLE state instead of NUD_PROBE. As result,
timer is set many days in the future. This is more visible on
32-bit platforms, with higher HZ value.
Why this is a problem? While we expect unused entries to expire,
such entries stay in REACHABLE state for too long, locked in
cache. They are not expired normally, only when cache is full.
Problem and the wrong state change reported by Zhang Changzhong:
172.16.1.18 dev bond0 lladdr 0a:0e:0f:01:12:01 ref 1 used 350521/15994171/350520 probes 4 REACHABLE
350520 seconds have elapsed since this entry was last updated, but it is
still in the REACHABLE state (base_reachable_time_ms is 30000),
preventing lladdr from being updated through probe.
Fix it by ensuring timer is started with valid used/confirmed
times. Considering the valid time range is LONG_MAX jiffies,
we try not to go too much in the past while we are in
DELAY/PROBE state. There are also places that need
used/updated times to be validated while timer is not running.
Reported-by: Zhang Changzhong <zhangchangzhong@huawei.com>
Signed-off-by: Julian Anastasov <ja@ssi.bg>
Tested-by: Zhang Changzhong <zhangchangzhong@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2023-02-02 17:25:51 +02:00
if ( ! time_in_range ( n - > confirmed , mint , jiffies ) )
n - > confirmed = mint ;
if ( time_before ( n - > used , n - > confirmed ) )
n - > used = n - > confirmed ;
2007-12-20 15:49:05 -08:00
if ( unlikely ( mod_timer ( & n - > timer , when ) ) ) {
printk ( " NEIGH: BUG, double timer add, state is %x \n " ,
n - > nud_state ) ;
dump_stack ( ) ;
}
}
2005-04-16 15:20:36 -07:00
static int neigh_del_timer ( struct neighbour * n )
{
if ( ( n - > nud_state & NUD_IN_TIMER ) & &
del_timer ( & n - > timer ) ) {
neigh_release ( n ) ;
return 1 ;
}
return 0 ;
}
net: neigh: decrement the family specific qlen
Commit 0ff4eb3d5ebb ("neighbour: make proxy_queue.qlen limit
per-device") introduced the length counter qlen in struct neigh_parms.
There are separate neigh_parms instances for IPv4/ARP and IPv6/ND, and
while the family specific qlen is incremented in pneigh_enqueue(), the
mentioned commit decrements always the IPv4/ARP specific qlen,
regardless of the currently processed family, in pneigh_queue_purge()
and neigh_proxy_process().
As a result, with IPv6/ND, the family specific qlen is only incremented
(and never decremented) until it exceeds PROXY_QLEN, and then, according
to the check in pneigh_enqueue(), neighbor solicitations are not
answered anymore. As an example, this is noted when using the
subnet-router anycast address to access a Linux router. After a certain
amount of time (in the observed case, qlen exceeded PROXY_QLEN after two
days), the Linux router stops answering neighbor solicitations for its
subnet-router anycast address and effectively becomes unreachable.
Another result with IPv6/ND is that the IPv4/ARP specific qlen is
decremented more often than incremented. This leads to negative qlen
values, as a signed integer has been used for the length counter qlen,
and potentially to an integer overflow.
Fix this by introducing the helper function neigh_parms_qlen_dec(),
which decrements the family specific qlen. Thereby, make use of the
existing helper function neigh_get_dev_parms_rcu(), whose definition
therefore needs to be placed earlier in neighbour.c. Take the family
member from struct neigh_table to determine the currently processed
family and appropriately call neigh_parms_qlen_dec() from
pneigh_queue_purge() and neigh_proxy_process().
Additionally, use an unsigned integer for the length counter qlen.
Fixes: 0ff4eb3d5ebb ("neighbour: make proxy_queue.qlen limit per-device")
Signed-off-by: Thomas Zeitlhofer <thomas.zeitlhofer+lkml@ze-it.at>
Signed-off-by: David S. Miller <davem@davemloft.net>
2022-11-15 23:09:41 +01:00
static struct neigh_parms * neigh_get_dev_parms_rcu ( struct net_device * dev ,
int family )
{
switch ( family ) {
case AF_INET :
return __in_dev_arp_parms_get_rcu ( dev ) ;
case AF_INET6 :
return __in6_dev_nd_parms_get_rcu ( dev ) ;
}
return NULL ;
}
static void neigh_parms_qlen_dec ( struct net_device * dev , int family )
{
struct neigh_parms * p ;
rcu_read_lock ( ) ;
p = neigh_get_dev_parms_rcu ( dev , family ) ;
if ( p )
p - > qlen - - ;
rcu_read_unlock ( ) ;
}
static void pneigh_queue_purge ( struct sk_buff_head * list , struct net * net ,
int family )
2005-04-16 15:20:36 -07:00
{
2022-08-22 10:53:46 +08:00
struct sk_buff_head tmp ;
2022-08-11 18:20:11 +03:00
unsigned long flags ;
2005-04-16 15:20:36 -07:00
struct sk_buff * skb ;
2022-08-22 10:53:46 +08:00
skb_queue_head_init ( & tmp ) ;
2022-08-11 18:20:11 +03:00
spin_lock_irqsave ( & list - > lock , flags ) ;
skb = skb_peek ( list ) ;
while ( skb ! = NULL ) {
struct sk_buff * skb_next = skb_peek_next ( skb , list ) ;
2022-08-11 18:20:12 +03:00
struct net_device * dev = skb - > dev ;
2022-08-22 10:53:46 +08:00
2022-08-11 18:20:12 +03:00
if ( net = = NULL | | net_eq ( dev_net ( dev ) , net ) ) {
net: neigh: decrement the family specific qlen
Commit 0ff4eb3d5ebb ("neighbour: make proxy_queue.qlen limit
per-device") introduced the length counter qlen in struct neigh_parms.
There are separate neigh_parms instances for IPv4/ARP and IPv6/ND, and
while the family specific qlen is incremented in pneigh_enqueue(), the
mentioned commit decrements always the IPv4/ARP specific qlen,
regardless of the currently processed family, in pneigh_queue_purge()
and neigh_proxy_process().
As a result, with IPv6/ND, the family specific qlen is only incremented
(and never decremented) until it exceeds PROXY_QLEN, and then, according
to the check in pneigh_enqueue(), neighbor solicitations are not
answered anymore. As an example, this is noted when using the
subnet-router anycast address to access a Linux router. After a certain
amount of time (in the observed case, qlen exceeded PROXY_QLEN after two
days), the Linux router stops answering neighbor solicitations for its
subnet-router anycast address and effectively becomes unreachable.
Another result with IPv6/ND is that the IPv4/ARP specific qlen is
decremented more often than incremented. This leads to negative qlen
values, as a signed integer has been used for the length counter qlen,
and potentially to an integer overflow.
Fix this by introducing the helper function neigh_parms_qlen_dec(),
which decrements the family specific qlen. Thereby, make use of the
existing helper function neigh_get_dev_parms_rcu(), whose definition
therefore needs to be placed earlier in neighbour.c. Take the family
member from struct neigh_table to determine the currently processed
family and appropriately call neigh_parms_qlen_dec() from
pneigh_queue_purge() and neigh_proxy_process().
Additionally, use an unsigned integer for the length counter qlen.
Fixes: 0ff4eb3d5ebb ("neighbour: make proxy_queue.qlen limit per-device")
Signed-off-by: Thomas Zeitlhofer <thomas.zeitlhofer+lkml@ze-it.at>
Signed-off-by: David S. Miller <davem@davemloft.net>
2022-11-15 23:09:41 +01:00
neigh_parms_qlen_dec ( dev , family ) ;
2022-08-11 18:20:11 +03:00
__skb_unlink ( skb , list ) ;
2022-08-22 10:53:46 +08:00
__skb_queue_tail ( & tmp , skb ) ;
2022-08-11 18:20:11 +03:00
}
skb = skb_next ;
2005-04-16 15:20:36 -07:00
}
2022-08-11 18:20:11 +03:00
spin_unlock_irqrestore ( & list - > lock , flags ) ;
2022-08-22 10:53:46 +08:00
while ( ( skb = __skb_dequeue ( & tmp ) ) ) {
dev_put ( skb - > dev ) ;
kfree_skb ( skb ) ;
}
2005-04-16 15:20:36 -07:00
}
2018-10-11 20:33:49 -07:00
static void neigh_flush_dev ( struct neigh_table * tbl , struct net_device * dev ,
bool skip_perm )
2005-04-16 15:20:36 -07:00
{
int i ;
2010-10-04 06:15:44 +00:00
struct neigh_hash_table * nht ;
2005-04-16 15:20:36 -07:00
2010-10-04 06:15:44 +00:00
nht = rcu_dereference_protected ( tbl - > nht ,
lockdep_is_held ( & tbl - > lock ) ) ;
2011-07-11 01:28:12 -07:00
for ( i = 0 ; i < ( 1 < < nht - > hash_shift ) ; i + + ) {
2010-10-06 17:49:21 -07:00
struct neighbour * n ;
struct neighbour __rcu * * np = & nht - > hash_buckets [ i ] ;
2005-04-16 15:20:36 -07:00
2010-10-06 17:49:21 -07:00
while ( ( n = rcu_dereference_protected ( * np ,
lockdep_is_held ( & tbl - > lock ) ) ) ! = NULL ) {
2005-04-16 15:20:36 -07:00
if ( dev & & n - > dev ! = dev ) {
np = & n - > next ;
continue ;
}
2018-10-11 20:33:49 -07:00
if ( skip_perm & & n - > nud_state & NUD_PERMANENT ) {
np = & n - > next ;
continue ;
}
2010-10-06 17:49:21 -07:00
rcu_assign_pointer ( * np ,
rcu_dereference_protected ( n - > next ,
lockdep_is_held ( & tbl - > lock ) ) ) ;
2005-04-16 15:20:36 -07:00
write_lock ( & n - > lock ) ;
neigh_del_timer ( n ) ;
neighbor: Improve garbage collection
The existing garbage collection algorithm has a number of problems:
1. The gc algorithm will not evict PERMANENT entries as those entries
are managed by userspace, yet the existing algorithm walks the entire
hash table which means it always considers PERMANENT entries when
looking for entries to evict. In some use cases (e.g., EVPN) there
can be tens of thousands of PERMANENT entries leading to wasted
CPU cycles when gc kicks in. As an example, with 32k permanent
entries, neigh_alloc has been observed taking more than 4 msec per
invocation.
2. Currently, when the number of neighbor entries hits gc_thresh2 and
the last flush for the table was more than 5 seconds ago gc kicks in
walks the entire hash table evicting *all* entries not in PERMANENT
or REACHABLE state and not marked as externally learned. There is no
discriminator on when the neigh entry was created or if it just moved
from REACHABLE to another NUD_VALID state (e.g., NUD_STALE).
It is possible for entries to be created or for established neighbor
entries to be moved to STALE (e.g., an external node sends an ARP
request) right before the 5 second window lapses:
-----|---------x|----------|-----
t-5 t t+5
If that happens those entries are evicted during gc causing unnecessary
thrashing on neighbor entries and userspace caches trying to track them.
Further, this contradicts the description of gc_thresh2 which says
"Entries older than 5 seconds will be cleared".
One workaround is to make gc_thresh2 == gc_thresh3 but that negates the
whole point of having separate thresholds.
3. Clearing *all* neigh non-PERMANENT/REACHABLE/externally learned entries
when gc_thresh2 is exceeded is over kill and contributes to trashing
especially during startup.
This patch addresses these problems as follows:
1. Use of a separate list_head to track entries that can be garbage
collected along with a separate counter. PERMANENT entries are not
added to this list.
The gc_thresh parameters are only compared to the new counter, not the
total entries in the table. The forced_gc function is updated to only
walk this new gc_list looking for entries to evict.
2. Entries are added to the list head at the tail and removed from the
front.
3. Entries are only evicted if they were last updated more than 5 seconds
ago, adhering to the original intent of gc_thresh2.
4. Forced gc is stopped once the number of gc_entries drops below
gc_thresh2.
5. Since gc checks do not apply to PERMANENT entries, gc levels are skipped
when allocating a new neighbor for a PERMANENT entry. By extension this
means there are no explicit limits on the number of PERMANENT entries
that can be created, but this is no different than FIB entries or FDB
entries.
Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-12-07 12:24:57 -08:00
neigh_mark_dead ( n ) ;
2017-06-30 13:07:55 +03:00
if ( refcount_read ( & n - > refcnt ) ! = 1 ) {
2005-04-16 15:20:36 -07:00
/* The most unpleasant situation.
We must destroy neighbour entry ,
but someone still uses it .
The destroy will be delayed until
the last user releases us , but
we must kill timers etc . and move
it to safe state .
*/
2013-06-28 02:37:42 -07:00
__skb_queue_purge ( & n - > arp_queue ) ;
neigh: new unresolved queue limits
Le mercredi 09 novembre 2011 à 16:21 -0500, David Miller a écrit :
> From: David Miller <davem@davemloft.net>
> Date: Wed, 09 Nov 2011 16:16:44 -0500 (EST)
>
> > From: Eric Dumazet <eric.dumazet@gmail.com>
> > Date: Wed, 09 Nov 2011 12:14:09 +0100
> >
> >> unres_qlen is the number of frames we are able to queue per unresolved
> >> neighbour. Its default value (3) was never changed and is responsible
> >> for strange drops, especially if IP fragments are used, or multiple
> >> sessions start in parallel. Even a single tcp flow can hit this limit.
> > ...
> >
> > Ok, I've applied this, let's see what happens :-)
>
> Early answer, build fails.
>
> Please test build this patch with DECNET enabled and resubmit. The
> decnet neigh layer still refers to the removed ->queue_len member.
>
> Thanks.
Ouch, this was fixed on one machine yesterday, but not the other one I
used this morning, sorry.
[PATCH V5 net-next] neigh: new unresolved queue limits
unres_qlen is the number of frames we are able to queue per unresolved
neighbour. Its default value (3) was never changed and is responsible
for strange drops, especially if IP fragments are used, or multiple
sessions start in parallel. Even a single tcp flow can hit this limit.
$ arp -d 192.168.20.108 ; ping -c 2 -s 8000 192.168.20.108
PING 192.168.20.108 (192.168.20.108) 8000(8028) bytes of data.
8008 bytes from 192.168.20.108: icmp_seq=2 ttl=64 time=0.322 ms
Signed-off-by: David S. Miller <davem@davemloft.net>
2011-11-09 12:07:14 +00:00
n - > arp_queue_len_bytes = 0 ;
2023-09-21 09:27:13 +00:00
WRITE_ONCE ( n - > output , neigh_blackhole ) ;
2005-04-16 15:20:36 -07:00
if ( n - > nud_state & NUD_VALID )
n - > nud_state = NUD_NOARP ;
else
n - > nud_state = NUD_NONE ;
2013-04-15 15:17:19 +00:00
neigh_dbg ( 2 , " neigh %p is stray \n " , n ) ;
2005-04-16 15:20:36 -07:00
}
write_unlock ( & n - > lock ) ;
2007-08-08 23:12:36 -07:00
neigh_cleanup_and_release ( n ) ;
2005-04-16 15:20:36 -07:00
}
}
2005-10-23 17:18:00 +10:00
}
2005-04-16 15:20:36 -07:00
2005-10-23 17:18:00 +10:00
void neigh_changeaddr ( struct neigh_table * tbl , struct net_device * dev )
{
write_lock_bh ( & tbl - > lock ) ;
2018-10-11 20:33:49 -07:00
neigh_flush_dev ( tbl , dev , false ) ;
2005-10-23 17:18:00 +10:00
write_unlock_bh ( & tbl - > lock ) ;
}
2008-03-24 18:39:10 +09:00
EXPORT_SYMBOL ( neigh_changeaddr ) ;
2005-10-23 17:18:00 +10:00
2018-10-11 20:33:49 -07:00
static int __neigh_ifdown ( struct neigh_table * tbl , struct net_device * dev ,
bool skip_perm )
2005-10-23 17:18:00 +10:00
{
write_lock_bh ( & tbl - > lock ) ;
2018-10-11 20:33:49 -07:00
neigh_flush_dev ( tbl , dev , skip_perm ) ;
2018-04-12 10:46:55 +02:00
pneigh_ifdown_and_unlock ( tbl , dev ) ;
net: neigh: decrement the family specific qlen
Commit 0ff4eb3d5ebb ("neighbour: make proxy_queue.qlen limit
per-device") introduced the length counter qlen in struct neigh_parms.
There are separate neigh_parms instances for IPv4/ARP and IPv6/ND, and
while the family specific qlen is incremented in pneigh_enqueue(), the
mentioned commit decrements always the IPv4/ARP specific qlen,
regardless of the currently processed family, in pneigh_queue_purge()
and neigh_proxy_process().
As a result, with IPv6/ND, the family specific qlen is only incremented
(and never decremented) until it exceeds PROXY_QLEN, and then, according
to the check in pneigh_enqueue(), neighbor solicitations are not
answered anymore. As an example, this is noted when using the
subnet-router anycast address to access a Linux router. After a certain
amount of time (in the observed case, qlen exceeded PROXY_QLEN after two
days), the Linux router stops answering neighbor solicitations for its
subnet-router anycast address and effectively becomes unreachable.
Another result with IPv6/ND is that the IPv4/ARP specific qlen is
decremented more often than incremented. This leads to negative qlen
values, as a signed integer has been used for the length counter qlen,
and potentially to an integer overflow.
Fix this by introducing the helper function neigh_parms_qlen_dec(),
which decrements the family specific qlen. Thereby, make use of the
existing helper function neigh_get_dev_parms_rcu(), whose definition
therefore needs to be placed earlier in neighbour.c. Take the family
member from struct neigh_table to determine the currently processed
family and appropriately call neigh_parms_qlen_dec() from
pneigh_queue_purge() and neigh_proxy_process().
Additionally, use an unsigned integer for the length counter qlen.
Fixes: 0ff4eb3d5ebb ("neighbour: make proxy_queue.qlen limit per-device")
Signed-off-by: Thomas Zeitlhofer <thomas.zeitlhofer+lkml@ze-it.at>
Signed-off-by: David S. Miller <davem@davemloft.net>
2022-11-15 23:09:41 +01:00
pneigh_queue_purge ( & tbl - > proxy_queue , dev ? dev_net ( dev ) : NULL ,
tbl - > family ) ;
2022-08-11 18:20:11 +03:00
if ( skb_queue_empty_lockless ( & tbl - > proxy_queue ) )
del_timer_sync ( & tbl - > proxy_timer ) ;
2005-04-16 15:20:36 -07:00
return 0 ;
}
2018-10-11 20:33:49 -07:00
int neigh_carrier_down ( struct neigh_table * tbl , struct net_device * dev )
{
__neigh_ifdown ( tbl , dev , true ) ;
return 0 ;
}
EXPORT_SYMBOL ( neigh_carrier_down ) ;
int neigh_ifdown ( struct neigh_table * tbl , struct net_device * dev )
{
__neigh_ifdown ( tbl , dev , false ) ;
return 0 ;
}
2008-03-24 18:39:10 +09:00
EXPORT_SYMBOL ( neigh_ifdown ) ;
2005-04-16 15:20:36 -07:00
neighbor: Improve garbage collection
The existing garbage collection algorithm has a number of problems:
1. The gc algorithm will not evict PERMANENT entries as those entries
are managed by userspace, yet the existing algorithm walks the entire
hash table which means it always considers PERMANENT entries when
looking for entries to evict. In some use cases (e.g., EVPN) there
can be tens of thousands of PERMANENT entries leading to wasted
CPU cycles when gc kicks in. As an example, with 32k permanent
entries, neigh_alloc has been observed taking more than 4 msec per
invocation.
2. Currently, when the number of neighbor entries hits gc_thresh2 and
the last flush for the table was more than 5 seconds ago gc kicks in
walks the entire hash table evicting *all* entries not in PERMANENT
or REACHABLE state and not marked as externally learned. There is no
discriminator on when the neigh entry was created or if it just moved
from REACHABLE to another NUD_VALID state (e.g., NUD_STALE).
It is possible for entries to be created or for established neighbor
entries to be moved to STALE (e.g., an external node sends an ARP
request) right before the 5 second window lapses:
-----|---------x|----------|-----
t-5 t t+5
If that happens those entries are evicted during gc causing unnecessary
thrashing on neighbor entries and userspace caches trying to track them.
Further, this contradicts the description of gc_thresh2 which says
"Entries older than 5 seconds will be cleared".
One workaround is to make gc_thresh2 == gc_thresh3 but that negates the
whole point of having separate thresholds.
3. Clearing *all* neigh non-PERMANENT/REACHABLE/externally learned entries
when gc_thresh2 is exceeded is over kill and contributes to trashing
especially during startup.
This patch addresses these problems as follows:
1. Use of a separate list_head to track entries that can be garbage
collected along with a separate counter. PERMANENT entries are not
added to this list.
The gc_thresh parameters are only compared to the new counter, not the
total entries in the table. The forced_gc function is updated to only
walk this new gc_list looking for entries to evict.
2. Entries are added to the list head at the tail and removed from the
front.
3. Entries are only evicted if they were last updated more than 5 seconds
ago, adhering to the original intent of gc_thresh2.
4. Forced gc is stopped once the number of gc_entries drops below
gc_thresh2.
5. Since gc checks do not apply to PERMANENT entries, gc levels are skipped
when allocating a new neighbor for a PERMANENT entry. By extension this
means there are no explicit limits on the number of PERMANENT entries
that can be created, but this is no different than FIB entries or FDB
entries.
Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-12-07 12:24:57 -08:00
static struct neighbour * neigh_alloc ( struct neigh_table * tbl ,
struct net_device * dev ,
2021-10-11 14:12:37 +02:00
u32 flags , bool exempt_from_gc )
2005-04-16 15:20:36 -07:00
{
struct neighbour * n = NULL ;
unsigned long now = jiffies ;
int entries ;
2018-12-11 18:57:25 -07:00
if ( exempt_from_gc )
neighbor: Improve garbage collection
The existing garbage collection algorithm has a number of problems:
1. The gc algorithm will not evict PERMANENT entries as those entries
are managed by userspace, yet the existing algorithm walks the entire
hash table which means it always considers PERMANENT entries when
looking for entries to evict. In some use cases (e.g., EVPN) there
can be tens of thousands of PERMANENT entries leading to wasted
CPU cycles when gc kicks in. As an example, with 32k permanent
entries, neigh_alloc has been observed taking more than 4 msec per
invocation.
2. Currently, when the number of neighbor entries hits gc_thresh2 and
the last flush for the table was more than 5 seconds ago gc kicks in
walks the entire hash table evicting *all* entries not in PERMANENT
or REACHABLE state and not marked as externally learned. There is no
discriminator on when the neigh entry was created or if it just moved
from REACHABLE to another NUD_VALID state (e.g., NUD_STALE).
It is possible for entries to be created or for established neighbor
entries to be moved to STALE (e.g., an external node sends an ARP
request) right before the 5 second window lapses:
-----|---------x|----------|-----
t-5 t t+5
If that happens those entries are evicted during gc causing unnecessary
thrashing on neighbor entries and userspace caches trying to track them.
Further, this contradicts the description of gc_thresh2 which says
"Entries older than 5 seconds will be cleared".
One workaround is to make gc_thresh2 == gc_thresh3 but that negates the
whole point of having separate thresholds.
3. Clearing *all* neigh non-PERMANENT/REACHABLE/externally learned entries
when gc_thresh2 is exceeded is over kill and contributes to trashing
especially during startup.
This patch addresses these problems as follows:
1. Use of a separate list_head to track entries that can be garbage
collected along with a separate counter. PERMANENT entries are not
added to this list.
The gc_thresh parameters are only compared to the new counter, not the
total entries in the table. The forced_gc function is updated to only
walk this new gc_list looking for entries to evict.
2. Entries are added to the list head at the tail and removed from the
front.
3. Entries are only evicted if they were last updated more than 5 seconds
ago, adhering to the original intent of gc_thresh2.
4. Forced gc is stopped once the number of gc_entries drops below
gc_thresh2.
5. Since gc checks do not apply to PERMANENT entries, gc levels are skipped
when allocating a new neighbor for a PERMANENT entry. By extension this
means there are no explicit limits on the number of PERMANENT entries
that can be created, but this is no different than FIB entries or FDB
entries.
Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-12-07 12:24:57 -08:00
goto do_alloc ;
entries = atomic_inc_return ( & tbl - > gc_entries ) - 1 ;
2005-04-16 15:20:36 -07:00
if ( entries > = tbl - > gc_thresh3 | |
( entries > = tbl - > gc_thresh2 & &
time_after ( now , tbl - > last_flush + 5 * HZ ) ) ) {
if ( ! neigh_forced_gc ( tbl ) & &
2015-08-07 11:10:37 -07:00
entries > = tbl - > gc_thresh3 ) {
net_info_ratelimited ( " %s: neighbor table overflow! \n " ,
tbl - > id ) ;
NEIGH_CACHE_STAT_INC ( tbl , table_fulls ) ;
2005-04-16 15:20:36 -07:00
goto out_entries ;
2015-08-07 11:10:37 -07:00
}
2005-04-16 15:20:36 -07:00
}
neighbor: Improve garbage collection
The existing garbage collection algorithm has a number of problems:
1. The gc algorithm will not evict PERMANENT entries as those entries
are managed by userspace, yet the existing algorithm walks the entire
hash table which means it always considers PERMANENT entries when
looking for entries to evict. In some use cases (e.g., EVPN) there
can be tens of thousands of PERMANENT entries leading to wasted
CPU cycles when gc kicks in. As an example, with 32k permanent
entries, neigh_alloc has been observed taking more than 4 msec per
invocation.
2. Currently, when the number of neighbor entries hits gc_thresh2 and
the last flush for the table was more than 5 seconds ago gc kicks in
walks the entire hash table evicting *all* entries not in PERMANENT
or REACHABLE state and not marked as externally learned. There is no
discriminator on when the neigh entry was created or if it just moved
from REACHABLE to another NUD_VALID state (e.g., NUD_STALE).
It is possible for entries to be created or for established neighbor
entries to be moved to STALE (e.g., an external node sends an ARP
request) right before the 5 second window lapses:
-----|---------x|----------|-----
t-5 t t+5
If that happens those entries are evicted during gc causing unnecessary
thrashing on neighbor entries and userspace caches trying to track them.
Further, this contradicts the description of gc_thresh2 which says
"Entries older than 5 seconds will be cleared".
One workaround is to make gc_thresh2 == gc_thresh3 but that negates the
whole point of having separate thresholds.
3. Clearing *all* neigh non-PERMANENT/REACHABLE/externally learned entries
when gc_thresh2 is exceeded is over kill and contributes to trashing
especially during startup.
This patch addresses these problems as follows:
1. Use of a separate list_head to track entries that can be garbage
collected along with a separate counter. PERMANENT entries are not
added to this list.
The gc_thresh parameters are only compared to the new counter, not the
total entries in the table. The forced_gc function is updated to only
walk this new gc_list looking for entries to evict.
2. Entries are added to the list head at the tail and removed from the
front.
3. Entries are only evicted if they were last updated more than 5 seconds
ago, adhering to the original intent of gc_thresh2.
4. Forced gc is stopped once the number of gc_entries drops below
gc_thresh2.
5. Since gc checks do not apply to PERMANENT entries, gc levels are skipped
when allocating a new neighbor for a PERMANENT entry. By extension this
means there are no explicit limits on the number of PERMANENT entries
that can be created, but this is no different than FIB entries or FDB
entries.
Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-12-07 12:24:57 -08:00
do_alloc :
2013-01-24 00:44:23 +00:00
n = kzalloc ( tbl - > entry_size + dev - > neigh_priv_len , GFP_ATOMIC ) ;
2005-04-16 15:20:36 -07:00
if ( ! n )
goto out_entries ;
2013-06-28 02:37:42 -07:00
__skb_queue_head_init ( & n - > arp_queue ) ;
2005-04-16 15:20:36 -07:00
rwlock_init ( & n - > lock ) ;
2010-10-07 10:44:07 +00:00
seqlock_init ( & n - > ha_lock ) ;
2005-04-16 15:20:36 -07:00
n - > updated = n - > used = now ;
n - > nud_state = NUD_NONE ;
n - > output = neigh_blackhole ;
2021-10-11 14:12:35 +02:00
n - > flags = flags ;
2011-07-14 07:53:20 -07:00
seqlock_init ( & n - > hh . hh_lock ) ;
2005-04-16 15:20:36 -07:00
n - > parms = neigh_parms_clone ( & tbl - > parms ) ;
treewide: setup_timer() -> timer_setup()
This converts all remaining cases of the old setup_timer() API into using
timer_setup(), where the callback argument is the structure already
holding the struct timer_list. These should have no behavioral changes,
since they just change which pointer is passed into the callback with
the same available pointers after conversion. It handles the following
examples, in addition to some other variations.
Casting from unsigned long:
void my_callback(unsigned long data)
{
struct something *ptr = (struct something *)data;
...
}
...
setup_timer(&ptr->my_timer, my_callback, ptr);
and forced object casts:
void my_callback(struct something *ptr)
{
...
}
...
setup_timer(&ptr->my_timer, my_callback, (unsigned long)ptr);
become:
void my_callback(struct timer_list *t)
{
struct something *ptr = from_timer(ptr, t, my_timer);
...
}
...
timer_setup(&ptr->my_timer, my_callback, 0);
Direct function assignments:
void my_callback(unsigned long data)
{
struct something *ptr = (struct something *)data;
...
}
...
ptr->my_timer.function = my_callback;
have a temporary cast added, along with converting the args:
void my_callback(struct timer_list *t)
{
struct something *ptr = from_timer(ptr, t, my_timer);
...
}
...
ptr->my_timer.function = (TIMER_FUNC_TYPE)my_callback;
And finally, callbacks without a data assignment:
void my_callback(unsigned long data)
{
...
}
...
setup_timer(&ptr->my_timer, my_callback, 0);
have their argument renamed to verify they're unused during conversion:
void my_callback(struct timer_list *unused)
{
...
}
...
timer_setup(&ptr->my_timer, my_callback, 0);
The conversion is done with the following Coccinelle script:
spatch --very-quiet --all-includes --include-headers \
-I ./arch/x86/include -I ./arch/x86/include/generated \
-I ./include -I ./arch/x86/include/uapi \
-I ./arch/x86/include/generated/uapi -I ./include/uapi \
-I ./include/generated/uapi --include ./include/linux/kconfig.h \
--dir . \
--cocci-file ~/src/data/timer_setup.cocci
@fix_address_of@
expression e;
@@
setup_timer(
-&(e)
+&e
, ...)
// Update any raw setup_timer() usages that have a NULL callback, but
// would otherwise match change_timer_function_usage, since the latter
// will update all function assignments done in the face of a NULL
// function initialization in setup_timer().
@change_timer_function_usage_NULL@
expression _E;
identifier _timer;
type _cast_data;
@@
(
-setup_timer(&_E->_timer, NULL, _E);
+timer_setup(&_E->_timer, NULL, 0);
|
-setup_timer(&_E->_timer, NULL, (_cast_data)_E);
+timer_setup(&_E->_timer, NULL, 0);
|
-setup_timer(&_E._timer, NULL, &_E);
+timer_setup(&_E._timer, NULL, 0);
|
-setup_timer(&_E._timer, NULL, (_cast_data)&_E);
+timer_setup(&_E._timer, NULL, 0);
)
@change_timer_function_usage@
expression _E;
identifier _timer;
struct timer_list _stl;
identifier _callback;
type _cast_func, _cast_data;
@@
(
-setup_timer(&_E->_timer, _callback, _E);
+timer_setup(&_E->_timer, _callback, 0);
|
-setup_timer(&_E->_timer, &_callback, _E);
+timer_setup(&_E->_timer, _callback, 0);
|
-setup_timer(&_E->_timer, _callback, (_cast_data)_E);
+timer_setup(&_E->_timer, _callback, 0);
|
-setup_timer(&_E->_timer, &_callback, (_cast_data)_E);
+timer_setup(&_E->_timer, _callback, 0);
|
-setup_timer(&_E->_timer, (_cast_func)_callback, _E);
+timer_setup(&_E->_timer, _callback, 0);
|
-setup_timer(&_E->_timer, (_cast_func)&_callback, _E);
+timer_setup(&_E->_timer, _callback, 0);
|
-setup_timer(&_E->_timer, (_cast_func)_callback, (_cast_data)_E);
+timer_setup(&_E->_timer, _callback, 0);
|
-setup_timer(&_E->_timer, (_cast_func)&_callback, (_cast_data)_E);
+timer_setup(&_E->_timer, _callback, 0);
|
-setup_timer(&_E._timer, _callback, (_cast_data)_E);
+timer_setup(&_E._timer, _callback, 0);
|
-setup_timer(&_E._timer, _callback, (_cast_data)&_E);
+timer_setup(&_E._timer, _callback, 0);
|
-setup_timer(&_E._timer, &_callback, (_cast_data)_E);
+timer_setup(&_E._timer, _callback, 0);
|
-setup_timer(&_E._timer, &_callback, (_cast_data)&_E);
+timer_setup(&_E._timer, _callback, 0);
|
-setup_timer(&_E._timer, (_cast_func)_callback, (_cast_data)_E);
+timer_setup(&_E._timer, _callback, 0);
|
-setup_timer(&_E._timer, (_cast_func)_callback, (_cast_data)&_E);
+timer_setup(&_E._timer, _callback, 0);
|
-setup_timer(&_E._timer, (_cast_func)&_callback, (_cast_data)_E);
+timer_setup(&_E._timer, _callback, 0);
|
-setup_timer(&_E._timer, (_cast_func)&_callback, (_cast_data)&_E);
+timer_setup(&_E._timer, _callback, 0);
|
_E->_timer@_stl.function = _callback;
|
_E->_timer@_stl.function = &_callback;
|
_E->_timer@_stl.function = (_cast_func)_callback;
|
_E->_timer@_stl.function = (_cast_func)&_callback;
|
_E._timer@_stl.function = _callback;
|
_E._timer@_stl.function = &_callback;
|
_E._timer@_stl.function = (_cast_func)_callback;
|
_E._timer@_stl.function = (_cast_func)&_callback;
)
// callback(unsigned long arg)
@change_callback_handle_cast
depends on change_timer_function_usage@
identifier change_timer_function_usage._callback;
identifier change_timer_function_usage._timer;
type _origtype;
identifier _origarg;
type _handletype;
identifier _handle;
@@
void _callback(
-_origtype _origarg
+struct timer_list *t
)
{
(
... when != _origarg
_handletype *_handle =
-(_handletype *)_origarg;
+from_timer(_handle, t, _timer);
... when != _origarg
|
... when != _origarg
_handletype *_handle =
-(void *)_origarg;
+from_timer(_handle, t, _timer);
... when != _origarg
|
... when != _origarg
_handletype *_handle;
... when != _handle
_handle =
-(_handletype *)_origarg;
+from_timer(_handle, t, _timer);
... when != _origarg
|
... when != _origarg
_handletype *_handle;
... when != _handle
_handle =
-(void *)_origarg;
+from_timer(_handle, t, _timer);
... when != _origarg
)
}
// callback(unsigned long arg) without existing variable
@change_callback_handle_cast_no_arg
depends on change_timer_function_usage &&
!change_callback_handle_cast@
identifier change_timer_function_usage._callback;
identifier change_timer_function_usage._timer;
type _origtype;
identifier _origarg;
type _handletype;
@@
void _callback(
-_origtype _origarg
+struct timer_list *t
)
{
+ _handletype *_origarg = from_timer(_origarg, t, _timer);
+
... when != _origarg
- (_handletype *)_origarg
+ _origarg
... when != _origarg
}
// Avoid already converted callbacks.
@match_callback_converted
depends on change_timer_function_usage &&
!change_callback_handle_cast &&
!change_callback_handle_cast_no_arg@
identifier change_timer_function_usage._callback;
identifier t;
@@
void _callback(struct timer_list *t)
{ ... }
// callback(struct something *handle)
@change_callback_handle_arg
depends on change_timer_function_usage &&
!match_callback_converted &&
!change_callback_handle_cast &&
!change_callback_handle_cast_no_arg@
identifier change_timer_function_usage._callback;
identifier change_timer_function_usage._timer;
type _handletype;
identifier _handle;
@@
void _callback(
-_handletype *_handle
+struct timer_list *t
)
{
+ _handletype *_handle = from_timer(_handle, t, _timer);
...
}
// If change_callback_handle_arg ran on an empty function, remove
// the added handler.
@unchange_callback_handle_arg
depends on change_timer_function_usage &&
change_callback_handle_arg@
identifier change_timer_function_usage._callback;
identifier change_timer_function_usage._timer;
type _handletype;
identifier _handle;
identifier t;
@@
void _callback(struct timer_list *t)
{
- _handletype *_handle = from_timer(_handle, t, _timer);
}
// We only want to refactor the setup_timer() data argument if we've found
// the matching callback. This undoes changes in change_timer_function_usage.
@unchange_timer_function_usage
depends on change_timer_function_usage &&
!change_callback_handle_cast &&
!change_callback_handle_cast_no_arg &&
!change_callback_handle_arg@
expression change_timer_function_usage._E;
identifier change_timer_function_usage._timer;
identifier change_timer_function_usage._callback;
type change_timer_function_usage._cast_data;
@@
(
-timer_setup(&_E->_timer, _callback, 0);
+setup_timer(&_E->_timer, _callback, (_cast_data)_E);
|
-timer_setup(&_E._timer, _callback, 0);
+setup_timer(&_E._timer, _callback, (_cast_data)&_E);
)
// If we fixed a callback from a .function assignment, fix the
// assignment cast now.
@change_timer_function_assignment
depends on change_timer_function_usage &&
(change_callback_handle_cast ||
change_callback_handle_cast_no_arg ||
change_callback_handle_arg)@
expression change_timer_function_usage._E;
identifier change_timer_function_usage._timer;
identifier change_timer_function_usage._callback;
type _cast_func;
typedef TIMER_FUNC_TYPE;
@@
(
_E->_timer.function =
-_callback
+(TIMER_FUNC_TYPE)_callback
;
|
_E->_timer.function =
-&_callback
+(TIMER_FUNC_TYPE)_callback
;
|
_E->_timer.function =
-(_cast_func)_callback;
+(TIMER_FUNC_TYPE)_callback
;
|
_E->_timer.function =
-(_cast_func)&_callback
+(TIMER_FUNC_TYPE)_callback
;
|
_E._timer.function =
-_callback
+(TIMER_FUNC_TYPE)_callback
;
|
_E._timer.function =
-&_callback;
+(TIMER_FUNC_TYPE)_callback
;
|
_E._timer.function =
-(_cast_func)_callback
+(TIMER_FUNC_TYPE)_callback
;
|
_E._timer.function =
-(_cast_func)&_callback
+(TIMER_FUNC_TYPE)_callback
;
)
// Sometimes timer functions are called directly. Replace matched args.
@change_timer_function_calls
depends on change_timer_function_usage &&
(change_callback_handle_cast ||
change_callback_handle_cast_no_arg ||
change_callback_handle_arg)@
expression _E;
identifier change_timer_function_usage._timer;
identifier change_timer_function_usage._callback;
type _cast_data;
@@
_callback(
(
-(_cast_data)_E
+&_E->_timer
|
-(_cast_data)&_E
+&_E._timer
|
-_E
+&_E->_timer
)
)
// If a timer has been configured without a data argument, it can be
// converted without regard to the callback argument, since it is unused.
@match_timer_function_unused_data@
expression _E;
identifier _timer;
identifier _callback;
@@
(
-setup_timer(&_E->_timer, _callback, 0);
+timer_setup(&_E->_timer, _callback, 0);
|
-setup_timer(&_E->_timer, _callback, 0L);
+timer_setup(&_E->_timer, _callback, 0);
|
-setup_timer(&_E->_timer, _callback, 0UL);
+timer_setup(&_E->_timer, _callback, 0);
|
-setup_timer(&_E._timer, _callback, 0);
+timer_setup(&_E._timer, _callback, 0);
|
-setup_timer(&_E._timer, _callback, 0L);
+timer_setup(&_E._timer, _callback, 0);
|
-setup_timer(&_E._timer, _callback, 0UL);
+timer_setup(&_E._timer, _callback, 0);
|
-setup_timer(&_timer, _callback, 0);
+timer_setup(&_timer, _callback, 0);
|
-setup_timer(&_timer, _callback, 0L);
+timer_setup(&_timer, _callback, 0);
|
-setup_timer(&_timer, _callback, 0UL);
+timer_setup(&_timer, _callback, 0);
|
-setup_timer(_timer, _callback, 0);
+timer_setup(_timer, _callback, 0);
|
-setup_timer(_timer, _callback, 0L);
+timer_setup(_timer, _callback, 0);
|
-setup_timer(_timer, _callback, 0UL);
+timer_setup(_timer, _callback, 0);
)
@change_callback_unused_data
depends on match_timer_function_unused_data@
identifier match_timer_function_unused_data._callback;
type _origtype;
identifier _origarg;
@@
void _callback(
-_origtype _origarg
+struct timer_list *unused
)
{
... when != _origarg
}
Signed-off-by: Kees Cook <keescook@chromium.org>
2017-10-16 14:43:17 -07:00
timer_setup ( & n - > timer , neigh_timer_handler , 0 ) ;
2005-04-16 15:20:36 -07:00
NEIGH_CACHE_STAT_INC ( tbl , allocs ) ;
n - > tbl = tbl ;
2017-06-30 13:07:55 +03:00
refcount_set ( & n - > refcnt , 1 ) ;
2005-04-16 15:20:36 -07:00
n - > dead = 1 ;
2018-12-10 13:54:07 -08:00
INIT_LIST_HEAD ( & n - > gc_list ) ;
2021-10-11 14:12:38 +02:00
INIT_LIST_HEAD ( & n - > managed_list ) ;
neighbor: Improve garbage collection
The existing garbage collection algorithm has a number of problems:
1. The gc algorithm will not evict PERMANENT entries as those entries
are managed by userspace, yet the existing algorithm walks the entire
hash table which means it always considers PERMANENT entries when
looking for entries to evict. In some use cases (e.g., EVPN) there
can be tens of thousands of PERMANENT entries leading to wasted
CPU cycles when gc kicks in. As an example, with 32k permanent
entries, neigh_alloc has been observed taking more than 4 msec per
invocation.
2. Currently, when the number of neighbor entries hits gc_thresh2 and
the last flush for the table was more than 5 seconds ago gc kicks in
walks the entire hash table evicting *all* entries not in PERMANENT
or REACHABLE state and not marked as externally learned. There is no
discriminator on when the neigh entry was created or if it just moved
from REACHABLE to another NUD_VALID state (e.g., NUD_STALE).
It is possible for entries to be created or for established neighbor
entries to be moved to STALE (e.g., an external node sends an ARP
request) right before the 5 second window lapses:
-----|---------x|----------|-----
t-5 t t+5
If that happens those entries are evicted during gc causing unnecessary
thrashing on neighbor entries and userspace caches trying to track them.
Further, this contradicts the description of gc_thresh2 which says
"Entries older than 5 seconds will be cleared".
One workaround is to make gc_thresh2 == gc_thresh3 but that negates the
whole point of having separate thresholds.
3. Clearing *all* neigh non-PERMANENT/REACHABLE/externally learned entries
when gc_thresh2 is exceeded is over kill and contributes to trashing
especially during startup.
This patch addresses these problems as follows:
1. Use of a separate list_head to track entries that can be garbage
collected along with a separate counter. PERMANENT entries are not
added to this list.
The gc_thresh parameters are only compared to the new counter, not the
total entries in the table. The forced_gc function is updated to only
walk this new gc_list looking for entries to evict.
2. Entries are added to the list head at the tail and removed from the
front.
3. Entries are only evicted if they were last updated more than 5 seconds
ago, adhering to the original intent of gc_thresh2.
4. Forced gc is stopped once the number of gc_entries drops below
gc_thresh2.
5. Since gc checks do not apply to PERMANENT entries, gc levels are skipped
when allocating a new neighbor for a PERMANENT entry. By extension this
means there are no explicit limits on the number of PERMANENT entries
that can be created, but this is no different than FIB entries or FDB
entries.
Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-12-07 12:24:57 -08:00
atomic_inc ( & tbl - > entries ) ;
2005-04-16 15:20:36 -07:00
out :
return n ;
out_entries :
2018-12-11 18:57:25 -07:00
if ( ! exempt_from_gc )
neighbor: Improve garbage collection
The existing garbage collection algorithm has a number of problems:
1. The gc algorithm will not evict PERMANENT entries as those entries
are managed by userspace, yet the existing algorithm walks the entire
hash table which means it always considers PERMANENT entries when
looking for entries to evict. In some use cases (e.g., EVPN) there
can be tens of thousands of PERMANENT entries leading to wasted
CPU cycles when gc kicks in. As an example, with 32k permanent
entries, neigh_alloc has been observed taking more than 4 msec per
invocation.
2. Currently, when the number of neighbor entries hits gc_thresh2 and
the last flush for the table was more than 5 seconds ago gc kicks in
walks the entire hash table evicting *all* entries not in PERMANENT
or REACHABLE state and not marked as externally learned. There is no
discriminator on when the neigh entry was created or if it just moved
from REACHABLE to another NUD_VALID state (e.g., NUD_STALE).
It is possible for entries to be created or for established neighbor
entries to be moved to STALE (e.g., an external node sends an ARP
request) right before the 5 second window lapses:
-----|---------x|----------|-----
t-5 t t+5
If that happens those entries are evicted during gc causing unnecessary
thrashing on neighbor entries and userspace caches trying to track them.
Further, this contradicts the description of gc_thresh2 which says
"Entries older than 5 seconds will be cleared".
One workaround is to make gc_thresh2 == gc_thresh3 but that negates the
whole point of having separate thresholds.
3. Clearing *all* neigh non-PERMANENT/REACHABLE/externally learned entries
when gc_thresh2 is exceeded is over kill and contributes to trashing
especially during startup.
This patch addresses these problems as follows:
1. Use of a separate list_head to track entries that can be garbage
collected along with a separate counter. PERMANENT entries are not
added to this list.
The gc_thresh parameters are only compared to the new counter, not the
total entries in the table. The forced_gc function is updated to only
walk this new gc_list looking for entries to evict.
2. Entries are added to the list head at the tail and removed from the
front.
3. Entries are only evicted if they were last updated more than 5 seconds
ago, adhering to the original intent of gc_thresh2.
4. Forced gc is stopped once the number of gc_entries drops below
gc_thresh2.
5. Since gc checks do not apply to PERMANENT entries, gc levels are skipped
when allocating a new neighbor for a PERMANENT entry. By extension this
means there are no explicit limits on the number of PERMANENT entries
that can be created, but this is no different than FIB entries or FDB
entries.
Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-12-07 12:24:57 -08:00
atomic_dec ( & tbl - > gc_entries ) ;
2005-04-16 15:20:36 -07:00
goto out ;
}
2011-12-28 15:06:58 -05:00
static void neigh_get_hash_rnd ( u32 * x )
{
2017-06-07 23:00:05 -04:00
* x = get_random_u32 ( ) | 1 ;
2011-12-28 15:06:58 -05:00
}
2011-07-11 01:28:12 -07:00
static struct neigh_hash_table * neigh_hash_alloc ( unsigned int shift )
2005-04-16 15:20:36 -07:00
{
2011-07-11 01:28:12 -07:00
size_t size = ( 1 < < shift ) * sizeof ( struct neighbour * ) ;
2010-10-04 06:15:44 +00:00
struct neigh_hash_table * ret ;
2011-01-19 22:02:47 +00:00
struct neighbour __rcu * * buckets ;
2011-12-28 15:06:58 -05:00
int i ;
2005-04-16 15:20:36 -07:00
2010-10-04 06:15:44 +00:00
ret = kmalloc ( sizeof ( * ret ) , GFP_ATOMIC ) ;
if ( ! ret )
return NULL ;
2019-01-08 12:30:00 +03:00
if ( size < = PAGE_SIZE ) {
2010-10-04 06:15:44 +00:00
buckets = kzalloc ( size , GFP_ATOMIC ) ;
2019-01-08 12:30:00 +03:00
} else {
2011-01-19 22:02:47 +00:00
buckets = ( struct neighbour __rcu * * )
2010-10-04 06:15:44 +00:00
__get_free_pages ( GFP_ATOMIC | __GFP_ZERO ,
get_order ( size ) ) ;
2019-01-14 13:38:43 +03:00
kmemleak_alloc ( buckets , size , 1 , GFP_ATOMIC ) ;
2019-01-08 12:30:00 +03:00
}
2010-10-04 06:15:44 +00:00
if ( ! buckets ) {
kfree ( ret ) ;
return NULL ;
2005-04-16 15:20:36 -07:00
}
2011-01-19 22:02:47 +00:00
ret - > hash_buckets = buckets ;
2011-07-11 01:28:12 -07:00
ret - > hash_shift = shift ;
2011-12-28 15:06:58 -05:00
for ( i = 0 ; i < NEIGH_NUM_HASH_RND ; i + + )
neigh_get_hash_rnd ( & ret - > hash_rnd [ i ] ) ;
2005-04-16 15:20:36 -07:00
return ret ;
}
2010-10-04 06:15:44 +00:00
static void neigh_hash_free_rcu ( struct rcu_head * head )
2005-04-16 15:20:36 -07:00
{
2010-10-04 06:15:44 +00:00
struct neigh_hash_table * nht = container_of ( head ,
struct neigh_hash_table ,
rcu ) ;
2011-07-11 01:28:12 -07:00
size_t size = ( 1 < < nht - > hash_shift ) * sizeof ( struct neighbour * ) ;
2011-01-19 22:02:47 +00:00
struct neighbour __rcu * * buckets = nht - > hash_buckets ;
2005-04-16 15:20:36 -07:00
2019-01-08 12:30:00 +03:00
if ( size < = PAGE_SIZE ) {
2010-10-04 06:15:44 +00:00
kfree ( buckets ) ;
2019-01-08 12:30:00 +03:00
} else {
kmemleak_free ( buckets ) ;
2010-10-04 06:15:44 +00:00
free_pages ( ( unsigned long ) buckets , get_order ( size ) ) ;
2019-01-08 12:30:00 +03:00
}
2010-10-04 06:15:44 +00:00
kfree ( nht ) ;
2005-04-16 15:20:36 -07:00
}
2010-10-04 06:15:44 +00:00
static struct neigh_hash_table * neigh_hash_grow ( struct neigh_table * tbl ,
2011-07-11 01:28:12 -07:00
unsigned long new_shift )
2005-04-16 15:20:36 -07:00
{
2010-10-04 06:15:44 +00:00
unsigned int i , hash ;
struct neigh_hash_table * new_nht , * old_nht ;
2005-04-16 15:20:36 -07:00
NEIGH_CACHE_STAT_INC ( tbl , hash_grows ) ;
2010-10-04 06:15:44 +00:00
old_nht = rcu_dereference_protected ( tbl - > nht ,
lockdep_is_held ( & tbl - > lock ) ) ;
2011-07-11 01:28:12 -07:00
new_nht = neigh_hash_alloc ( new_shift ) ;
2010-10-04 06:15:44 +00:00
if ( ! new_nht )
return old_nht ;
2005-04-16 15:20:36 -07:00
2011-07-11 01:28:12 -07:00
for ( i = 0 ; i < ( 1 < < old_nht - > hash_shift ) ; i + + ) {
2005-04-16 15:20:36 -07:00
struct neighbour * n , * next ;
2010-10-06 17:49:21 -07:00
for ( n = rcu_dereference_protected ( old_nht - > hash_buckets [ i ] ,
lockdep_is_held ( & tbl - > lock ) ) ;
2010-10-04 06:15:44 +00:00
n ! = NULL ;
n = next ) {
hash = tbl - > hash ( n - > primary_key , n - > dev ,
new_nht - > hash_rnd ) ;
2005-04-16 15:20:36 -07:00
2011-07-11 01:28:12 -07:00
hash > > = ( 32 - new_nht - > hash_shift ) ;
2010-10-06 17:49:21 -07:00
next = rcu_dereference_protected ( n - > next ,
lockdep_is_held ( & tbl - > lock ) ) ;
rcu_assign_pointer ( n - > next ,
rcu_dereference_protected (
new_nht - > hash_buckets [ hash ] ,
lockdep_is_held ( & tbl - > lock ) ) ) ;
rcu_assign_pointer ( new_nht - > hash_buckets [ hash ] , n ) ;
2005-04-16 15:20:36 -07:00
}
}
2010-10-04 06:15:44 +00:00
rcu_assign_pointer ( tbl - > nht , new_nht ) ;
call_rcu ( & old_nht - > rcu , neigh_hash_free_rcu ) ;
return new_nht ;
2005-04-16 15:20:36 -07:00
}
struct neighbour * neigh_lookup ( struct neigh_table * tbl , const void * pkey ,
struct net_device * dev )
{
struct neighbour * n ;
2007-02-09 23:24:36 +09:00
2005-04-16 15:20:36 -07:00
NEIGH_CACHE_STAT_INC ( tbl , lookups ) ;
2023-03-21 04:01:14 +00:00
rcu_read_lock ( ) ;
2015-03-03 17:10:44 -06:00
n = __neigh_lookup_noref ( tbl , pkey , dev ) ;
if ( n ) {
2017-06-30 13:07:55 +03:00
if ( ! refcount_inc_not_zero ( & n - > refcnt ) )
2015-03-03 17:10:44 -06:00
n = NULL ;
NEIGH_CACHE_STAT_INC ( tbl , hits ) ;
2005-04-16 15:20:36 -07:00
}
2010-10-06 17:49:21 -07:00
2023-03-21 04:01:14 +00:00
rcu_read_unlock ( ) ;
2005-04-16 15:20:36 -07:00
return n ;
}
2008-03-24 18:39:10 +09:00
EXPORT_SYMBOL ( neigh_lookup ) ;
2005-04-16 15:20:36 -07:00
2021-10-11 14:12:35 +02:00
static struct neighbour *
___neigh_create ( struct neigh_table * tbl , const void * pkey ,
2021-10-11 14:12:37 +02:00
struct net_device * dev , u32 flags ,
2021-10-11 14:12:35 +02:00
bool exempt_from_gc , bool want_ref )
2005-04-16 15:20:36 -07:00
{
2021-10-11 14:12:35 +02:00
u32 hash_val , key_len = tbl - > key_len ;
struct neighbour * n1 , * rc , * n ;
2010-10-04 06:15:44 +00:00
struct neigh_hash_table * nht ;
2021-10-11 14:12:35 +02:00
int error ;
2005-04-16 15:20:36 -07:00
2021-10-11 14:12:35 +02:00
n = neigh_alloc ( tbl , dev , flags , exempt_from_gc ) ;
2019-05-22 12:22:21 -07:00
trace_neigh_create ( tbl , dev , pkey , n , exempt_from_gc ) ;
2005-04-16 15:20:36 -07:00
if ( ! n ) {
rc = ERR_PTR ( - ENOBUFS ) ;
goto out ;
}
memcpy ( n - > primary_key , pkey , key_len ) ;
n - > dev = dev ;
2022-06-07 21:39:55 -07:00
netdev_hold ( dev , & n - > dev_tracker , GFP_ATOMIC ) ;
2005-04-16 15:20:36 -07:00
/* Protocol specific setup. */
if ( tbl - > constructor & & ( error = tbl - > constructor ( n ) ) < 0 ) {
rc = ERR_PTR ( error ) ;
goto out_neigh_release ;
}
2011-07-25 00:01:38 +00:00
if ( dev - > netdev_ops - > ndo_neigh_construct ) {
2016-07-05 11:27:37 +02:00
error = dev - > netdev_ops - > ndo_neigh_construct ( dev , n ) ;
2011-07-25 00:01:38 +00:00
if ( error < 0 ) {
rc = ERR_PTR ( error ) ;
goto out_neigh_release ;
}
}
2011-12-19 15:04:41 -05:00
/* Device specific setup. */
if ( n - > parms - > neigh_setup & &
( error = n - > parms - > neigh_setup ( n ) ) < 0 ) {
rc = ERR_PTR ( error ) ;
goto out_neigh_release ;
}
2013-12-07 19:26:53 +01:00
n - > confirmed = jiffies - ( NEIGH_VAR ( n - > parms , BASE_REACHABLE_TIME ) < < 1 ) ;
2005-04-16 15:20:36 -07:00
write_lock_bh ( & tbl - > lock ) ;
2010-10-04 06:15:44 +00:00
nht = rcu_dereference_protected ( tbl - > nht ,
lockdep_is_held ( & tbl - > lock ) ) ;
2005-04-16 15:20:36 -07:00
2011-07-11 01:28:12 -07:00
if ( atomic_read ( & tbl - > entries ) > ( 1 < < nht - > hash_shift ) )
nht = neigh_hash_grow ( tbl , nht - > hash_shift + 1 ) ;
2005-04-16 15:20:36 -07:00
2018-01-14 04:18:50 -08:00
hash_val = tbl - > hash ( n - > primary_key , dev , nht - > hash_rnd ) > > ( 32 - nht - > hash_shift ) ;
2005-04-16 15:20:36 -07:00
if ( n - > parms - > dead ) {
rc = ERR_PTR ( - EINVAL ) ;
goto out_tbl_unlock ;
}
2010-10-06 17:49:21 -07:00
for ( n1 = rcu_dereference_protected ( nht - > hash_buckets [ hash_val ] ,
lockdep_is_held ( & tbl - > lock ) ) ;
n1 ! = NULL ;
n1 = rcu_dereference_protected ( n1 - > next ,
lockdep_is_held ( & tbl - > lock ) ) ) {
2018-01-14 04:18:50 -08:00
if ( dev = = n1 - > dev & & ! memcmp ( n1 - > primary_key , n - > primary_key , key_len ) ) {
2012-07-02 02:02:15 -07:00
if ( want_ref )
neigh_hold ( n1 ) ;
2005-04-16 15:20:36 -07:00
rc = n1 ;
goto out_tbl_unlock ;
}
}
n - > dead = 0 ;
2018-12-11 18:57:25 -07:00
if ( ! exempt_from_gc )
2018-12-10 13:54:07 -08:00
list_add_tail ( & n - > gc_list , & n - > tbl - > gc_list ) ;
2021-10-11 14:12:38 +02:00
if ( n - > flags & NTF_MANAGED )
list_add_tail ( & n - > managed_list , & n - > tbl - > managed_list ) ;
2012-07-02 02:02:15 -07:00
if ( want_ref )
neigh_hold ( n ) ;
2010-10-06 17:49:21 -07:00
rcu_assign_pointer ( n - > next ,
rcu_dereference_protected ( nht - > hash_buckets [ hash_val ] ,
lockdep_is_held ( & tbl - > lock ) ) ) ;
rcu_assign_pointer ( nht - > hash_buckets [ hash_val ] , n ) ;
2005-04-16 15:20:36 -07:00
write_unlock_bh ( & tbl - > lock ) ;
2013-04-15 15:17:19 +00:00
neigh_dbg ( 2 , " neigh %p is created \n " , n ) ;
2005-04-16 15:20:36 -07:00
rc = n ;
out :
return rc ;
out_tbl_unlock :
write_unlock_bh ( & tbl - > lock ) ;
out_neigh_release :
2019-05-01 18:08:34 -07:00
if ( ! exempt_from_gc )
atomic_dec ( & tbl - > gc_entries ) ;
2005-04-16 15:20:36 -07:00
neigh_release ( n ) ;
goto out ;
}
neighbor: Improve garbage collection
The existing garbage collection algorithm has a number of problems:
1. The gc algorithm will not evict PERMANENT entries as those entries
are managed by userspace, yet the existing algorithm walks the entire
hash table which means it always considers PERMANENT entries when
looking for entries to evict. In some use cases (e.g., EVPN) there
can be tens of thousands of PERMANENT entries leading to wasted
CPU cycles when gc kicks in. As an example, with 32k permanent
entries, neigh_alloc has been observed taking more than 4 msec per
invocation.
2. Currently, when the number of neighbor entries hits gc_thresh2 and
the last flush for the table was more than 5 seconds ago gc kicks in
walks the entire hash table evicting *all* entries not in PERMANENT
or REACHABLE state and not marked as externally learned. There is no
discriminator on when the neigh entry was created or if it just moved
from REACHABLE to another NUD_VALID state (e.g., NUD_STALE).
It is possible for entries to be created or for established neighbor
entries to be moved to STALE (e.g., an external node sends an ARP
request) right before the 5 second window lapses:
-----|---------x|----------|-----
t-5 t t+5
If that happens those entries are evicted during gc causing unnecessary
thrashing on neighbor entries and userspace caches trying to track them.
Further, this contradicts the description of gc_thresh2 which says
"Entries older than 5 seconds will be cleared".
One workaround is to make gc_thresh2 == gc_thresh3 but that negates the
whole point of having separate thresholds.
3. Clearing *all* neigh non-PERMANENT/REACHABLE/externally learned entries
when gc_thresh2 is exceeded is over kill and contributes to trashing
especially during startup.
This patch addresses these problems as follows:
1. Use of a separate list_head to track entries that can be garbage
collected along with a separate counter. PERMANENT entries are not
added to this list.
The gc_thresh parameters are only compared to the new counter, not the
total entries in the table. The forced_gc function is updated to only
walk this new gc_list looking for entries to evict.
2. Entries are added to the list head at the tail and removed from the
front.
3. Entries are only evicted if they were last updated more than 5 seconds
ago, adhering to the original intent of gc_thresh2.
4. Forced gc is stopped once the number of gc_entries drops below
gc_thresh2.
5. Since gc checks do not apply to PERMANENT entries, gc levels are skipped
when allocating a new neighbor for a PERMANENT entry. By extension this
means there are no explicit limits on the number of PERMANENT entries
that can be created, but this is no different than FIB entries or FDB
entries.
Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-12-07 12:24:57 -08:00
struct neighbour * __neigh_create ( struct neigh_table * tbl , const void * pkey ,
struct net_device * dev , bool want_ref )
{
2021-10-11 14:12:35 +02:00
return ___neigh_create ( tbl , pkey , dev , 0 , false , want_ref ) ;
neighbor: Improve garbage collection
The existing garbage collection algorithm has a number of problems:
1. The gc algorithm will not evict PERMANENT entries as those entries
are managed by userspace, yet the existing algorithm walks the entire
hash table which means it always considers PERMANENT entries when
looking for entries to evict. In some use cases (e.g., EVPN) there
can be tens of thousands of PERMANENT entries leading to wasted
CPU cycles when gc kicks in. As an example, with 32k permanent
entries, neigh_alloc has been observed taking more than 4 msec per
invocation.
2. Currently, when the number of neighbor entries hits gc_thresh2 and
the last flush for the table was more than 5 seconds ago gc kicks in
walks the entire hash table evicting *all* entries not in PERMANENT
or REACHABLE state and not marked as externally learned. There is no
discriminator on when the neigh entry was created or if it just moved
from REACHABLE to another NUD_VALID state (e.g., NUD_STALE).
It is possible for entries to be created or for established neighbor
entries to be moved to STALE (e.g., an external node sends an ARP
request) right before the 5 second window lapses:
-----|---------x|----------|-----
t-5 t t+5
If that happens those entries are evicted during gc causing unnecessary
thrashing on neighbor entries and userspace caches trying to track them.
Further, this contradicts the description of gc_thresh2 which says
"Entries older than 5 seconds will be cleared".
One workaround is to make gc_thresh2 == gc_thresh3 but that negates the
whole point of having separate thresholds.
3. Clearing *all* neigh non-PERMANENT/REACHABLE/externally learned entries
when gc_thresh2 is exceeded is over kill and contributes to trashing
especially during startup.
This patch addresses these problems as follows:
1. Use of a separate list_head to track entries that can be garbage
collected along with a separate counter. PERMANENT entries are not
added to this list.
The gc_thresh parameters are only compared to the new counter, not the
total entries in the table. The forced_gc function is updated to only
walk this new gc_list looking for entries to evict.
2. Entries are added to the list head at the tail and removed from the
front.
3. Entries are only evicted if they were last updated more than 5 seconds
ago, adhering to the original intent of gc_thresh2.
4. Forced gc is stopped once the number of gc_entries drops below
gc_thresh2.
5. Since gc checks do not apply to PERMANENT entries, gc levels are skipped
when allocating a new neighbor for a PERMANENT entry. By extension this
means there are no explicit limits on the number of PERMANENT entries
that can be created, but this is no different than FIB entries or FDB
entries.
Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-12-07 12:24:57 -08:00
}
2012-07-02 02:02:15 -07:00
EXPORT_SYMBOL ( __neigh_create ) ;
2005-04-16 15:20:36 -07:00
2017-09-23 23:03:04 +03:00
static u32 pneigh_hash ( const void * pkey , unsigned int key_len )
2008-03-24 14:48:59 -07:00
{
u32 hash_val = * ( u32 * ) ( pkey + key_len - 4 ) ;
hash_val ^ = ( hash_val > > 16 ) ;
hash_val ^ = hash_val > > 8 ;
hash_val ^ = hash_val > > 4 ;
hash_val & = PNEIGH_HASHMASK ;
2008-03-28 12:46:53 +09:00
return hash_val ;
}
2008-03-24 14:48:59 -07:00
2008-03-28 12:46:53 +09:00
static struct pneigh_entry * __pneigh_lookup_1 ( struct pneigh_entry * n ,
struct net * net ,
const void * pkey ,
2017-09-23 23:03:04 +03:00
unsigned int key_len ,
2008-03-28 12:46:53 +09:00
struct net_device * dev )
{
while ( n ) {
2008-03-24 14:48:59 -07:00
if ( ! memcmp ( n - > key , pkey , key_len ) & &
2008-03-28 12:46:53 +09:00
net_eq ( pneigh_net ( n ) , net ) & &
2008-03-24 14:48:59 -07:00
( n - > dev = = dev | | ! n - > dev ) )
2008-03-28 12:46:53 +09:00
return n ;
n = n - > next ;
2008-03-24 14:48:59 -07:00
}
2008-03-28 12:46:53 +09:00
return NULL ;
}
2008-03-24 14:48:59 -07:00
2008-03-28 12:46:53 +09:00
struct pneigh_entry * __pneigh_lookup ( struct neigh_table * tbl ,
struct net * net , const void * pkey , struct net_device * dev )
{
2017-09-23 23:03:04 +03:00
unsigned int key_len = tbl - > key_len ;
2008-03-28 12:46:53 +09:00
u32 hash_val = pneigh_hash ( pkey , key_len ) ;
return __pneigh_lookup_1 ( tbl - > phash_buckets [ hash_val ] ,
net , pkey , key_len , dev ) ;
2008-03-24 14:48:59 -07:00
}
2008-03-24 18:39:10 +09:00
EXPORT_SYMBOL_GPL ( __pneigh_lookup ) ;
2008-03-24 14:48:59 -07:00
2008-01-24 00:13:18 -08:00
struct pneigh_entry * pneigh_lookup ( struct neigh_table * tbl ,
struct net * net , const void * pkey ,
2005-04-16 15:20:36 -07:00
struct net_device * dev , int creat )
{
struct pneigh_entry * n ;
2017-09-23 23:03:04 +03:00
unsigned int key_len = tbl - > key_len ;
2008-03-28 12:46:53 +09:00
u32 hash_val = pneigh_hash ( pkey , key_len ) ;
2005-04-16 15:20:36 -07:00
read_lock_bh ( & tbl - > lock ) ;
2008-03-28 12:46:53 +09:00
n = __pneigh_lookup_1 ( tbl - > phash_buckets [ hash_val ] ,
net , pkey , key_len , dev ) ;
2005-04-16 15:20:36 -07:00
read_unlock_bh ( & tbl - > lock ) ;
2008-03-28 12:46:53 +09:00
if ( n | | ! creat )
2005-04-16 15:20:36 -07:00
goto out ;
2007-10-15 12:54:15 -07:00
ASSERT_RTNL ( ) ;
2021-12-06 08:53:29 -08:00
n = kzalloc ( sizeof ( * n ) + key_len , GFP_KERNEL ) ;
2005-04-16 15:20:36 -07:00
if ( ! n )
goto out ;
2015-03-11 23:04:08 -05:00
write_pnet ( & n - > net , net ) ;
2005-04-16 15:20:36 -07:00
memcpy ( n - > key , pkey , key_len ) ;
n - > dev = dev ;
2022-06-07 21:39:55 -07:00
netdev_hold ( dev , & n - > dev_tracker , GFP_KERNEL ) ;
2005-04-16 15:20:36 -07:00
if ( tbl - > pconstructor & & tbl - > pconstructor ( n ) ) {
2022-06-07 21:39:55 -07:00
netdev_put ( dev , & n - > dev_tracker ) ;
2005-04-16 15:20:36 -07:00
kfree ( n ) ;
n = NULL ;
goto out ;
}
write_lock_bh ( & tbl - > lock ) ;
n - > next = tbl - > phash_buckets [ hash_val ] ;
tbl - > phash_buckets [ hash_val ] = n ;
write_unlock_bh ( & tbl - > lock ) ;
out :
return n ;
}
2008-03-24 18:39:10 +09:00
EXPORT_SYMBOL ( pneigh_lookup ) ;
2005-04-16 15:20:36 -07:00
2008-01-24 00:13:18 -08:00
int pneigh_delete ( struct neigh_table * tbl , struct net * net , const void * pkey ,
2005-04-16 15:20:36 -07:00
struct net_device * dev )
{
struct pneigh_entry * n , * * np ;
2017-09-23 23:03:04 +03:00
unsigned int key_len = tbl - > key_len ;
2008-03-28 12:46:53 +09:00
u32 hash_val = pneigh_hash ( pkey , key_len ) ;
2005-04-16 15:20:36 -07:00
write_lock_bh ( & tbl - > lock ) ;
for ( np = & tbl - > phash_buckets [ hash_val ] ; ( n = * np ) ! = NULL ;
np = & n - > next ) {
2008-01-24 00:13:18 -08:00
if ( ! memcmp ( n - > key , pkey , key_len ) & & n - > dev = = dev & &
2008-03-26 03:57:35 +09:00
net_eq ( pneigh_net ( n ) , net ) ) {
2005-04-16 15:20:36 -07:00
* np = n - > next ;
write_unlock_bh ( & tbl - > lock ) ;
if ( tbl - > pdestructor )
tbl - > pdestructor ( n ) ;
2022-06-07 21:39:55 -07:00
netdev_put ( n - > dev , & n - > dev_tracker ) ;
2005-04-16 15:20:36 -07:00
kfree ( n ) ;
return 0 ;
}
}
write_unlock_bh ( & tbl - > lock ) ;
return - ENOENT ;
}
2018-04-12 10:46:55 +02:00
static int pneigh_ifdown_and_unlock ( struct neigh_table * tbl ,
struct net_device * dev )
2005-04-16 15:20:36 -07:00
{
2018-04-12 10:46:55 +02:00
struct pneigh_entry * n , * * np , * freelist = NULL ;
2005-04-16 15:20:36 -07:00
u32 h ;
for ( h = 0 ; h < = PNEIGH_HASHMASK ; h + + ) {
np = & tbl - > phash_buckets [ h ] ;
while ( ( n = * np ) ! = NULL ) {
if ( ! dev | | n - > dev = = dev ) {
* np = n - > next ;
2018-04-12 10:46:55 +02:00
n - > next = freelist ;
freelist = n ;
2005-04-16 15:20:36 -07:00
continue ;
}
np = & n - > next ;
}
}
2018-04-12 10:46:55 +02:00
write_unlock_bh ( & tbl - > lock ) ;
while ( ( n = freelist ) ) {
freelist = n - > next ;
n - > next = NULL ;
if ( tbl - > pdestructor )
tbl - > pdestructor ( n ) ;
2022-06-07 21:39:55 -07:00
netdev_put ( n - > dev , & n - > dev_tracker ) ;
2018-04-12 10:46:55 +02:00
kfree ( n ) ;
}
2005-04-16 15:20:36 -07:00
return - ENOENT ;
}
2008-01-24 00:30:58 -08:00
static void neigh_parms_destroy ( struct neigh_parms * parms ) ;
static inline void neigh_parms_put ( struct neigh_parms * parms )
{
2017-06-30 13:07:56 +03:00
if ( refcount_dec_and_test ( & parms - > refcnt ) )
2008-01-24 00:30:58 -08:00
neigh_parms_destroy ( parms ) ;
}
2005-04-16 15:20:36 -07:00
/*
* neighbour must already be out of the table ;
*
*/
void neigh_destroy ( struct neighbour * neigh )
{
2011-07-25 00:01:38 +00:00
struct net_device * dev = neigh - > dev ;
2005-04-16 15:20:36 -07:00
NEIGH_CACHE_STAT_INC ( neigh - > tbl , destroys ) ;
if ( ! neigh - > dead ) {
2012-05-16 19:58:40 +00:00
pr_warn ( " Destroying alive neighbour %p \n " , neigh ) ;
2005-04-16 15:20:36 -07:00
dump_stack ( ) ;
return ;
}
if ( neigh_del_timer ( neigh ) )
2012-05-16 19:58:40 +00:00
pr_warn ( " Impossible event \n " ) ;
2005-04-16 15:20:36 -07:00
2013-06-28 02:37:42 -07:00
write_lock_bh ( & neigh - > lock ) ;
__skb_queue_purge ( & neigh - > arp_queue ) ;
write_unlock_bh ( & neigh - > lock ) ;
neigh: new unresolved queue limits
Le mercredi 09 novembre 2011 à 16:21 -0500, David Miller a écrit :
> From: David Miller <davem@davemloft.net>
> Date: Wed, 09 Nov 2011 16:16:44 -0500 (EST)
>
> > From: Eric Dumazet <eric.dumazet@gmail.com>
> > Date: Wed, 09 Nov 2011 12:14:09 +0100
> >
> >> unres_qlen is the number of frames we are able to queue per unresolved
> >> neighbour. Its default value (3) was never changed and is responsible
> >> for strange drops, especially if IP fragments are used, or multiple
> >> sessions start in parallel. Even a single tcp flow can hit this limit.
> > ...
> >
> > Ok, I've applied this, let's see what happens :-)
>
> Early answer, build fails.
>
> Please test build this patch with DECNET enabled and resubmit. The
> decnet neigh layer still refers to the removed ->queue_len member.
>
> Thanks.
Ouch, this was fixed on one machine yesterday, but not the other one I
used this morning, sorry.
[PATCH V5 net-next] neigh: new unresolved queue limits
unres_qlen is the number of frames we are able to queue per unresolved
neighbour. Its default value (3) was never changed and is responsible
for strange drops, especially if IP fragments are used, or multiple
sessions start in parallel. Even a single tcp flow can hit this limit.
$ arp -d 192.168.20.108 ; ping -c 2 -s 8000 192.168.20.108
PING 192.168.20.108 (192.168.20.108) 8000(8028) bytes of data.
8008 bytes from 192.168.20.108: icmp_seq=2 ttl=64 time=0.322 ms
Signed-off-by: David S. Miller <davem@davemloft.net>
2011-11-09 12:07:14 +00:00
neigh - > arp_queue_len_bytes = 0 ;
2005-04-16 15:20:36 -07:00
2011-12-19 15:04:41 -05:00
if ( dev - > netdev_ops - > ndo_neigh_destroy )
2016-07-05 11:27:37 +02:00
dev - > netdev_ops - > ndo_neigh_destroy ( dev , neigh ) ;
2011-12-19 15:04:41 -05:00
2022-06-07 21:39:55 -07:00
netdev_put ( dev , & neigh - > dev_tracker ) ;
2005-04-16 15:20:36 -07:00
neigh_parms_put ( neigh - > parms ) ;
2013-04-15 15:17:19 +00:00
neigh_dbg ( 2 , " neigh %p is destroyed \n " , neigh ) ;
2005-04-16 15:20:36 -07:00
atomic_dec ( & neigh - > tbl - > entries ) ;
2011-07-25 00:01:22 +00:00
kfree_rcu ( neigh , rcu ) ;
2005-04-16 15:20:36 -07:00
}
2008-03-24 18:39:10 +09:00
EXPORT_SYMBOL ( neigh_destroy ) ;
2005-04-16 15:20:36 -07:00
/* Neighbour state is suspicious;
disable fast path .
Called with write_locked neigh .
*/
static void neigh_suspect ( struct neighbour * neigh )
{
2013-04-15 15:17:19 +00:00
neigh_dbg ( 2 , " neigh %p is suspected \n " , neigh ) ;
2005-04-16 15:20:36 -07:00
2023-09-21 09:27:13 +00:00
WRITE_ONCE ( neigh - > output , neigh - > ops - > output ) ;
2005-04-16 15:20:36 -07:00
}
/* Neighbour state is OK;
enable fast path .
Called with write_locked neigh .
*/
static void neigh_connect ( struct neighbour * neigh )
{
2013-04-15 15:17:19 +00:00
neigh_dbg ( 2 , " neigh %p is connected \n " , neigh ) ;
2005-04-16 15:20:36 -07:00
2023-09-21 09:27:13 +00:00
WRITE_ONCE ( neigh - > output , neigh - > ops - > connected_output ) ;
2005-04-16 15:20:36 -07:00
}
2009-07-30 03:15:07 +00:00
static void neigh_periodic_work ( struct work_struct * work )
2005-04-16 15:20:36 -07:00
{
2009-07-30 03:15:07 +00:00
struct neigh_table * tbl = container_of ( work , struct neigh_table , gc_work . work ) ;
2010-10-06 17:49:21 -07:00
struct neighbour * n ;
struct neighbour __rcu * * np ;
2009-07-30 03:15:07 +00:00
unsigned int i ;
2010-10-04 06:15:44 +00:00
struct neigh_hash_table * nht ;
2005-04-16 15:20:36 -07:00
NEIGH_CACHE_STAT_INC ( tbl , periodic_gc_runs ) ;
2009-07-30 03:15:07 +00:00
write_lock_bh ( & tbl - > lock ) ;
2010-10-04 06:15:44 +00:00
nht = rcu_dereference_protected ( tbl - > nht ,
lockdep_is_held ( & tbl - > lock ) ) ;
2005-04-16 15:20:36 -07:00
/*
* periodically recompute ReachableTime from random function
*/
2009-07-30 03:15:07 +00:00
if ( time_after ( jiffies , tbl - > last_rand + 300 * HZ ) ) {
2005-04-16 15:20:36 -07:00
struct neigh_parms * p ;
2009-07-30 03:15:07 +00:00
tbl - > last_rand = jiffies ;
2014-10-29 19:29:31 +01:00
list_for_each_entry ( p , & tbl - > parms_list , list )
2005-04-16 15:20:36 -07:00
p - > reachable_time =
2013-12-07 19:26:53 +01:00
neigh_rand_reach_time ( NEIGH_VAR ( p , BASE_REACHABLE_TIME ) ) ;
2005-04-16 15:20:36 -07:00
}
2014-02-27 17:14:41 +08:00
if ( atomic_read ( & tbl - > entries ) < tbl - > gc_thresh1 )
goto out ;
2011-07-11 01:28:12 -07:00
for ( i = 0 ; i < ( 1 < < nht - > hash_shift ) ; i + + ) {
2010-10-04 06:15:44 +00:00
np = & nht - > hash_buckets [ i ] ;
2005-04-16 15:20:36 -07:00
2010-10-06 17:49:21 -07:00
while ( ( n = rcu_dereference_protected ( * np ,
lockdep_is_held ( & tbl - > lock ) ) ) ! = NULL ) {
2009-07-30 03:15:07 +00:00
unsigned int state ;
2005-04-16 15:20:36 -07:00
2009-07-30 03:15:07 +00:00
write_lock ( & n - > lock ) ;
2005-04-16 15:20:36 -07:00
2009-07-30 03:15:07 +00:00
state = n - > nud_state ;
2018-04-24 13:49:34 -07:00
if ( ( state & ( NUD_PERMANENT | NUD_IN_TIMER ) ) | |
( n - > flags & NTF_EXT_LEARNED ) ) {
2009-07-30 03:15:07 +00:00
write_unlock ( & n - > lock ) ;
goto next_elt ;
}
2005-04-16 15:20:36 -07:00
neigh: make sure used and confirmed times are valid
Entries can linger in cache without timer for days, thanks to
the gc_thresh1 limit. As result, without traffic, the confirmed
time can be outdated and to appear to be in the future. Later,
on traffic, NUD_STALE entries can switch to NUD_DELAY and start
the timer which can see the invalid confirmed time and wrongly
switch to NUD_REACHABLE state instead of NUD_PROBE. As result,
timer is set many days in the future. This is more visible on
32-bit platforms, with higher HZ value.
Why this is a problem? While we expect unused entries to expire,
such entries stay in REACHABLE state for too long, locked in
cache. They are not expired normally, only when cache is full.
Problem and the wrong state change reported by Zhang Changzhong:
172.16.1.18 dev bond0 lladdr 0a:0e:0f:01:12:01 ref 1 used 350521/15994171/350520 probes 4 REACHABLE
350520 seconds have elapsed since this entry was last updated, but it is
still in the REACHABLE state (base_reachable_time_ms is 30000),
preventing lladdr from being updated through probe.
Fix it by ensuring timer is started with valid used/confirmed
times. Considering the valid time range is LONG_MAX jiffies,
we try not to go too much in the past while we are in
DELAY/PROBE state. There are also places that need
used/updated times to be validated while timer is not running.
Reported-by: Zhang Changzhong <zhangchangzhong@huawei.com>
Signed-off-by: Julian Anastasov <ja@ssi.bg>
Tested-by: Zhang Changzhong <zhangchangzhong@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2023-02-02 17:25:51 +02:00
if ( time_before ( n - > used , n - > confirmed ) & &
time_is_before_eq_jiffies ( n - > confirmed ) )
2009-07-30 03:15:07 +00:00
n - > used = n - > confirmed ;
2005-04-16 15:20:36 -07:00
2017-06-30 13:07:55 +03:00
if ( refcount_read ( & n - > refcnt ) = = 1 & &
2009-07-30 03:15:07 +00:00
( state = = NUD_FAILED | |
neigh: make sure used and confirmed times are valid
Entries can linger in cache without timer for days, thanks to
the gc_thresh1 limit. As result, without traffic, the confirmed
time can be outdated and to appear to be in the future. Later,
on traffic, NUD_STALE entries can switch to NUD_DELAY and start
the timer which can see the invalid confirmed time and wrongly
switch to NUD_REACHABLE state instead of NUD_PROBE. As result,
timer is set many days in the future. This is more visible on
32-bit platforms, with higher HZ value.
Why this is a problem? While we expect unused entries to expire,
such entries stay in REACHABLE state for too long, locked in
cache. They are not expired normally, only when cache is full.
Problem and the wrong state change reported by Zhang Changzhong:
172.16.1.18 dev bond0 lladdr 0a:0e:0f:01:12:01 ref 1 used 350521/15994171/350520 probes 4 REACHABLE
350520 seconds have elapsed since this entry was last updated, but it is
still in the REACHABLE state (base_reachable_time_ms is 30000),
preventing lladdr from being updated through probe.
Fix it by ensuring timer is started with valid used/confirmed
times. Considering the valid time range is LONG_MAX jiffies,
we try not to go too much in the past while we are in
DELAY/PROBE state. There are also places that need
used/updated times to be validated while timer is not running.
Reported-by: Zhang Changzhong <zhangchangzhong@huawei.com>
Signed-off-by: Julian Anastasov <ja@ssi.bg>
Tested-by: Zhang Changzhong <zhangchangzhong@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2023-02-02 17:25:51 +02:00
! time_in_range_open ( jiffies , n - > used ,
n - > used + NEIGH_VAR ( n - > parms , GC_STALETIME ) ) ) ) {
2023-09-21 08:46:26 +00:00
rcu_assign_pointer ( * np ,
rcu_dereference_protected ( n - > next ,
lockdep_is_held ( & tbl - > lock ) ) ) ;
neighbor: Improve garbage collection
The existing garbage collection algorithm has a number of problems:
1. The gc algorithm will not evict PERMANENT entries as those entries
are managed by userspace, yet the existing algorithm walks the entire
hash table which means it always considers PERMANENT entries when
looking for entries to evict. In some use cases (e.g., EVPN) there
can be tens of thousands of PERMANENT entries leading to wasted
CPU cycles when gc kicks in. As an example, with 32k permanent
entries, neigh_alloc has been observed taking more than 4 msec per
invocation.
2. Currently, when the number of neighbor entries hits gc_thresh2 and
the last flush for the table was more than 5 seconds ago gc kicks in
walks the entire hash table evicting *all* entries not in PERMANENT
or REACHABLE state and not marked as externally learned. There is no
discriminator on when the neigh entry was created or if it just moved
from REACHABLE to another NUD_VALID state (e.g., NUD_STALE).
It is possible for entries to be created or for established neighbor
entries to be moved to STALE (e.g., an external node sends an ARP
request) right before the 5 second window lapses:
-----|---------x|----------|-----
t-5 t t+5
If that happens those entries are evicted during gc causing unnecessary
thrashing on neighbor entries and userspace caches trying to track them.
Further, this contradicts the description of gc_thresh2 which says
"Entries older than 5 seconds will be cleared".
One workaround is to make gc_thresh2 == gc_thresh3 but that negates the
whole point of having separate thresholds.
3. Clearing *all* neigh non-PERMANENT/REACHABLE/externally learned entries
when gc_thresh2 is exceeded is over kill and contributes to trashing
especially during startup.
This patch addresses these problems as follows:
1. Use of a separate list_head to track entries that can be garbage
collected along with a separate counter. PERMANENT entries are not
added to this list.
The gc_thresh parameters are only compared to the new counter, not the
total entries in the table. The forced_gc function is updated to only
walk this new gc_list looking for entries to evict.
2. Entries are added to the list head at the tail and removed from the
front.
3. Entries are only evicted if they were last updated more than 5 seconds
ago, adhering to the original intent of gc_thresh2.
4. Forced gc is stopped once the number of gc_entries drops below
gc_thresh2.
5. Since gc checks do not apply to PERMANENT entries, gc levels are skipped
when allocating a new neighbor for a PERMANENT entry. By extension this
means there are no explicit limits on the number of PERMANENT entries
that can be created, but this is no different than FIB entries or FDB
entries.
Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-12-07 12:24:57 -08:00
neigh_mark_dead ( n ) ;
2009-07-30 03:15:07 +00:00
write_unlock ( & n - > lock ) ;
neigh_cleanup_and_release ( n ) ;
continue ;
}
2005-04-16 15:20:36 -07:00
write_unlock ( & n - > lock ) ;
next_elt :
2009-07-30 03:15:07 +00:00
np = & n - > next ;
}
/*
* It ' s fine to release lock here , even if hash table
* grows while we are preempted .
*/
write_unlock_bh ( & tbl - > lock ) ;
cond_resched ( ) ;
write_lock_bh ( & tbl - > lock ) ;
2012-02-21 16:04:13 -05:00
nht = rcu_dereference_protected ( tbl - > nht ,
lockdep_is_held ( & tbl - > lock ) ) ;
2005-04-16 15:20:36 -07:00
}
2013-01-22 05:20:05 +00:00
out :
2013-12-07 19:26:53 +01:00
/* Cycle through all hash buckets every BASE_REACHABLE_TIME/2 ticks.
* ARP entry timeouts range from 1 / 2 BASE_REACHABLE_TIME to 3 / 2
* BASE_REACHABLE_TIME .
2005-04-16 15:20:36 -07:00
*/
2014-01-22 12:23:33 +05:30
queue_delayed_work ( system_power_efficient_wq , & tbl - > gc_work ,
2013-12-07 19:26:53 +01:00
NEIGH_VAR ( & tbl - > parms , BASE_REACHABLE_TIME ) > > 1 ) ;
2009-07-30 03:15:07 +00:00
write_unlock_bh ( & tbl - > lock ) ;
2005-04-16 15:20:36 -07:00
}
static __inline__ int neigh_max_probes ( struct neighbour * n )
{
struct neigh_parms * p = n - > parms ;
2015-03-19 22:41:46 +09:00
return NEIGH_VAR ( p , UCAST_PROBES ) + NEIGH_VAR ( p , APP_PROBES ) +
( n - > nud_state & NUD_PROBE ? NEIGH_VAR ( p , MCAST_REPROBES ) :
NEIGH_VAR ( p , MCAST_PROBES ) ) ;
2005-04-16 15:20:36 -07:00
}
2009-06-11 04:16:28 -07:00
static void neigh_invalidate ( struct neighbour * neigh )
2010-03-09 19:40:54 +00:00
__releases ( neigh - > lock )
__acquires ( neigh - > lock )
2009-06-11 04:16:28 -07:00
{
struct sk_buff * skb ;
NEIGH_CACHE_STAT_INC ( neigh - > tbl , res_failed ) ;
2013-04-15 15:17:19 +00:00
neigh_dbg ( 2 , " neigh %p is failed \n " , neigh ) ;
2009-06-11 04:16:28 -07:00
neigh - > updated = jiffies ;
/* It is very thin place. report_unreachable is very complicated
routine . Particularly , it can hit the same neighbour entry !
So that , we try to be accurate and avoid dead loop . - - ANK
*/
while ( neigh - > nud_state = = NUD_FAILED & &
( skb = __skb_dequeue ( & neigh - > arp_queue ) ) ! = NULL ) {
write_unlock ( & neigh - > lock ) ;
neigh - > ops - > error_report ( neigh , skb ) ;
write_lock ( & neigh - > lock ) ;
}
2013-06-28 02:37:42 -07:00
__skb_queue_purge ( & neigh - > arp_queue ) ;
neigh: new unresolved queue limits
Le mercredi 09 novembre 2011 à 16:21 -0500, David Miller a écrit :
> From: David Miller <davem@davemloft.net>
> Date: Wed, 09 Nov 2011 16:16:44 -0500 (EST)
>
> > From: Eric Dumazet <eric.dumazet@gmail.com>
> > Date: Wed, 09 Nov 2011 12:14:09 +0100
> >
> >> unres_qlen is the number of frames we are able to queue per unresolved
> >> neighbour. Its default value (3) was never changed and is responsible
> >> for strange drops, especially if IP fragments are used, or multiple
> >> sessions start in parallel. Even a single tcp flow can hit this limit.
> > ...
> >
> > Ok, I've applied this, let's see what happens :-)
>
> Early answer, build fails.
>
> Please test build this patch with DECNET enabled and resubmit. The
> decnet neigh layer still refers to the removed ->queue_len member.
>
> Thanks.
Ouch, this was fixed on one machine yesterday, but not the other one I
used this morning, sorry.
[PATCH V5 net-next] neigh: new unresolved queue limits
unres_qlen is the number of frames we are able to queue per unresolved
neighbour. Its default value (3) was never changed and is responsible
for strange drops, especially if IP fragments are used, or multiple
sessions start in parallel. Even a single tcp flow can hit this limit.
$ arp -d 192.168.20.108 ; ping -c 2 -s 8000 192.168.20.108
PING 192.168.20.108 (192.168.20.108) 8000(8028) bytes of data.
8008 bytes from 192.168.20.108: icmp_seq=2 ttl=64 time=0.322 ms
Signed-off-by: David S. Miller <davem@davemloft.net>
2011-11-09 12:07:14 +00:00
neigh - > arp_queue_len_bytes = 0 ;
2009-06-11 04:16:28 -07:00
}
2011-08-09 08:15:58 +00:00
static void neigh_probe ( struct neighbour * neigh )
__releases ( neigh - > lock )
{
2013-09-21 06:32:34 +02:00
struct sk_buff * skb = skb_peek_tail ( & neigh - > arp_queue ) ;
2011-08-09 08:15:58 +00:00
/* keep skb alive even if arp_queue overflows */
if ( skb )
2015-11-17 20:49:30 +08:00
skb = skb_clone ( skb , GFP_ATOMIC ) ;
2011-08-09 08:15:58 +00:00
write_unlock ( & neigh - > lock ) ;
2017-03-23 12:39:21 -07:00
if ( neigh - > ops - > solicit )
neigh - > ops - > solicit ( neigh , skb ) ;
2011-08-09 08:15:58 +00:00
atomic_inc ( & neigh - > probes ) ;
2019-01-17 23:11:30 +08:00
consume_skb ( skb ) ;
2011-08-09 08:15:58 +00:00
}
2005-04-16 15:20:36 -07:00
/* Called when a timer expires for a neighbour entry. */
treewide: setup_timer() -> timer_setup()
This converts all remaining cases of the old setup_timer() API into using
timer_setup(), where the callback argument is the structure already
holding the struct timer_list. These should have no behavioral changes,
since they just change which pointer is passed into the callback with
the same available pointers after conversion. It handles the following
examples, in addition to some other variations.
Casting from unsigned long:
void my_callback(unsigned long data)
{
struct something *ptr = (struct something *)data;
...
}
...
setup_timer(&ptr->my_timer, my_callback, ptr);
and forced object casts:
void my_callback(struct something *ptr)
{
...
}
...
setup_timer(&ptr->my_timer, my_callback, (unsigned long)ptr);
become:
void my_callback(struct timer_list *t)
{
struct something *ptr = from_timer(ptr, t, my_timer);
...
}
...
timer_setup(&ptr->my_timer, my_callback, 0);
Direct function assignments:
void my_callback(unsigned long data)
{
struct something *ptr = (struct something *)data;
...
}
...
ptr->my_timer.function = my_callback;
have a temporary cast added, along with converting the args:
void my_callback(struct timer_list *t)
{
struct something *ptr = from_timer(ptr, t, my_timer);
...
}
...
ptr->my_timer.function = (TIMER_FUNC_TYPE)my_callback;
And finally, callbacks without a data assignment:
void my_callback(unsigned long data)
{
...
}
...
setup_timer(&ptr->my_timer, my_callback, 0);
have their argument renamed to verify they're unused during conversion:
void my_callback(struct timer_list *unused)
{
...
}
...
timer_setup(&ptr->my_timer, my_callback, 0);
The conversion is done with the following Coccinelle script:
spatch --very-quiet --all-includes --include-headers \
-I ./arch/x86/include -I ./arch/x86/include/generated \
-I ./include -I ./arch/x86/include/uapi \
-I ./arch/x86/include/generated/uapi -I ./include/uapi \
-I ./include/generated/uapi --include ./include/linux/kconfig.h \
--dir . \
--cocci-file ~/src/data/timer_setup.cocci
@fix_address_of@
expression e;
@@
setup_timer(
-&(e)
+&e
, ...)
// Update any raw setup_timer() usages that have a NULL callback, but
// would otherwise match change_timer_function_usage, since the latter
// will update all function assignments done in the face of a NULL
// function initialization in setup_timer().
@change_timer_function_usage_NULL@
expression _E;
identifier _timer;
type _cast_data;
@@
(
-setup_timer(&_E->_timer, NULL, _E);
+timer_setup(&_E->_timer, NULL, 0);
|
-setup_timer(&_E->_timer, NULL, (_cast_data)_E);
+timer_setup(&_E->_timer, NULL, 0);
|
-setup_timer(&_E._timer, NULL, &_E);
+timer_setup(&_E._timer, NULL, 0);
|
-setup_timer(&_E._timer, NULL, (_cast_data)&_E);
+timer_setup(&_E._timer, NULL, 0);
)
@change_timer_function_usage@
expression _E;
identifier _timer;
struct timer_list _stl;
identifier _callback;
type _cast_func, _cast_data;
@@
(
-setup_timer(&_E->_timer, _callback, _E);
+timer_setup(&_E->_timer, _callback, 0);
|
-setup_timer(&_E->_timer, &_callback, _E);
+timer_setup(&_E->_timer, _callback, 0);
|
-setup_timer(&_E->_timer, _callback, (_cast_data)_E);
+timer_setup(&_E->_timer, _callback, 0);
|
-setup_timer(&_E->_timer, &_callback, (_cast_data)_E);
+timer_setup(&_E->_timer, _callback, 0);
|
-setup_timer(&_E->_timer, (_cast_func)_callback, _E);
+timer_setup(&_E->_timer, _callback, 0);
|
-setup_timer(&_E->_timer, (_cast_func)&_callback, _E);
+timer_setup(&_E->_timer, _callback, 0);
|
-setup_timer(&_E->_timer, (_cast_func)_callback, (_cast_data)_E);
+timer_setup(&_E->_timer, _callback, 0);
|
-setup_timer(&_E->_timer, (_cast_func)&_callback, (_cast_data)_E);
+timer_setup(&_E->_timer, _callback, 0);
|
-setup_timer(&_E._timer, _callback, (_cast_data)_E);
+timer_setup(&_E._timer, _callback, 0);
|
-setup_timer(&_E._timer, _callback, (_cast_data)&_E);
+timer_setup(&_E._timer, _callback, 0);
|
-setup_timer(&_E._timer, &_callback, (_cast_data)_E);
+timer_setup(&_E._timer, _callback, 0);
|
-setup_timer(&_E._timer, &_callback, (_cast_data)&_E);
+timer_setup(&_E._timer, _callback, 0);
|
-setup_timer(&_E._timer, (_cast_func)_callback, (_cast_data)_E);
+timer_setup(&_E._timer, _callback, 0);
|
-setup_timer(&_E._timer, (_cast_func)_callback, (_cast_data)&_E);
+timer_setup(&_E._timer, _callback, 0);
|
-setup_timer(&_E._timer, (_cast_func)&_callback, (_cast_data)_E);
+timer_setup(&_E._timer, _callback, 0);
|
-setup_timer(&_E._timer, (_cast_func)&_callback, (_cast_data)&_E);
+timer_setup(&_E._timer, _callback, 0);
|
_E->_timer@_stl.function = _callback;
|
_E->_timer@_stl.function = &_callback;
|
_E->_timer@_stl.function = (_cast_func)_callback;
|
_E->_timer@_stl.function = (_cast_func)&_callback;
|
_E._timer@_stl.function = _callback;
|
_E._timer@_stl.function = &_callback;
|
_E._timer@_stl.function = (_cast_func)_callback;
|
_E._timer@_stl.function = (_cast_func)&_callback;
)
// callback(unsigned long arg)
@change_callback_handle_cast
depends on change_timer_function_usage@
identifier change_timer_function_usage._callback;
identifier change_timer_function_usage._timer;
type _origtype;
identifier _origarg;
type _handletype;
identifier _handle;
@@
void _callback(
-_origtype _origarg
+struct timer_list *t
)
{
(
... when != _origarg
_handletype *_handle =
-(_handletype *)_origarg;
+from_timer(_handle, t, _timer);
... when != _origarg
|
... when != _origarg
_handletype *_handle =
-(void *)_origarg;
+from_timer(_handle, t, _timer);
... when != _origarg
|
... when != _origarg
_handletype *_handle;
... when != _handle
_handle =
-(_handletype *)_origarg;
+from_timer(_handle, t, _timer);
... when != _origarg
|
... when != _origarg
_handletype *_handle;
... when != _handle
_handle =
-(void *)_origarg;
+from_timer(_handle, t, _timer);
... when != _origarg
)
}
// callback(unsigned long arg) without existing variable
@change_callback_handle_cast_no_arg
depends on change_timer_function_usage &&
!change_callback_handle_cast@
identifier change_timer_function_usage._callback;
identifier change_timer_function_usage._timer;
type _origtype;
identifier _origarg;
type _handletype;
@@
void _callback(
-_origtype _origarg
+struct timer_list *t
)
{
+ _handletype *_origarg = from_timer(_origarg, t, _timer);
+
... when != _origarg
- (_handletype *)_origarg
+ _origarg
... when != _origarg
}
// Avoid already converted callbacks.
@match_callback_converted
depends on change_timer_function_usage &&
!change_callback_handle_cast &&
!change_callback_handle_cast_no_arg@
identifier change_timer_function_usage._callback;
identifier t;
@@
void _callback(struct timer_list *t)
{ ... }
// callback(struct something *handle)
@change_callback_handle_arg
depends on change_timer_function_usage &&
!match_callback_converted &&
!change_callback_handle_cast &&
!change_callback_handle_cast_no_arg@
identifier change_timer_function_usage._callback;
identifier change_timer_function_usage._timer;
type _handletype;
identifier _handle;
@@
void _callback(
-_handletype *_handle
+struct timer_list *t
)
{
+ _handletype *_handle = from_timer(_handle, t, _timer);
...
}
// If change_callback_handle_arg ran on an empty function, remove
// the added handler.
@unchange_callback_handle_arg
depends on change_timer_function_usage &&
change_callback_handle_arg@
identifier change_timer_function_usage._callback;
identifier change_timer_function_usage._timer;
type _handletype;
identifier _handle;
identifier t;
@@
void _callback(struct timer_list *t)
{
- _handletype *_handle = from_timer(_handle, t, _timer);
}
// We only want to refactor the setup_timer() data argument if we've found
// the matching callback. This undoes changes in change_timer_function_usage.
@unchange_timer_function_usage
depends on change_timer_function_usage &&
!change_callback_handle_cast &&
!change_callback_handle_cast_no_arg &&
!change_callback_handle_arg@
expression change_timer_function_usage._E;
identifier change_timer_function_usage._timer;
identifier change_timer_function_usage._callback;
type change_timer_function_usage._cast_data;
@@
(
-timer_setup(&_E->_timer, _callback, 0);
+setup_timer(&_E->_timer, _callback, (_cast_data)_E);
|
-timer_setup(&_E._timer, _callback, 0);
+setup_timer(&_E._timer, _callback, (_cast_data)&_E);
)
// If we fixed a callback from a .function assignment, fix the
// assignment cast now.
@change_timer_function_assignment
depends on change_timer_function_usage &&
(change_callback_handle_cast ||
change_callback_handle_cast_no_arg ||
change_callback_handle_arg)@
expression change_timer_function_usage._E;
identifier change_timer_function_usage._timer;
identifier change_timer_function_usage._callback;
type _cast_func;
typedef TIMER_FUNC_TYPE;
@@
(
_E->_timer.function =
-_callback
+(TIMER_FUNC_TYPE)_callback
;
|
_E->_timer.function =
-&_callback
+(TIMER_FUNC_TYPE)_callback
;
|
_E->_timer.function =
-(_cast_func)_callback;
+(TIMER_FUNC_TYPE)_callback
;
|
_E->_timer.function =
-(_cast_func)&_callback
+(TIMER_FUNC_TYPE)_callback
;
|
_E._timer.function =
-_callback
+(TIMER_FUNC_TYPE)_callback
;
|
_E._timer.function =
-&_callback;
+(TIMER_FUNC_TYPE)_callback
;
|
_E._timer.function =
-(_cast_func)_callback
+(TIMER_FUNC_TYPE)_callback
;
|
_E._timer.function =
-(_cast_func)&_callback
+(TIMER_FUNC_TYPE)_callback
;
)
// Sometimes timer functions are called directly. Replace matched args.
@change_timer_function_calls
depends on change_timer_function_usage &&
(change_callback_handle_cast ||
change_callback_handle_cast_no_arg ||
change_callback_handle_arg)@
expression _E;
identifier change_timer_function_usage._timer;
identifier change_timer_function_usage._callback;
type _cast_data;
@@
_callback(
(
-(_cast_data)_E
+&_E->_timer
|
-(_cast_data)&_E
+&_E._timer
|
-_E
+&_E->_timer
)
)
// If a timer has been configured without a data argument, it can be
// converted without regard to the callback argument, since it is unused.
@match_timer_function_unused_data@
expression _E;
identifier _timer;
identifier _callback;
@@
(
-setup_timer(&_E->_timer, _callback, 0);
+timer_setup(&_E->_timer, _callback, 0);
|
-setup_timer(&_E->_timer, _callback, 0L);
+timer_setup(&_E->_timer, _callback, 0);
|
-setup_timer(&_E->_timer, _callback, 0UL);
+timer_setup(&_E->_timer, _callback, 0);
|
-setup_timer(&_E._timer, _callback, 0);
+timer_setup(&_E._timer, _callback, 0);
|
-setup_timer(&_E._timer, _callback, 0L);
+timer_setup(&_E._timer, _callback, 0);
|
-setup_timer(&_E._timer, _callback, 0UL);
+timer_setup(&_E._timer, _callback, 0);
|
-setup_timer(&_timer, _callback, 0);
+timer_setup(&_timer, _callback, 0);
|
-setup_timer(&_timer, _callback, 0L);
+timer_setup(&_timer, _callback, 0);
|
-setup_timer(&_timer, _callback, 0UL);
+timer_setup(&_timer, _callback, 0);
|
-setup_timer(_timer, _callback, 0);
+timer_setup(_timer, _callback, 0);
|
-setup_timer(_timer, _callback, 0L);
+timer_setup(_timer, _callback, 0);
|
-setup_timer(_timer, _callback, 0UL);
+timer_setup(_timer, _callback, 0);
)
@change_callback_unused_data
depends on match_timer_function_unused_data@
identifier match_timer_function_unused_data._callback;
type _origtype;
identifier _origarg;
@@
void _callback(
-_origtype _origarg
+struct timer_list *unused
)
{
... when != _origarg
}
Signed-off-by: Kees Cook <keescook@chromium.org>
2017-10-16 14:43:17 -07:00
static void neigh_timer_handler ( struct timer_list * t )
2005-04-16 15:20:36 -07:00
{
unsigned long now , next ;
treewide: setup_timer() -> timer_setup()
This converts all remaining cases of the old setup_timer() API into using
timer_setup(), where the callback argument is the structure already
holding the struct timer_list. These should have no behavioral changes,
since they just change which pointer is passed into the callback with
the same available pointers after conversion. It handles the following
examples, in addition to some other variations.
Casting from unsigned long:
void my_callback(unsigned long data)
{
struct something *ptr = (struct something *)data;
...
}
...
setup_timer(&ptr->my_timer, my_callback, ptr);
and forced object casts:
void my_callback(struct something *ptr)
{
...
}
...
setup_timer(&ptr->my_timer, my_callback, (unsigned long)ptr);
become:
void my_callback(struct timer_list *t)
{
struct something *ptr = from_timer(ptr, t, my_timer);
...
}
...
timer_setup(&ptr->my_timer, my_callback, 0);
Direct function assignments:
void my_callback(unsigned long data)
{
struct something *ptr = (struct something *)data;
...
}
...
ptr->my_timer.function = my_callback;
have a temporary cast added, along with converting the args:
void my_callback(struct timer_list *t)
{
struct something *ptr = from_timer(ptr, t, my_timer);
...
}
...
ptr->my_timer.function = (TIMER_FUNC_TYPE)my_callback;
And finally, callbacks without a data assignment:
void my_callback(unsigned long data)
{
...
}
...
setup_timer(&ptr->my_timer, my_callback, 0);
have their argument renamed to verify they're unused during conversion:
void my_callback(struct timer_list *unused)
{
...
}
...
timer_setup(&ptr->my_timer, my_callback, 0);
The conversion is done with the following Coccinelle script:
spatch --very-quiet --all-includes --include-headers \
-I ./arch/x86/include -I ./arch/x86/include/generated \
-I ./include -I ./arch/x86/include/uapi \
-I ./arch/x86/include/generated/uapi -I ./include/uapi \
-I ./include/generated/uapi --include ./include/linux/kconfig.h \
--dir . \
--cocci-file ~/src/data/timer_setup.cocci
@fix_address_of@
expression e;
@@
setup_timer(
-&(e)
+&e
, ...)
// Update any raw setup_timer() usages that have a NULL callback, but
// would otherwise match change_timer_function_usage, since the latter
// will update all function assignments done in the face of a NULL
// function initialization in setup_timer().
@change_timer_function_usage_NULL@
expression _E;
identifier _timer;
type _cast_data;
@@
(
-setup_timer(&_E->_timer, NULL, _E);
+timer_setup(&_E->_timer, NULL, 0);
|
-setup_timer(&_E->_timer, NULL, (_cast_data)_E);
+timer_setup(&_E->_timer, NULL, 0);
|
-setup_timer(&_E._timer, NULL, &_E);
+timer_setup(&_E._timer, NULL, 0);
|
-setup_timer(&_E._timer, NULL, (_cast_data)&_E);
+timer_setup(&_E._timer, NULL, 0);
)
@change_timer_function_usage@
expression _E;
identifier _timer;
struct timer_list _stl;
identifier _callback;
type _cast_func, _cast_data;
@@
(
-setup_timer(&_E->_timer, _callback, _E);
+timer_setup(&_E->_timer, _callback, 0);
|
-setup_timer(&_E->_timer, &_callback, _E);
+timer_setup(&_E->_timer, _callback, 0);
|
-setup_timer(&_E->_timer, _callback, (_cast_data)_E);
+timer_setup(&_E->_timer, _callback, 0);
|
-setup_timer(&_E->_timer, &_callback, (_cast_data)_E);
+timer_setup(&_E->_timer, _callback, 0);
|
-setup_timer(&_E->_timer, (_cast_func)_callback, _E);
+timer_setup(&_E->_timer, _callback, 0);
|
-setup_timer(&_E->_timer, (_cast_func)&_callback, _E);
+timer_setup(&_E->_timer, _callback, 0);
|
-setup_timer(&_E->_timer, (_cast_func)_callback, (_cast_data)_E);
+timer_setup(&_E->_timer, _callback, 0);
|
-setup_timer(&_E->_timer, (_cast_func)&_callback, (_cast_data)_E);
+timer_setup(&_E->_timer, _callback, 0);
|
-setup_timer(&_E._timer, _callback, (_cast_data)_E);
+timer_setup(&_E._timer, _callback, 0);
|
-setup_timer(&_E._timer, _callback, (_cast_data)&_E);
+timer_setup(&_E._timer, _callback, 0);
|
-setup_timer(&_E._timer, &_callback, (_cast_data)_E);
+timer_setup(&_E._timer, _callback, 0);
|
-setup_timer(&_E._timer, &_callback, (_cast_data)&_E);
+timer_setup(&_E._timer, _callback, 0);
|
-setup_timer(&_E._timer, (_cast_func)_callback, (_cast_data)_E);
+timer_setup(&_E._timer, _callback, 0);
|
-setup_timer(&_E._timer, (_cast_func)_callback, (_cast_data)&_E);
+timer_setup(&_E._timer, _callback, 0);
|
-setup_timer(&_E._timer, (_cast_func)&_callback, (_cast_data)_E);
+timer_setup(&_E._timer, _callback, 0);
|
-setup_timer(&_E._timer, (_cast_func)&_callback, (_cast_data)&_E);
+timer_setup(&_E._timer, _callback, 0);
|
_E->_timer@_stl.function = _callback;
|
_E->_timer@_stl.function = &_callback;
|
_E->_timer@_stl.function = (_cast_func)_callback;
|
_E->_timer@_stl.function = (_cast_func)&_callback;
|
_E._timer@_stl.function = _callback;
|
_E._timer@_stl.function = &_callback;
|
_E._timer@_stl.function = (_cast_func)_callback;
|
_E._timer@_stl.function = (_cast_func)&_callback;
)
// callback(unsigned long arg)
@change_callback_handle_cast
depends on change_timer_function_usage@
identifier change_timer_function_usage._callback;
identifier change_timer_function_usage._timer;
type _origtype;
identifier _origarg;
type _handletype;
identifier _handle;
@@
void _callback(
-_origtype _origarg
+struct timer_list *t
)
{
(
... when != _origarg
_handletype *_handle =
-(_handletype *)_origarg;
+from_timer(_handle, t, _timer);
... when != _origarg
|
... when != _origarg
_handletype *_handle =
-(void *)_origarg;
+from_timer(_handle, t, _timer);
... when != _origarg
|
... when != _origarg
_handletype *_handle;
... when != _handle
_handle =
-(_handletype *)_origarg;
+from_timer(_handle, t, _timer);
... when != _origarg
|
... when != _origarg
_handletype *_handle;
... when != _handle
_handle =
-(void *)_origarg;
+from_timer(_handle, t, _timer);
... when != _origarg
)
}
// callback(unsigned long arg) without existing variable
@change_callback_handle_cast_no_arg
depends on change_timer_function_usage &&
!change_callback_handle_cast@
identifier change_timer_function_usage._callback;
identifier change_timer_function_usage._timer;
type _origtype;
identifier _origarg;
type _handletype;
@@
void _callback(
-_origtype _origarg
+struct timer_list *t
)
{
+ _handletype *_origarg = from_timer(_origarg, t, _timer);
+
... when != _origarg
- (_handletype *)_origarg
+ _origarg
... when != _origarg
}
// Avoid already converted callbacks.
@match_callback_converted
depends on change_timer_function_usage &&
!change_callback_handle_cast &&
!change_callback_handle_cast_no_arg@
identifier change_timer_function_usage._callback;
identifier t;
@@
void _callback(struct timer_list *t)
{ ... }
// callback(struct something *handle)
@change_callback_handle_arg
depends on change_timer_function_usage &&
!match_callback_converted &&
!change_callback_handle_cast &&
!change_callback_handle_cast_no_arg@
identifier change_timer_function_usage._callback;
identifier change_timer_function_usage._timer;
type _handletype;
identifier _handle;
@@
void _callback(
-_handletype *_handle
+struct timer_list *t
)
{
+ _handletype *_handle = from_timer(_handle, t, _timer);
...
}
// If change_callback_handle_arg ran on an empty function, remove
// the added handler.
@unchange_callback_handle_arg
depends on change_timer_function_usage &&
change_callback_handle_arg@
identifier change_timer_function_usage._callback;
identifier change_timer_function_usage._timer;
type _handletype;
identifier _handle;
identifier t;
@@
void _callback(struct timer_list *t)
{
- _handletype *_handle = from_timer(_handle, t, _timer);
}
// We only want to refactor the setup_timer() data argument if we've found
// the matching callback. This undoes changes in change_timer_function_usage.
@unchange_timer_function_usage
depends on change_timer_function_usage &&
!change_callback_handle_cast &&
!change_callback_handle_cast_no_arg &&
!change_callback_handle_arg@
expression change_timer_function_usage._E;
identifier change_timer_function_usage._timer;
identifier change_timer_function_usage._callback;
type change_timer_function_usage._cast_data;
@@
(
-timer_setup(&_E->_timer, _callback, 0);
+setup_timer(&_E->_timer, _callback, (_cast_data)_E);
|
-timer_setup(&_E._timer, _callback, 0);
+setup_timer(&_E._timer, _callback, (_cast_data)&_E);
)
// If we fixed a callback from a .function assignment, fix the
// assignment cast now.
@change_timer_function_assignment
depends on change_timer_function_usage &&
(change_callback_handle_cast ||
change_callback_handle_cast_no_arg ||
change_callback_handle_arg)@
expression change_timer_function_usage._E;
identifier change_timer_function_usage._timer;
identifier change_timer_function_usage._callback;
type _cast_func;
typedef TIMER_FUNC_TYPE;
@@
(
_E->_timer.function =
-_callback
+(TIMER_FUNC_TYPE)_callback
;
|
_E->_timer.function =
-&_callback
+(TIMER_FUNC_TYPE)_callback
;
|
_E->_timer.function =
-(_cast_func)_callback;
+(TIMER_FUNC_TYPE)_callback
;
|
_E->_timer.function =
-(_cast_func)&_callback
+(TIMER_FUNC_TYPE)_callback
;
|
_E._timer.function =
-_callback
+(TIMER_FUNC_TYPE)_callback
;
|
_E._timer.function =
-&_callback;
+(TIMER_FUNC_TYPE)_callback
;
|
_E._timer.function =
-(_cast_func)_callback
+(TIMER_FUNC_TYPE)_callback
;
|
_E._timer.function =
-(_cast_func)&_callback
+(TIMER_FUNC_TYPE)_callback
;
)
// Sometimes timer functions are called directly. Replace matched args.
@change_timer_function_calls
depends on change_timer_function_usage &&
(change_callback_handle_cast ||
change_callback_handle_cast_no_arg ||
change_callback_handle_arg)@
expression _E;
identifier change_timer_function_usage._timer;
identifier change_timer_function_usage._callback;
type _cast_data;
@@
_callback(
(
-(_cast_data)_E
+&_E->_timer
|
-(_cast_data)&_E
+&_E._timer
|
-_E
+&_E->_timer
)
)
// If a timer has been configured without a data argument, it can be
// converted without regard to the callback argument, since it is unused.
@match_timer_function_unused_data@
expression _E;
identifier _timer;
identifier _callback;
@@
(
-setup_timer(&_E->_timer, _callback, 0);
+timer_setup(&_E->_timer, _callback, 0);
|
-setup_timer(&_E->_timer, _callback, 0L);
+timer_setup(&_E->_timer, _callback, 0);
|
-setup_timer(&_E->_timer, _callback, 0UL);
+timer_setup(&_E->_timer, _callback, 0);
|
-setup_timer(&_E._timer, _callback, 0);
+timer_setup(&_E._timer, _callback, 0);
|
-setup_timer(&_E._timer, _callback, 0L);
+timer_setup(&_E._timer, _callback, 0);
|
-setup_timer(&_E._timer, _callback, 0UL);
+timer_setup(&_E._timer, _callback, 0);
|
-setup_timer(&_timer, _callback, 0);
+timer_setup(&_timer, _callback, 0);
|
-setup_timer(&_timer, _callback, 0L);
+timer_setup(&_timer, _callback, 0);
|
-setup_timer(&_timer, _callback, 0UL);
+timer_setup(&_timer, _callback, 0);
|
-setup_timer(_timer, _callback, 0);
+timer_setup(_timer, _callback, 0);
|
-setup_timer(_timer, _callback, 0L);
+timer_setup(_timer, _callback, 0);
|
-setup_timer(_timer, _callback, 0UL);
+timer_setup(_timer, _callback, 0);
)
@change_callback_unused_data
depends on match_timer_function_unused_data@
identifier match_timer_function_unused_data._callback;
type _origtype;
identifier _origarg;
@@
void _callback(
-_origtype _origarg
+struct timer_list *unused
)
{
... when != _origarg
}
Signed-off-by: Kees Cook <keescook@chromium.org>
2017-10-16 14:43:17 -07:00
struct neighbour * neigh = from_timer ( neigh , t , timer ) ;
2012-04-15 05:58:06 +00:00
unsigned int state ;
2005-04-16 15:20:36 -07:00
int notify = 0 ;
write_lock ( & neigh - > lock ) ;
state = neigh - > nud_state ;
now = jiffies ;
next = now + HZ ;
2011-11-01 17:45:55 -04:00
if ( ! ( state & NUD_IN_TIMER ) )
2005-04-16 15:20:36 -07:00
goto out ;
if ( state & NUD_REACHABLE ) {
2007-02-09 23:24:36 +09:00
if ( time_before_eq ( now ,
2005-04-16 15:20:36 -07:00
neigh - > confirmed + neigh - > parms - > reachable_time ) ) {
2013-04-15 15:17:19 +00:00
neigh_dbg ( 2 , " neigh %p is still alive \n " , neigh ) ;
2005-04-16 15:20:36 -07:00
next = neigh - > confirmed + neigh - > parms - > reachable_time ;
} else if ( time_before_eq ( now ,
2013-12-07 19:26:53 +01:00
neigh - > used +
NEIGH_VAR ( neigh - > parms , DELAY_PROBE_TIME ) ) ) {
2013-04-15 15:17:19 +00:00
neigh_dbg ( 2 , " neigh %p is delayed \n " , neigh ) ;
2023-03-13 20:17:31 +00:00
WRITE_ONCE ( neigh - > nud_state , NUD_DELAY ) ;
2006-03-20 16:52:52 -08:00
neigh - > updated = jiffies ;
2005-04-16 15:20:36 -07:00
neigh_suspect ( neigh ) ;
2013-12-07 19:26:53 +01:00
next = now + NEIGH_VAR ( neigh - > parms , DELAY_PROBE_TIME ) ;
2005-04-16 15:20:36 -07:00
} else {
2013-04-15 15:17:19 +00:00
neigh_dbg ( 2 , " neigh %p is suspected \n " , neigh ) ;
2023-03-13 20:17:31 +00:00
WRITE_ONCE ( neigh - > nud_state , NUD_STALE ) ;
2006-03-20 16:52:52 -08:00
neigh - > updated = jiffies ;
2005-04-16 15:20:36 -07:00
neigh_suspect ( neigh ) ;
2006-07-30 20:43:36 -07:00
notify = 1 ;
2005-04-16 15:20:36 -07:00
}
} else if ( state & NUD_DELAY ) {
2007-02-09 23:24:36 +09:00
if ( time_before_eq ( now ,
2013-12-07 19:26:53 +01:00
neigh - > confirmed +
NEIGH_VAR ( neigh - > parms , DELAY_PROBE_TIME ) ) ) {
2013-04-15 15:17:19 +00:00
neigh_dbg ( 2 , " neigh %p is now reachable \n " , neigh ) ;
2023-03-13 20:17:31 +00:00
WRITE_ONCE ( neigh - > nud_state , NUD_REACHABLE ) ;
2006-03-20 16:52:52 -08:00
neigh - > updated = jiffies ;
2005-04-16 15:20:36 -07:00
neigh_connect ( neigh ) ;
2006-07-30 20:43:36 -07:00
notify = 1 ;
2005-04-16 15:20:36 -07:00
next = neigh - > confirmed + neigh - > parms - > reachable_time ;
} else {
2013-04-15 15:17:19 +00:00
neigh_dbg ( 2 , " neigh %p is probed \n " , neigh ) ;
2023-03-13 20:17:31 +00:00
WRITE_ONCE ( neigh - > nud_state , NUD_PROBE ) ;
2006-03-20 16:52:52 -08:00
neigh - > updated = jiffies ;
2005-04-16 15:20:36 -07:00
atomic_set ( & neigh - > probes , 0 ) ;
2015-05-18 19:44:41 +09:00
notify = 1 ;
2020-04-01 14:46:20 +08:00
next = now + max ( NEIGH_VAR ( neigh - > parms , RETRANS_TIME ) ,
HZ / 100 ) ;
2005-04-16 15:20:36 -07:00
}
} else {
/* NUD_PROBE|NUD_INCOMPLETE */
2020-04-01 14:46:20 +08:00
next = now + max ( NEIGH_VAR ( neigh - > parms , RETRANS_TIME ) , HZ / 100 ) ;
2005-04-16 15:20:36 -07:00
}
if ( ( neigh - > nud_state & ( NUD_INCOMPLETE | NUD_PROBE ) ) & &
atomic_read ( & neigh - > probes ) > = neigh_max_probes ( neigh ) ) {
2023-03-13 20:17:31 +00:00
WRITE_ONCE ( neigh - > nud_state , NUD_FAILED ) ;
2005-04-16 15:20:36 -07:00
notify = 1 ;
2009-06-11 04:16:28 -07:00
neigh_invalidate ( neigh ) ;
2014-02-27 17:03:03 +08:00
goto out ;
2005-04-16 15:20:36 -07:00
}
if ( neigh - > nud_state & NUD_IN_TIMER ) {
2020-05-28 15:15:13 +08:00
if ( time_before ( next , jiffies + HZ / 100 ) )
next = jiffies + HZ / 100 ;
2005-10-23 16:37:48 +10:00
if ( ! mod_timer ( & neigh - > timer , next ) )
neigh_hold ( neigh ) ;
2005-04-16 15:20:36 -07:00
}
if ( neigh - > nud_state & ( NUD_INCOMPLETE | NUD_PROBE ) ) {
2011-08-09 08:15:58 +00:00
neigh_probe ( neigh ) ;
2008-02-17 18:39:54 -08:00
} else {
2008-02-11 21:45:44 -08:00
out :
2008-02-17 18:39:54 -08:00
write_unlock ( & neigh - > lock ) ;
}
2007-08-08 23:12:56 -07:00
2006-07-30 20:43:36 -07:00
if ( notify )
2017-03-19 22:01:28 -07:00
neigh_update_notify ( neigh , 0 ) ;
2005-04-16 15:20:36 -07:00
2019-02-14 09:15:11 -08:00
trace_neigh_timer_handler ( neigh , 0 ) ;
2005-04-16 15:20:36 -07:00
neigh_release ( neigh ) ;
}
2022-02-01 20:39:42 +01:00
int __neigh_event_send ( struct neighbour * neigh , struct sk_buff * skb ,
const bool immediate_ok )
2005-04-16 15:20:36 -07:00
{
int rc ;
2011-08-09 08:15:58 +00:00
bool immediate_probe = false ;
2005-04-16 15:20:36 -07:00
write_lock_bh ( & neigh - > lock ) ;
rc = 0 ;
if ( neigh - > nud_state & ( NUD_CONNECTED | NUD_DELAY | NUD_PROBE ) )
goto out_unlock_bh ;
2015-06-16 22:56:39 +03:00
if ( neigh - > dead )
goto out_dead ;
2005-04-16 15:20:36 -07:00
if ( ! ( neigh - > nud_state & ( NUD_STALE | NUD_INCOMPLETE ) ) ) {
2013-12-07 19:26:53 +01:00
if ( NEIGH_VAR ( neigh - > parms , MCAST_PROBES ) +
NEIGH_VAR ( neigh - > parms , APP_PROBES ) ) {
2011-08-09 08:15:58 +00:00
unsigned long next , now = jiffies ;
2013-12-07 19:26:53 +01:00
atomic_set ( & neigh - > probes ,
NEIGH_VAR ( neigh - > parms , UCAST_PROBES ) ) ;
2019-07-14 23:36:11 +02:00
neigh_del_timer ( neigh ) ;
2023-03-13 20:17:31 +00:00
WRITE_ONCE ( neigh - > nud_state , NUD_INCOMPLETE ) ;
2011-08-09 08:15:58 +00:00
neigh - > updated = now ;
2022-02-01 20:39:42 +01:00
if ( ! immediate_ok ) {
next = now + 1 ;
} else {
immediate_probe = true ;
next = now + max ( NEIGH_VAR ( neigh - > parms ,
RETRANS_TIME ) ,
HZ / 100 ) ;
}
2011-08-09 08:15:58 +00:00
neigh_add_timer ( neigh , next ) ;
2005-04-16 15:20:36 -07:00
} else {
2023-03-13 20:17:31 +00:00
WRITE_ONCE ( neigh - > nud_state , NUD_FAILED ) ;
2006-03-20 16:52:52 -08:00
neigh - > updated = jiffies ;
2005-04-16 15:20:36 -07:00
write_unlock_bh ( & neigh - > lock ) ;
2022-02-26 12:18:30 +08:00
kfree_skb_reason ( skb , SKB_DROP_REASON_NEIGH_FAILED ) ;
2005-04-16 15:20:36 -07:00
return 1 ;
}
} else if ( neigh - > nud_state & NUD_STALE ) {
2013-04-15 15:17:19 +00:00
neigh_dbg ( 2 , " neigh %p is delayed \n " , neigh ) ;
2019-07-14 23:36:11 +02:00
neigh_del_timer ( neigh ) ;
2023-03-13 20:17:31 +00:00
WRITE_ONCE ( neigh - > nud_state , NUD_DELAY ) ;
2006-03-20 16:52:52 -08:00
neigh - > updated = jiffies ;
2013-12-07 19:26:53 +01:00
neigh_add_timer ( neigh , jiffies +
NEIGH_VAR ( neigh - > parms , DELAY_PROBE_TIME ) ) ;
2005-04-16 15:20:36 -07:00
}
if ( neigh - > nud_state = = NUD_INCOMPLETE ) {
if ( skb ) {
neigh: new unresolved queue limits
Le mercredi 09 novembre 2011 à 16:21 -0500, David Miller a écrit :
> From: David Miller <davem@davemloft.net>
> Date: Wed, 09 Nov 2011 16:16:44 -0500 (EST)
>
> > From: Eric Dumazet <eric.dumazet@gmail.com>
> > Date: Wed, 09 Nov 2011 12:14:09 +0100
> >
> >> unres_qlen is the number of frames we are able to queue per unresolved
> >> neighbour. Its default value (3) was never changed and is responsible
> >> for strange drops, especially if IP fragments are used, or multiple
> >> sessions start in parallel. Even a single tcp flow can hit this limit.
> > ...
> >
> > Ok, I've applied this, let's see what happens :-)
>
> Early answer, build fails.
>
> Please test build this patch with DECNET enabled and resubmit. The
> decnet neigh layer still refers to the removed ->queue_len member.
>
> Thanks.
Ouch, this was fixed on one machine yesterday, but not the other one I
used this morning, sorry.
[PATCH V5 net-next] neigh: new unresolved queue limits
unres_qlen is the number of frames we are able to queue per unresolved
neighbour. Its default value (3) was never changed and is responsible
for strange drops, especially if IP fragments are used, or multiple
sessions start in parallel. Even a single tcp flow can hit this limit.
$ arp -d 192.168.20.108 ; ping -c 2 -s 8000 192.168.20.108
PING 192.168.20.108 (192.168.20.108) 8000(8028) bytes of data.
8008 bytes from 192.168.20.108: icmp_seq=2 ttl=64 time=0.322 ms
Signed-off-by: David S. Miller <davem@davemloft.net>
2011-11-09 12:07:14 +00:00
while ( neigh - > arp_queue_len_bytes + skb - > truesize >
2013-12-07 19:26:53 +01:00
NEIGH_VAR ( neigh - > parms , QUEUE_LEN_BYTES ) ) {
2005-04-16 15:20:36 -07:00
struct sk_buff * buff ;
neigh: new unresolved queue limits
Le mercredi 09 novembre 2011 à 16:21 -0500, David Miller a écrit :
> From: David Miller <davem@davemloft.net>
> Date: Wed, 09 Nov 2011 16:16:44 -0500 (EST)
>
> > From: Eric Dumazet <eric.dumazet@gmail.com>
> > Date: Wed, 09 Nov 2011 12:14:09 +0100
> >
> >> unres_qlen is the number of frames we are able to queue per unresolved
> >> neighbour. Its default value (3) was never changed and is responsible
> >> for strange drops, especially if IP fragments are used, or multiple
> >> sessions start in parallel. Even a single tcp flow can hit this limit.
> > ...
> >
> > Ok, I've applied this, let's see what happens :-)
>
> Early answer, build fails.
>
> Please test build this patch with DECNET enabled and resubmit. The
> decnet neigh layer still refers to the removed ->queue_len member.
>
> Thanks.
Ouch, this was fixed on one machine yesterday, but not the other one I
used this morning, sorry.
[PATCH V5 net-next] neigh: new unresolved queue limits
unres_qlen is the number of frames we are able to queue per unresolved
neighbour. Its default value (3) was never changed and is responsible
for strange drops, especially if IP fragments are used, or multiple
sessions start in parallel. Even a single tcp flow can hit this limit.
$ arp -d 192.168.20.108 ; ping -c 2 -s 8000 192.168.20.108
PING 192.168.20.108 (192.168.20.108) 8000(8028) bytes of data.
8008 bytes from 192.168.20.108: icmp_seq=2 ttl=64 time=0.322 ms
Signed-off-by: David S. Miller <davem@davemloft.net>
2011-11-09 12:07:14 +00:00
2008-09-23 01:11:18 -07:00
buff = __skb_dequeue ( & neigh - > arp_queue ) ;
neigh: new unresolved queue limits
Le mercredi 09 novembre 2011 à 16:21 -0500, David Miller a écrit :
> From: David Miller <davem@davemloft.net>
> Date: Wed, 09 Nov 2011 16:16:44 -0500 (EST)
>
> > From: Eric Dumazet <eric.dumazet@gmail.com>
> > Date: Wed, 09 Nov 2011 12:14:09 +0100
> >
> >> unres_qlen is the number of frames we are able to queue per unresolved
> >> neighbour. Its default value (3) was never changed and is responsible
> >> for strange drops, especially if IP fragments are used, or multiple
> >> sessions start in parallel. Even a single tcp flow can hit this limit.
> > ...
> >
> > Ok, I've applied this, let's see what happens :-)
>
> Early answer, build fails.
>
> Please test build this patch with DECNET enabled and resubmit. The
> decnet neigh layer still refers to the removed ->queue_len member.
>
> Thanks.
Ouch, this was fixed on one machine yesterday, but not the other one I
used this morning, sorry.
[PATCH V5 net-next] neigh: new unresolved queue limits
unres_qlen is the number of frames we are able to queue per unresolved
neighbour. Its default value (3) was never changed and is responsible
for strange drops, especially if IP fragments are used, or multiple
sessions start in parallel. Even a single tcp flow can hit this limit.
$ arp -d 192.168.20.108 ; ping -c 2 -s 8000 192.168.20.108
PING 192.168.20.108 (192.168.20.108) 8000(8028) bytes of data.
8008 bytes from 192.168.20.108: icmp_seq=2 ttl=64 time=0.322 ms
Signed-off-by: David S. Miller <davem@davemloft.net>
2011-11-09 12:07:14 +00:00
if ( ! buff )
break ;
neigh - > arp_queue_len_bytes - = buff - > truesize ;
2022-02-26 12:18:30 +08:00
kfree_skb_reason ( buff , SKB_DROP_REASON_NEIGH_QUEUEFULL ) ;
2008-07-16 20:50:49 -07:00
NEIGH_CACHE_STAT_INC ( neigh - > tbl , unres_discards ) ;
2005-04-16 15:20:36 -07:00
}
2010-05-27 16:09:39 -07:00
skb_dst_force ( skb ) ;
2005-04-16 15:20:36 -07:00
__skb_queue_tail ( & neigh - > arp_queue , skb ) ;
neigh: new unresolved queue limits
Le mercredi 09 novembre 2011 à 16:21 -0500, David Miller a écrit :
> From: David Miller <davem@davemloft.net>
> Date: Wed, 09 Nov 2011 16:16:44 -0500 (EST)
>
> > From: Eric Dumazet <eric.dumazet@gmail.com>
> > Date: Wed, 09 Nov 2011 12:14:09 +0100
> >
> >> unres_qlen is the number of frames we are able to queue per unresolved
> >> neighbour. Its default value (3) was never changed and is responsible
> >> for strange drops, especially if IP fragments are used, or multiple
> >> sessions start in parallel. Even a single tcp flow can hit this limit.
> > ...
> >
> > Ok, I've applied this, let's see what happens :-)
>
> Early answer, build fails.
>
> Please test build this patch with DECNET enabled and resubmit. The
> decnet neigh layer still refers to the removed ->queue_len member.
>
> Thanks.
Ouch, this was fixed on one machine yesterday, but not the other one I
used this morning, sorry.
[PATCH V5 net-next] neigh: new unresolved queue limits
unres_qlen is the number of frames we are able to queue per unresolved
neighbour. Its default value (3) was never changed and is responsible
for strange drops, especially if IP fragments are used, or multiple
sessions start in parallel. Even a single tcp flow can hit this limit.
$ arp -d 192.168.20.108 ; ping -c 2 -s 8000 192.168.20.108
PING 192.168.20.108 (192.168.20.108) 8000(8028) bytes of data.
8008 bytes from 192.168.20.108: icmp_seq=2 ttl=64 time=0.322 ms
Signed-off-by: David S. Miller <davem@davemloft.net>
2011-11-09 12:07:14 +00:00
neigh - > arp_queue_len_bytes + = skb - > truesize ;
2005-04-16 15:20:36 -07:00
}
rc = 1 ;
}
out_unlock_bh :
2011-08-09 08:15:58 +00:00
if ( immediate_probe )
neigh_probe ( neigh ) ;
else
write_unlock ( & neigh - > lock ) ;
local_bh_enable ( ) ;
2019-02-14 09:15:11 -08:00
trace_neigh_event_send_done ( neigh , rc ) ;
2005-04-16 15:20:36 -07:00
return rc ;
2015-06-16 22:56:39 +03:00
out_dead :
if ( neigh - > nud_state & NUD_STALE )
goto out_unlock_bh ;
write_unlock_bh ( & neigh - > lock ) ;
2022-02-26 12:18:30 +08:00
kfree_skb_reason ( skb , SKB_DROP_REASON_NEIGH_DEAD ) ;
2019-02-14 09:15:11 -08:00
trace_neigh_event_send_dead ( neigh , 1 ) ;
2015-06-16 22:56:39 +03:00
return 1 ;
2005-04-16 15:20:36 -07:00
}
2008-03-24 18:39:10 +09:00
EXPORT_SYMBOL ( __neigh_event_send ) ;
2005-04-16 15:20:36 -07:00
2011-07-14 07:53:20 -07:00
static void neigh_update_hhs ( struct neighbour * neigh )
2005-04-16 15:20:36 -07:00
{
struct hh_cache * hh ;
2007-10-09 01:40:57 -07:00
void ( * update ) ( struct hh_cache * , const struct net_device * , const unsigned char * )
2010-07-14 18:02:16 -07:00
= NULL ;
if ( neigh - > dev - > header_ops )
update = neigh - > dev - > header_ops - > cache_update ;
2005-04-16 15:20:36 -07:00
if ( update ) {
2011-07-14 07:53:20 -07:00
hh = & neigh - > hh ;
2019-11-07 18:29:11 -08:00
if ( READ_ONCE ( hh - > hh_len ) ) {
2006-12-07 15:08:17 -08:00
write_seqlock_bh ( & hh - > hh_lock ) ;
2005-04-16 15:20:36 -07:00
update ( hh , neigh - > dev , neigh - > ha ) ;
2006-12-07 15:08:17 -08:00
write_sequnlock_bh ( & hh - > hh_lock ) ;
2005-04-16 15:20:36 -07:00
}
}
}
/* Generic update routine.
- - lladdr is new lladdr or NULL , if it is not supplied .
- - new is new state .
- - flags
NEIGH_UPDATE_F_OVERRIDE allows to override existing lladdr ,
if it is different .
NEIGH_UPDATE_F_WEAK_OVERRIDE will suspect existing " connected "
2007-02-09 23:24:36 +09:00
lladdr instead of overriding it
2005-04-16 15:20:36 -07:00
if it is different .
NEIGH_UPDATE_F_ADMIN means that the change is administrative .
net, neigh: Enable state migration between NUD_PERMANENT and NTF_USE
Currently, it is not possible to migrate a neighbor entry between NUD_PERMANENT
state and NTF_USE flag with a dynamic NUD state from a user space control plane.
Similarly, it is not possible to add/remove NTF_EXT_LEARNED flag from an existing
neighbor entry in combination with NTF_USE flag.
This is due to the latter directly calling into neigh_event_send() without any
meta data updates as happening in __neigh_update(). Thus, to enable this use
case, extend the latter with a NEIGH_UPDATE_F_USE flag where we break the
NUD_PERMANENT state in particular so that a latter neigh_event_send() is able
to re-resolve a neighbor entry.
Before fix, NUD_PERMANENT -> NUD_* & NTF_USE:
# ./ip/ip n replace 192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a
# ./ip/ip n
192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a PERMANENT
[...]
# ./ip/ip n replace 192.168.178.30 dev enp5s0 use extern_learn
# ./ip/ip n
192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a PERMANENT
[...]
As can be seen, despite the admin-triggered replace, the entry remains in the
NUD_PERMANENT state.
After fix, NUD_PERMANENT -> NUD_* & NTF_USE:
# ./ip/ip n replace 192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a
# ./ip/ip n
192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a PERMANENT
[...]
# ./ip/ip n replace 192.168.178.30 dev enp5s0 use extern_learn
# ./ip/ip n
192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a extern_learn REACHABLE
[...]
# ./ip/ip n
192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a extern_learn STALE
[...]
# ./ip/ip n replace 192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a
# ./ip/ip n
192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a PERMANENT
[...]
After the fix, the admin-triggered replace switches to a dynamic state from
the NTF_USE flag which triggered a new neighbor resolution. Likewise, we can
transition back from there, if needed, into NUD_PERMANENT.
Similar before/after behavior can be observed for below transitions:
Before fix, NTF_USE -> NTF_USE | NTF_EXT_LEARNED -> NTF_USE:
# ./ip/ip n replace 192.168.178.30 dev enp5s0 use
# ./ip/ip n
192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a REACHABLE
[...]
# ./ip/ip n replace 192.168.178.30 dev enp5s0 use extern_learn
# ./ip/ip n
192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a REACHABLE
[...]
After fix, NTF_USE -> NTF_USE | NTF_EXT_LEARNED -> NTF_USE:
# ./ip/ip n replace 192.168.178.30 dev enp5s0 use
# ./ip/ip n
192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a REACHABLE
[...]
# ./ip/ip n replace 192.168.178.30 dev enp5s0 use extern_learn
# ./ip/ip n
192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a extern_learn REACHABLE
[...]
# ./ip/ip n replace 192.168.178.30 dev enp5s0 use
# ./ip/ip n
192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a REACHABLE
[..]
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Roopa Prabhu <roopa@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2021-10-11 14:12:36 +02:00
NEIGH_UPDATE_F_USE means that the entry is user triggered .
2021-10-11 14:12:38 +02:00
NEIGH_UPDATE_F_MANAGED means that the entry will be auto - refreshed .
2007-02-09 23:24:36 +09:00
NEIGH_UPDATE_F_OVERRIDE_ISROUTER allows to override existing
2005-04-16 15:20:36 -07:00
NTF_ROUTER flag .
NEIGH_UPDATE_F_ISROUTER indicates if the neighbour is known as
a router .
Caller MUST hold reference count on the entry .
*/
2018-12-05 20:02:29 -08:00
static int __neigh_update ( struct neighbour * neigh , const u8 * lladdr ,
u8 new , u32 flags , u32 nlmsg_pid ,
struct netlink_ext_ack * extack )
2005-04-16 15:20:36 -07:00
{
2021-10-11 14:12:38 +02:00
bool gc_update = false , managed_update = false ;
2005-04-16 15:20:36 -07:00
int update_isrouter = 0 ;
2021-10-11 14:12:38 +02:00
struct net_device * dev ;
int err , notify = 0 ;
u8 old ;
2005-04-16 15:20:36 -07:00
2019-02-14 09:15:11 -08:00
trace_neigh_update ( neigh , lladdr , new , flags , nlmsg_pid ) ;
2005-04-16 15:20:36 -07:00
write_lock_bh ( & neigh - > lock ) ;
dev = neigh - > dev ;
old = neigh - > nud_state ;
err = - EPERM ;
2018-12-05 20:02:29 -08:00
if ( neigh - > dead ) {
NL_SET_ERR_MSG ( extack , " Neighbor entry is now dead " ) ;
neighbour: Prevent a dead entry from updating gc_list
Following race condition was detected:
<CPU A, t0> - neigh_flush_dev() is under execution and calls
neigh_mark_dead(n) marking the neighbour entry 'n' as dead.
<CPU B, t1> - Executing: __netif_receive_skb() ->
__netif_receive_skb_core() -> arp_rcv() -> arp_process().arp_process()
calls __neigh_lookup() which takes a reference on neighbour entry 'n'.
<CPU A, t2> - Moves further along neigh_flush_dev() and calls
neigh_cleanup_and_release(n), but since reference count increased in t2,
'n' couldn't be destroyed.
<CPU B, t3> - Moves further along, arp_process() and calls
neigh_update()-> __neigh_update() -> neigh_update_gc_list(), which adds
the neighbour entry back in gc_list(neigh_mark_dead(), removed it
earlier in t0 from gc_list)
<CPU B, t4> - arp_process() finally calls neigh_release(n), destroying
the neighbour entry.
This leads to 'n' still being part of gc_list, but the actual
neighbour structure has been freed.
The situation can be prevented from happening if we disallow a dead
entry to have any possibility of updating gc_list. This is what the
patch intends to achieve.
Fixes: 9c29a2f55ec0 ("neighbor: Fix locking order for gc_list changes")
Signed-off-by: Chinmay Agarwal <chinagar@codeaurora.org>
Reviewed-by: Cong Wang <xiyou.wangcong@gmail.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Link: https://lore.kernel.org/r/20210127165453.GA20514@chinagar-linux.qualcomm.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2021-01-27 22:24:54 +05:30
new = old ;
2015-06-16 22:56:39 +03:00
goto out ;
2018-12-05 20:02:29 -08:00
}
neighbour: Prevent a dead entry from updating gc_list
Following race condition was detected:
<CPU A, t0> - neigh_flush_dev() is under execution and calls
neigh_mark_dead(n) marking the neighbour entry 'n' as dead.
<CPU B, t1> - Executing: __netif_receive_skb() ->
__netif_receive_skb_core() -> arp_rcv() -> arp_process().arp_process()
calls __neigh_lookup() which takes a reference on neighbour entry 'n'.
<CPU A, t2> - Moves further along neigh_flush_dev() and calls
neigh_cleanup_and_release(n), but since reference count increased in t2,
'n' couldn't be destroyed.
<CPU B, t3> - Moves further along, arp_process() and calls
neigh_update()-> __neigh_update() -> neigh_update_gc_list(), which adds
the neighbour entry back in gc_list(neigh_mark_dead(), removed it
earlier in t0 from gc_list)
<CPU B, t4> - arp_process() finally calls neigh_release(n), destroying
the neighbour entry.
This leads to 'n' still being part of gc_list, but the actual
neighbour structure has been freed.
The situation can be prevented from happening if we disallow a dead
entry to have any possibility of updating gc_list. This is what the
patch intends to achieve.
Fixes: 9c29a2f55ec0 ("neighbor: Fix locking order for gc_list changes")
Signed-off-by: Chinmay Agarwal <chinagar@codeaurora.org>
Reviewed-by: Cong Wang <xiyou.wangcong@gmail.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Link: https://lore.kernel.org/r/20210127165453.GA20514@chinagar-linux.qualcomm.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2021-01-27 22:24:54 +05:30
if ( ! ( flags & NEIGH_UPDATE_F_ADMIN ) & &
( old & ( NUD_NOARP | NUD_PERMANENT ) ) )
goto out ;
2005-04-16 15:20:36 -07:00
2021-10-11 14:12:38 +02:00
neigh_update_flags ( neigh , flags , & notify , & gc_update , & managed_update ) ;
if ( flags & ( NEIGH_UPDATE_F_USE | NEIGH_UPDATE_F_MANAGED ) ) {
net, neigh: Enable state migration between NUD_PERMANENT and NTF_USE
Currently, it is not possible to migrate a neighbor entry between NUD_PERMANENT
state and NTF_USE flag with a dynamic NUD state from a user space control plane.
Similarly, it is not possible to add/remove NTF_EXT_LEARNED flag from an existing
neighbor entry in combination with NTF_USE flag.
This is due to the latter directly calling into neigh_event_send() without any
meta data updates as happening in __neigh_update(). Thus, to enable this use
case, extend the latter with a NEIGH_UPDATE_F_USE flag where we break the
NUD_PERMANENT state in particular so that a latter neigh_event_send() is able
to re-resolve a neighbor entry.
Before fix, NUD_PERMANENT -> NUD_* & NTF_USE:
# ./ip/ip n replace 192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a
# ./ip/ip n
192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a PERMANENT
[...]
# ./ip/ip n replace 192.168.178.30 dev enp5s0 use extern_learn
# ./ip/ip n
192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a PERMANENT
[...]
As can be seen, despite the admin-triggered replace, the entry remains in the
NUD_PERMANENT state.
After fix, NUD_PERMANENT -> NUD_* & NTF_USE:
# ./ip/ip n replace 192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a
# ./ip/ip n
192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a PERMANENT
[...]
# ./ip/ip n replace 192.168.178.30 dev enp5s0 use extern_learn
# ./ip/ip n
192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a extern_learn REACHABLE
[...]
# ./ip/ip n
192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a extern_learn STALE
[...]
# ./ip/ip n replace 192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a
# ./ip/ip n
192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a PERMANENT
[...]
After the fix, the admin-triggered replace switches to a dynamic state from
the NTF_USE flag which triggered a new neighbor resolution. Likewise, we can
transition back from there, if needed, into NUD_PERMANENT.
Similar before/after behavior can be observed for below transitions:
Before fix, NTF_USE -> NTF_USE | NTF_EXT_LEARNED -> NTF_USE:
# ./ip/ip n replace 192.168.178.30 dev enp5s0 use
# ./ip/ip n
192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a REACHABLE
[...]
# ./ip/ip n replace 192.168.178.30 dev enp5s0 use extern_learn
# ./ip/ip n
192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a REACHABLE
[...]
After fix, NTF_USE -> NTF_USE | NTF_EXT_LEARNED -> NTF_USE:
# ./ip/ip n replace 192.168.178.30 dev enp5s0 use
# ./ip/ip n
192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a REACHABLE
[...]
# ./ip/ip n replace 192.168.178.30 dev enp5s0 use extern_learn
# ./ip/ip n
192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a extern_learn REACHABLE
[...]
# ./ip/ip n replace 192.168.178.30 dev enp5s0 use
# ./ip/ip n
192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a REACHABLE
[..]
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Roopa Prabhu <roopa@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2021-10-11 14:12:36 +02:00
new = old & ~ NUD_PERMANENT ;
2023-03-13 20:17:31 +00:00
WRITE_ONCE ( neigh - > nud_state , new ) ;
net, neigh: Enable state migration between NUD_PERMANENT and NTF_USE
Currently, it is not possible to migrate a neighbor entry between NUD_PERMANENT
state and NTF_USE flag with a dynamic NUD state from a user space control plane.
Similarly, it is not possible to add/remove NTF_EXT_LEARNED flag from an existing
neighbor entry in combination with NTF_USE flag.
This is due to the latter directly calling into neigh_event_send() without any
meta data updates as happening in __neigh_update(). Thus, to enable this use
case, extend the latter with a NEIGH_UPDATE_F_USE flag where we break the
NUD_PERMANENT state in particular so that a latter neigh_event_send() is able
to re-resolve a neighbor entry.
Before fix, NUD_PERMANENT -> NUD_* & NTF_USE:
# ./ip/ip n replace 192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a
# ./ip/ip n
192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a PERMANENT
[...]
# ./ip/ip n replace 192.168.178.30 dev enp5s0 use extern_learn
# ./ip/ip n
192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a PERMANENT
[...]
As can be seen, despite the admin-triggered replace, the entry remains in the
NUD_PERMANENT state.
After fix, NUD_PERMANENT -> NUD_* & NTF_USE:
# ./ip/ip n replace 192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a
# ./ip/ip n
192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a PERMANENT
[...]
# ./ip/ip n replace 192.168.178.30 dev enp5s0 use extern_learn
# ./ip/ip n
192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a extern_learn REACHABLE
[...]
# ./ip/ip n
192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a extern_learn STALE
[...]
# ./ip/ip n replace 192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a
# ./ip/ip n
192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a PERMANENT
[...]
After the fix, the admin-triggered replace switches to a dynamic state from
the NTF_USE flag which triggered a new neighbor resolution. Likewise, we can
transition back from there, if needed, into NUD_PERMANENT.
Similar before/after behavior can be observed for below transitions:
Before fix, NTF_USE -> NTF_USE | NTF_EXT_LEARNED -> NTF_USE:
# ./ip/ip n replace 192.168.178.30 dev enp5s0 use
# ./ip/ip n
192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a REACHABLE
[...]
# ./ip/ip n replace 192.168.178.30 dev enp5s0 use extern_learn
# ./ip/ip n
192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a REACHABLE
[...]
After fix, NTF_USE -> NTF_USE | NTF_EXT_LEARNED -> NTF_USE:
# ./ip/ip n replace 192.168.178.30 dev enp5s0 use
# ./ip/ip n
192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a REACHABLE
[...]
# ./ip/ip n replace 192.168.178.30 dev enp5s0 use extern_learn
# ./ip/ip n
192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a extern_learn REACHABLE
[...]
# ./ip/ip n replace 192.168.178.30 dev enp5s0 use
# ./ip/ip n
192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a REACHABLE
[..]
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Roopa Prabhu <roopa@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2021-10-11 14:12:36 +02:00
err = 0 ;
goto out ;
}
2018-04-24 13:49:34 -07:00
2005-04-16 15:20:36 -07:00
if ( ! ( new & NUD_VALID ) ) {
neigh_del_timer ( neigh ) ;
if ( old & NUD_CONNECTED )
neigh_suspect ( neigh ) ;
2023-03-13 20:17:31 +00:00
WRITE_ONCE ( neigh - > nud_state , new ) ;
2005-04-16 15:20:36 -07:00
err = 0 ;
notify = old & NUD_VALID ;
2018-10-20 18:09:31 -07:00
if ( ( old & ( NUD_INCOMPLETE | NUD_PROBE ) ) & &
2009-06-11 04:16:28 -07:00
( new & NUD_FAILED ) ) {
neigh_invalidate ( neigh ) ;
notify = 1 ;
}
2005-04-16 15:20:36 -07:00
goto out ;
}
/* Compare new lladdr with cached one */
if ( ! dev - > addr_len ) {
/* First case: device needs no address. */
lladdr = neigh - > ha ;
} else if ( lladdr ) {
/* The second case: if something is already cached
and a new address is proposed :
- compare new & old
- if they are different , check override flag
*/
2007-02-09 23:24:36 +09:00
if ( ( old & NUD_VALID ) & &
2005-04-16 15:20:36 -07:00
! memcmp ( lladdr , neigh - > ha , dev - > addr_len ) )
lladdr = neigh - > ha ;
} else {
/* No address is supplied; if we know something,
use it , otherwise discard the request .
*/
err = - EINVAL ;
2018-12-05 20:02:29 -08:00
if ( ! ( old & NUD_VALID ) ) {
NL_SET_ERR_MSG ( extack , " No link layer address given " ) ;
2005-04-16 15:20:36 -07:00
goto out ;
2018-12-05 20:02:29 -08:00
}
2005-04-16 15:20:36 -07:00
lladdr = neigh - > ha ;
}
2018-09-13 11:12:03 -07:00
/* Update confirmed timestamp for neighbour entry after we
* received ARP packet even if it doesn ' t change IP to MAC binding .
*/
if ( new & NUD_CONNECTED )
neigh - > confirmed = jiffies ;
2005-04-16 15:20:36 -07:00
/* If entry was valid and address is not changed,
do not change entry state , if new one is STALE .
*/
err = 0 ;
update_isrouter = flags & NEIGH_UPDATE_F_OVERRIDE_ISROUTER ;
if ( old & NUD_VALID ) {
if ( lladdr ! = neigh - > ha & & ! ( flags & NEIGH_UPDATE_F_OVERRIDE ) ) {
update_isrouter = 0 ;
if ( ( flags & NEIGH_UPDATE_F_WEAK_OVERRIDE ) & &
( old & NUD_CONNECTED ) ) {
lladdr = neigh - > ha ;
new = NUD_STALE ;
} else
goto out ;
} else {
2016-07-27 09:56:50 +03:00
if ( lladdr = = neigh - > ha & & new = = NUD_STALE & &
! ( flags & NEIGH_UPDATE_F_ADMIN ) )
2005-04-16 15:20:36 -07:00
new = old ;
}
}
2018-09-13 11:12:03 -07:00
/* Update timestamp only once we know we will make a change to the
neighbour: update neigh timestamps iff update is effective
It's a common practice to send gratuitous ARPs after moving an
IP address to another device to speed up healing of a service. To
fulfill service availability constraints, the timing of network peers
updating their caches to point to a new location of an IP address can be
particularly important.
Sometimes neigh_update calls won't touch neither lladdr nor state, for
example if an update arrives in locktime interval. The neigh->updated
value is tested by the protocol specific neigh code, which in turn
will influence whether NEIGH_UPDATE_F_OVERRIDE gets set in the
call to neigh_update() or not. As a result, we may effectively ignore
the update request, bailing out of touching the neigh entry, except that
we still bump its timestamps inside neigh_update.
This may be a problem for updates arriving in quick succession. For
example, consider the following scenario:
A service is moved to another device with its IP address. The new device
sends three gratuitous ARP requests into the network with ~1 seconds
interval between them. Just before the first request arrives to one of
network peer nodes, its neigh entry for the IP address transitions from
STALE to DELAY. This transition, among other things, updates
neigh->updated. Once the kernel receives the first gratuitous ARP, it
ignores it because its arrival time is inside the locktime interval. The
kernel still bumps neigh->updated. Then the second gratuitous ARP
request arrives, and it's also ignored because it's still in the (new)
locktime interval. Same happens for the third request. The node
eventually heals itself (after delay_first_probe_time seconds since the
initial transition to DELAY state), but it just wasted some time and
require a new ARP request/reply round trip. This unfortunate behaviour
both puts more load on the network, as well as reduces service
availability.
This patch changes neigh_update so that it bumps neigh->updated (as well
as neigh->confirmed) only once we are sure that either lladdr or entry
state will change). In the scenario described above, it means that the
second gratuitous ARP request will actually update the entry lladdr.
Ideally, we would update the neigh entry on the very first gratuitous
ARP request. The locktime mechanism is designed to ignore ARP updates in
a short timeframe after a previous ARP update was honoured by the kernel
layer. This would require tracking timestamps for state transitions
separately from timestamps when actual updates are received. This would
probably involve changes in neighbour struct. Therefore, the patch
doesn't tackle the issue of the first gratuitous APR ignored, leaving
it for a follow-up.
Signed-off-by: Ihar Hrachyshka <ihrachys@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-05-16 08:44:24 -07:00
* neighbour entry . Otherwise we risk to move the locktime window with
* noop updates and ignore relevant ARP updates .
*/
2018-09-13 11:12:03 -07:00
if ( new ! = old | | lladdr ! = neigh - > ha )
neighbour: update neigh timestamps iff update is effective
It's a common practice to send gratuitous ARPs after moving an
IP address to another device to speed up healing of a service. To
fulfill service availability constraints, the timing of network peers
updating their caches to point to a new location of an IP address can be
particularly important.
Sometimes neigh_update calls won't touch neither lladdr nor state, for
example if an update arrives in locktime interval. The neigh->updated
value is tested by the protocol specific neigh code, which in turn
will influence whether NEIGH_UPDATE_F_OVERRIDE gets set in the
call to neigh_update() or not. As a result, we may effectively ignore
the update request, bailing out of touching the neigh entry, except that
we still bump its timestamps inside neigh_update.
This may be a problem for updates arriving in quick succession. For
example, consider the following scenario:
A service is moved to another device with its IP address. The new device
sends three gratuitous ARP requests into the network with ~1 seconds
interval between them. Just before the first request arrives to one of
network peer nodes, its neigh entry for the IP address transitions from
STALE to DELAY. This transition, among other things, updates
neigh->updated. Once the kernel receives the first gratuitous ARP, it
ignores it because its arrival time is inside the locktime interval. The
kernel still bumps neigh->updated. Then the second gratuitous ARP
request arrives, and it's also ignored because it's still in the (new)
locktime interval. Same happens for the third request. The node
eventually heals itself (after delay_first_probe_time seconds since the
initial transition to DELAY state), but it just wasted some time and
require a new ARP request/reply round trip. This unfortunate behaviour
both puts more load on the network, as well as reduces service
availability.
This patch changes neigh_update so that it bumps neigh->updated (as well
as neigh->confirmed) only once we are sure that either lladdr or entry
state will change). In the scenario described above, it means that the
second gratuitous ARP request will actually update the entry lladdr.
Ideally, we would update the neigh entry on the very first gratuitous
ARP request. The locktime mechanism is designed to ignore ARP updates in
a short timeframe after a previous ARP update was honoured by the kernel
layer. This would require tracking timestamps for state transitions
separately from timestamps when actual updates are received. This would
probably involve changes in neighbour struct. Therefore, the patch
doesn't tackle the issue of the first gratuitous APR ignored, leaving
it for a follow-up.
Signed-off-by: Ihar Hrachyshka <ihrachys@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-05-16 08:44:24 -07:00
neigh - > updated = jiffies ;
2005-04-16 15:20:36 -07:00
if ( new ! = old ) {
neigh_del_timer ( neigh ) ;
2015-05-18 19:44:41 +09:00
if ( new & NUD_PROBE )
atomic_set ( & neigh - > probes , 0 ) ;
2007-12-20 15:49:05 -08:00
if ( new & NUD_IN_TIMER )
2007-02-09 23:24:36 +09:00
neigh_add_timer ( neigh , ( jiffies +
( ( new & NUD_REACHABLE ) ?
2005-09-27 12:07:44 -07:00
neigh - > parms - > reachable_time :
0 ) ) ) ;
2023-03-13 20:17:31 +00:00
WRITE_ONCE ( neigh - > nud_state , new ) ;
2013-12-15 13:39:56 -08:00
notify = 1 ;
2005-04-16 15:20:36 -07:00
}
if ( lladdr ! = neigh - > ha ) {
2010-10-07 10:44:07 +00:00
write_seqlock ( & neigh - > ha_lock ) ;
2005-04-16 15:20:36 -07:00
memcpy ( & neigh - > ha , lladdr , dev - > addr_len ) ;
2010-10-07 10:44:07 +00:00
write_sequnlock ( & neigh - > ha_lock ) ;
2005-04-16 15:20:36 -07:00
neigh_update_hhs ( neigh ) ;
if ( ! ( new & NUD_CONNECTED ) )
neigh - > confirmed = jiffies -
2013-12-07 19:26:53 +01:00
( NEIGH_VAR ( neigh - > parms , BASE_REACHABLE_TIME ) < < 1 ) ;
2005-04-16 15:20:36 -07:00
notify = 1 ;
}
if ( new = = old )
goto out ;
if ( new & NUD_CONNECTED )
neigh_connect ( neigh ) ;
else
neigh_suspect ( neigh ) ;
if ( ! ( old & NUD_VALID ) ) {
struct sk_buff * skb ;
/* Again: avoid dead loop if something went wrong */
while ( neigh - > nud_state & NUD_VALID & &
( skb = __skb_dequeue ( & neigh - > arp_queue ) ) ! = NULL ) {
2011-07-17 23:09:49 -07:00
struct dst_entry * dst = skb_dst ( skb ) ;
struct neighbour * n2 , * n1 = neigh ;
2005-04-16 15:20:36 -07:00
write_unlock_bh ( & neigh - > lock ) ;
2011-10-17 22:32:42 +00:00
rcu_read_lock ( ) ;
2012-07-02 22:15:37 -07:00
/* Why not just use 'neigh' as-is? The problem is that
* things such as shaper , eql , and sch_teql can end up
* using alternative , different , neigh objects to output
* the packet in the output path . So what we need to do
* here is re - lookup the top - level neigh in the path so
* we can reinject the packet there .
*/
n2 = NULL ;
2021-03-19 14:33:37 -04:00
if ( dst & & dst - > obsolete ! = DST_OBSOLETE_DEAD ) {
2012-07-02 22:15:37 -07:00
n2 = dst_neigh_lookup_skb ( dst , skb ) ;
if ( n2 )
n1 = n2 ;
}
2023-09-21 09:27:13 +00:00
READ_ONCE ( n1 - > output ) ( n1 , skb ) ;
2012-07-02 22:15:37 -07:00
if ( n2 )
neigh_release ( n2 ) ;
2011-10-17 22:32:42 +00:00
rcu_read_unlock ( ) ;
2005-04-16 15:20:36 -07:00
write_lock_bh ( & neigh - > lock ) ;
}
2013-06-28 02:37:42 -07:00
__skb_queue_purge ( & neigh - > arp_queue ) ;
neigh: new unresolved queue limits
Le mercredi 09 novembre 2011 à 16:21 -0500, David Miller a écrit :
> From: David Miller <davem@davemloft.net>
> Date: Wed, 09 Nov 2011 16:16:44 -0500 (EST)
>
> > From: Eric Dumazet <eric.dumazet@gmail.com>
> > Date: Wed, 09 Nov 2011 12:14:09 +0100
> >
> >> unres_qlen is the number of frames we are able to queue per unresolved
> >> neighbour. Its default value (3) was never changed and is responsible
> >> for strange drops, especially if IP fragments are used, or multiple
> >> sessions start in parallel. Even a single tcp flow can hit this limit.
> > ...
> >
> > Ok, I've applied this, let's see what happens :-)
>
> Early answer, build fails.
>
> Please test build this patch with DECNET enabled and resubmit. The
> decnet neigh layer still refers to the removed ->queue_len member.
>
> Thanks.
Ouch, this was fixed on one machine yesterday, but not the other one I
used this morning, sorry.
[PATCH V5 net-next] neigh: new unresolved queue limits
unres_qlen is the number of frames we are able to queue per unresolved
neighbour. Its default value (3) was never changed and is responsible
for strange drops, especially if IP fragments are used, or multiple
sessions start in parallel. Even a single tcp flow can hit this limit.
$ arp -d 192.168.20.108 ; ping -c 2 -s 8000 192.168.20.108
PING 192.168.20.108 (192.168.20.108) 8000(8028) bytes of data.
8008 bytes from 192.168.20.108: icmp_seq=2 ttl=64 time=0.322 ms
Signed-off-by: David S. Miller <davem@davemloft.net>
2011-11-09 12:07:14 +00:00
neigh - > arp_queue_len_bytes = 0 ;
2005-04-16 15:20:36 -07:00
}
out :
2018-09-22 21:26:20 -07:00
if ( update_isrouter )
neigh_update_is_router ( neigh , flags , & notify ) ;
2005-04-16 15:20:36 -07:00
write_unlock_bh ( & neigh - > lock ) ;
2021-10-11 14:12:38 +02:00
if ( ( ( new ^ old ) & NUD_PERMANENT ) | | gc_update )
2018-12-11 18:57:21 -07:00
neigh_update_gc_list ( neigh ) ;
2021-10-11 14:12:38 +02:00
if ( managed_update )
neigh_update_managed_list ( neigh ) ;
2006-07-30 20:43:36 -07:00
if ( notify )
2017-03-19 22:01:28 -07:00
neigh_update_notify ( neigh , nlmsg_pid ) ;
2019-02-14 09:15:11 -08:00
trace_neigh_update_done ( neigh , err ) ;
2005-04-16 15:20:36 -07:00
return err ;
}
2018-12-05 20:02:29 -08:00
int neigh_update ( struct neighbour * neigh , const u8 * lladdr , u8 new ,
u32 flags , u32 nlmsg_pid )
{
return __neigh_update ( neigh , lladdr , new , flags , nlmsg_pid , NULL ) ;
}
2008-03-24 18:39:10 +09:00
EXPORT_SYMBOL ( neigh_update ) ;
2005-04-16 15:20:36 -07:00
2013-12-11 13:48:20 +01:00
/* Update the neigh to listen temporarily for probe responses, even if it is
* in a NUD_FAILED state . The caller has to hold neigh - > lock for writing .
*/
void __neigh_set_probe_once ( struct neighbour * neigh )
{
2015-06-16 22:56:39 +03:00
if ( neigh - > dead )
return ;
2013-12-11 13:48:20 +01:00
neigh - > updated = jiffies ;
if ( ! ( neigh - > nud_state & NUD_FAILED ) )
return ;
2023-03-13 20:17:31 +00:00
WRITE_ONCE ( neigh - > nud_state , NUD_INCOMPLETE ) ;
2014-05-09 13:16:48 +08:00
atomic_set ( & neigh - > probes , neigh_max_probes ( neigh ) ) ;
2013-12-11 13:48:20 +01:00
neigh_add_timer ( neigh ,
2020-04-01 14:46:20 +08:00
jiffies + max ( NEIGH_VAR ( neigh - > parms , RETRANS_TIME ) ,
HZ / 100 ) ) ;
2013-12-11 13:48:20 +01:00
}
EXPORT_SYMBOL ( __neigh_set_probe_once ) ;
2005-04-16 15:20:36 -07:00
struct neighbour * neigh_event_ns ( struct neigh_table * tbl ,
u8 * lladdr , void * saddr ,
struct net_device * dev )
{
struct neighbour * neigh = __neigh_lookup ( tbl , saddr , dev ,
lladdr | | ! dev - > addr_len ) ;
if ( neigh )
2007-02-09 23:24:36 +09:00
neigh_update ( neigh , lladdr , NUD_STALE ,
2017-03-19 22:01:28 -07:00
NEIGH_UPDATE_F_OVERRIDE , 0 ) ;
2005-04-16 15:20:36 -07:00
return neigh ;
}
2008-03-24 18:39:10 +09:00
EXPORT_SYMBOL ( neigh_event_ns ) ;
2005-04-16 15:20:36 -07:00
2010-10-11 09:16:57 -07:00
/* called with read_lock_bh(&n->lock); */
2015-03-02 00:13:22 -06:00
static void neigh_hh_init ( struct neighbour * n )
2005-04-16 15:20:36 -07:00
{
2015-03-02 00:13:22 -06:00
struct net_device * dev = n - > dev ;
__be16 prot = n - > tbl - > protocol ;
2011-07-14 07:53:20 -07:00
struct hh_cache * hh = & n - > hh ;
2010-10-07 10:44:07 +00:00
write_lock_bh ( & n - > lock ) ;
2010-10-11 09:16:57 -07:00
2011-07-14 07:53:20 -07:00
/* Only one thread can come in here and initialize the
* hh_cache entry .
*/
2011-07-16 17:45:02 -07:00
if ( ! hh - > hh_len )
dev - > header_ops - > cache ( n , hh , prot ) ;
2010-10-11 09:16:57 -07:00
2010-10-07 10:44:07 +00:00
write_unlock_bh ( & n - > lock ) ;
2005-04-16 15:20:36 -07:00
}
/* Slow and careful. */
2011-07-17 13:34:11 -07:00
int neigh_resolve_output ( struct neighbour * neigh , struct sk_buff * skb )
2005-04-16 15:20:36 -07:00
{
int rc = 0 ;
if ( ! neigh_event_send ( neigh , skb ) ) {
int err ;
struct net_device * dev = neigh - > dev ;
2010-10-07 10:44:07 +00:00
unsigned int seq ;
2010-10-11 09:16:57 -07:00
2019-11-07 18:29:11 -08:00
if ( dev - > header_ops - > cache & & ! READ_ONCE ( neigh - > hh . hh_len ) )
2015-03-02 00:13:22 -06:00
neigh_hh_init ( neigh ) ;
2010-10-11 09:16:57 -07:00
2010-10-07 10:44:07 +00:00
do {
2012-10-05 19:10:15 +00:00
__skb_pull ( skb , skb_network_offset ( skb ) ) ;
2010-10-07 10:44:07 +00:00
seq = read_seqbegin ( & neigh - > ha_lock ) ;
err = dev_hard_header ( skb , dev , ntohs ( skb - > protocol ) ,
neigh - > ha , NULL , skb - > len ) ;
} while ( read_seqretry ( & neigh - > ha_lock , seq ) ) ;
2010-10-11 09:16:57 -07:00
2005-04-16 15:20:36 -07:00
if ( err > = 0 )
2011-07-16 18:06:24 -07:00
rc = dev_queue_xmit ( skb ) ;
2005-04-16 15:20:36 -07:00
else
goto out_kfree_skb ;
}
out :
return rc ;
out_kfree_skb :
rc = - EINVAL ;
kfree_skb ( skb ) ;
goto out ;
}
2008-03-24 18:39:10 +09:00
EXPORT_SYMBOL ( neigh_resolve_output ) ;
2005-04-16 15:20:36 -07:00
/* As fast as possible without hh cache */
2011-07-17 13:34:11 -07:00
int neigh_connected_output ( struct neighbour * neigh , struct sk_buff * skb )
2005-04-16 15:20:36 -07:00
{
struct net_device * dev = neigh - > dev ;
2010-10-07 10:44:07 +00:00
unsigned int seq ;
2011-07-17 13:34:11 -07:00
int err ;
2005-04-16 15:20:36 -07:00
2010-10-07 10:44:07 +00:00
do {
2012-10-05 19:10:15 +00:00
__skb_pull ( skb , skb_network_offset ( skb ) ) ;
2010-10-07 10:44:07 +00:00
seq = read_seqbegin ( & neigh - > ha_lock ) ;
err = dev_hard_header ( skb , dev , ntohs ( skb - > protocol ) ,
neigh - > ha , NULL , skb - > len ) ;
} while ( read_seqretry ( & neigh - > ha_lock , seq ) ) ;
2005-04-16 15:20:36 -07:00
if ( err > = 0 )
2011-07-16 18:06:24 -07:00
err = dev_queue_xmit ( skb ) ;
2005-04-16 15:20:36 -07:00
else {
err = - EINVAL ;
kfree_skb ( skb ) ;
}
return err ;
}
2008-03-24 18:39:10 +09:00
EXPORT_SYMBOL ( neigh_connected_output ) ;
2005-04-16 15:20:36 -07:00
2011-07-17 13:34:11 -07:00
int neigh_direct_output ( struct neighbour * neigh , struct sk_buff * skb )
{
return dev_queue_xmit ( skb ) ;
}
EXPORT_SYMBOL ( neigh_direct_output ) ;
2021-10-11 14:12:38 +02:00
static void neigh_managed_work ( struct work_struct * work )
{
struct neigh_table * tbl = container_of ( work , struct neigh_table ,
managed_work . work ) ;
struct neighbour * neigh ;
write_lock_bh ( & tbl - > lock ) ;
list_for_each_entry ( neigh , & tbl - > managed_list , managed_list )
2022-02-01 20:39:42 +01:00
neigh_event_send_probe ( neigh , NULL , false ) ;
2021-10-11 14:12:38 +02:00
queue_delayed_work ( system_power_efficient_wq , & tbl - > managed_work ,
2022-06-29 08:48:32 +00:00
NEIGH_VAR ( & tbl - > parms , INTERVAL_PROBE_TIME_MS ) ) ;
2021-10-11 14:12:38 +02:00
write_unlock_bh ( & tbl - > lock ) ;
}
treewide: setup_timer() -> timer_setup()
This converts all remaining cases of the old setup_timer() API into using
timer_setup(), where the callback argument is the structure already
holding the struct timer_list. These should have no behavioral changes,
since they just change which pointer is passed into the callback with
the same available pointers after conversion. It handles the following
examples, in addition to some other variations.
Casting from unsigned long:
void my_callback(unsigned long data)
{
struct something *ptr = (struct something *)data;
...
}
...
setup_timer(&ptr->my_timer, my_callback, ptr);
and forced object casts:
void my_callback(struct something *ptr)
{
...
}
...
setup_timer(&ptr->my_timer, my_callback, (unsigned long)ptr);
become:
void my_callback(struct timer_list *t)
{
struct something *ptr = from_timer(ptr, t, my_timer);
...
}
...
timer_setup(&ptr->my_timer, my_callback, 0);
Direct function assignments:
void my_callback(unsigned long data)
{
struct something *ptr = (struct something *)data;
...
}
...
ptr->my_timer.function = my_callback;
have a temporary cast added, along with converting the args:
void my_callback(struct timer_list *t)
{
struct something *ptr = from_timer(ptr, t, my_timer);
...
}
...
ptr->my_timer.function = (TIMER_FUNC_TYPE)my_callback;
And finally, callbacks without a data assignment:
void my_callback(unsigned long data)
{
...
}
...
setup_timer(&ptr->my_timer, my_callback, 0);
have their argument renamed to verify they're unused during conversion:
void my_callback(struct timer_list *unused)
{
...
}
...
timer_setup(&ptr->my_timer, my_callback, 0);
The conversion is done with the following Coccinelle script:
spatch --very-quiet --all-includes --include-headers \
-I ./arch/x86/include -I ./arch/x86/include/generated \
-I ./include -I ./arch/x86/include/uapi \
-I ./arch/x86/include/generated/uapi -I ./include/uapi \
-I ./include/generated/uapi --include ./include/linux/kconfig.h \
--dir . \
--cocci-file ~/src/data/timer_setup.cocci
@fix_address_of@
expression e;
@@
setup_timer(
-&(e)
+&e
, ...)
// Update any raw setup_timer() usages that have a NULL callback, but
// would otherwise match change_timer_function_usage, since the latter
// will update all function assignments done in the face of a NULL
// function initialization in setup_timer().
@change_timer_function_usage_NULL@
expression _E;
identifier _timer;
type _cast_data;
@@
(
-setup_timer(&_E->_timer, NULL, _E);
+timer_setup(&_E->_timer, NULL, 0);
|
-setup_timer(&_E->_timer, NULL, (_cast_data)_E);
+timer_setup(&_E->_timer, NULL, 0);
|
-setup_timer(&_E._timer, NULL, &_E);
+timer_setup(&_E._timer, NULL, 0);
|
-setup_timer(&_E._timer, NULL, (_cast_data)&_E);
+timer_setup(&_E._timer, NULL, 0);
)
@change_timer_function_usage@
expression _E;
identifier _timer;
struct timer_list _stl;
identifier _callback;
type _cast_func, _cast_data;
@@
(
-setup_timer(&_E->_timer, _callback, _E);
+timer_setup(&_E->_timer, _callback, 0);
|
-setup_timer(&_E->_timer, &_callback, _E);
+timer_setup(&_E->_timer, _callback, 0);
|
-setup_timer(&_E->_timer, _callback, (_cast_data)_E);
+timer_setup(&_E->_timer, _callback, 0);
|
-setup_timer(&_E->_timer, &_callback, (_cast_data)_E);
+timer_setup(&_E->_timer, _callback, 0);
|
-setup_timer(&_E->_timer, (_cast_func)_callback, _E);
+timer_setup(&_E->_timer, _callback, 0);
|
-setup_timer(&_E->_timer, (_cast_func)&_callback, _E);
+timer_setup(&_E->_timer, _callback, 0);
|
-setup_timer(&_E->_timer, (_cast_func)_callback, (_cast_data)_E);
+timer_setup(&_E->_timer, _callback, 0);
|
-setup_timer(&_E->_timer, (_cast_func)&_callback, (_cast_data)_E);
+timer_setup(&_E->_timer, _callback, 0);
|
-setup_timer(&_E._timer, _callback, (_cast_data)_E);
+timer_setup(&_E._timer, _callback, 0);
|
-setup_timer(&_E._timer, _callback, (_cast_data)&_E);
+timer_setup(&_E._timer, _callback, 0);
|
-setup_timer(&_E._timer, &_callback, (_cast_data)_E);
+timer_setup(&_E._timer, _callback, 0);
|
-setup_timer(&_E._timer, &_callback, (_cast_data)&_E);
+timer_setup(&_E._timer, _callback, 0);
|
-setup_timer(&_E._timer, (_cast_func)_callback, (_cast_data)_E);
+timer_setup(&_E._timer, _callback, 0);
|
-setup_timer(&_E._timer, (_cast_func)_callback, (_cast_data)&_E);
+timer_setup(&_E._timer, _callback, 0);
|
-setup_timer(&_E._timer, (_cast_func)&_callback, (_cast_data)_E);
+timer_setup(&_E._timer, _callback, 0);
|
-setup_timer(&_E._timer, (_cast_func)&_callback, (_cast_data)&_E);
+timer_setup(&_E._timer, _callback, 0);
|
_E->_timer@_stl.function = _callback;
|
_E->_timer@_stl.function = &_callback;
|
_E->_timer@_stl.function = (_cast_func)_callback;
|
_E->_timer@_stl.function = (_cast_func)&_callback;
|
_E._timer@_stl.function = _callback;
|
_E._timer@_stl.function = &_callback;
|
_E._timer@_stl.function = (_cast_func)_callback;
|
_E._timer@_stl.function = (_cast_func)&_callback;
)
// callback(unsigned long arg)
@change_callback_handle_cast
depends on change_timer_function_usage@
identifier change_timer_function_usage._callback;
identifier change_timer_function_usage._timer;
type _origtype;
identifier _origarg;
type _handletype;
identifier _handle;
@@
void _callback(
-_origtype _origarg
+struct timer_list *t
)
{
(
... when != _origarg
_handletype *_handle =
-(_handletype *)_origarg;
+from_timer(_handle, t, _timer);
... when != _origarg
|
... when != _origarg
_handletype *_handle =
-(void *)_origarg;
+from_timer(_handle, t, _timer);
... when != _origarg
|
... when != _origarg
_handletype *_handle;
... when != _handle
_handle =
-(_handletype *)_origarg;
+from_timer(_handle, t, _timer);
... when != _origarg
|
... when != _origarg
_handletype *_handle;
... when != _handle
_handle =
-(void *)_origarg;
+from_timer(_handle, t, _timer);
... when != _origarg
)
}
// callback(unsigned long arg) without existing variable
@change_callback_handle_cast_no_arg
depends on change_timer_function_usage &&
!change_callback_handle_cast@
identifier change_timer_function_usage._callback;
identifier change_timer_function_usage._timer;
type _origtype;
identifier _origarg;
type _handletype;
@@
void _callback(
-_origtype _origarg
+struct timer_list *t
)
{
+ _handletype *_origarg = from_timer(_origarg, t, _timer);
+
... when != _origarg
- (_handletype *)_origarg
+ _origarg
... when != _origarg
}
// Avoid already converted callbacks.
@match_callback_converted
depends on change_timer_function_usage &&
!change_callback_handle_cast &&
!change_callback_handle_cast_no_arg@
identifier change_timer_function_usage._callback;
identifier t;
@@
void _callback(struct timer_list *t)
{ ... }
// callback(struct something *handle)
@change_callback_handle_arg
depends on change_timer_function_usage &&
!match_callback_converted &&
!change_callback_handle_cast &&
!change_callback_handle_cast_no_arg@
identifier change_timer_function_usage._callback;
identifier change_timer_function_usage._timer;
type _handletype;
identifier _handle;
@@
void _callback(
-_handletype *_handle
+struct timer_list *t
)
{
+ _handletype *_handle = from_timer(_handle, t, _timer);
...
}
// If change_callback_handle_arg ran on an empty function, remove
// the added handler.
@unchange_callback_handle_arg
depends on change_timer_function_usage &&
change_callback_handle_arg@
identifier change_timer_function_usage._callback;
identifier change_timer_function_usage._timer;
type _handletype;
identifier _handle;
identifier t;
@@
void _callback(struct timer_list *t)
{
- _handletype *_handle = from_timer(_handle, t, _timer);
}
// We only want to refactor the setup_timer() data argument if we've found
// the matching callback. This undoes changes in change_timer_function_usage.
@unchange_timer_function_usage
depends on change_timer_function_usage &&
!change_callback_handle_cast &&
!change_callback_handle_cast_no_arg &&
!change_callback_handle_arg@
expression change_timer_function_usage._E;
identifier change_timer_function_usage._timer;
identifier change_timer_function_usage._callback;
type change_timer_function_usage._cast_data;
@@
(
-timer_setup(&_E->_timer, _callback, 0);
+setup_timer(&_E->_timer, _callback, (_cast_data)_E);
|
-timer_setup(&_E._timer, _callback, 0);
+setup_timer(&_E._timer, _callback, (_cast_data)&_E);
)
// If we fixed a callback from a .function assignment, fix the
// assignment cast now.
@change_timer_function_assignment
depends on change_timer_function_usage &&
(change_callback_handle_cast ||
change_callback_handle_cast_no_arg ||
change_callback_handle_arg)@
expression change_timer_function_usage._E;
identifier change_timer_function_usage._timer;
identifier change_timer_function_usage._callback;
type _cast_func;
typedef TIMER_FUNC_TYPE;
@@
(
_E->_timer.function =
-_callback
+(TIMER_FUNC_TYPE)_callback
;
|
_E->_timer.function =
-&_callback
+(TIMER_FUNC_TYPE)_callback
;
|
_E->_timer.function =
-(_cast_func)_callback;
+(TIMER_FUNC_TYPE)_callback
;
|
_E->_timer.function =
-(_cast_func)&_callback
+(TIMER_FUNC_TYPE)_callback
;
|
_E._timer.function =
-_callback
+(TIMER_FUNC_TYPE)_callback
;
|
_E._timer.function =
-&_callback;
+(TIMER_FUNC_TYPE)_callback
;
|
_E._timer.function =
-(_cast_func)_callback
+(TIMER_FUNC_TYPE)_callback
;
|
_E._timer.function =
-(_cast_func)&_callback
+(TIMER_FUNC_TYPE)_callback
;
)
// Sometimes timer functions are called directly. Replace matched args.
@change_timer_function_calls
depends on change_timer_function_usage &&
(change_callback_handle_cast ||
change_callback_handle_cast_no_arg ||
change_callback_handle_arg)@
expression _E;
identifier change_timer_function_usage._timer;
identifier change_timer_function_usage._callback;
type _cast_data;
@@
_callback(
(
-(_cast_data)_E
+&_E->_timer
|
-(_cast_data)&_E
+&_E._timer
|
-_E
+&_E->_timer
)
)
// If a timer has been configured without a data argument, it can be
// converted without regard to the callback argument, since it is unused.
@match_timer_function_unused_data@
expression _E;
identifier _timer;
identifier _callback;
@@
(
-setup_timer(&_E->_timer, _callback, 0);
+timer_setup(&_E->_timer, _callback, 0);
|
-setup_timer(&_E->_timer, _callback, 0L);
+timer_setup(&_E->_timer, _callback, 0);
|
-setup_timer(&_E->_timer, _callback, 0UL);
+timer_setup(&_E->_timer, _callback, 0);
|
-setup_timer(&_E._timer, _callback, 0);
+timer_setup(&_E._timer, _callback, 0);
|
-setup_timer(&_E._timer, _callback, 0L);
+timer_setup(&_E._timer, _callback, 0);
|
-setup_timer(&_E._timer, _callback, 0UL);
+timer_setup(&_E._timer, _callback, 0);
|
-setup_timer(&_timer, _callback, 0);
+timer_setup(&_timer, _callback, 0);
|
-setup_timer(&_timer, _callback, 0L);
+timer_setup(&_timer, _callback, 0);
|
-setup_timer(&_timer, _callback, 0UL);
+timer_setup(&_timer, _callback, 0);
|
-setup_timer(_timer, _callback, 0);
+timer_setup(_timer, _callback, 0);
|
-setup_timer(_timer, _callback, 0L);
+timer_setup(_timer, _callback, 0);
|
-setup_timer(_timer, _callback, 0UL);
+timer_setup(_timer, _callback, 0);
)
@change_callback_unused_data
depends on match_timer_function_unused_data@
identifier match_timer_function_unused_data._callback;
type _origtype;
identifier _origarg;
@@
void _callback(
-_origtype _origarg
+struct timer_list *unused
)
{
... when != _origarg
}
Signed-off-by: Kees Cook <keescook@chromium.org>
2017-10-16 14:43:17 -07:00
static void neigh_proxy_process ( struct timer_list * t )
2005-04-16 15:20:36 -07:00
{
treewide: setup_timer() -> timer_setup()
This converts all remaining cases of the old setup_timer() API into using
timer_setup(), where the callback argument is the structure already
holding the struct timer_list. These should have no behavioral changes,
since they just change which pointer is passed into the callback with
the same available pointers after conversion. It handles the following
examples, in addition to some other variations.
Casting from unsigned long:
void my_callback(unsigned long data)
{
struct something *ptr = (struct something *)data;
...
}
...
setup_timer(&ptr->my_timer, my_callback, ptr);
and forced object casts:
void my_callback(struct something *ptr)
{
...
}
...
setup_timer(&ptr->my_timer, my_callback, (unsigned long)ptr);
become:
void my_callback(struct timer_list *t)
{
struct something *ptr = from_timer(ptr, t, my_timer);
...
}
...
timer_setup(&ptr->my_timer, my_callback, 0);
Direct function assignments:
void my_callback(unsigned long data)
{
struct something *ptr = (struct something *)data;
...
}
...
ptr->my_timer.function = my_callback;
have a temporary cast added, along with converting the args:
void my_callback(struct timer_list *t)
{
struct something *ptr = from_timer(ptr, t, my_timer);
...
}
...
ptr->my_timer.function = (TIMER_FUNC_TYPE)my_callback;
And finally, callbacks without a data assignment:
void my_callback(unsigned long data)
{
...
}
...
setup_timer(&ptr->my_timer, my_callback, 0);
have their argument renamed to verify they're unused during conversion:
void my_callback(struct timer_list *unused)
{
...
}
...
timer_setup(&ptr->my_timer, my_callback, 0);
The conversion is done with the following Coccinelle script:
spatch --very-quiet --all-includes --include-headers \
-I ./arch/x86/include -I ./arch/x86/include/generated \
-I ./include -I ./arch/x86/include/uapi \
-I ./arch/x86/include/generated/uapi -I ./include/uapi \
-I ./include/generated/uapi --include ./include/linux/kconfig.h \
--dir . \
--cocci-file ~/src/data/timer_setup.cocci
@fix_address_of@
expression e;
@@
setup_timer(
-&(e)
+&e
, ...)
// Update any raw setup_timer() usages that have a NULL callback, but
// would otherwise match change_timer_function_usage, since the latter
// will update all function assignments done in the face of a NULL
// function initialization in setup_timer().
@change_timer_function_usage_NULL@
expression _E;
identifier _timer;
type _cast_data;
@@
(
-setup_timer(&_E->_timer, NULL, _E);
+timer_setup(&_E->_timer, NULL, 0);
|
-setup_timer(&_E->_timer, NULL, (_cast_data)_E);
+timer_setup(&_E->_timer, NULL, 0);
|
-setup_timer(&_E._timer, NULL, &_E);
+timer_setup(&_E._timer, NULL, 0);
|
-setup_timer(&_E._timer, NULL, (_cast_data)&_E);
+timer_setup(&_E._timer, NULL, 0);
)
@change_timer_function_usage@
expression _E;
identifier _timer;
struct timer_list _stl;
identifier _callback;
type _cast_func, _cast_data;
@@
(
-setup_timer(&_E->_timer, _callback, _E);
+timer_setup(&_E->_timer, _callback, 0);
|
-setup_timer(&_E->_timer, &_callback, _E);
+timer_setup(&_E->_timer, _callback, 0);
|
-setup_timer(&_E->_timer, _callback, (_cast_data)_E);
+timer_setup(&_E->_timer, _callback, 0);
|
-setup_timer(&_E->_timer, &_callback, (_cast_data)_E);
+timer_setup(&_E->_timer, _callback, 0);
|
-setup_timer(&_E->_timer, (_cast_func)_callback, _E);
+timer_setup(&_E->_timer, _callback, 0);
|
-setup_timer(&_E->_timer, (_cast_func)&_callback, _E);
+timer_setup(&_E->_timer, _callback, 0);
|
-setup_timer(&_E->_timer, (_cast_func)_callback, (_cast_data)_E);
+timer_setup(&_E->_timer, _callback, 0);
|
-setup_timer(&_E->_timer, (_cast_func)&_callback, (_cast_data)_E);
+timer_setup(&_E->_timer, _callback, 0);
|
-setup_timer(&_E._timer, _callback, (_cast_data)_E);
+timer_setup(&_E._timer, _callback, 0);
|
-setup_timer(&_E._timer, _callback, (_cast_data)&_E);
+timer_setup(&_E._timer, _callback, 0);
|
-setup_timer(&_E._timer, &_callback, (_cast_data)_E);
+timer_setup(&_E._timer, _callback, 0);
|
-setup_timer(&_E._timer, &_callback, (_cast_data)&_E);
+timer_setup(&_E._timer, _callback, 0);
|
-setup_timer(&_E._timer, (_cast_func)_callback, (_cast_data)_E);
+timer_setup(&_E._timer, _callback, 0);
|
-setup_timer(&_E._timer, (_cast_func)_callback, (_cast_data)&_E);
+timer_setup(&_E._timer, _callback, 0);
|
-setup_timer(&_E._timer, (_cast_func)&_callback, (_cast_data)_E);
+timer_setup(&_E._timer, _callback, 0);
|
-setup_timer(&_E._timer, (_cast_func)&_callback, (_cast_data)&_E);
+timer_setup(&_E._timer, _callback, 0);
|
_E->_timer@_stl.function = _callback;
|
_E->_timer@_stl.function = &_callback;
|
_E->_timer@_stl.function = (_cast_func)_callback;
|
_E->_timer@_stl.function = (_cast_func)&_callback;
|
_E._timer@_stl.function = _callback;
|
_E._timer@_stl.function = &_callback;
|
_E._timer@_stl.function = (_cast_func)_callback;
|
_E._timer@_stl.function = (_cast_func)&_callback;
)
// callback(unsigned long arg)
@change_callback_handle_cast
depends on change_timer_function_usage@
identifier change_timer_function_usage._callback;
identifier change_timer_function_usage._timer;
type _origtype;
identifier _origarg;
type _handletype;
identifier _handle;
@@
void _callback(
-_origtype _origarg
+struct timer_list *t
)
{
(
... when != _origarg
_handletype *_handle =
-(_handletype *)_origarg;
+from_timer(_handle, t, _timer);
... when != _origarg
|
... when != _origarg
_handletype *_handle =
-(void *)_origarg;
+from_timer(_handle, t, _timer);
... when != _origarg
|
... when != _origarg
_handletype *_handle;
... when != _handle
_handle =
-(_handletype *)_origarg;
+from_timer(_handle, t, _timer);
... when != _origarg
|
... when != _origarg
_handletype *_handle;
... when != _handle
_handle =
-(void *)_origarg;
+from_timer(_handle, t, _timer);
... when != _origarg
)
}
// callback(unsigned long arg) without existing variable
@change_callback_handle_cast_no_arg
depends on change_timer_function_usage &&
!change_callback_handle_cast@
identifier change_timer_function_usage._callback;
identifier change_timer_function_usage._timer;
type _origtype;
identifier _origarg;
type _handletype;
@@
void _callback(
-_origtype _origarg
+struct timer_list *t
)
{
+ _handletype *_origarg = from_timer(_origarg, t, _timer);
+
... when != _origarg
- (_handletype *)_origarg
+ _origarg
... when != _origarg
}
// Avoid already converted callbacks.
@match_callback_converted
depends on change_timer_function_usage &&
!change_callback_handle_cast &&
!change_callback_handle_cast_no_arg@
identifier change_timer_function_usage._callback;
identifier t;
@@
void _callback(struct timer_list *t)
{ ... }
// callback(struct something *handle)
@change_callback_handle_arg
depends on change_timer_function_usage &&
!match_callback_converted &&
!change_callback_handle_cast &&
!change_callback_handle_cast_no_arg@
identifier change_timer_function_usage._callback;
identifier change_timer_function_usage._timer;
type _handletype;
identifier _handle;
@@
void _callback(
-_handletype *_handle
+struct timer_list *t
)
{
+ _handletype *_handle = from_timer(_handle, t, _timer);
...
}
// If change_callback_handle_arg ran on an empty function, remove
// the added handler.
@unchange_callback_handle_arg
depends on change_timer_function_usage &&
change_callback_handle_arg@
identifier change_timer_function_usage._callback;
identifier change_timer_function_usage._timer;
type _handletype;
identifier _handle;
identifier t;
@@
void _callback(struct timer_list *t)
{
- _handletype *_handle = from_timer(_handle, t, _timer);
}
// We only want to refactor the setup_timer() data argument if we've found
// the matching callback. This undoes changes in change_timer_function_usage.
@unchange_timer_function_usage
depends on change_timer_function_usage &&
!change_callback_handle_cast &&
!change_callback_handle_cast_no_arg &&
!change_callback_handle_arg@
expression change_timer_function_usage._E;
identifier change_timer_function_usage._timer;
identifier change_timer_function_usage._callback;
type change_timer_function_usage._cast_data;
@@
(
-timer_setup(&_E->_timer, _callback, 0);
+setup_timer(&_E->_timer, _callback, (_cast_data)_E);
|
-timer_setup(&_E._timer, _callback, 0);
+setup_timer(&_E._timer, _callback, (_cast_data)&_E);
)
// If we fixed a callback from a .function assignment, fix the
// assignment cast now.
@change_timer_function_assignment
depends on change_timer_function_usage &&
(change_callback_handle_cast ||
change_callback_handle_cast_no_arg ||
change_callback_handle_arg)@
expression change_timer_function_usage._E;
identifier change_timer_function_usage._timer;
identifier change_timer_function_usage._callback;
type _cast_func;
typedef TIMER_FUNC_TYPE;
@@
(
_E->_timer.function =
-_callback
+(TIMER_FUNC_TYPE)_callback
;
|
_E->_timer.function =
-&_callback
+(TIMER_FUNC_TYPE)_callback
;
|
_E->_timer.function =
-(_cast_func)_callback;
+(TIMER_FUNC_TYPE)_callback
;
|
_E->_timer.function =
-(_cast_func)&_callback
+(TIMER_FUNC_TYPE)_callback
;
|
_E._timer.function =
-_callback
+(TIMER_FUNC_TYPE)_callback
;
|
_E._timer.function =
-&_callback;
+(TIMER_FUNC_TYPE)_callback
;
|
_E._timer.function =
-(_cast_func)_callback
+(TIMER_FUNC_TYPE)_callback
;
|
_E._timer.function =
-(_cast_func)&_callback
+(TIMER_FUNC_TYPE)_callback
;
)
// Sometimes timer functions are called directly. Replace matched args.
@change_timer_function_calls
depends on change_timer_function_usage &&
(change_callback_handle_cast ||
change_callback_handle_cast_no_arg ||
change_callback_handle_arg)@
expression _E;
identifier change_timer_function_usage._timer;
identifier change_timer_function_usage._callback;
type _cast_data;
@@
_callback(
(
-(_cast_data)_E
+&_E->_timer
|
-(_cast_data)&_E
+&_E._timer
|
-_E
+&_E->_timer
)
)
// If a timer has been configured without a data argument, it can be
// converted without regard to the callback argument, since it is unused.
@match_timer_function_unused_data@
expression _E;
identifier _timer;
identifier _callback;
@@
(
-setup_timer(&_E->_timer, _callback, 0);
+timer_setup(&_E->_timer, _callback, 0);
|
-setup_timer(&_E->_timer, _callback, 0L);
+timer_setup(&_E->_timer, _callback, 0);
|
-setup_timer(&_E->_timer, _callback, 0UL);
+timer_setup(&_E->_timer, _callback, 0);
|
-setup_timer(&_E._timer, _callback, 0);
+timer_setup(&_E._timer, _callback, 0);
|
-setup_timer(&_E._timer, _callback, 0L);
+timer_setup(&_E._timer, _callback, 0);
|
-setup_timer(&_E._timer, _callback, 0UL);
+timer_setup(&_E._timer, _callback, 0);
|
-setup_timer(&_timer, _callback, 0);
+timer_setup(&_timer, _callback, 0);
|
-setup_timer(&_timer, _callback, 0L);
+timer_setup(&_timer, _callback, 0);
|
-setup_timer(&_timer, _callback, 0UL);
+timer_setup(&_timer, _callback, 0);
|
-setup_timer(_timer, _callback, 0);
+timer_setup(_timer, _callback, 0);
|
-setup_timer(_timer, _callback, 0L);
+timer_setup(_timer, _callback, 0);
|
-setup_timer(_timer, _callback, 0UL);
+timer_setup(_timer, _callback, 0);
)
@change_callback_unused_data
depends on match_timer_function_unused_data@
identifier match_timer_function_unused_data._callback;
type _origtype;
identifier _origarg;
@@
void _callback(
-_origtype _origarg
+struct timer_list *unused
)
{
... when != _origarg
}
Signed-off-by: Kees Cook <keescook@chromium.org>
2017-10-16 14:43:17 -07:00
struct neigh_table * tbl = from_timer ( tbl , t , proxy_timer ) ;
2005-04-16 15:20:36 -07:00
long sched_next = 0 ;
unsigned long now = jiffies ;
2008-09-23 01:11:18 -07:00
struct sk_buff * skb , * n ;
2005-04-16 15:20:36 -07:00
spin_lock ( & tbl - > proxy_queue . lock ) ;
2008-09-23 01:11:18 -07:00
skb_queue_walk_safe ( & tbl - > proxy_queue , skb , n ) {
long tdif = NEIGH_CB ( skb ) - > sched_next - now ;
2005-04-16 15:20:36 -07:00
if ( tdif < = 0 ) {
2008-09-23 01:11:18 -07:00
struct net_device * dev = skb - > dev ;
2011-08-22 19:32:42 +00:00
net: neigh: decrement the family specific qlen
Commit 0ff4eb3d5ebb ("neighbour: make proxy_queue.qlen limit
per-device") introduced the length counter qlen in struct neigh_parms.
There are separate neigh_parms instances for IPv4/ARP and IPv6/ND, and
while the family specific qlen is incremented in pneigh_enqueue(), the
mentioned commit decrements always the IPv4/ARP specific qlen,
regardless of the currently processed family, in pneigh_queue_purge()
and neigh_proxy_process().
As a result, with IPv6/ND, the family specific qlen is only incremented
(and never decremented) until it exceeds PROXY_QLEN, and then, according
to the check in pneigh_enqueue(), neighbor solicitations are not
answered anymore. As an example, this is noted when using the
subnet-router anycast address to access a Linux router. After a certain
amount of time (in the observed case, qlen exceeded PROXY_QLEN after two
days), the Linux router stops answering neighbor solicitations for its
subnet-router anycast address and effectively becomes unreachable.
Another result with IPv6/ND is that the IPv4/ARP specific qlen is
decremented more often than incremented. This leads to negative qlen
values, as a signed integer has been used for the length counter qlen,
and potentially to an integer overflow.
Fix this by introducing the helper function neigh_parms_qlen_dec(),
which decrements the family specific qlen. Thereby, make use of the
existing helper function neigh_get_dev_parms_rcu(), whose definition
therefore needs to be placed earlier in neighbour.c. Take the family
member from struct neigh_table to determine the currently processed
family and appropriately call neigh_parms_qlen_dec() from
pneigh_queue_purge() and neigh_proxy_process().
Additionally, use an unsigned integer for the length counter qlen.
Fixes: 0ff4eb3d5ebb ("neighbour: make proxy_queue.qlen limit per-device")
Signed-off-by: Thomas Zeitlhofer <thomas.zeitlhofer+lkml@ze-it.at>
Signed-off-by: David S. Miller <davem@davemloft.net>
2022-11-15 23:09:41 +01:00
neigh_parms_qlen_dec ( dev , tbl - > family ) ;
2008-09-23 01:11:18 -07:00
__skb_unlink ( skb , & tbl - > proxy_queue ) ;
2022-08-11 18:20:12 +03:00
2011-08-22 19:32:42 +00:00
if ( tbl - > proxy_redo & & netif_running ( dev ) ) {
rcu_read_lock ( ) ;
2008-09-23 01:11:18 -07:00
tbl - > proxy_redo ( skb ) ;
2011-08-22 19:32:42 +00:00
rcu_read_unlock ( ) ;
} else {
2008-09-23 01:11:18 -07:00
kfree_skb ( skb ) ;
2011-08-22 19:32:42 +00:00
}
2005-04-16 15:20:36 -07:00
dev_put ( dev ) ;
} else if ( ! sched_next | | tdif < sched_next )
sched_next = tdif ;
}
del_timer ( & tbl - > proxy_timer ) ;
if ( sched_next )
mod_timer ( & tbl - > proxy_timer , jiffies + sched_next ) ;
spin_unlock ( & tbl - > proxy_queue . lock ) ;
}
2023-01-30 12:14:28 -05:00
static unsigned long neigh_proxy_delay ( struct neigh_parms * p )
{
/* If proxy_delay is zero, do not call get_random_u32_below()
* as it is undefined behavior .
*/
unsigned long proxy_delay = NEIGH_VAR ( p , PROXY_DELAY ) ;
return proxy_delay ?
jiffies + get_random_u32_below ( proxy_delay ) : jiffies ;
}
2005-04-16 15:20:36 -07:00
void pneigh_enqueue ( struct neigh_table * tbl , struct neigh_parms * p ,
struct sk_buff * skb )
{
2023-01-30 12:14:28 -05:00
unsigned long sched_next = neigh_proxy_delay ( p ) ;
2005-04-16 15:20:36 -07:00
2022-08-11 18:20:12 +03:00
if ( p - > qlen > NEIGH_VAR ( p , PROXY_QLEN ) ) {
2005-04-16 15:20:36 -07:00
kfree_skb ( skb ) ;
return ;
}
2005-08-14 17:24:31 -07:00
NEIGH_CB ( skb ) - > sched_next = sched_next ;
NEIGH_CB ( skb ) - > flags | = LOCALLY_ENQUEUED ;
2005-04-16 15:20:36 -07:00
spin_lock ( & tbl - > proxy_queue . lock ) ;
if ( del_timer ( & tbl - > proxy_timer ) ) {
if ( time_before ( tbl - > proxy_timer . expires , sched_next ) )
sched_next = tbl - > proxy_timer . expires ;
}
2009-06-02 05:19:30 +00:00
skb_dst_drop ( skb ) ;
2005-04-16 15:20:36 -07:00
dev_hold ( skb - > dev ) ;
__skb_queue_tail ( & tbl - > proxy_queue , skb ) ;
2022-08-11 18:20:12 +03:00
p - > qlen + + ;
2005-04-16 15:20:36 -07:00
mod_timer ( & tbl - > proxy_timer , sched_next ) ;
spin_unlock ( & tbl - > proxy_queue . lock ) ;
}
2008-03-24 18:39:10 +09:00
EXPORT_SYMBOL ( pneigh_enqueue ) ;
2005-04-16 15:20:36 -07:00
2009-07-13 11:17:49 -07:00
static inline struct neigh_parms * lookup_neigh_parms ( struct neigh_table * tbl ,
2008-01-24 00:13:18 -08:00
struct net * net , int ifindex )
{
struct neigh_parms * p ;
2014-10-29 19:29:31 +01:00
list_for_each_entry ( p , & tbl - > parms_list , list ) {
2008-03-26 03:57:35 +09:00
if ( ( p - > dev & & p - > dev - > ifindex = = ifindex & & net_eq ( neigh_parms_net ( p ) , net ) ) | |
2013-06-20 10:01:33 +08:00
( ! p - > dev & & ! ifindex & & net_eq ( net , & init_net ) ) )
2008-01-24 00:13:18 -08:00
return p ;
}
return NULL ;
}
2005-04-16 15:20:36 -07:00
struct neigh_parms * neigh_parms_alloc ( struct net_device * dev ,
struct neigh_table * tbl )
{
2013-06-20 10:01:32 +08:00
struct neigh_parms * p ;
2008-11-20 20:14:53 -08:00
struct net * net = dev_net ( dev ) ;
const struct net_device_ops * ops = dev - > netdev_ops ;
2008-01-24 00:13:18 -08:00
2013-06-20 10:01:32 +08:00
p = kmemdup ( & tbl - > parms , sizeof ( * p ) , GFP_KERNEL ) ;
2005-04-16 15:20:36 -07:00
if ( p ) {
p - > tbl = tbl ;
2017-06-30 13:07:56 +03:00
refcount_set ( & p - > refcnt , 1 ) ;
2005-04-16 15:20:36 -07:00
p - > reachable_time =
2013-12-07 19:26:53 +01:00
neigh_rand_reach_time ( NEIGH_VAR ( p , BASE_REACHABLE_TIME ) ) ;
2022-08-11 18:20:12 +03:00
p - > qlen = 0 ;
2022-06-07 21:39:55 -07:00
netdev_hold ( dev , & p - > dev_tracker , GFP_KERNEL ) ;
2013-08-02 19:07:38 +02:00
p - > dev = dev ;
2015-03-11 23:04:08 -05:00
write_pnet ( & p - > net , net ) ;
2013-08-02 19:07:38 +02:00
p - > sysctl_table = NULL ;
2005-06-18 22:50:55 -07:00
2008-11-20 20:14:53 -08:00
if ( ops - > ndo_neigh_setup & & ops - > ndo_neigh_setup ( dev , p ) ) {
2022-06-07 21:39:55 -07:00
netdev_put ( dev , & p - > dev_tracker ) ;
2008-01-14 22:59:59 -08:00
kfree ( p ) ;
return NULL ;
2005-04-16 15:20:36 -07:00
}
2008-01-14 22:59:59 -08:00
2005-04-16 15:20:36 -07:00
write_lock_bh ( & tbl - > lock ) ;
2014-10-29 19:29:31 +01:00
list_add ( & p - > list , & tbl - > parms . list ) ;
2005-04-16 15:20:36 -07:00
write_unlock_bh ( & tbl - > lock ) ;
2013-12-07 19:26:56 +01:00
neigh_parms_data_state_cleanall ( p ) ;
2005-04-16 15:20:36 -07:00
}
return p ;
}
2008-03-24 18:39:10 +09:00
EXPORT_SYMBOL ( neigh_parms_alloc ) ;
2005-04-16 15:20:36 -07:00
static void neigh_rcu_free_parms ( struct rcu_head * head )
{
struct neigh_parms * parms =
container_of ( head , struct neigh_parms , rcu_head ) ;
neigh_parms_put ( parms ) ;
}
void neigh_parms_release ( struct neigh_table * tbl , struct neigh_parms * parms )
{
if ( ! parms | | parms = = & tbl - > parms )
return ;
write_lock_bh ( & tbl - > lock ) ;
2014-10-29 19:29:31 +01:00
list_del ( & parms - > list ) ;
parms - > dead = 1 ;
2005-04-16 15:20:36 -07:00
write_unlock_bh ( & tbl - > lock ) ;
2022-06-07 21:39:55 -07:00
netdev_put ( parms - > dev , & parms - > dev_tracker ) ;
2014-10-29 19:29:31 +01:00
call_rcu ( & parms - > rcu_head , neigh_rcu_free_parms ) ;
2005-04-16 15:20:36 -07:00
}
2008-03-24 18:39:10 +09:00
EXPORT_SYMBOL ( neigh_parms_release ) ;
2005-04-16 15:20:36 -07:00
2008-01-24 00:30:58 -08:00
static void neigh_parms_destroy ( struct neigh_parms * parms )
2005-04-16 15:20:36 -07:00
{
kfree ( parms ) ;
}
2007-04-17 12:45:31 -07:00
static struct lock_class_key neigh_table_proxy_queue_class ;
2014-11-10 15:59:36 -08:00
static struct neigh_table * neigh_tables [ NEIGH_NR_TABLES ] __read_mostly ;
void neigh_table_init ( int index , struct neigh_table * tbl )
2005-04-16 15:20:36 -07:00
{
unsigned long now = jiffies ;
unsigned long phsize ;
2014-10-29 19:29:31 +01:00
INIT_LIST_HEAD ( & tbl - > parms_list ) ;
neighbor: Improve garbage collection
The existing garbage collection algorithm has a number of problems:
1. The gc algorithm will not evict PERMANENT entries as those entries
are managed by userspace, yet the existing algorithm walks the entire
hash table which means it always considers PERMANENT entries when
looking for entries to evict. In some use cases (e.g., EVPN) there
can be tens of thousands of PERMANENT entries leading to wasted
CPU cycles when gc kicks in. As an example, with 32k permanent
entries, neigh_alloc has been observed taking more than 4 msec per
invocation.
2. Currently, when the number of neighbor entries hits gc_thresh2 and
the last flush for the table was more than 5 seconds ago gc kicks in
walks the entire hash table evicting *all* entries not in PERMANENT
or REACHABLE state and not marked as externally learned. There is no
discriminator on when the neigh entry was created or if it just moved
from REACHABLE to another NUD_VALID state (e.g., NUD_STALE).
It is possible for entries to be created or for established neighbor
entries to be moved to STALE (e.g., an external node sends an ARP
request) right before the 5 second window lapses:
-----|---------x|----------|-----
t-5 t t+5
If that happens those entries are evicted during gc causing unnecessary
thrashing on neighbor entries and userspace caches trying to track them.
Further, this contradicts the description of gc_thresh2 which says
"Entries older than 5 seconds will be cleared".
One workaround is to make gc_thresh2 == gc_thresh3 but that negates the
whole point of having separate thresholds.
3. Clearing *all* neigh non-PERMANENT/REACHABLE/externally learned entries
when gc_thresh2 is exceeded is over kill and contributes to trashing
especially during startup.
This patch addresses these problems as follows:
1. Use of a separate list_head to track entries that can be garbage
collected along with a separate counter. PERMANENT entries are not
added to this list.
The gc_thresh parameters are only compared to the new counter, not the
total entries in the table. The forced_gc function is updated to only
walk this new gc_list looking for entries to evict.
2. Entries are added to the list head at the tail and removed from the
front.
3. Entries are only evicted if they were last updated more than 5 seconds
ago, adhering to the original intent of gc_thresh2.
4. Forced gc is stopped once the number of gc_entries drops below
gc_thresh2.
5. Since gc checks do not apply to PERMANENT entries, gc levels are skipped
when allocating a new neighbor for a PERMANENT entry. By extension this
means there are no explicit limits on the number of PERMANENT entries
that can be created, but this is no different than FIB entries or FDB
entries.
Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-12-07 12:24:57 -08:00
INIT_LIST_HEAD ( & tbl - > gc_list ) ;
2021-10-11 14:12:38 +02:00
INIT_LIST_HEAD ( & tbl - > managed_list ) ;
2014-10-29 19:29:31 +01:00
list_add ( & tbl - > parms . list , & tbl - > parms_list ) ;
2008-11-12 00:54:54 -08:00
write_pnet ( & tbl - > parms . net , & init_net ) ;
2017-06-30 13:07:56 +03:00
refcount_set ( & tbl - > parms . refcnt , 1 ) ;
2005-04-16 15:20:36 -07:00
tbl - > parms . reachable_time =
2013-12-07 19:26:53 +01:00
neigh_rand_reach_time ( NEIGH_VAR ( & tbl - > parms , BASE_REACHABLE_TIME ) ) ;
2022-08-11 18:20:12 +03:00
tbl - > parms . qlen = 0 ;
2005-04-16 15:20:36 -07:00
tbl - > stats = alloc_percpu ( struct neigh_statistics ) ;
if ( ! tbl - > stats )
panic ( " cannot create neighbour cache statistics " ) ;
2007-02-09 23:24:36 +09:00
2005-04-16 15:20:36 -07:00
# ifdef CONFIG_PROC_FS
2018-04-15 10:16:41 +02:00
if ( ! proc_create_seq_data ( tbl - > id , 0 , init_net . proc_net_stat ,
& neigh_stat_seq_ops , tbl ) )
2005-04-16 15:20:36 -07:00
panic ( " cannot create neighbour proc dir entry " ) ;
# endif
2011-07-11 01:28:12 -07:00
RCU_INIT_POINTER ( tbl - > nht , neigh_hash_alloc ( 3 ) ) ;
2005-04-16 15:20:36 -07:00
phsize = ( PNEIGH_HASHMASK + 1 ) * sizeof ( struct pneigh_entry * ) ;
2006-04-07 14:52:59 -07:00
tbl - > phash_buckets = kzalloc ( phsize , GFP_KERNEL ) ;
2005-04-16 15:20:36 -07:00
2010-10-04 06:15:44 +00:00
if ( ! tbl - > nht | | ! tbl - > phash_buckets )
2005-04-16 15:20:36 -07:00
panic ( " cannot allocate neighbour cache hashes " ) ;
2013-01-24 00:44:23 +00:00
if ( ! tbl - > entry_size )
tbl - > entry_size = ALIGN ( offsetof ( struct neighbour , primary_key ) +
tbl - > key_len , NEIGH_PRIV_ALIGN ) ;
else
WARN_ON ( tbl - > entry_size % NEIGH_PRIV_ALIGN ) ;
2005-04-16 15:20:36 -07:00
rwlock_init ( & tbl - > lock ) ;
2021-10-11 14:12:38 +02:00
2012-08-21 13:18:23 -07:00
INIT_DEFERRABLE_WORK ( & tbl - > gc_work , neigh_periodic_work ) ;
2014-01-22 12:23:33 +05:30
queue_delayed_work ( system_power_efficient_wq , & tbl - > gc_work ,
tbl - > parms . reachable_time ) ;
2021-10-11 14:12:38 +02:00
INIT_DEFERRABLE_WORK ( & tbl - > managed_work , neigh_managed_work ) ;
queue_delayed_work ( system_power_efficient_wq , & tbl - > managed_work , 0 ) ;
treewide: setup_timer() -> timer_setup()
This converts all remaining cases of the old setup_timer() API into using
timer_setup(), where the callback argument is the structure already
holding the struct timer_list. These should have no behavioral changes,
since they just change which pointer is passed into the callback with
the same available pointers after conversion. It handles the following
examples, in addition to some other variations.
Casting from unsigned long:
void my_callback(unsigned long data)
{
struct something *ptr = (struct something *)data;
...
}
...
setup_timer(&ptr->my_timer, my_callback, ptr);
and forced object casts:
void my_callback(struct something *ptr)
{
...
}
...
setup_timer(&ptr->my_timer, my_callback, (unsigned long)ptr);
become:
void my_callback(struct timer_list *t)
{
struct something *ptr = from_timer(ptr, t, my_timer);
...
}
...
timer_setup(&ptr->my_timer, my_callback, 0);
Direct function assignments:
void my_callback(unsigned long data)
{
struct something *ptr = (struct something *)data;
...
}
...
ptr->my_timer.function = my_callback;
have a temporary cast added, along with converting the args:
void my_callback(struct timer_list *t)
{
struct something *ptr = from_timer(ptr, t, my_timer);
...
}
...
ptr->my_timer.function = (TIMER_FUNC_TYPE)my_callback;
And finally, callbacks without a data assignment:
void my_callback(unsigned long data)
{
...
}
...
setup_timer(&ptr->my_timer, my_callback, 0);
have their argument renamed to verify they're unused during conversion:
void my_callback(struct timer_list *unused)
{
...
}
...
timer_setup(&ptr->my_timer, my_callback, 0);
The conversion is done with the following Coccinelle script:
spatch --very-quiet --all-includes --include-headers \
-I ./arch/x86/include -I ./arch/x86/include/generated \
-I ./include -I ./arch/x86/include/uapi \
-I ./arch/x86/include/generated/uapi -I ./include/uapi \
-I ./include/generated/uapi --include ./include/linux/kconfig.h \
--dir . \
--cocci-file ~/src/data/timer_setup.cocci
@fix_address_of@
expression e;
@@
setup_timer(
-&(e)
+&e
, ...)
// Update any raw setup_timer() usages that have a NULL callback, but
// would otherwise match change_timer_function_usage, since the latter
// will update all function assignments done in the face of a NULL
// function initialization in setup_timer().
@change_timer_function_usage_NULL@
expression _E;
identifier _timer;
type _cast_data;
@@
(
-setup_timer(&_E->_timer, NULL, _E);
+timer_setup(&_E->_timer, NULL, 0);
|
-setup_timer(&_E->_timer, NULL, (_cast_data)_E);
+timer_setup(&_E->_timer, NULL, 0);
|
-setup_timer(&_E._timer, NULL, &_E);
+timer_setup(&_E._timer, NULL, 0);
|
-setup_timer(&_E._timer, NULL, (_cast_data)&_E);
+timer_setup(&_E._timer, NULL, 0);
)
@change_timer_function_usage@
expression _E;
identifier _timer;
struct timer_list _stl;
identifier _callback;
type _cast_func, _cast_data;
@@
(
-setup_timer(&_E->_timer, _callback, _E);
+timer_setup(&_E->_timer, _callback, 0);
|
-setup_timer(&_E->_timer, &_callback, _E);
+timer_setup(&_E->_timer, _callback, 0);
|
-setup_timer(&_E->_timer, _callback, (_cast_data)_E);
+timer_setup(&_E->_timer, _callback, 0);
|
-setup_timer(&_E->_timer, &_callback, (_cast_data)_E);
+timer_setup(&_E->_timer, _callback, 0);
|
-setup_timer(&_E->_timer, (_cast_func)_callback, _E);
+timer_setup(&_E->_timer, _callback, 0);
|
-setup_timer(&_E->_timer, (_cast_func)&_callback, _E);
+timer_setup(&_E->_timer, _callback, 0);
|
-setup_timer(&_E->_timer, (_cast_func)_callback, (_cast_data)_E);
+timer_setup(&_E->_timer, _callback, 0);
|
-setup_timer(&_E->_timer, (_cast_func)&_callback, (_cast_data)_E);
+timer_setup(&_E->_timer, _callback, 0);
|
-setup_timer(&_E._timer, _callback, (_cast_data)_E);
+timer_setup(&_E._timer, _callback, 0);
|
-setup_timer(&_E._timer, _callback, (_cast_data)&_E);
+timer_setup(&_E._timer, _callback, 0);
|
-setup_timer(&_E._timer, &_callback, (_cast_data)_E);
+timer_setup(&_E._timer, _callback, 0);
|
-setup_timer(&_E._timer, &_callback, (_cast_data)&_E);
+timer_setup(&_E._timer, _callback, 0);
|
-setup_timer(&_E._timer, (_cast_func)_callback, (_cast_data)_E);
+timer_setup(&_E._timer, _callback, 0);
|
-setup_timer(&_E._timer, (_cast_func)_callback, (_cast_data)&_E);
+timer_setup(&_E._timer, _callback, 0);
|
-setup_timer(&_E._timer, (_cast_func)&_callback, (_cast_data)_E);
+timer_setup(&_E._timer, _callback, 0);
|
-setup_timer(&_E._timer, (_cast_func)&_callback, (_cast_data)&_E);
+timer_setup(&_E._timer, _callback, 0);
|
_E->_timer@_stl.function = _callback;
|
_E->_timer@_stl.function = &_callback;
|
_E->_timer@_stl.function = (_cast_func)_callback;
|
_E->_timer@_stl.function = (_cast_func)&_callback;
|
_E._timer@_stl.function = _callback;
|
_E._timer@_stl.function = &_callback;
|
_E._timer@_stl.function = (_cast_func)_callback;
|
_E._timer@_stl.function = (_cast_func)&_callback;
)
// callback(unsigned long arg)
@change_callback_handle_cast
depends on change_timer_function_usage@
identifier change_timer_function_usage._callback;
identifier change_timer_function_usage._timer;
type _origtype;
identifier _origarg;
type _handletype;
identifier _handle;
@@
void _callback(
-_origtype _origarg
+struct timer_list *t
)
{
(
... when != _origarg
_handletype *_handle =
-(_handletype *)_origarg;
+from_timer(_handle, t, _timer);
... when != _origarg
|
... when != _origarg
_handletype *_handle =
-(void *)_origarg;
+from_timer(_handle, t, _timer);
... when != _origarg
|
... when != _origarg
_handletype *_handle;
... when != _handle
_handle =
-(_handletype *)_origarg;
+from_timer(_handle, t, _timer);
... when != _origarg
|
... when != _origarg
_handletype *_handle;
... when != _handle
_handle =
-(void *)_origarg;
+from_timer(_handle, t, _timer);
... when != _origarg
)
}
// callback(unsigned long arg) without existing variable
@change_callback_handle_cast_no_arg
depends on change_timer_function_usage &&
!change_callback_handle_cast@
identifier change_timer_function_usage._callback;
identifier change_timer_function_usage._timer;
type _origtype;
identifier _origarg;
type _handletype;
@@
void _callback(
-_origtype _origarg
+struct timer_list *t
)
{
+ _handletype *_origarg = from_timer(_origarg, t, _timer);
+
... when != _origarg
- (_handletype *)_origarg
+ _origarg
... when != _origarg
}
// Avoid already converted callbacks.
@match_callback_converted
depends on change_timer_function_usage &&
!change_callback_handle_cast &&
!change_callback_handle_cast_no_arg@
identifier change_timer_function_usage._callback;
identifier t;
@@
void _callback(struct timer_list *t)
{ ... }
// callback(struct something *handle)
@change_callback_handle_arg
depends on change_timer_function_usage &&
!match_callback_converted &&
!change_callback_handle_cast &&
!change_callback_handle_cast_no_arg@
identifier change_timer_function_usage._callback;
identifier change_timer_function_usage._timer;
type _handletype;
identifier _handle;
@@
void _callback(
-_handletype *_handle
+struct timer_list *t
)
{
+ _handletype *_handle = from_timer(_handle, t, _timer);
...
}
// If change_callback_handle_arg ran on an empty function, remove
// the added handler.
@unchange_callback_handle_arg
depends on change_timer_function_usage &&
change_callback_handle_arg@
identifier change_timer_function_usage._callback;
identifier change_timer_function_usage._timer;
type _handletype;
identifier _handle;
identifier t;
@@
void _callback(struct timer_list *t)
{
- _handletype *_handle = from_timer(_handle, t, _timer);
}
// We only want to refactor the setup_timer() data argument if we've found
// the matching callback. This undoes changes in change_timer_function_usage.
@unchange_timer_function_usage
depends on change_timer_function_usage &&
!change_callback_handle_cast &&
!change_callback_handle_cast_no_arg &&
!change_callback_handle_arg@
expression change_timer_function_usage._E;
identifier change_timer_function_usage._timer;
identifier change_timer_function_usage._callback;
type change_timer_function_usage._cast_data;
@@
(
-timer_setup(&_E->_timer, _callback, 0);
+setup_timer(&_E->_timer, _callback, (_cast_data)_E);
|
-timer_setup(&_E._timer, _callback, 0);
+setup_timer(&_E._timer, _callback, (_cast_data)&_E);
)
// If we fixed a callback from a .function assignment, fix the
// assignment cast now.
@change_timer_function_assignment
depends on change_timer_function_usage &&
(change_callback_handle_cast ||
change_callback_handle_cast_no_arg ||
change_callback_handle_arg)@
expression change_timer_function_usage._E;
identifier change_timer_function_usage._timer;
identifier change_timer_function_usage._callback;
type _cast_func;
typedef TIMER_FUNC_TYPE;
@@
(
_E->_timer.function =
-_callback
+(TIMER_FUNC_TYPE)_callback
;
|
_E->_timer.function =
-&_callback
+(TIMER_FUNC_TYPE)_callback
;
|
_E->_timer.function =
-(_cast_func)_callback;
+(TIMER_FUNC_TYPE)_callback
;
|
_E->_timer.function =
-(_cast_func)&_callback
+(TIMER_FUNC_TYPE)_callback
;
|
_E._timer.function =
-_callback
+(TIMER_FUNC_TYPE)_callback
;
|
_E._timer.function =
-&_callback;
+(TIMER_FUNC_TYPE)_callback
;
|
_E._timer.function =
-(_cast_func)_callback
+(TIMER_FUNC_TYPE)_callback
;
|
_E._timer.function =
-(_cast_func)&_callback
+(TIMER_FUNC_TYPE)_callback
;
)
// Sometimes timer functions are called directly. Replace matched args.
@change_timer_function_calls
depends on change_timer_function_usage &&
(change_callback_handle_cast ||
change_callback_handle_cast_no_arg ||
change_callback_handle_arg)@
expression _E;
identifier change_timer_function_usage._timer;
identifier change_timer_function_usage._callback;
type _cast_data;
@@
_callback(
(
-(_cast_data)_E
+&_E->_timer
|
-(_cast_data)&_E
+&_E._timer
|
-_E
+&_E->_timer
)
)
// If a timer has been configured without a data argument, it can be
// converted without regard to the callback argument, since it is unused.
@match_timer_function_unused_data@
expression _E;
identifier _timer;
identifier _callback;
@@
(
-setup_timer(&_E->_timer, _callback, 0);
+timer_setup(&_E->_timer, _callback, 0);
|
-setup_timer(&_E->_timer, _callback, 0L);
+timer_setup(&_E->_timer, _callback, 0);
|
-setup_timer(&_E->_timer, _callback, 0UL);
+timer_setup(&_E->_timer, _callback, 0);
|
-setup_timer(&_E._timer, _callback, 0);
+timer_setup(&_E._timer, _callback, 0);
|
-setup_timer(&_E._timer, _callback, 0L);
+timer_setup(&_E._timer, _callback, 0);
|
-setup_timer(&_E._timer, _callback, 0UL);
+timer_setup(&_E._timer, _callback, 0);
|
-setup_timer(&_timer, _callback, 0);
+timer_setup(&_timer, _callback, 0);
|
-setup_timer(&_timer, _callback, 0L);
+timer_setup(&_timer, _callback, 0);
|
-setup_timer(&_timer, _callback, 0UL);
+timer_setup(&_timer, _callback, 0);
|
-setup_timer(_timer, _callback, 0);
+timer_setup(_timer, _callback, 0);
|
-setup_timer(_timer, _callback, 0L);
+timer_setup(_timer, _callback, 0);
|
-setup_timer(_timer, _callback, 0UL);
+timer_setup(_timer, _callback, 0);
)
@change_callback_unused_data
depends on match_timer_function_unused_data@
identifier match_timer_function_unused_data._callback;
type _origtype;
identifier _origarg;
@@
void _callback(
-_origtype _origarg
+struct timer_list *unused
)
{
... when != _origarg
}
Signed-off-by: Kees Cook <keescook@chromium.org>
2017-10-16 14:43:17 -07:00
timer_setup ( & tbl - > proxy_timer , neigh_proxy_process , 0 ) ;
2007-04-17 12:45:31 -07:00
skb_queue_head_init_class ( & tbl - > proxy_queue ,
& neigh_table_proxy_queue_class ) ;
2005-04-16 15:20:36 -07:00
tbl - > last_flush = now ;
tbl - > last_rand = now + tbl - > parms . reachable_time * 20 ;
[NEIGH]: Fix IP-over-ATM and ARP interaction.
The classical IP over ATM code maintains its own IPv4 <-> <ATM stuff>
ARP table, using the standard neighbour-table code. The
neigh_table_init function adds this neighbour table to a linked list
of all neighbor tables which is used by the functions neigh_delete()
neigh_add() and neightbl_set(), all called by the netlink code.
Once the ATM neighbour table is added to the list, there are two
tables with family == AF_INET there, and ARP entries sent via netlink
go into the first table with matching family. This is indeterminate
and often wrong.
To see the bug, on a kernel with CLIP enabled, create a standard IPv4
ARP entry by pinging an unused address on a local subnet. Then attempt
to complete that entry by doing
ip neigh replace <ip address> lladdr <some mac address> nud reachable
Looking at the ARP tables by using
ip neigh show
will reveal two ARP entries for the same address. One of these can be
found in /proc/net/arp, and the other in /proc/net/atm/arp.
This patch adds a new function, neigh_table_init_no_netlink() which
does everything the neigh_table_init() does, except add the table to
the netlink all-arp-tables chain. In addition neigh_table_init() has a
check that all tables on the chain have a distinct address family.
The init call in clip.c is changed to call
neigh_table_init_no_netlink().
Since ATM ARP tables are rather more complicated than can currently be
handled by the available rtattrs in the netlink protocol, no
functionality is lost by this patch, and non-ATM ARP manipulation via
netlink is rescued. A more complete solution would involve a rtattr
for ATM ARP entries and some way for the netlink code to give
neigh_add and friends more information than just address family with
which to find the correct ARP table.
[ I've changed the assertion checking in neigh_table_init() to not
use BUG_ON() while holding neigh_tbl_lock. Instead we remember that
we found an existing tbl with the same family, and after dropping
the lock we'll give a diagnostic kernel log message and a stack dump.
-DaveM ]
Signed-off-by: Simon Kelley <simon@thekelleys.org.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
2006-05-12 14:56:08 -07:00
2014-11-10 15:59:36 -08:00
neigh_tables [ index ] = tbl ;
2005-04-16 15:20:36 -07:00
}
2008-03-24 18:39:10 +09:00
EXPORT_SYMBOL ( neigh_table_init ) ;
2005-04-16 15:20:36 -07:00
2014-11-10 15:59:36 -08:00
int neigh_table_clear ( int index , struct neigh_table * tbl )
2005-04-16 15:20:36 -07:00
{
2014-11-10 15:59:36 -08:00
neigh_tables [ index ] = NULL ;
2005-04-16 15:20:36 -07:00
/* It is not clean... Fix it to unload IPv6 module safely */
2021-11-22 16:01:51 +01:00
cancel_delayed_work_sync ( & tbl - > managed_work ) ;
2010-10-19 06:04:42 +00:00
cancel_delayed_work_sync ( & tbl - > gc_work ) ;
2005-04-16 15:20:36 -07:00
del_timer_sync ( & tbl - > proxy_timer ) ;
net: neigh: decrement the family specific qlen
Commit 0ff4eb3d5ebb ("neighbour: make proxy_queue.qlen limit
per-device") introduced the length counter qlen in struct neigh_parms.
There are separate neigh_parms instances for IPv4/ARP and IPv6/ND, and
while the family specific qlen is incremented in pneigh_enqueue(), the
mentioned commit decrements always the IPv4/ARP specific qlen,
regardless of the currently processed family, in pneigh_queue_purge()
and neigh_proxy_process().
As a result, with IPv6/ND, the family specific qlen is only incremented
(and never decremented) until it exceeds PROXY_QLEN, and then, according
to the check in pneigh_enqueue(), neighbor solicitations are not
answered anymore. As an example, this is noted when using the
subnet-router anycast address to access a Linux router. After a certain
amount of time (in the observed case, qlen exceeded PROXY_QLEN after two
days), the Linux router stops answering neighbor solicitations for its
subnet-router anycast address and effectively becomes unreachable.
Another result with IPv6/ND is that the IPv4/ARP specific qlen is
decremented more often than incremented. This leads to negative qlen
values, as a signed integer has been used for the length counter qlen,
and potentially to an integer overflow.
Fix this by introducing the helper function neigh_parms_qlen_dec(),
which decrements the family specific qlen. Thereby, make use of the
existing helper function neigh_get_dev_parms_rcu(), whose definition
therefore needs to be placed earlier in neighbour.c. Take the family
member from struct neigh_table to determine the currently processed
family and appropriately call neigh_parms_qlen_dec() from
pneigh_queue_purge() and neigh_proxy_process().
Additionally, use an unsigned integer for the length counter qlen.
Fixes: 0ff4eb3d5ebb ("neighbour: make proxy_queue.qlen limit per-device")
Signed-off-by: Thomas Zeitlhofer <thomas.zeitlhofer+lkml@ze-it.at>
Signed-off-by: David S. Miller <davem@davemloft.net>
2022-11-15 23:09:41 +01:00
pneigh_queue_purge ( & tbl - > proxy_queue , NULL , tbl - > family ) ;
2005-04-16 15:20:36 -07:00
neigh_ifdown ( tbl , NULL ) ;
if ( atomic_read ( & tbl - > entries ) )
2012-05-16 19:58:40 +00:00
pr_crit ( " neighbour leakage \n " ) ;
2005-04-16 15:20:36 -07:00
2011-01-19 22:02:47 +00:00
call_rcu ( & rcu_dereference_protected ( tbl - > nht , 1 ) - > rcu ,
neigh_hash_free_rcu ) ;
2010-10-04 06:15:44 +00:00
tbl - > nht = NULL ;
2005-04-16 15:20:36 -07:00
kfree ( tbl - > phash_buckets ) ;
tbl - > phash_buckets = NULL ;
2007-11-05 21:28:13 -08:00
remove_proc_entry ( tbl - > id , init_net . proc_net_stat ) ;
2006-09-01 01:34:10 -07:00
free_percpu ( tbl - > stats ) ;
tbl - > stats = NULL ;
2005-04-16 15:20:36 -07:00
return 0 ;
}
2008-03-24 18:39:10 +09:00
EXPORT_SYMBOL ( neigh_table_clear ) ;
2005-04-16 15:20:36 -07:00
2014-11-10 15:59:36 -08:00
static struct neigh_table * neigh_find_table ( int family )
{
struct neigh_table * tbl = NULL ;
switch ( family ) {
case AF_INET :
tbl = neigh_tables [ NEIGH_ARP_TABLE ] ;
break ;
case AF_INET6 :
tbl = neigh_tables [ NEIGH_ND_TABLE ] ;
break ;
}
return tbl ;
}
2018-12-19 12:51:38 -08:00
const struct nla_policy nda_policy [ NDA_MAX + 1 ] = {
2020-05-21 22:26:14 -07:00
[ NDA_UNSPEC ] = { . strict_start_type = NDA_NH_ID } ,
2018-12-19 12:51:38 -08:00
[ NDA_DST ] = { . type = NLA_BINARY , . len = MAX_ADDR_LEN } ,
[ NDA_LLADDR ] = { . type = NLA_BINARY , . len = MAX_ADDR_LEN } ,
[ NDA_CACHEINFO ] = { . len = sizeof ( struct nda_cacheinfo ) } ,
[ NDA_PROBES ] = { . type = NLA_U32 } ,
[ NDA_VLAN ] = { . type = NLA_U16 } ,
[ NDA_PORT ] = { . type = NLA_U16 } ,
[ NDA_VNI ] = { . type = NLA_U32 } ,
[ NDA_IFINDEX ] = { . type = NLA_U32 } ,
[ NDA_MASTER ] = { . type = NLA_U32 } ,
2018-12-19 20:02:36 -08:00
[ NDA_PROTOCOL ] = { . type = NLA_U8 } ,
2020-05-21 22:26:14 -07:00
[ NDA_NH_ID ] = { . type = NLA_U32 } ,
2021-10-13 15:21:39 +02:00
[ NDA_FLAGS_EXT ] = NLA_POLICY_MASK ( NLA_U32 , NTF_EXT_MASK ) ,
2020-06-23 23:47:16 +03:00
[ NDA_FDB_EXT_ATTRS ] = { . type = NLA_NESTED } ,
2018-12-19 12:51:38 -08:00
} ;
2017-04-16 09:48:24 -07:00
static int neigh_delete ( struct sk_buff * skb , struct nlmsghdr * nlh ,
struct netlink_ext_ack * extack )
2005-04-16 15:20:36 -07:00
{
2008-03-26 02:26:21 +09:00
struct net * net = sock_net ( skb - > sk ) ;
2006-08-07 17:53:08 -07:00
struct ndmsg * ndm ;
struct nlattr * dst_attr ;
2005-04-16 15:20:36 -07:00
struct neigh_table * tbl ;
2014-11-10 15:59:36 -08:00
struct neighbour * neigh ;
2005-04-16 15:20:36 -07:00
struct net_device * dev = NULL ;
2006-08-07 17:53:08 -07:00
int err = - EINVAL ;
2005-04-16 15:20:36 -07:00
2010-10-04 04:27:36 +00:00
ASSERT_RTNL ( ) ;
2006-08-07 17:53:08 -07:00
if ( nlmsg_len ( nlh ) < sizeof ( * ndm ) )
2005-04-16 15:20:36 -07:00
goto out ;
2006-08-07 17:53:08 -07:00
dst_attr = nlmsg_find_attr ( nlh , sizeof ( * ndm ) , NDA_DST ) ;
2018-12-05 20:02:29 -08:00
if ( ! dst_attr ) {
NL_SET_ERR_MSG ( extack , " Network address not specified " ) ;
2006-08-07 17:53:08 -07:00
goto out ;
2018-12-05 20:02:29 -08:00
}
2006-08-07 17:53:08 -07:00
ndm = nlmsg_data ( nlh ) ;
if ( ndm - > ndm_ifindex ) {
2010-10-04 04:27:36 +00:00
dev = __dev_get_by_index ( net , ndm - > ndm_ifindex ) ;
2006-08-07 17:53:08 -07:00
if ( dev = = NULL ) {
err = - ENODEV ;
goto out ;
}
}
2014-11-10 15:59:36 -08:00
tbl = neigh_find_table ( ndm - > ndm_family ) ;
if ( tbl = = NULL )
return - EAFNOSUPPORT ;
2005-04-16 15:20:36 -07:00
2018-12-05 20:02:29 -08:00
if ( nla_len ( dst_attr ) < ( int ) tbl - > key_len ) {
NL_SET_ERR_MSG ( extack , " Invalid network address " ) ;
2014-11-10 15:59:36 -08:00
goto out ;
2018-12-05 20:02:29 -08:00
}
2005-04-16 15:20:36 -07:00
2014-11-10 15:59:36 -08:00
if ( ndm - > ndm_flags & NTF_PROXY ) {
err = pneigh_delete ( tbl , net , nla_data ( dst_attr ) , dev ) ;
goto out ;
}
2005-04-16 15:20:36 -07:00
2014-11-10 15:59:36 -08:00
if ( dev = = NULL )
goto out ;
2006-08-07 17:53:08 -07:00
2014-11-10 15:59:36 -08:00
neigh = neigh_lookup ( tbl , nla_data ( dst_attr ) , dev ) ;
if ( neigh = = NULL ) {
err = - ENOENT ;
2010-10-04 04:27:36 +00:00
goto out ;
2005-04-16 15:20:36 -07:00
}
2014-11-10 15:59:36 -08:00
2018-12-05 20:02:29 -08:00
err = __neigh_update ( neigh , NULL , NUD_FAILED ,
NEIGH_UPDATE_F_OVERRIDE | NEIGH_UPDATE_F_ADMIN ,
NETLINK_CB ( skb ) . portid , extack ) ;
2017-06-02 09:01:49 -07:00
write_lock_bh ( & tbl - > lock ) ;
2014-11-10 15:59:36 -08:00
neigh_release ( neigh ) ;
2017-06-02 09:01:49 -07:00
neigh_remove_one ( neigh , tbl ) ;
write_unlock_bh ( & tbl - > lock ) ;
2006-08-07 17:53:08 -07:00
2005-04-16 15:20:36 -07:00
out :
return err ;
}
2017-04-16 09:48:24 -07:00
static int neigh_add ( struct sk_buff * skb , struct nlmsghdr * nlh ,
struct netlink_ext_ack * extack )
2005-04-16 15:20:36 -07:00
{
2018-09-22 21:26:19 -07:00
int flags = NEIGH_UPDATE_F_ADMIN | NEIGH_UPDATE_F_OVERRIDE |
2021-10-11 14:12:37 +02:00
NEIGH_UPDATE_F_OVERRIDE_ISROUTER ;
2008-03-26 02:26:21 +09:00
struct net * net = sock_net ( skb - > sk ) ;
2006-08-07 17:55:40 -07:00
struct ndmsg * ndm ;
struct nlattr * tb [ NDA_MAX + 1 ] ;
2005-04-16 15:20:36 -07:00
struct neigh_table * tbl ;
struct net_device * dev = NULL ;
2014-11-10 15:59:36 -08:00
struct neighbour * neigh ;
void * dst , * lladdr ;
2018-12-15 14:09:06 -08:00
u8 protocol = 0 ;
2021-10-11 14:12:37 +02:00
u32 ndm_flags ;
2006-08-07 17:55:40 -07:00
int err ;
2005-04-16 15:20:36 -07:00
2010-10-04 04:27:36 +00:00
ASSERT_RTNL ( ) ;
netlink: make validation more configurable for future strictness
We currently have two levels of strict validation:
1) liberal (default)
- undefined (type >= max) & NLA_UNSPEC attributes accepted
- attribute length >= expected accepted
- garbage at end of message accepted
2) strict (opt-in)
- NLA_UNSPEC attributes accepted
- attribute length >= expected accepted
Split out parsing strictness into four different options:
* TRAILING - check that there's no trailing data after parsing
attributes (in message or nested)
* MAXTYPE - reject attrs > max known type
* UNSPEC - reject attributes with NLA_UNSPEC policy entries
* STRICT_ATTRS - strictly validate attribute size
The default for future things should be *everything*.
The current *_strict() is a combination of TRAILING and MAXTYPE,
and is renamed to _deprecated_strict().
The current regular parsing has none of this, and is renamed to
*_parse_deprecated().
Additionally it allows us to selectively set one of the new flags
even on old policies. Notably, the UNSPEC flag could be useful in
this case, since it can be arranged (by filling in the policy) to
not be an incompatible userspace ABI change, but would then going
forward prevent forgetting attribute entries. Similar can apply
to the POLICY flag.
We end up with the following renames:
* nla_parse -> nla_parse_deprecated
* nla_parse_strict -> nla_parse_deprecated_strict
* nlmsg_parse -> nlmsg_parse_deprecated
* nlmsg_parse_strict -> nlmsg_parse_deprecated_strict
* nla_parse_nested -> nla_parse_nested_deprecated
* nla_validate_nested -> nla_validate_nested_deprecated
Using spatch, of course:
@@
expression TB, MAX, HEAD, LEN, POL, EXT;
@@
-nla_parse(TB, MAX, HEAD, LEN, POL, EXT)
+nla_parse_deprecated(TB, MAX, HEAD, LEN, POL, EXT)
@@
expression NLH, HDRLEN, TB, MAX, POL, EXT;
@@
-nlmsg_parse(NLH, HDRLEN, TB, MAX, POL, EXT)
+nlmsg_parse_deprecated(NLH, HDRLEN, TB, MAX, POL, EXT)
@@
expression NLH, HDRLEN, TB, MAX, POL, EXT;
@@
-nlmsg_parse_strict(NLH, HDRLEN, TB, MAX, POL, EXT)
+nlmsg_parse_deprecated_strict(NLH, HDRLEN, TB, MAX, POL, EXT)
@@
expression TB, MAX, NLA, POL, EXT;
@@
-nla_parse_nested(TB, MAX, NLA, POL, EXT)
+nla_parse_nested_deprecated(TB, MAX, NLA, POL, EXT)
@@
expression START, MAX, POL, EXT;
@@
-nla_validate_nested(START, MAX, POL, EXT)
+nla_validate_nested_deprecated(START, MAX, POL, EXT)
@@
expression NLH, HDRLEN, MAX, POL, EXT;
@@
-nlmsg_validate(NLH, HDRLEN, MAX, POL, EXT)
+nlmsg_validate_deprecated(NLH, HDRLEN, MAX, POL, EXT)
For this patch, don't actually add the strict, non-renamed versions
yet so that it breaks compile if I get it wrong.
Also, while at it, make nla_validate and nla_parse go down to a
common __nla_validate_parse() function to avoid code duplication.
Ultimately, this allows us to have very strict validation for every
new caller of nla_parse()/nlmsg_parse() etc as re-introduced in the
next patch, while existing things will continue to work as is.
In effect then, this adds fully strict validation for any new command.
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-04-26 14:07:28 +02:00
err = nlmsg_parse_deprecated ( nlh , sizeof ( * ndm ) , tb , NDA_MAX ,
nda_policy , extack ) ;
2006-08-07 17:55:40 -07:00
if ( err < 0 )
2005-04-16 15:20:36 -07:00
goto out ;
2006-08-07 17:55:40 -07:00
err = - EINVAL ;
2018-12-05 20:02:29 -08:00
if ( ! tb [ NDA_DST ] ) {
NL_SET_ERR_MSG ( extack , " Network address not specified " ) ;
2006-08-07 17:55:40 -07:00
goto out ;
2018-12-05 20:02:29 -08:00
}
2006-08-07 17:55:40 -07:00
ndm = nlmsg_data ( nlh ) ;
2021-10-11 14:12:37 +02:00
ndm_flags = ndm - > ndm_flags ;
if ( tb [ NDA_FLAGS_EXT ] ) {
u32 ext = nla_get_u32 ( tb [ NDA_FLAGS_EXT ] ) ;
2021-10-13 15:21:38 +02:00
BUILD_BUG_ON ( sizeof ( neigh - > flags ) * BITS_PER_BYTE <
( sizeof ( ndm - > ndm_flags ) * BITS_PER_BYTE +
hweight32 ( NTF_EXT_MASK ) ) ) ;
2021-10-11 14:12:37 +02:00
ndm_flags | = ( ext < < NTF_EXT_SHIFT ) ;
}
2006-08-07 17:55:40 -07:00
if ( ndm - > ndm_ifindex ) {
2010-10-04 04:27:36 +00:00
dev = __dev_get_by_index ( net , ndm - > ndm_ifindex ) ;
2006-08-07 17:55:40 -07:00
if ( dev = = NULL ) {
err = - ENODEV ;
goto out ;
}
2018-12-05 20:02:29 -08:00
if ( tb [ NDA_LLADDR ] & & nla_len ( tb [ NDA_LLADDR ] ) < dev - > addr_len ) {
NL_SET_ERR_MSG ( extack , " Invalid link address " ) ;
2010-10-04 04:27:36 +00:00
goto out ;
2018-12-05 20:02:29 -08:00
}
2006-08-07 17:55:40 -07:00
}
2014-11-10 15:59:36 -08:00
tbl = neigh_find_table ( ndm - > ndm_family ) ;
if ( tbl = = NULL )
return - EAFNOSUPPORT ;
2005-04-16 15:20:36 -07:00
2018-12-05 20:02:29 -08:00
if ( nla_len ( tb [ NDA_DST ] ) < ( int ) tbl - > key_len ) {
NL_SET_ERR_MSG ( extack , " Invalid network address " ) ;
2014-11-10 15:59:36 -08:00
goto out ;
2018-12-05 20:02:29 -08:00
}
2014-11-10 15:59:36 -08:00
dst = nla_data ( tb [ NDA_DST ] ) ;
lladdr = tb [ NDA_LLADDR ] ? nla_data ( tb [ NDA_LLADDR ] ) : NULL ;
2005-04-16 15:20:36 -07:00
2018-12-19 20:02:36 -08:00
if ( tb [ NDA_PROTOCOL ] )
2018-12-15 14:09:06 -08:00
protocol = nla_get_u8 ( tb [ NDA_PROTOCOL ] ) ;
2021-10-11 14:12:37 +02:00
if ( ndm_flags & NTF_PROXY ) {
2014-11-10 15:59:36 -08:00
struct pneigh_entry * pn ;
2021-10-11 14:12:38 +02:00
if ( ndm_flags & NTF_MANAGED ) {
NL_SET_ERR_MSG ( extack , " Invalid NTF_* flag combination " ) ;
goto out ;
}
2014-11-10 15:59:36 -08:00
err = - ENOBUFS ;
pn = pneigh_lookup ( tbl , net , dst , dev , 1 ) ;
if ( pn ) {
2021-10-11 14:12:37 +02:00
pn - > flags = ndm_flags ;
2018-12-15 14:09:06 -08:00
if ( protocol )
pn - > protocol = protocol ;
2014-11-10 15:59:36 -08:00
err = 0 ;
}
goto out ;
}
2005-04-16 15:20:36 -07:00
2018-12-05 20:02:29 -08:00
if ( ! dev ) {
NL_SET_ERR_MSG ( extack , " Device not specified " ) ;
2014-11-10 15:59:36 -08:00
goto out ;
2018-12-05 20:02:29 -08:00
}
2006-09-22 14:43:19 -07:00
2019-04-16 17:31:43 -07:00
if ( tbl - > allow_add & & ! tbl - > allow_add ( dev , extack ) ) {
err = - EINVAL ;
goto out ;
}
2014-11-10 15:59:36 -08:00
neigh = neigh_lookup ( tbl , dst , dev ) ;
if ( neigh = = NULL ) {
2021-10-13 15:21:40 +02:00
bool ndm_permanent = ndm - > ndm_state & NUD_PERMANENT ;
bool exempt_from_gc = ndm_permanent | |
ndm_flags & NTF_EXT_LEARNED ;
2018-12-11 18:57:25 -07:00
2014-11-10 15:59:36 -08:00
if ( ! ( nlh - > nlmsg_flags & NLM_F_CREATE ) ) {
err = - ENOENT ;
2010-10-04 04:27:36 +00:00
goto out ;
2005-04-16 15:20:36 -07:00
}
2021-10-13 15:21:40 +02:00
if ( ndm_permanent & & ( ndm_flags & NTF_MANAGED ) ) {
NL_SET_ERR_MSG ( extack , " Invalid NTF_* flag for permanent entry " ) ;
err = - EINVAL ;
goto out ;
}
2005-04-16 15:20:36 -07:00
2021-10-11 14:12:35 +02:00
neigh = ___neigh_create ( tbl , dst , dev ,
2021-10-11 14:12:38 +02:00
ndm_flags &
( NTF_EXT_LEARNED | NTF_MANAGED ) ,
2021-10-11 14:12:35 +02:00
exempt_from_gc , true ) ;
2014-11-10 15:59:36 -08:00
if ( IS_ERR ( neigh ) ) {
err = PTR_ERR ( neigh ) ;
goto out ;
}
} else {
if ( nlh - > nlmsg_flags & NLM_F_EXCL ) {
err = - EEXIST ;
neigh_release ( neigh ) ;
2010-10-04 04:27:36 +00:00
goto out ;
2006-08-07 17:55:40 -07:00
}
2005-04-16 15:20:36 -07:00
2014-11-10 15:59:36 -08:00
if ( ! ( nlh - > nlmsg_flags & NLM_F_REPLACE ) )
2018-09-22 21:26:19 -07:00
flags & = ~ ( NEIGH_UPDATE_F_OVERRIDE |
NEIGH_UPDATE_F_OVERRIDE_ISROUTER ) ;
2005-04-16 15:20:36 -07:00
}
2020-05-01 21:34:18 -04:00
if ( protocol )
neigh - > protocol = protocol ;
2021-10-11 14:12:37 +02:00
if ( ndm_flags & NTF_EXT_LEARNED )
2018-04-24 13:49:34 -07:00
flags | = NEIGH_UPDATE_F_EXT_LEARNED ;
2021-10-11 14:12:37 +02:00
if ( ndm_flags & NTF_ROUTER )
2018-09-22 21:26:19 -07:00
flags | = NEIGH_UPDATE_F_ISROUTER ;
2021-10-11 14:12:38 +02:00
if ( ndm_flags & NTF_MANAGED )
flags | = NEIGH_UPDATE_F_MANAGED ;
2021-10-11 14:12:37 +02:00
if ( ndm_flags & NTF_USE )
net, neigh: Enable state migration between NUD_PERMANENT and NTF_USE
Currently, it is not possible to migrate a neighbor entry between NUD_PERMANENT
state and NTF_USE flag with a dynamic NUD state from a user space control plane.
Similarly, it is not possible to add/remove NTF_EXT_LEARNED flag from an existing
neighbor entry in combination with NTF_USE flag.
This is due to the latter directly calling into neigh_event_send() without any
meta data updates as happening in __neigh_update(). Thus, to enable this use
case, extend the latter with a NEIGH_UPDATE_F_USE flag where we break the
NUD_PERMANENT state in particular so that a latter neigh_event_send() is able
to re-resolve a neighbor entry.
Before fix, NUD_PERMANENT -> NUD_* & NTF_USE:
# ./ip/ip n replace 192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a
# ./ip/ip n
192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a PERMANENT
[...]
# ./ip/ip n replace 192.168.178.30 dev enp5s0 use extern_learn
# ./ip/ip n
192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a PERMANENT
[...]
As can be seen, despite the admin-triggered replace, the entry remains in the
NUD_PERMANENT state.
After fix, NUD_PERMANENT -> NUD_* & NTF_USE:
# ./ip/ip n replace 192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a
# ./ip/ip n
192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a PERMANENT
[...]
# ./ip/ip n replace 192.168.178.30 dev enp5s0 use extern_learn
# ./ip/ip n
192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a extern_learn REACHABLE
[...]
# ./ip/ip n
192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a extern_learn STALE
[...]
# ./ip/ip n replace 192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a
# ./ip/ip n
192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a PERMANENT
[...]
After the fix, the admin-triggered replace switches to a dynamic state from
the NTF_USE flag which triggered a new neighbor resolution. Likewise, we can
transition back from there, if needed, into NUD_PERMANENT.
Similar before/after behavior can be observed for below transitions:
Before fix, NTF_USE -> NTF_USE | NTF_EXT_LEARNED -> NTF_USE:
# ./ip/ip n replace 192.168.178.30 dev enp5s0 use
# ./ip/ip n
192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a REACHABLE
[...]
# ./ip/ip n replace 192.168.178.30 dev enp5s0 use extern_learn
# ./ip/ip n
192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a REACHABLE
[...]
After fix, NTF_USE -> NTF_USE | NTF_EXT_LEARNED -> NTF_USE:
# ./ip/ip n replace 192.168.178.30 dev enp5s0 use
# ./ip/ip n
192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a REACHABLE
[...]
# ./ip/ip n replace 192.168.178.30 dev enp5s0 use extern_learn
# ./ip/ip n
192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a extern_learn REACHABLE
[...]
# ./ip/ip n replace 192.168.178.30 dev enp5s0 use
# ./ip/ip n
192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a REACHABLE
[..]
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Roopa Prabhu <roopa@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2021-10-11 14:12:36 +02:00
flags | = NEIGH_UPDATE_F_USE ;
2018-09-22 21:26:19 -07:00
net, neigh: Enable state migration between NUD_PERMANENT and NTF_USE
Currently, it is not possible to migrate a neighbor entry between NUD_PERMANENT
state and NTF_USE flag with a dynamic NUD state from a user space control plane.
Similarly, it is not possible to add/remove NTF_EXT_LEARNED flag from an existing
neighbor entry in combination with NTF_USE flag.
This is due to the latter directly calling into neigh_event_send() without any
meta data updates as happening in __neigh_update(). Thus, to enable this use
case, extend the latter with a NEIGH_UPDATE_F_USE flag where we break the
NUD_PERMANENT state in particular so that a latter neigh_event_send() is able
to re-resolve a neighbor entry.
Before fix, NUD_PERMANENT -> NUD_* & NTF_USE:
# ./ip/ip n replace 192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a
# ./ip/ip n
192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a PERMANENT
[...]
# ./ip/ip n replace 192.168.178.30 dev enp5s0 use extern_learn
# ./ip/ip n
192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a PERMANENT
[...]
As can be seen, despite the admin-triggered replace, the entry remains in the
NUD_PERMANENT state.
After fix, NUD_PERMANENT -> NUD_* & NTF_USE:
# ./ip/ip n replace 192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a
# ./ip/ip n
192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a PERMANENT
[...]
# ./ip/ip n replace 192.168.178.30 dev enp5s0 use extern_learn
# ./ip/ip n
192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a extern_learn REACHABLE
[...]
# ./ip/ip n
192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a extern_learn STALE
[...]
# ./ip/ip n replace 192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a
# ./ip/ip n
192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a PERMANENT
[...]
After the fix, the admin-triggered replace switches to a dynamic state from
the NTF_USE flag which triggered a new neighbor resolution. Likewise, we can
transition back from there, if needed, into NUD_PERMANENT.
Similar before/after behavior can be observed for below transitions:
Before fix, NTF_USE -> NTF_USE | NTF_EXT_LEARNED -> NTF_USE:
# ./ip/ip n replace 192.168.178.30 dev enp5s0 use
# ./ip/ip n
192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a REACHABLE
[...]
# ./ip/ip n replace 192.168.178.30 dev enp5s0 use extern_learn
# ./ip/ip n
192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a REACHABLE
[...]
After fix, NTF_USE -> NTF_USE | NTF_EXT_LEARNED -> NTF_USE:
# ./ip/ip n replace 192.168.178.30 dev enp5s0 use
# ./ip/ip n
192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a REACHABLE
[...]
# ./ip/ip n replace 192.168.178.30 dev enp5s0 use extern_learn
# ./ip/ip n
192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a extern_learn REACHABLE
[...]
# ./ip/ip n replace 192.168.178.30 dev enp5s0 use
# ./ip/ip n
192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a REACHABLE
[..]
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Roopa Prabhu <roopa@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2021-10-11 14:12:36 +02:00
err = __neigh_update ( neigh , lladdr , ndm - > ndm_state , flags ,
NETLINK_CB ( skb ) . portid , extack ) ;
2021-10-11 14:12:38 +02:00
if ( ! err & & ndm_flags & ( NTF_USE | NTF_MANAGED ) ) {
2014-11-10 15:59:36 -08:00
neigh_event_send ( neigh , NULL ) ;
err = 0 ;
net, neigh: Enable state migration between NUD_PERMANENT and NTF_USE
Currently, it is not possible to migrate a neighbor entry between NUD_PERMANENT
state and NTF_USE flag with a dynamic NUD state from a user space control plane.
Similarly, it is not possible to add/remove NTF_EXT_LEARNED flag from an existing
neighbor entry in combination with NTF_USE flag.
This is due to the latter directly calling into neigh_event_send() without any
meta data updates as happening in __neigh_update(). Thus, to enable this use
case, extend the latter with a NEIGH_UPDATE_F_USE flag where we break the
NUD_PERMANENT state in particular so that a latter neigh_event_send() is able
to re-resolve a neighbor entry.
Before fix, NUD_PERMANENT -> NUD_* & NTF_USE:
# ./ip/ip n replace 192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a
# ./ip/ip n
192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a PERMANENT
[...]
# ./ip/ip n replace 192.168.178.30 dev enp5s0 use extern_learn
# ./ip/ip n
192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a PERMANENT
[...]
As can be seen, despite the admin-triggered replace, the entry remains in the
NUD_PERMANENT state.
After fix, NUD_PERMANENT -> NUD_* & NTF_USE:
# ./ip/ip n replace 192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a
# ./ip/ip n
192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a PERMANENT
[...]
# ./ip/ip n replace 192.168.178.30 dev enp5s0 use extern_learn
# ./ip/ip n
192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a extern_learn REACHABLE
[...]
# ./ip/ip n
192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a extern_learn STALE
[...]
# ./ip/ip n replace 192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a
# ./ip/ip n
192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a PERMANENT
[...]
After the fix, the admin-triggered replace switches to a dynamic state from
the NTF_USE flag which triggered a new neighbor resolution. Likewise, we can
transition back from there, if needed, into NUD_PERMANENT.
Similar before/after behavior can be observed for below transitions:
Before fix, NTF_USE -> NTF_USE | NTF_EXT_LEARNED -> NTF_USE:
# ./ip/ip n replace 192.168.178.30 dev enp5s0 use
# ./ip/ip n
192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a REACHABLE
[...]
# ./ip/ip n replace 192.168.178.30 dev enp5s0 use extern_learn
# ./ip/ip n
192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a REACHABLE
[...]
After fix, NTF_USE -> NTF_USE | NTF_EXT_LEARNED -> NTF_USE:
# ./ip/ip n replace 192.168.178.30 dev enp5s0 use
# ./ip/ip n
192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a REACHABLE
[...]
# ./ip/ip n replace 192.168.178.30 dev enp5s0 use extern_learn
# ./ip/ip n
192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a extern_learn REACHABLE
[...]
# ./ip/ip n replace 192.168.178.30 dev enp5s0 use
# ./ip/ip n
192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a REACHABLE
[..]
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Roopa Prabhu <roopa@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2021-10-11 14:12:36 +02:00
}
2014-11-10 15:59:36 -08:00
neigh_release ( neigh ) ;
2005-04-16 15:20:36 -07:00
out :
return err ;
}
2005-06-18 22:50:55 -07:00
static int neightbl_fill_parms ( struct sk_buff * skb , struct neigh_parms * parms )
{
2006-08-07 18:00:18 -07:00
struct nlattr * nest ;
2019-04-26 11:13:06 +02:00
nest = nla_nest_start_noflag ( skb , NDTA_PARMS ) ;
2006-08-07 18:00:18 -07:00
if ( nest = = NULL )
return - ENOBUFS ;
2005-06-18 22:50:55 -07:00
2012-04-01 20:06:28 -04:00
if ( ( parms - > dev & &
nla_put_u32 ( skb , NDTPA_IFINDEX , parms - > dev - > ifindex ) ) | |
2017-06-30 13:07:56 +03:00
nla_put_u32 ( skb , NDTPA_REFCNT , refcount_read ( & parms - > refcnt ) ) | |
2013-12-07 19:26:53 +01:00
nla_put_u32 ( skb , NDTPA_QUEUE_LENBYTES ,
NEIGH_VAR ( parms , QUEUE_LEN_BYTES ) ) | |
2012-04-01 20:06:28 -04:00
/* approximative value for deprecated QUEUE_LEN (in packets) */
nla_put_u32 ( skb , NDTPA_QUEUE_LEN ,
2013-12-07 19:26:53 +01:00
NEIGH_VAR ( parms , QUEUE_LEN_BYTES ) / SKB_TRUESIZE ( ETH_FRAME_LEN ) ) | |
nla_put_u32 ( skb , NDTPA_PROXY_QLEN , NEIGH_VAR ( parms , PROXY_QLEN ) ) | |
nla_put_u32 ( skb , NDTPA_APP_PROBES , NEIGH_VAR ( parms , APP_PROBES ) ) | |
nla_put_u32 ( skb , NDTPA_UCAST_PROBES ,
NEIGH_VAR ( parms , UCAST_PROBES ) ) | |
nla_put_u32 ( skb , NDTPA_MCAST_PROBES ,
NEIGH_VAR ( parms , MCAST_PROBES ) ) | |
2015-03-19 22:41:46 +09:00
nla_put_u32 ( skb , NDTPA_MCAST_REPROBES ,
NEIGH_VAR ( parms , MCAST_REPROBES ) ) | |
2016-04-22 17:31:21 +02:00
nla_put_msecs ( skb , NDTPA_REACHABLE_TIME , parms - > reachable_time ,
NDTPA_PAD ) | |
2012-04-01 20:06:28 -04:00
nla_put_msecs ( skb , NDTPA_BASE_REACHABLE_TIME ,
2016-04-22 17:31:21 +02:00
NEIGH_VAR ( parms , BASE_REACHABLE_TIME ) , NDTPA_PAD ) | |
2013-12-07 19:26:53 +01:00
nla_put_msecs ( skb , NDTPA_GC_STALETIME ,
2016-04-22 17:31:21 +02:00
NEIGH_VAR ( parms , GC_STALETIME ) , NDTPA_PAD ) | |
2012-04-01 20:06:28 -04:00
nla_put_msecs ( skb , NDTPA_DELAY_PROBE_TIME ,
2016-04-22 17:31:21 +02:00
NEIGH_VAR ( parms , DELAY_PROBE_TIME ) , NDTPA_PAD ) | |
2013-12-07 19:26:53 +01:00
nla_put_msecs ( skb , NDTPA_RETRANS_TIME ,
2016-04-22 17:31:21 +02:00
NEIGH_VAR ( parms , RETRANS_TIME ) , NDTPA_PAD ) | |
2013-12-07 19:26:53 +01:00
nla_put_msecs ( skb , NDTPA_ANYCAST_DELAY ,
2016-04-22 17:31:21 +02:00
NEIGH_VAR ( parms , ANYCAST_DELAY ) , NDTPA_PAD ) | |
2013-12-07 19:26:53 +01:00
nla_put_msecs ( skb , NDTPA_PROXY_DELAY ,
2016-04-22 17:31:21 +02:00
NEIGH_VAR ( parms , PROXY_DELAY ) , NDTPA_PAD ) | |
2013-12-07 19:26:53 +01:00
nla_put_msecs ( skb , NDTPA_LOCKTIME ,
2022-06-29 08:48:32 +00:00
NEIGH_VAR ( parms , LOCKTIME ) , NDTPA_PAD ) | |
nla_put_msecs ( skb , NDTPA_INTERVAL_PROBE_TIME_MS ,
NEIGH_VAR ( parms , INTERVAL_PROBE_TIME_MS ) , NDTPA_PAD ) )
2012-04-01 20:06:28 -04:00
goto nla_put_failure ;
2006-08-07 18:00:18 -07:00
return nla_nest_end ( skb , nest ) ;
2005-06-18 22:50:55 -07:00
2006-08-07 18:00:18 -07:00
nla_put_failure :
2008-06-03 16:36:54 -07:00
nla_nest_cancel ( skb , nest ) ;
return - EMSGSIZE ;
2005-06-18 22:50:55 -07:00
}
2006-08-07 18:00:18 -07:00
static int neightbl_fill_info ( struct sk_buff * skb , struct neigh_table * tbl ,
u32 pid , u32 seq , int type , int flags )
2005-06-18 22:50:55 -07:00
{
struct nlmsghdr * nlh ;
struct ndtmsg * ndtmsg ;
2006-08-07 18:00:18 -07:00
nlh = nlmsg_put ( skb , pid , seq , type , sizeof ( * ndtmsg ) , flags ) ;
if ( nlh = = NULL )
2007-01-31 23:16:40 -08:00
return - EMSGSIZE ;
2005-06-18 22:50:55 -07:00
2006-08-07 18:00:18 -07:00
ndtmsg = nlmsg_data ( nlh ) ;
2005-06-18 22:50:55 -07:00
read_lock_bh ( & tbl - > lock ) ;
ndtmsg - > ndtm_family = tbl - > family ;
2005-06-28 12:55:30 -07:00
ndtmsg - > ndtm_pad1 = 0 ;
ndtmsg - > ndtm_pad2 = 0 ;
2005-06-18 22:50:55 -07:00
2012-04-01 20:06:28 -04:00
if ( nla_put_string ( skb , NDTA_NAME , tbl - > id ) | |
2016-04-22 17:31:21 +02:00
nla_put_msecs ( skb , NDTA_GC_INTERVAL , tbl - > gc_interval , NDTA_PAD ) | |
2012-04-01 20:06:28 -04:00
nla_put_u32 ( skb , NDTA_THRESH1 , tbl - > gc_thresh1 ) | |
nla_put_u32 ( skb , NDTA_THRESH2 , tbl - > gc_thresh2 ) | |
nla_put_u32 ( skb , NDTA_THRESH3 , tbl - > gc_thresh3 ) )
goto nla_put_failure ;
2005-06-18 22:50:55 -07:00
{
unsigned long now = jiffies ;
2019-11-05 14:11:49 -08:00
long flush_delta = now - tbl - > last_flush ;
long rand_delta = now - tbl - > last_rand ;
2010-10-04 06:15:44 +00:00
struct neigh_hash_table * nht ;
2005-06-18 22:50:55 -07:00
struct ndt_config ndc = {
. ndtc_key_len = tbl - > key_len ,
. ndtc_entry_size = tbl - > entry_size ,
. ndtc_entries = atomic_read ( & tbl - > entries ) ,
. ndtc_last_flush = jiffies_to_msecs ( flush_delta ) ,
. ndtc_last_rand = jiffies_to_msecs ( rand_delta ) ,
. ndtc_proxy_qlen = tbl - > proxy_queue . qlen ,
} ;
2023-03-21 04:01:14 +00:00
rcu_read_lock ( ) ;
nht = rcu_dereference ( tbl - > nht ) ;
2011-12-28 15:06:58 -05:00
ndc . ndtc_hash_rnd = nht - > hash_rnd [ 0 ] ;
2011-07-11 01:28:12 -07:00
ndc . ndtc_hash_mask = ( ( 1 < < nht - > hash_shift ) - 1 ) ;
2023-03-21 04:01:14 +00:00
rcu_read_unlock ( ) ;
2010-10-04 06:15:44 +00:00
2012-04-01 20:06:28 -04:00
if ( nla_put ( skb , NDTA_CONFIG , sizeof ( ndc ) , & ndc ) )
goto nla_put_failure ;
2005-06-18 22:50:55 -07:00
}
{
int cpu ;
struct ndt_stats ndst ;
memset ( & ndst , 0 , sizeof ( ndst ) ) ;
2006-04-10 22:52:50 -07:00
for_each_possible_cpu ( cpu ) {
2005-06-18 22:50:55 -07:00
struct neigh_statistics * st ;
st = per_cpu_ptr ( tbl - > stats , cpu ) ;
ndst . ndts_allocs + = st - > allocs ;
ndst . ndts_destroys + = st - > destroys ;
ndst . ndts_hash_grows + = st - > hash_grows ;
ndst . ndts_res_failed + = st - > res_failed ;
ndst . ndts_lookups + = st - > lookups ;
ndst . ndts_hits + = st - > hits ;
ndst . ndts_rcv_probes_mcast + = st - > rcv_probes_mcast ;
ndst . ndts_rcv_probes_ucast + = st - > rcv_probes_ucast ;
ndst . ndts_periodic_gc_runs + = st - > periodic_gc_runs ;
ndst . ndts_forced_gc_runs + = st - > forced_gc_runs ;
2015-08-07 11:10:37 -07:00
ndst . ndts_table_fulls + = st - > table_fulls ;
2005-06-18 22:50:55 -07:00
}
2016-04-26 10:06:17 +02:00
if ( nla_put_64bit ( skb , NDTA_STATS , sizeof ( ndst ) , & ndst ,
NDTA_PAD ) )
2012-04-01 20:06:28 -04:00
goto nla_put_failure ;
2005-06-18 22:50:55 -07:00
}
BUG_ON ( tbl - > parms . dev ) ;
if ( neightbl_fill_parms ( skb , & tbl - > parms ) < 0 )
2006-08-07 18:00:18 -07:00
goto nla_put_failure ;
2005-06-18 22:50:55 -07:00
read_unlock_bh ( & tbl - > lock ) ;
2015-01-16 22:09:00 +01:00
nlmsg_end ( skb , nlh ) ;
return 0 ;
2005-06-18 22:50:55 -07:00
2006-08-07 18:00:18 -07:00
nla_put_failure :
2005-06-18 22:50:55 -07:00
read_unlock_bh ( & tbl - > lock ) ;
2007-01-31 23:16:40 -08:00
nlmsg_cancel ( skb , nlh ) ;
return - EMSGSIZE ;
2005-06-18 22:50:55 -07:00
}
2006-08-07 18:00:18 -07:00
static int neightbl_fill_param_info ( struct sk_buff * skb ,
struct neigh_table * tbl ,
2005-06-18 22:50:55 -07:00
struct neigh_parms * parms ,
2006-08-07 18:00:18 -07:00
u32 pid , u32 seq , int type ,
unsigned int flags )
2005-06-18 22:50:55 -07:00
{
struct ndtmsg * ndtmsg ;
struct nlmsghdr * nlh ;
2006-08-07 18:00:18 -07:00
nlh = nlmsg_put ( skb , pid , seq , type , sizeof ( * ndtmsg ) , flags ) ;
if ( nlh = = NULL )
2007-01-31 23:16:40 -08:00
return - EMSGSIZE ;
2005-06-18 22:50:55 -07:00
2006-08-07 18:00:18 -07:00
ndtmsg = nlmsg_data ( nlh ) ;
2005-06-18 22:50:55 -07:00
read_lock_bh ( & tbl - > lock ) ;
ndtmsg - > ndtm_family = tbl - > family ;
2005-06-28 12:55:30 -07:00
ndtmsg - > ndtm_pad1 = 0 ;
ndtmsg - > ndtm_pad2 = 0 ;
2005-06-18 22:50:55 -07:00
2006-08-07 18:00:18 -07:00
if ( nla_put_string ( skb , NDTA_NAME , tbl - > id ) < 0 | |
neightbl_fill_parms ( skb , parms ) < 0 )
goto errout ;
2005-06-18 22:50:55 -07:00
read_unlock_bh ( & tbl - > lock ) ;
2015-01-16 22:09:00 +01:00
nlmsg_end ( skb , nlh ) ;
return 0 ;
2006-08-07 18:00:18 -07:00
errout :
2005-06-18 22:50:55 -07:00
read_unlock_bh ( & tbl - > lock ) ;
2007-01-31 23:16:40 -08:00
nlmsg_cancel ( skb , nlh ) ;
return - EMSGSIZE ;
2005-06-18 22:50:55 -07:00
}
2007-02-09 23:24:36 +09:00
2007-06-05 12:38:30 -07:00
static const struct nla_policy nl_neightbl_policy [ NDTA_MAX + 1 ] = {
2006-08-07 17:58:53 -07:00
[ NDTA_NAME ] = { . type = NLA_STRING } ,
[ NDTA_THRESH1 ] = { . type = NLA_U32 } ,
[ NDTA_THRESH2 ] = { . type = NLA_U32 } ,
[ NDTA_THRESH3 ] = { . type = NLA_U32 } ,
[ NDTA_GC_INTERVAL ] = { . type = NLA_U64 } ,
[ NDTA_PARMS ] = { . type = NLA_NESTED } ,
} ;
2007-06-05 12:38:30 -07:00
static const struct nla_policy nl_ntbl_parm_policy [ NDTPA_MAX + 1 ] = {
2006-08-07 17:58:53 -07:00
[ NDTPA_IFINDEX ] = { . type = NLA_U32 } ,
[ NDTPA_QUEUE_LEN ] = { . type = NLA_U32 } ,
[ NDTPA_PROXY_QLEN ] = { . type = NLA_U32 } ,
[ NDTPA_APP_PROBES ] = { . type = NLA_U32 } ,
[ NDTPA_UCAST_PROBES ] = { . type = NLA_U32 } ,
[ NDTPA_MCAST_PROBES ] = { . type = NLA_U32 } ,
2015-03-19 22:41:46 +09:00
[ NDTPA_MCAST_REPROBES ] = { . type = NLA_U32 } ,
2006-08-07 17:58:53 -07:00
[ NDTPA_BASE_REACHABLE_TIME ] = { . type = NLA_U64 } ,
[ NDTPA_GC_STALETIME ] = { . type = NLA_U64 } ,
[ NDTPA_DELAY_PROBE_TIME ] = { . type = NLA_U64 } ,
[ NDTPA_RETRANS_TIME ] = { . type = NLA_U64 } ,
[ NDTPA_ANYCAST_DELAY ] = { . type = NLA_U64 } ,
[ NDTPA_PROXY_DELAY ] = { . type = NLA_U64 } ,
[ NDTPA_LOCKTIME ] = { . type = NLA_U64 } ,
2022-06-29 08:48:32 +00:00
[ NDTPA_INTERVAL_PROBE_TIME_MS ] = { . type = NLA_U64 , . min = 1 } ,
2006-08-07 17:58:53 -07:00
} ;
2017-04-16 09:48:24 -07:00
static int neightbl_set ( struct sk_buff * skb , struct nlmsghdr * nlh ,
struct netlink_ext_ack * extack )
2005-06-18 22:50:55 -07:00
{
2008-03-26 02:26:21 +09:00
struct net * net = sock_net ( skb - > sk ) ;
2005-06-18 22:50:55 -07:00
struct neigh_table * tbl ;
2006-08-07 17:58:53 -07:00
struct ndtmsg * ndtmsg ;
struct nlattr * tb [ NDTA_MAX + 1 ] ;
2014-11-10 15:59:36 -08:00
bool found = false ;
int err , tidx ;
2005-06-18 22:50:55 -07:00
netlink: make validation more configurable for future strictness
We currently have two levels of strict validation:
1) liberal (default)
- undefined (type >= max) & NLA_UNSPEC attributes accepted
- attribute length >= expected accepted
- garbage at end of message accepted
2) strict (opt-in)
- NLA_UNSPEC attributes accepted
- attribute length >= expected accepted
Split out parsing strictness into four different options:
* TRAILING - check that there's no trailing data after parsing
attributes (in message or nested)
* MAXTYPE - reject attrs > max known type
* UNSPEC - reject attributes with NLA_UNSPEC policy entries
* STRICT_ATTRS - strictly validate attribute size
The default for future things should be *everything*.
The current *_strict() is a combination of TRAILING and MAXTYPE,
and is renamed to _deprecated_strict().
The current regular parsing has none of this, and is renamed to
*_parse_deprecated().
Additionally it allows us to selectively set one of the new flags
even on old policies. Notably, the UNSPEC flag could be useful in
this case, since it can be arranged (by filling in the policy) to
not be an incompatible userspace ABI change, but would then going
forward prevent forgetting attribute entries. Similar can apply
to the POLICY flag.
We end up with the following renames:
* nla_parse -> nla_parse_deprecated
* nla_parse_strict -> nla_parse_deprecated_strict
* nlmsg_parse -> nlmsg_parse_deprecated
* nlmsg_parse_strict -> nlmsg_parse_deprecated_strict
* nla_parse_nested -> nla_parse_nested_deprecated
* nla_validate_nested -> nla_validate_nested_deprecated
Using spatch, of course:
@@
expression TB, MAX, HEAD, LEN, POL, EXT;
@@
-nla_parse(TB, MAX, HEAD, LEN, POL, EXT)
+nla_parse_deprecated(TB, MAX, HEAD, LEN, POL, EXT)
@@
expression NLH, HDRLEN, TB, MAX, POL, EXT;
@@
-nlmsg_parse(NLH, HDRLEN, TB, MAX, POL, EXT)
+nlmsg_parse_deprecated(NLH, HDRLEN, TB, MAX, POL, EXT)
@@
expression NLH, HDRLEN, TB, MAX, POL, EXT;
@@
-nlmsg_parse_strict(NLH, HDRLEN, TB, MAX, POL, EXT)
+nlmsg_parse_deprecated_strict(NLH, HDRLEN, TB, MAX, POL, EXT)
@@
expression TB, MAX, NLA, POL, EXT;
@@
-nla_parse_nested(TB, MAX, NLA, POL, EXT)
+nla_parse_nested_deprecated(TB, MAX, NLA, POL, EXT)
@@
expression START, MAX, POL, EXT;
@@
-nla_validate_nested(START, MAX, POL, EXT)
+nla_validate_nested_deprecated(START, MAX, POL, EXT)
@@
expression NLH, HDRLEN, MAX, POL, EXT;
@@
-nlmsg_validate(NLH, HDRLEN, MAX, POL, EXT)
+nlmsg_validate_deprecated(NLH, HDRLEN, MAX, POL, EXT)
For this patch, don't actually add the strict, non-renamed versions
yet so that it breaks compile if I get it wrong.
Also, while at it, make nla_validate and nla_parse go down to a
common __nla_validate_parse() function to avoid code duplication.
Ultimately, this allows us to have very strict validation for every
new caller of nla_parse()/nlmsg_parse() etc as re-introduced in the
next patch, while existing things will continue to work as is.
In effect then, this adds fully strict validation for any new command.
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-04-26 14:07:28 +02:00
err = nlmsg_parse_deprecated ( nlh , sizeof ( * ndtmsg ) , tb , NDTA_MAX ,
nl_neightbl_policy , extack ) ;
2006-08-07 17:58:53 -07:00
if ( err < 0 )
goto errout ;
2005-06-18 22:50:55 -07:00
2006-08-07 17:58:53 -07:00
if ( tb [ NDTA_NAME ] = = NULL ) {
err = - EINVAL ;
goto errout ;
}
ndtmsg = nlmsg_data ( nlh ) ;
2014-11-10 15:59:36 -08:00
for ( tidx = 0 ; tidx < NEIGH_NR_TABLES ; tidx + + ) {
tbl = neigh_tables [ tidx ] ;
if ( ! tbl )
continue ;
2005-06-18 22:50:55 -07:00
if ( ndtmsg - > ndtm_family & & tbl - > family ! = ndtmsg - > ndtm_family )
continue ;
2014-11-10 15:59:36 -08:00
if ( nla_strcmp ( tb [ NDTA_NAME ] , tbl - > id ) = = 0 ) {
found = true ;
2005-06-18 22:50:55 -07:00
break ;
2014-11-10 15:59:36 -08:00
}
2005-06-18 22:50:55 -07:00
}
2014-11-10 15:59:36 -08:00
if ( ! found )
return - ENOENT ;
2005-06-18 22:50:55 -07:00
2007-02-09 23:24:36 +09:00
/*
2005-06-18 22:50:55 -07:00
* We acquire tbl - > lock to be nice to the periodic timers and
* make sure they always see a consistent set of values .
*/
write_lock_bh ( & tbl - > lock ) ;
2006-08-07 17:58:53 -07:00
if ( tb [ NDTA_PARMS ] ) {
struct nlattr * tbp [ NDTPA_MAX + 1 ] ;
2005-06-18 22:50:55 -07:00
struct neigh_parms * p ;
2006-08-07 17:58:53 -07:00
int i , ifindex = 0 ;
2005-06-18 22:50:55 -07:00
netlink: make validation more configurable for future strictness
We currently have two levels of strict validation:
1) liberal (default)
- undefined (type >= max) & NLA_UNSPEC attributes accepted
- attribute length >= expected accepted
- garbage at end of message accepted
2) strict (opt-in)
- NLA_UNSPEC attributes accepted
- attribute length >= expected accepted
Split out parsing strictness into four different options:
* TRAILING - check that there's no trailing data after parsing
attributes (in message or nested)
* MAXTYPE - reject attrs > max known type
* UNSPEC - reject attributes with NLA_UNSPEC policy entries
* STRICT_ATTRS - strictly validate attribute size
The default for future things should be *everything*.
The current *_strict() is a combination of TRAILING and MAXTYPE,
and is renamed to _deprecated_strict().
The current regular parsing has none of this, and is renamed to
*_parse_deprecated().
Additionally it allows us to selectively set one of the new flags
even on old policies. Notably, the UNSPEC flag could be useful in
this case, since it can be arranged (by filling in the policy) to
not be an incompatible userspace ABI change, but would then going
forward prevent forgetting attribute entries. Similar can apply
to the POLICY flag.
We end up with the following renames:
* nla_parse -> nla_parse_deprecated
* nla_parse_strict -> nla_parse_deprecated_strict
* nlmsg_parse -> nlmsg_parse_deprecated
* nlmsg_parse_strict -> nlmsg_parse_deprecated_strict
* nla_parse_nested -> nla_parse_nested_deprecated
* nla_validate_nested -> nla_validate_nested_deprecated
Using spatch, of course:
@@
expression TB, MAX, HEAD, LEN, POL, EXT;
@@
-nla_parse(TB, MAX, HEAD, LEN, POL, EXT)
+nla_parse_deprecated(TB, MAX, HEAD, LEN, POL, EXT)
@@
expression NLH, HDRLEN, TB, MAX, POL, EXT;
@@
-nlmsg_parse(NLH, HDRLEN, TB, MAX, POL, EXT)
+nlmsg_parse_deprecated(NLH, HDRLEN, TB, MAX, POL, EXT)
@@
expression NLH, HDRLEN, TB, MAX, POL, EXT;
@@
-nlmsg_parse_strict(NLH, HDRLEN, TB, MAX, POL, EXT)
+nlmsg_parse_deprecated_strict(NLH, HDRLEN, TB, MAX, POL, EXT)
@@
expression TB, MAX, NLA, POL, EXT;
@@
-nla_parse_nested(TB, MAX, NLA, POL, EXT)
+nla_parse_nested_deprecated(TB, MAX, NLA, POL, EXT)
@@
expression START, MAX, POL, EXT;
@@
-nla_validate_nested(START, MAX, POL, EXT)
+nla_validate_nested_deprecated(START, MAX, POL, EXT)
@@
expression NLH, HDRLEN, MAX, POL, EXT;
@@
-nlmsg_validate(NLH, HDRLEN, MAX, POL, EXT)
+nlmsg_validate_deprecated(NLH, HDRLEN, MAX, POL, EXT)
For this patch, don't actually add the strict, non-renamed versions
yet so that it breaks compile if I get it wrong.
Also, while at it, make nla_validate and nla_parse go down to a
common __nla_validate_parse() function to avoid code duplication.
Ultimately, this allows us to have very strict validation for every
new caller of nla_parse()/nlmsg_parse() etc as re-introduced in the
next patch, while existing things will continue to work as is.
In effect then, this adds fully strict validation for any new command.
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-04-26 14:07:28 +02:00
err = nla_parse_nested_deprecated ( tbp , NDTPA_MAX ,
tb [ NDTA_PARMS ] ,
nl_ntbl_parm_policy , extack ) ;
2006-08-07 17:58:53 -07:00
if ( err < 0 )
goto errout_tbl_lock ;
2005-06-18 22:50:55 -07:00
2006-08-07 17:58:53 -07:00
if ( tbp [ NDTPA_IFINDEX ] )
ifindex = nla_get_u32 ( tbp [ NDTPA_IFINDEX ] ) ;
2005-06-18 22:50:55 -07:00
2009-07-13 11:17:49 -07:00
p = lookup_neigh_parms ( tbl , net , ifindex ) ;
2005-06-18 22:50:55 -07:00
if ( p = = NULL ) {
err = - ENOENT ;
2006-08-07 17:58:53 -07:00
goto errout_tbl_lock ;
2005-06-18 22:50:55 -07:00
}
2006-08-07 17:58:53 -07:00
for ( i = 1 ; i < = NDTPA_MAX ; i + + ) {
if ( tbp [ i ] = = NULL )
continue ;
2005-06-18 22:50:55 -07:00
2006-08-07 17:58:53 -07:00
switch ( i ) {
case NDTPA_QUEUE_LEN :
2013-12-07 19:26:53 +01:00
NEIGH_VAR_SET ( p , QUEUE_LEN_BYTES ,
nla_get_u32 ( tbp [ i ] ) *
SKB_TRUESIZE ( ETH_FRAME_LEN ) ) ;
neigh: new unresolved queue limits
Le mercredi 09 novembre 2011 à 16:21 -0500, David Miller a écrit :
> From: David Miller <davem@davemloft.net>
> Date: Wed, 09 Nov 2011 16:16:44 -0500 (EST)
>
> > From: Eric Dumazet <eric.dumazet@gmail.com>
> > Date: Wed, 09 Nov 2011 12:14:09 +0100
> >
> >> unres_qlen is the number of frames we are able to queue per unresolved
> >> neighbour. Its default value (3) was never changed and is responsible
> >> for strange drops, especially if IP fragments are used, or multiple
> >> sessions start in parallel. Even a single tcp flow can hit this limit.
> > ...
> >
> > Ok, I've applied this, let's see what happens :-)
>
> Early answer, build fails.
>
> Please test build this patch with DECNET enabled and resubmit. The
> decnet neigh layer still refers to the removed ->queue_len member.
>
> Thanks.
Ouch, this was fixed on one machine yesterday, but not the other one I
used this morning, sorry.
[PATCH V5 net-next] neigh: new unresolved queue limits
unres_qlen is the number of frames we are able to queue per unresolved
neighbour. Its default value (3) was never changed and is responsible
for strange drops, especially if IP fragments are used, or multiple
sessions start in parallel. Even a single tcp flow can hit this limit.
$ arp -d 192.168.20.108 ; ping -c 2 -s 8000 192.168.20.108
PING 192.168.20.108 (192.168.20.108) 8000(8028) bytes of data.
8008 bytes from 192.168.20.108: icmp_seq=2 ttl=64 time=0.322 ms
Signed-off-by: David S. Miller <davem@davemloft.net>
2011-11-09 12:07:14 +00:00
break ;
case NDTPA_QUEUE_LENBYTES :
2013-12-07 19:26:53 +01:00
NEIGH_VAR_SET ( p , QUEUE_LEN_BYTES ,
nla_get_u32 ( tbp [ i ] ) ) ;
2006-08-07 17:58:53 -07:00
break ;
case NDTPA_PROXY_QLEN :
2013-12-07 19:26:53 +01:00
NEIGH_VAR_SET ( p , PROXY_QLEN ,
nla_get_u32 ( tbp [ i ] ) ) ;
2006-08-07 17:58:53 -07:00
break ;
case NDTPA_APP_PROBES :
2013-12-07 19:26:53 +01:00
NEIGH_VAR_SET ( p , APP_PROBES ,
nla_get_u32 ( tbp [ i ] ) ) ;
2006-08-07 17:58:53 -07:00
break ;
case NDTPA_UCAST_PROBES :
2013-12-07 19:26:53 +01:00
NEIGH_VAR_SET ( p , UCAST_PROBES ,
nla_get_u32 ( tbp [ i ] ) ) ;
2006-08-07 17:58:53 -07:00
break ;
case NDTPA_MCAST_PROBES :
2013-12-07 19:26:53 +01:00
NEIGH_VAR_SET ( p , MCAST_PROBES ,
nla_get_u32 ( tbp [ i ] ) ) ;
2006-08-07 17:58:53 -07:00
break ;
2015-03-19 22:41:46 +09:00
case NDTPA_MCAST_REPROBES :
NEIGH_VAR_SET ( p , MCAST_REPROBES ,
nla_get_u32 ( tbp [ i ] ) ) ;
break ;
2006-08-07 17:58:53 -07:00
case NDTPA_BASE_REACHABLE_TIME :
2013-12-07 19:26:53 +01:00
NEIGH_VAR_SET ( p , BASE_REACHABLE_TIME ,
nla_get_msecs ( tbp [ i ] ) ) ;
2015-01-14 04:22:39 +01:00
/* update reachable_time as well, otherwise, the change will
* only be effective after the next time neigh_periodic_work
* decides to recompute it ( can be multiple minutes )
*/
p - > reachable_time =
neigh_rand_reach_time ( NEIGH_VAR ( p , BASE_REACHABLE_TIME ) ) ;
2006-08-07 17:58:53 -07:00
break ;
case NDTPA_GC_STALETIME :
2013-12-07 19:26:53 +01:00
NEIGH_VAR_SET ( p , GC_STALETIME ,
nla_get_msecs ( tbp [ i ] ) ) ;
2006-08-07 17:58:53 -07:00
break ;
case NDTPA_DELAY_PROBE_TIME :
2013-12-07 19:26:53 +01:00
NEIGH_VAR_SET ( p , DELAY_PROBE_TIME ,
nla_get_msecs ( tbp [ i ] ) ) ;
2016-07-05 11:27:42 +02:00
call_netevent_notifiers ( NETEVENT_DELAY_PROBE_TIME_UPDATE , p ) ;
2006-08-07 17:58:53 -07:00
break ;
2022-06-29 08:48:32 +00:00
case NDTPA_INTERVAL_PROBE_TIME_MS :
NEIGH_VAR_SET ( p , INTERVAL_PROBE_TIME_MS ,
nla_get_msecs ( tbp [ i ] ) ) ;
break ;
2006-08-07 17:58:53 -07:00
case NDTPA_RETRANS_TIME :
2013-12-07 19:26:53 +01:00
NEIGH_VAR_SET ( p , RETRANS_TIME ,
nla_get_msecs ( tbp [ i ] ) ) ;
2006-08-07 17:58:53 -07:00
break ;
case NDTPA_ANYCAST_DELAY :
2014-01-14 15:46:07 +01:00
NEIGH_VAR_SET ( p , ANYCAST_DELAY ,
nla_get_msecs ( tbp [ i ] ) ) ;
2006-08-07 17:58:53 -07:00
break ;
case NDTPA_PROXY_DELAY :
2014-01-14 15:46:07 +01:00
NEIGH_VAR_SET ( p , PROXY_DELAY ,
nla_get_msecs ( tbp [ i ] ) ) ;
2006-08-07 17:58:53 -07:00
break ;
case NDTPA_LOCKTIME :
2014-01-14 15:46:07 +01:00
NEIGH_VAR_SET ( p , LOCKTIME ,
nla_get_msecs ( tbp [ i ] ) ) ;
2006-08-07 17:58:53 -07:00
break ;
}
}
}
2005-06-18 22:50:55 -07:00
2013-06-20 10:01:34 +08:00
err = - ENOENT ;
if ( ( tb [ NDTA_THRESH1 ] | | tb [ NDTA_THRESH2 ] | |
tb [ NDTA_THRESH3 ] | | tb [ NDTA_GC_INTERVAL ] ) & &
! net_eq ( net , & init_net ) )
goto errout_tbl_lock ;
2006-08-07 17:58:53 -07:00
if ( tb [ NDTA_THRESH1 ] )
tbl - > gc_thresh1 = nla_get_u32 ( tb [ NDTA_THRESH1 ] ) ;
2005-06-18 22:50:55 -07:00
2006-08-07 17:58:53 -07:00
if ( tb [ NDTA_THRESH2 ] )
tbl - > gc_thresh2 = nla_get_u32 ( tb [ NDTA_THRESH2 ] ) ;
2005-06-18 22:50:55 -07:00
2006-08-07 17:58:53 -07:00
if ( tb [ NDTA_THRESH3 ] )
tbl - > gc_thresh3 = nla_get_u32 ( tb [ NDTA_THRESH3 ] ) ;
2005-06-18 22:50:55 -07:00
2006-08-07 17:58:53 -07:00
if ( tb [ NDTA_GC_INTERVAL ] )
tbl - > gc_interval = nla_get_msecs ( tb [ NDTA_GC_INTERVAL ] ) ;
2005-06-18 22:50:55 -07:00
err = 0 ;
2006-08-07 17:58:53 -07:00
errout_tbl_lock :
2005-06-18 22:50:55 -07:00
write_unlock_bh ( & tbl - > lock ) ;
2006-08-07 17:58:53 -07:00
errout :
2005-06-18 22:50:55 -07:00
return err ;
}
2018-10-07 20:16:37 -07:00
static int neightbl_valid_dump_info ( const struct nlmsghdr * nlh ,
struct netlink_ext_ack * extack )
{
struct ndtmsg * ndtm ;
if ( nlh - > nlmsg_len < nlmsg_msg_size ( sizeof ( * ndtm ) ) ) {
NL_SET_ERR_MSG ( extack , " Invalid header for neighbor table dump request " ) ;
return - EINVAL ;
}
ndtm = nlmsg_data ( nlh ) ;
if ( ndtm - > ndtm_pad1 | | ndtm - > ndtm_pad2 ) {
NL_SET_ERR_MSG ( extack , " Invalid values in header for neighbor table dump request " ) ;
return - EINVAL ;
}
if ( nlmsg_attrlen ( nlh , sizeof ( * ndtm ) ) ) {
NL_SET_ERR_MSG ( extack , " Invalid data after header in neighbor table dump request " ) ;
return - EINVAL ;
}
return 0 ;
}
2007-03-22 11:50:06 -07:00
static int neightbl_dump_info ( struct sk_buff * skb , struct netlink_callback * cb )
2005-06-18 22:50:55 -07:00
{
2018-10-07 20:16:37 -07:00
const struct nlmsghdr * nlh = cb - > nlh ;
2008-03-26 02:26:21 +09:00
struct net * net = sock_net ( skb - > sk ) ;
2006-08-07 18:00:18 -07:00
int family , tidx , nidx = 0 ;
int tbl_skip = cb - > args [ 0 ] ;
int neigh_skip = cb - > args [ 1 ] ;
2005-06-18 22:50:55 -07:00
struct neigh_table * tbl ;
2018-10-07 20:16:37 -07:00
if ( cb - > strict_check ) {
int err = neightbl_valid_dump_info ( nlh , cb - > extack ) ;
if ( err < 0 )
return err ;
}
family = ( ( struct rtgenmsg * ) nlmsg_data ( nlh ) ) - > rtgen_family ;
2005-06-18 22:50:55 -07:00
2014-11-10 15:59:36 -08:00
for ( tidx = 0 ; tidx < NEIGH_NR_TABLES ; tidx + + ) {
2005-06-18 22:50:55 -07:00
struct neigh_parms * p ;
2014-11-10 15:59:36 -08:00
tbl = neigh_tables [ tidx ] ;
if ( ! tbl )
continue ;
2006-08-07 18:00:18 -07:00
if ( tidx < tbl_skip | | ( family & & tbl - > family ! = family ) )
2005-06-18 22:50:55 -07:00
continue ;
2012-09-07 20:12:54 +00:00
if ( neightbl_fill_info ( skb , tbl , NETLINK_CB ( cb - > skb ) . portid ,
2018-10-07 20:16:37 -07:00
nlh - > nlmsg_seq , RTM_NEWNEIGHTBL ,
2015-01-18 23:36:08 -05:00
NLM_F_MULTI ) < 0 )
2005-06-18 22:50:55 -07:00
break ;
2014-10-29 19:29:31 +01:00
nidx = 0 ;
p = list_next_entry ( & tbl - > parms , list ) ;
list_for_each_entry_from ( p , & tbl - > parms_list , list ) {
2008-03-26 03:57:35 +09:00
if ( ! net_eq ( neigh_parms_net ( p ) , net ) )
2008-01-24 00:13:18 -08:00
continue ;
2009-02-06 00:52:04 -08:00
if ( nidx < neigh_skip )
goto next ;
2005-06-18 22:50:55 -07:00
2006-08-07 18:00:18 -07:00
if ( neightbl_fill_param_info ( skb , tbl , p ,
2012-09-07 20:12:54 +00:00
NETLINK_CB ( cb - > skb ) . portid ,
2018-10-07 20:16:37 -07:00
nlh - > nlmsg_seq ,
2006-08-07 18:00:18 -07:00
RTM_NEWNEIGHTBL ,
2015-01-18 23:36:08 -05:00
NLM_F_MULTI ) < 0 )
2005-06-18 22:50:55 -07:00
goto out ;
2009-02-06 00:52:04 -08:00
next :
nidx + + ;
2005-06-18 22:50:55 -07:00
}
2006-08-07 18:00:18 -07:00
neigh_skip = 0 ;
2005-06-18 22:50:55 -07:00
}
out :
2006-08-07 18:00:18 -07:00
cb - > args [ 0 ] = tidx ;
cb - > args [ 1 ] = nidx ;
2005-06-18 22:50:55 -07:00
return skb - > len ;
}
2005-04-16 15:20:36 -07:00
2006-08-07 17:56:37 -07:00
static int neigh_fill_info ( struct sk_buff * skb , struct neighbour * neigh ,
u32 pid , u32 seq , int type , unsigned int flags )
2005-04-16 15:20:36 -07:00
{
2021-10-11 14:12:37 +02:00
u32 neigh_flags , neigh_flags_ext ;
2005-04-16 15:20:36 -07:00
unsigned long now = jiffies ;
struct nda_cacheinfo ci ;
2006-08-07 17:56:37 -07:00
struct nlmsghdr * nlh ;
struct ndmsg * ndm ;
nlh = nlmsg_put ( skb , pid , seq , type , sizeof ( * ndm ) , flags ) ;
if ( nlh = = NULL )
2007-01-31 23:16:40 -08:00
return - EMSGSIZE ;
2005-04-16 15:20:36 -07:00
2021-10-11 14:12:37 +02:00
neigh_flags_ext = neigh - > flags > > NTF_EXT_SHIFT ;
neigh_flags = neigh - > flags & NTF_OLD_MASK ;
2006-08-07 17:56:37 -07:00
ndm = nlmsg_data ( nlh ) ;
ndm - > ndm_family = neigh - > ops - > family ;
2005-06-28 12:55:30 -07:00
ndm - > ndm_pad1 = 0 ;
ndm - > ndm_pad2 = 0 ;
2021-10-11 14:12:37 +02:00
ndm - > ndm_flags = neigh_flags ;
2006-08-07 17:56:37 -07:00
ndm - > ndm_type = neigh - > type ;
ndm - > ndm_ifindex = neigh - > dev - > ifindex ;
2005-04-16 15:20:36 -07:00
2012-04-01 20:06:28 -04:00
if ( nla_put ( skb , NDA_DST , neigh - > tbl - > key_len , neigh - > primary_key ) )
goto nla_put_failure ;
2006-08-07 17:56:37 -07:00
read_lock_bh ( & neigh - > lock ) ;
ndm - > ndm_state = neigh - > nud_state ;
2010-10-07 10:44:07 +00:00
if ( neigh - > nud_state & NUD_VALID ) {
char haddr [ MAX_ADDR_LEN ] ;
neigh_ha_snapshot ( haddr , neigh , neigh - > dev ) ;
if ( nla_put ( skb , NDA_LLADDR , neigh - > dev - > addr_len , haddr ) < 0 ) {
read_unlock_bh ( & neigh - > lock ) ;
goto nla_put_failure ;
}
2006-08-07 17:56:37 -07:00
}
2008-06-03 16:03:15 -07:00
ci . ndm_used = jiffies_to_clock_t ( now - neigh - > used ) ;
ci . ndm_confirmed = jiffies_to_clock_t ( now - neigh - > confirmed ) ;
ci . ndm_updated = jiffies_to_clock_t ( now - neigh - > updated ) ;
2017-06-30 13:07:55 +03:00
ci . ndm_refcnt = refcount_read ( & neigh - > refcnt ) - 1 ;
2006-08-07 17:56:37 -07:00
read_unlock_bh ( & neigh - > lock ) ;
2012-04-01 20:06:28 -04:00
if ( nla_put_u32 ( skb , NDA_PROBES , atomic_read ( & neigh - > probes ) ) | |
nla_put ( skb , NDA_CACHEINFO , sizeof ( ci ) , & ci ) )
goto nla_put_failure ;
2006-08-07 17:56:37 -07:00
2018-12-15 14:09:06 -08:00
if ( neigh - > protocol & & nla_put_u8 ( skb , NDA_PROTOCOL , neigh - > protocol ) )
goto nla_put_failure ;
2021-10-11 14:12:37 +02:00
if ( neigh_flags_ext & & nla_put_u32 ( skb , NDA_FLAGS_EXT , neigh_flags_ext ) )
goto nla_put_failure ;
2018-12-15 14:09:06 -08:00
2015-01-16 22:09:00 +01:00
nlmsg_end ( skb , nlh ) ;
return 0 ;
2006-08-07 17:56:37 -07:00
nla_put_failure :
2007-01-31 23:16:40 -08:00
nlmsg_cancel ( skb , nlh ) ;
return - EMSGSIZE ;
2005-04-16 15:20:36 -07:00
}
2012-01-26 22:28:58 +00:00
static int pneigh_fill_info ( struct sk_buff * skb , struct pneigh_entry * pn ,
u32 pid , u32 seq , int type , unsigned int flags ,
struct neigh_table * tbl )
{
2021-10-11 14:12:37 +02:00
u32 neigh_flags , neigh_flags_ext ;
2012-01-26 22:28:58 +00:00
struct nlmsghdr * nlh ;
struct ndmsg * ndm ;
nlh = nlmsg_put ( skb , pid , seq , type , sizeof ( * ndm ) , flags ) ;
if ( nlh = = NULL )
return - EMSGSIZE ;
2021-10-11 14:12:37 +02:00
neigh_flags_ext = pn - > flags > > NTF_EXT_SHIFT ;
neigh_flags = pn - > flags & NTF_OLD_MASK ;
2012-01-26 22:28:58 +00:00
ndm = nlmsg_data ( nlh ) ;
ndm - > ndm_family = tbl - > family ;
ndm - > ndm_pad1 = 0 ;
ndm - > ndm_pad2 = 0 ;
2021-10-11 14:12:37 +02:00
ndm - > ndm_flags = neigh_flags | NTF_PROXY ;
2014-07-26 00:38:59 +08:00
ndm - > ndm_type = RTN_UNICAST ;
2015-12-01 01:14:48 +03:00
ndm - > ndm_ifindex = pn - > dev ? pn - > dev - > ifindex : 0 ;
2012-01-26 22:28:58 +00:00
ndm - > ndm_state = NUD_NONE ;
2012-04-01 20:06:28 -04:00
if ( nla_put ( skb , NDA_DST , tbl - > key_len , pn - > key ) )
goto nla_put_failure ;
2012-01-26 22:28:58 +00:00
2018-12-15 14:09:06 -08:00
if ( pn - > protocol & & nla_put_u8 ( skb , NDA_PROTOCOL , pn - > protocol ) )
goto nla_put_failure ;
2021-10-11 14:12:37 +02:00
if ( neigh_flags_ext & & nla_put_u32 ( skb , NDA_FLAGS_EXT , neigh_flags_ext ) )
goto nla_put_failure ;
2018-12-15 14:09:06 -08:00
2015-01-16 22:09:00 +01:00
nlmsg_end ( skb , nlh ) ;
return 0 ;
2012-01-26 22:28:58 +00:00
nla_put_failure :
nlmsg_cancel ( skb , nlh ) ;
return - EMSGSIZE ;
}
2017-03-19 22:01:28 -07:00
static void neigh_update_notify ( struct neighbour * neigh , u32 nlmsg_pid )
2007-08-08 23:12:56 -07:00
{
call_netevent_notifiers ( NETEVENT_NEIGH_UPDATE , neigh ) ;
2017-03-19 22:01:28 -07:00
__neigh_notify ( neigh , RTM_NEWNEIGH , 0 , nlmsg_pid ) ;
2007-08-08 23:12:56 -07:00
}
2005-04-16 15:20:36 -07:00
2015-09-29 09:32:03 -07:00
static bool neigh_master_filtered ( struct net_device * dev , int master_idx )
{
struct net_device * master ;
if ( ! master_idx )
return false ;
2018-10-26 09:33:27 -07:00
master = dev ? netdev_master_upper_dev_get ( dev ) : NULL ;
2021-08-10 09:06:58 +00:00
/* 0 is already used to denote NDA_MASTER wasn't passed, therefore need another
* invalid value for ifindex to denote " no master " .
*/
if ( master_idx = = - 1 )
return ! ! master ;
2015-09-29 09:32:03 -07:00
if ( ! master | | master - > ifindex ! = master_idx )
return true ;
return false ;
}
2015-10-03 11:43:46 -07:00
static bool neigh_ifindex_filtered ( struct net_device * dev , int filter_idx )
{
2018-10-26 09:33:27 -07:00
if ( filter_idx & & ( ! dev | | dev - > ifindex ! = filter_idx ) )
2015-10-03 11:43:46 -07:00
return true ;
return false ;
}
2018-10-03 15:33:12 -07:00
struct neigh_dump_filter {
int master_idx ;
int dev_idx ;
} ;
2005-04-16 15:20:36 -07:00
static int neigh_dump_table ( struct neigh_table * tbl , struct sk_buff * skb ,
2018-10-03 15:33:12 -07:00
struct netlink_callback * cb ,
struct neigh_dump_filter * filter )
2005-04-16 15:20:36 -07:00
{
2010-10-06 17:49:21 -07:00
struct net * net = sock_net ( skb - > sk ) ;
2005-04-16 15:20:36 -07:00
struct neighbour * n ;
int rc , h , s_h = cb - > args [ 1 ] ;
int idx , s_idx = idx = cb - > args [ 2 ] ;
2010-10-04 06:15:44 +00:00
struct neigh_hash_table * nht ;
2015-09-29 09:32:03 -07:00
unsigned int flags = NLM_F_MULTI ;
2018-10-03 15:33:12 -07:00
if ( filter - > dev_idx | | filter - > master_idx )
flags | = NLM_F_DUMP_FILTERED ;
2005-04-16 15:20:36 -07:00
2023-03-21 04:01:14 +00:00
rcu_read_lock ( ) ;
nht = rcu_dereference ( tbl - > nht ) ;
2010-10-04 06:15:44 +00:00
2012-06-07 04:58:35 +00:00
for ( h = s_h ; h < ( 1 < < nht - > hash_shift ) ; h + + ) {
2005-04-16 15:20:36 -07:00
if ( h > s_h )
s_idx = 0 ;
2023-03-21 04:01:14 +00:00
for ( n = rcu_dereference ( nht - > hash_buckets [ h ] ) , idx = 0 ;
2010-10-06 17:49:21 -07:00
n ! = NULL ;
2023-03-21 04:01:14 +00:00
n = rcu_dereference ( n - > next ) ) {
2016-11-30 11:24:42 +08:00
if ( idx < s_idx | | ! net_eq ( dev_net ( n - > dev ) , net ) )
goto next ;
2018-10-03 15:33:12 -07:00
if ( neigh_ifindex_filtered ( n - > dev , filter - > dev_idx ) | |
neigh_master_filtered ( n - > dev , filter - > master_idx ) )
2009-02-06 00:52:04 -08:00
goto next ;
2012-09-07 20:12:54 +00:00
if ( neigh_fill_info ( skb , n , NETLINK_CB ( cb - > skb ) . portid ,
2005-04-16 15:20:36 -07:00
cb - > nlh - > nlmsg_seq ,
2005-06-18 22:54:12 -07:00
RTM_NEWNEIGH ,
2015-09-29 09:32:03 -07:00
flags ) < 0 ) {
2005-04-16 15:20:36 -07:00
rc = - 1 ;
goto out ;
}
2010-10-06 17:49:21 -07:00
next :
2009-02-06 00:52:04 -08:00
idx + + ;
2005-04-16 15:20:36 -07:00
}
}
rc = skb - > len ;
out :
2023-03-21 04:01:14 +00:00
rcu_read_unlock ( ) ;
2005-04-16 15:20:36 -07:00
cb - > args [ 1 ] = h ;
cb - > args [ 2 ] = idx ;
return rc ;
}
2012-01-26 22:28:58 +00:00
static int pneigh_dump_table ( struct neigh_table * tbl , struct sk_buff * skb ,
2018-10-03 15:33:12 -07:00
struct netlink_callback * cb ,
struct neigh_dump_filter * filter )
2012-01-26 22:28:58 +00:00
{
struct pneigh_entry * n ;
struct net * net = sock_net ( skb - > sk ) ;
int rc , h , s_h = cb - > args [ 3 ] ;
int idx , s_idx = idx = cb - > args [ 4 ] ;
2018-10-03 15:33:12 -07:00
unsigned int flags = NLM_F_MULTI ;
if ( filter - > dev_idx | | filter - > master_idx )
flags | = NLM_F_DUMP_FILTERED ;
2012-01-26 22:28:58 +00:00
read_lock_bh ( & tbl - > lock ) ;
2012-06-07 04:58:35 +00:00
for ( h = s_h ; h < = PNEIGH_HASHMASK ; h + + ) {
2012-01-26 22:28:58 +00:00
if ( h > s_h )
s_idx = 0 ;
for ( n = tbl - > phash_buckets [ h ] , idx = 0 ; n ; n = n - > next ) {
2016-11-30 11:24:42 +08:00
if ( idx < s_idx | | pneigh_net ( n ) ! = net )
2012-01-26 22:28:58 +00:00
goto next ;
2018-10-03 15:33:12 -07:00
if ( neigh_ifindex_filtered ( n - > dev , filter - > dev_idx ) | |
neigh_master_filtered ( n - > dev , filter - > master_idx ) )
goto next ;
2012-09-07 20:12:54 +00:00
if ( pneigh_fill_info ( skb , n , NETLINK_CB ( cb - > skb ) . portid ,
2012-01-26 22:28:58 +00:00
cb - > nlh - > nlmsg_seq ,
2018-10-03 15:33:12 -07:00
RTM_NEWNEIGH , flags , tbl ) < 0 ) {
2012-01-26 22:28:58 +00:00
read_unlock_bh ( & tbl - > lock ) ;
rc = - 1 ;
goto out ;
}
next :
idx + + ;
}
}
read_unlock_bh ( & tbl - > lock ) ;
rc = skb - > len ;
out :
cb - > args [ 3 ] = h ;
cb - > args [ 4 ] = idx ;
return rc ;
}
2018-10-07 20:16:36 -07:00
static int neigh_valid_dump_req ( const struct nlmsghdr * nlh ,
bool strict_check ,
struct neigh_dump_filter * filter ,
struct netlink_ext_ack * extack )
{
struct nlattr * tb [ NDA_MAX + 1 ] ;
int err , i ;
if ( strict_check ) {
struct ndmsg * ndm ;
if ( nlh - > nlmsg_len < nlmsg_msg_size ( sizeof ( * ndm ) ) ) {
NL_SET_ERR_MSG ( extack , " Invalid header for neighbor dump request " ) ;
return - EINVAL ;
}
ndm = nlmsg_data ( nlh ) ;
if ( ndm - > ndm_pad1 | | ndm - > ndm_pad2 | | ndm - > ndm_ifindex | |
2018-12-19 16:54:38 -08:00
ndm - > ndm_state | | ndm - > ndm_type ) {
2018-10-07 20:16:36 -07:00
NL_SET_ERR_MSG ( extack , " Invalid values in header for neighbor dump request " ) ;
return - EINVAL ;
}
2018-12-19 16:54:38 -08:00
if ( ndm - > ndm_flags & ~ NTF_PROXY ) {
NL_SET_ERR_MSG ( extack , " Invalid flags in header for neighbor dump request " ) ;
return - EINVAL ;
}
netlink: make validation more configurable for future strictness
We currently have two levels of strict validation:
1) liberal (default)
- undefined (type >= max) & NLA_UNSPEC attributes accepted
- attribute length >= expected accepted
- garbage at end of message accepted
2) strict (opt-in)
- NLA_UNSPEC attributes accepted
- attribute length >= expected accepted
Split out parsing strictness into four different options:
* TRAILING - check that there's no trailing data after parsing
attributes (in message or nested)
* MAXTYPE - reject attrs > max known type
* UNSPEC - reject attributes with NLA_UNSPEC policy entries
* STRICT_ATTRS - strictly validate attribute size
The default for future things should be *everything*.
The current *_strict() is a combination of TRAILING and MAXTYPE,
and is renamed to _deprecated_strict().
The current regular parsing has none of this, and is renamed to
*_parse_deprecated().
Additionally it allows us to selectively set one of the new flags
even on old policies. Notably, the UNSPEC flag could be useful in
this case, since it can be arranged (by filling in the policy) to
not be an incompatible userspace ABI change, but would then going
forward prevent forgetting attribute entries. Similar can apply
to the POLICY flag.
We end up with the following renames:
* nla_parse -> nla_parse_deprecated
* nla_parse_strict -> nla_parse_deprecated_strict
* nlmsg_parse -> nlmsg_parse_deprecated
* nlmsg_parse_strict -> nlmsg_parse_deprecated_strict
* nla_parse_nested -> nla_parse_nested_deprecated
* nla_validate_nested -> nla_validate_nested_deprecated
Using spatch, of course:
@@
expression TB, MAX, HEAD, LEN, POL, EXT;
@@
-nla_parse(TB, MAX, HEAD, LEN, POL, EXT)
+nla_parse_deprecated(TB, MAX, HEAD, LEN, POL, EXT)
@@
expression NLH, HDRLEN, TB, MAX, POL, EXT;
@@
-nlmsg_parse(NLH, HDRLEN, TB, MAX, POL, EXT)
+nlmsg_parse_deprecated(NLH, HDRLEN, TB, MAX, POL, EXT)
@@
expression NLH, HDRLEN, TB, MAX, POL, EXT;
@@
-nlmsg_parse_strict(NLH, HDRLEN, TB, MAX, POL, EXT)
+nlmsg_parse_deprecated_strict(NLH, HDRLEN, TB, MAX, POL, EXT)
@@
expression TB, MAX, NLA, POL, EXT;
@@
-nla_parse_nested(TB, MAX, NLA, POL, EXT)
+nla_parse_nested_deprecated(TB, MAX, NLA, POL, EXT)
@@
expression START, MAX, POL, EXT;
@@
-nla_validate_nested(START, MAX, POL, EXT)
+nla_validate_nested_deprecated(START, MAX, POL, EXT)
@@
expression NLH, HDRLEN, MAX, POL, EXT;
@@
-nlmsg_validate(NLH, HDRLEN, MAX, POL, EXT)
+nlmsg_validate_deprecated(NLH, HDRLEN, MAX, POL, EXT)
For this patch, don't actually add the strict, non-renamed versions
yet so that it breaks compile if I get it wrong.
Also, while at it, make nla_validate and nla_parse go down to a
common __nla_validate_parse() function to avoid code duplication.
Ultimately, this allows us to have very strict validation for every
new caller of nla_parse()/nlmsg_parse() etc as re-introduced in the
next patch, while existing things will continue to work as is.
In effect then, this adds fully strict validation for any new command.
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-04-26 14:07:28 +02:00
err = nlmsg_parse_deprecated_strict ( nlh , sizeof ( struct ndmsg ) ,
tb , NDA_MAX , nda_policy ,
extack ) ;
2018-10-07 20:16:36 -07:00
} else {
netlink: make validation more configurable for future strictness
We currently have two levels of strict validation:
1) liberal (default)
- undefined (type >= max) & NLA_UNSPEC attributes accepted
- attribute length >= expected accepted
- garbage at end of message accepted
2) strict (opt-in)
- NLA_UNSPEC attributes accepted
- attribute length >= expected accepted
Split out parsing strictness into four different options:
* TRAILING - check that there's no trailing data after parsing
attributes (in message or nested)
* MAXTYPE - reject attrs > max known type
* UNSPEC - reject attributes with NLA_UNSPEC policy entries
* STRICT_ATTRS - strictly validate attribute size
The default for future things should be *everything*.
The current *_strict() is a combination of TRAILING and MAXTYPE,
and is renamed to _deprecated_strict().
The current regular parsing has none of this, and is renamed to
*_parse_deprecated().
Additionally it allows us to selectively set one of the new flags
even on old policies. Notably, the UNSPEC flag could be useful in
this case, since it can be arranged (by filling in the policy) to
not be an incompatible userspace ABI change, but would then going
forward prevent forgetting attribute entries. Similar can apply
to the POLICY flag.
We end up with the following renames:
* nla_parse -> nla_parse_deprecated
* nla_parse_strict -> nla_parse_deprecated_strict
* nlmsg_parse -> nlmsg_parse_deprecated
* nlmsg_parse_strict -> nlmsg_parse_deprecated_strict
* nla_parse_nested -> nla_parse_nested_deprecated
* nla_validate_nested -> nla_validate_nested_deprecated
Using spatch, of course:
@@
expression TB, MAX, HEAD, LEN, POL, EXT;
@@
-nla_parse(TB, MAX, HEAD, LEN, POL, EXT)
+nla_parse_deprecated(TB, MAX, HEAD, LEN, POL, EXT)
@@
expression NLH, HDRLEN, TB, MAX, POL, EXT;
@@
-nlmsg_parse(NLH, HDRLEN, TB, MAX, POL, EXT)
+nlmsg_parse_deprecated(NLH, HDRLEN, TB, MAX, POL, EXT)
@@
expression NLH, HDRLEN, TB, MAX, POL, EXT;
@@
-nlmsg_parse_strict(NLH, HDRLEN, TB, MAX, POL, EXT)
+nlmsg_parse_deprecated_strict(NLH, HDRLEN, TB, MAX, POL, EXT)
@@
expression TB, MAX, NLA, POL, EXT;
@@
-nla_parse_nested(TB, MAX, NLA, POL, EXT)
+nla_parse_nested_deprecated(TB, MAX, NLA, POL, EXT)
@@
expression START, MAX, POL, EXT;
@@
-nla_validate_nested(START, MAX, POL, EXT)
+nla_validate_nested_deprecated(START, MAX, POL, EXT)
@@
expression NLH, HDRLEN, MAX, POL, EXT;
@@
-nlmsg_validate(NLH, HDRLEN, MAX, POL, EXT)
+nlmsg_validate_deprecated(NLH, HDRLEN, MAX, POL, EXT)
For this patch, don't actually add the strict, non-renamed versions
yet so that it breaks compile if I get it wrong.
Also, while at it, make nla_validate and nla_parse go down to a
common __nla_validate_parse() function to avoid code duplication.
Ultimately, this allows us to have very strict validation for every
new caller of nla_parse()/nlmsg_parse() etc as re-introduced in the
next patch, while existing things will continue to work as is.
In effect then, this adds fully strict validation for any new command.
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-04-26 14:07:28 +02:00
err = nlmsg_parse_deprecated ( nlh , sizeof ( struct ndmsg ) , tb ,
NDA_MAX , nda_policy , extack ) ;
2018-10-07 20:16:36 -07:00
}
if ( err < 0 )
return err ;
for ( i = 0 ; i < = NDA_MAX ; + + i ) {
if ( ! tb [ i ] )
continue ;
/* all new attributes should require strict_check */
switch ( i ) {
case NDA_IFINDEX :
filter - > dev_idx = nla_get_u32 ( tb [ i ] ) ;
break ;
case NDA_MASTER :
filter - > master_idx = nla_get_u32 ( tb [ i ] ) ;
break ;
default :
if ( strict_check ) {
NL_SET_ERR_MSG ( extack , " Unsupported attribute in neighbor dump request " ) ;
return - EINVAL ;
}
}
}
return 0 ;
}
2007-03-22 11:50:06 -07:00
static int neigh_dump_info ( struct sk_buff * skb , struct netlink_callback * cb )
2005-04-16 15:20:36 -07:00
{
2018-10-03 15:33:12 -07:00
const struct nlmsghdr * nlh = cb - > nlh ;
struct neigh_dump_filter filter = { } ;
2005-04-16 15:20:36 -07:00
struct neigh_table * tbl ;
int t , family , s_t ;
2012-01-26 22:28:58 +00:00
int proxy = 0 ;
2012-06-07 04:58:35 +00:00
int err ;
2005-04-16 15:20:36 -07:00
2018-10-03 15:33:12 -07:00
family = ( ( struct rtgenmsg * ) nlmsg_data ( nlh ) ) - > rtgen_family ;
2012-01-26 22:28:58 +00:00
/* check for full ndmsg structure presence, family member is
* the same for both structures
*/
2018-10-03 15:33:12 -07:00
if ( nlmsg_len ( nlh ) > = sizeof ( struct ndmsg ) & &
( ( struct ndmsg * ) nlmsg_data ( nlh ) ) - > ndm_flags = = NTF_PROXY )
2012-01-26 22:28:58 +00:00
proxy = 1 ;
2018-10-07 20:16:36 -07:00
err = neigh_valid_dump_req ( nlh , cb - > strict_check , & filter , cb - > extack ) ;
if ( err < 0 & & cb - > strict_check )
return err ;
2005-04-16 15:20:36 -07:00
s_t = cb - > args [ 0 ] ;
2014-11-10 15:59:36 -08:00
for ( t = 0 ; t < NEIGH_NR_TABLES ; t + + ) {
tbl = neigh_tables [ t ] ;
if ( ! tbl )
continue ;
2005-04-16 15:20:36 -07:00
if ( t < s_t | | ( family & & tbl - > family ! = family ) )
continue ;
if ( t > s_t )
memset ( & cb - > args [ 1 ] , 0 , sizeof ( cb - > args ) -
sizeof ( cb - > args [ 0 ] ) ) ;
2012-01-26 22:28:58 +00:00
if ( proxy )
2018-10-03 15:33:12 -07:00
err = pneigh_dump_table ( tbl , skb , cb , & filter ) ;
2012-01-26 22:28:58 +00:00
else
2018-10-03 15:33:12 -07:00
err = neigh_dump_table ( tbl , skb , cb , & filter ) ;
2012-06-07 04:58:35 +00:00
if ( err < 0 )
break ;
2005-04-16 15:20:36 -07:00
}
cb - > args [ 0 ] = t ;
return skb - > len ;
}
2018-12-19 12:51:38 -08:00
static int neigh_valid_get_req ( const struct nlmsghdr * nlh ,
struct neigh_table * * tbl ,
void * * dst , int * dev_idx , u8 * ndm_flags ,
struct netlink_ext_ack * extack )
{
struct nlattr * tb [ NDA_MAX + 1 ] ;
struct ndmsg * ndm ;
int err , i ;
if ( nlh - > nlmsg_len < nlmsg_msg_size ( sizeof ( * ndm ) ) ) {
NL_SET_ERR_MSG ( extack , " Invalid header for neighbor get request " ) ;
return - EINVAL ;
}
ndm = nlmsg_data ( nlh ) ;
if ( ndm - > ndm_pad1 | | ndm - > ndm_pad2 | | ndm - > ndm_state | |
ndm - > ndm_type ) {
NL_SET_ERR_MSG ( extack , " Invalid values in header for neighbor get request " ) ;
return - EINVAL ;
}
if ( ndm - > ndm_flags & ~ NTF_PROXY ) {
NL_SET_ERR_MSG ( extack , " Invalid flags in header for neighbor get request " ) ;
return - EINVAL ;
}
netlink: make validation more configurable for future strictness
We currently have two levels of strict validation:
1) liberal (default)
- undefined (type >= max) & NLA_UNSPEC attributes accepted
- attribute length >= expected accepted
- garbage at end of message accepted
2) strict (opt-in)
- NLA_UNSPEC attributes accepted
- attribute length >= expected accepted
Split out parsing strictness into four different options:
* TRAILING - check that there's no trailing data after parsing
attributes (in message or nested)
* MAXTYPE - reject attrs > max known type
* UNSPEC - reject attributes with NLA_UNSPEC policy entries
* STRICT_ATTRS - strictly validate attribute size
The default for future things should be *everything*.
The current *_strict() is a combination of TRAILING and MAXTYPE,
and is renamed to _deprecated_strict().
The current regular parsing has none of this, and is renamed to
*_parse_deprecated().
Additionally it allows us to selectively set one of the new flags
even on old policies. Notably, the UNSPEC flag could be useful in
this case, since it can be arranged (by filling in the policy) to
not be an incompatible userspace ABI change, but would then going
forward prevent forgetting attribute entries. Similar can apply
to the POLICY flag.
We end up with the following renames:
* nla_parse -> nla_parse_deprecated
* nla_parse_strict -> nla_parse_deprecated_strict
* nlmsg_parse -> nlmsg_parse_deprecated
* nlmsg_parse_strict -> nlmsg_parse_deprecated_strict
* nla_parse_nested -> nla_parse_nested_deprecated
* nla_validate_nested -> nla_validate_nested_deprecated
Using spatch, of course:
@@
expression TB, MAX, HEAD, LEN, POL, EXT;
@@
-nla_parse(TB, MAX, HEAD, LEN, POL, EXT)
+nla_parse_deprecated(TB, MAX, HEAD, LEN, POL, EXT)
@@
expression NLH, HDRLEN, TB, MAX, POL, EXT;
@@
-nlmsg_parse(NLH, HDRLEN, TB, MAX, POL, EXT)
+nlmsg_parse_deprecated(NLH, HDRLEN, TB, MAX, POL, EXT)
@@
expression NLH, HDRLEN, TB, MAX, POL, EXT;
@@
-nlmsg_parse_strict(NLH, HDRLEN, TB, MAX, POL, EXT)
+nlmsg_parse_deprecated_strict(NLH, HDRLEN, TB, MAX, POL, EXT)
@@
expression TB, MAX, NLA, POL, EXT;
@@
-nla_parse_nested(TB, MAX, NLA, POL, EXT)
+nla_parse_nested_deprecated(TB, MAX, NLA, POL, EXT)
@@
expression START, MAX, POL, EXT;
@@
-nla_validate_nested(START, MAX, POL, EXT)
+nla_validate_nested_deprecated(START, MAX, POL, EXT)
@@
expression NLH, HDRLEN, MAX, POL, EXT;
@@
-nlmsg_validate(NLH, HDRLEN, MAX, POL, EXT)
+nlmsg_validate_deprecated(NLH, HDRLEN, MAX, POL, EXT)
For this patch, don't actually add the strict, non-renamed versions
yet so that it breaks compile if I get it wrong.
Also, while at it, make nla_validate and nla_parse go down to a
common __nla_validate_parse() function to avoid code duplication.
Ultimately, this allows us to have very strict validation for every
new caller of nla_parse()/nlmsg_parse() etc as re-introduced in the
next patch, while existing things will continue to work as is.
In effect then, this adds fully strict validation for any new command.
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-04-26 14:07:28 +02:00
err = nlmsg_parse_deprecated_strict ( nlh , sizeof ( struct ndmsg ) , tb ,
NDA_MAX , nda_policy , extack ) ;
2018-12-19 12:51:38 -08:00
if ( err < 0 )
return err ;
* ndm_flags = ndm - > ndm_flags ;
* dev_idx = ndm - > ndm_ifindex ;
* tbl = neigh_find_table ( ndm - > ndm_family ) ;
if ( * tbl = = NULL ) {
NL_SET_ERR_MSG ( extack , " Unsupported family in header for neighbor get request " ) ;
return - EAFNOSUPPORT ;
}
for ( i = 0 ; i < = NDA_MAX ; + + i ) {
if ( ! tb [ i ] )
continue ;
switch ( i ) {
case NDA_DST :
if ( nla_len ( tb [ i ] ) ! = ( int ) ( * tbl ) - > key_len ) {
NL_SET_ERR_MSG ( extack , " Invalid network address in neighbor get request " ) ;
return - EINVAL ;
}
* dst = nla_data ( tb [ i ] ) ;
break ;
default :
NL_SET_ERR_MSG ( extack , " Unsupported attribute in neighbor get request " ) ;
return - EINVAL ;
}
}
return 0 ;
}
static inline size_t neigh_nlmsg_size ( void )
{
return NLMSG_ALIGN ( sizeof ( struct ndmsg ) )
+ nla_total_size ( MAX_ADDR_LEN ) /* NDA_DST */
+ nla_total_size ( MAX_ADDR_LEN ) /* NDA_LLADDR */
+ nla_total_size ( sizeof ( struct nda_cacheinfo ) )
+ nla_total_size ( 4 ) /* NDA_PROBES */
2021-10-11 14:12:37 +02:00
+ nla_total_size ( 4 ) /* NDA_FLAGS_EXT */
2018-12-19 12:51:38 -08:00
+ nla_total_size ( 1 ) ; /* NDA_PROTOCOL */
}
static int neigh_get_reply ( struct net * net , struct neighbour * neigh ,
u32 pid , u32 seq )
{
struct sk_buff * skb ;
int err = 0 ;
skb = nlmsg_new ( neigh_nlmsg_size ( ) , GFP_KERNEL ) ;
if ( ! skb )
return - ENOBUFS ;
err = neigh_fill_info ( skb , neigh , pid , seq , RTM_NEWNEIGH , 0 ) ;
if ( err ) {
kfree_skb ( skb ) ;
goto errout ;
}
err = rtnl_unicast ( skb , net , pid ) ;
errout :
return err ;
}
static inline size_t pneigh_nlmsg_size ( void )
{
return NLMSG_ALIGN ( sizeof ( struct ndmsg ) )
2018-12-20 16:50:50 +00:00
+ nla_total_size ( MAX_ADDR_LEN ) /* NDA_DST */
2021-10-11 14:12:37 +02:00
+ nla_total_size ( 4 ) /* NDA_FLAGS_EXT */
2018-12-19 12:51:38 -08:00
+ nla_total_size ( 1 ) ; /* NDA_PROTOCOL */
}
static int pneigh_get_reply ( struct net * net , struct pneigh_entry * neigh ,
u32 pid , u32 seq , struct neigh_table * tbl )
{
struct sk_buff * skb ;
int err = 0 ;
skb = nlmsg_new ( pneigh_nlmsg_size ( ) , GFP_KERNEL ) ;
if ( ! skb )
return - ENOBUFS ;
err = pneigh_fill_info ( skb , neigh , pid , seq , RTM_NEWNEIGH , 0 , tbl ) ;
if ( err ) {
kfree_skb ( skb ) ;
goto errout ;
}
err = rtnl_unicast ( skb , net , pid ) ;
errout :
return err ;
}
static int neigh_get ( struct sk_buff * in_skb , struct nlmsghdr * nlh ,
struct netlink_ext_ack * extack )
{
struct net * net = sock_net ( in_skb - > sk ) ;
struct net_device * dev = NULL ;
struct neigh_table * tbl = NULL ;
struct neighbour * neigh ;
void * dst = NULL ;
u8 ndm_flags = 0 ;
int dev_idx = 0 ;
int err ;
err = neigh_valid_get_req ( nlh , & tbl , & dst , & dev_idx , & ndm_flags ,
extack ) ;
if ( err < 0 )
return err ;
if ( dev_idx ) {
dev = __dev_get_by_index ( net , dev_idx ) ;
if ( ! dev ) {
NL_SET_ERR_MSG ( extack , " Unknown device ifindex " ) ;
return - ENODEV ;
}
}
if ( ! dst ) {
NL_SET_ERR_MSG ( extack , " Network address not specified " ) ;
return - EINVAL ;
}
if ( ndm_flags & NTF_PROXY ) {
struct pneigh_entry * pn ;
pn = pneigh_lookup ( tbl , net , dst , dev , 0 ) ;
if ( ! pn ) {
NL_SET_ERR_MSG ( extack , " Proxy neighbour entry not found " ) ;
return - ENOENT ;
}
return pneigh_get_reply ( net , pn , NETLINK_CB ( in_skb ) . portid ,
nlh - > nlmsg_seq , tbl ) ;
}
if ( ! dev ) {
NL_SET_ERR_MSG ( extack , " No device specified " ) ;
return - EINVAL ;
}
neigh = neigh_lookup ( tbl , dst , dev ) ;
if ( ! neigh ) {
NL_SET_ERR_MSG ( extack , " Neighbour entry not found " ) ;
return - ENOENT ;
}
err = neigh_get_reply ( net , neigh , NETLINK_CB ( in_skb ) . portid ,
nlh - > nlmsg_seq ) ;
neigh_release ( neigh ) ;
return err ;
}
2005-04-16 15:20:36 -07:00
void neigh_for_each ( struct neigh_table * tbl , void ( * cb ) ( struct neighbour * , void * ) , void * cookie )
{
int chain ;
2010-10-04 06:15:44 +00:00
struct neigh_hash_table * nht ;
2005-04-16 15:20:36 -07:00
2023-03-21 04:01:14 +00:00
rcu_read_lock ( ) ;
nht = rcu_dereference ( tbl - > nht ) ;
2010-10-04 06:15:44 +00:00
2023-03-21 04:01:14 +00:00
read_lock_bh ( & tbl - > lock ) ; /* avoid resizes */
2011-07-11 01:28:12 -07:00
for ( chain = 0 ; chain < ( 1 < < nht - > hash_shift ) ; chain + + ) {
2005-04-16 15:20:36 -07:00
struct neighbour * n ;
2023-03-21 04:01:14 +00:00
for ( n = rcu_dereference ( nht - > hash_buckets [ chain ] ) ;
2010-10-06 17:49:21 -07:00
n ! = NULL ;
2023-03-21 04:01:14 +00:00
n = rcu_dereference ( n - > next ) )
2005-04-16 15:20:36 -07:00
cb ( n , cookie ) ;
}
2023-03-21 04:01:14 +00:00
read_unlock_bh ( & tbl - > lock ) ;
rcu_read_unlock ( ) ;
2005-04-16 15:20:36 -07:00
}
EXPORT_SYMBOL ( neigh_for_each ) ;
/* The tbl->lock must be held as a writer and BH disabled. */
void __neigh_for_each_release ( struct neigh_table * tbl ,
int ( * cb ) ( struct neighbour * ) )
{
int chain ;
2010-10-04 06:15:44 +00:00
struct neigh_hash_table * nht ;
2005-04-16 15:20:36 -07:00
2010-10-04 06:15:44 +00:00
nht = rcu_dereference_protected ( tbl - > nht ,
lockdep_is_held ( & tbl - > lock ) ) ;
2011-07-11 01:28:12 -07:00
for ( chain = 0 ; chain < ( 1 < < nht - > hash_shift ) ; chain + + ) {
2010-10-06 17:49:21 -07:00
struct neighbour * n ;
struct neighbour __rcu * * np ;
2005-04-16 15:20:36 -07:00
2010-10-04 06:15:44 +00:00
np = & nht - > hash_buckets [ chain ] ;
2010-10-06 17:49:21 -07:00
while ( ( n = rcu_dereference_protected ( * np ,
lockdep_is_held ( & tbl - > lock ) ) ) ! = NULL ) {
2005-04-16 15:20:36 -07:00
int release ;
write_lock ( & n - > lock ) ;
release = cb ( n ) ;
if ( release ) {
2010-10-06 17:49:21 -07:00
rcu_assign_pointer ( * np ,
rcu_dereference_protected ( n - > next ,
lockdep_is_held ( & tbl - > lock ) ) ) ;
neighbor: Improve garbage collection
The existing garbage collection algorithm has a number of problems:
1. The gc algorithm will not evict PERMANENT entries as those entries
are managed by userspace, yet the existing algorithm walks the entire
hash table which means it always considers PERMANENT entries when
looking for entries to evict. In some use cases (e.g., EVPN) there
can be tens of thousands of PERMANENT entries leading to wasted
CPU cycles when gc kicks in. As an example, with 32k permanent
entries, neigh_alloc has been observed taking more than 4 msec per
invocation.
2. Currently, when the number of neighbor entries hits gc_thresh2 and
the last flush for the table was more than 5 seconds ago gc kicks in
walks the entire hash table evicting *all* entries not in PERMANENT
or REACHABLE state and not marked as externally learned. There is no
discriminator on when the neigh entry was created or if it just moved
from REACHABLE to another NUD_VALID state (e.g., NUD_STALE).
It is possible for entries to be created or for established neighbor
entries to be moved to STALE (e.g., an external node sends an ARP
request) right before the 5 second window lapses:
-----|---------x|----------|-----
t-5 t t+5
If that happens those entries are evicted during gc causing unnecessary
thrashing on neighbor entries and userspace caches trying to track them.
Further, this contradicts the description of gc_thresh2 which says
"Entries older than 5 seconds will be cleared".
One workaround is to make gc_thresh2 == gc_thresh3 but that negates the
whole point of having separate thresholds.
3. Clearing *all* neigh non-PERMANENT/REACHABLE/externally learned entries
when gc_thresh2 is exceeded is over kill and contributes to trashing
especially during startup.
This patch addresses these problems as follows:
1. Use of a separate list_head to track entries that can be garbage
collected along with a separate counter. PERMANENT entries are not
added to this list.
The gc_thresh parameters are only compared to the new counter, not the
total entries in the table. The forced_gc function is updated to only
walk this new gc_list looking for entries to evict.
2. Entries are added to the list head at the tail and removed from the
front.
3. Entries are only evicted if they were last updated more than 5 seconds
ago, adhering to the original intent of gc_thresh2.
4. Forced gc is stopped once the number of gc_entries drops below
gc_thresh2.
5. Since gc checks do not apply to PERMANENT entries, gc levels are skipped
when allocating a new neighbor for a PERMANENT entry. By extension this
means there are no explicit limits on the number of PERMANENT entries
that can be created, but this is no different than FIB entries or FDB
entries.
Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-12-07 12:24:57 -08:00
neigh_mark_dead ( n ) ;
2005-04-16 15:20:36 -07:00
} else
np = & n - > next ;
write_unlock ( & n - > lock ) ;
2007-08-08 23:12:36 -07:00
if ( release )
neigh_cleanup_and_release ( n ) ;
2005-04-16 15:20:36 -07:00
}
}
}
EXPORT_SYMBOL ( __neigh_for_each_release ) ;
2015-03-07 16:25:56 -06:00
int neigh_xmit ( int index , struct net_device * dev ,
2015-03-03 17:11:16 -06:00
const void * addr , struct sk_buff * skb )
{
2015-03-07 16:25:56 -06:00
int err = - EAFNOSUPPORT ;
if ( likely ( index < NEIGH_NR_TABLES ) ) {
2015-03-03 17:11:16 -06:00
struct neigh_table * tbl ;
struct neighbour * neigh ;
2015-03-07 16:25:56 -06:00
tbl = neigh_tables [ index ] ;
2015-03-03 17:11:16 -06:00
if ( ! tbl )
goto out ;
2023-03-21 04:01:14 +00:00
rcu_read_lock ( ) ;
2019-05-01 18:18:42 -07:00
if ( index = = NEIGH_ARP_TABLE ) {
u32 key = * ( ( u32 * ) addr ) ;
neigh = __ipv4_neigh_lookup_noref ( dev , key ) ;
} else {
neigh = __neigh_lookup_noref ( tbl , addr , dev ) ;
}
2015-03-03 17:11:16 -06:00
if ( ! neigh )
neigh = __neigh_create ( tbl , addr , dev , false ) ;
err = PTR_ERR ( neigh ) ;
neigh: Explicitly declare RCU-bh read side critical section in neigh_xmit()
neigh_xmit() expects to be called inside an RCU-bh read side critical
section, and while one of its two current callers gets this right, the
other one doesn't.
More specifically, neigh_xmit() has two callers, mpls_forward() and
mpls_output(), and while both callers call neigh_xmit() under
rcu_read_lock(), this provides sufficient protection for neigh_xmit()
only in the case of mpls_forward(), as that is always called from
softirq context and therefore doesn't need explicit BH protection,
while mpls_output() can be called from process context with softirqs
enabled.
When mpls_output() is called from process context, with softirqs
enabled, we can be preempted by a softirq at any time, and RCU-bh
considers the completion of a softirq as signaling the end of any
pending read-side critical sections, so if we do get a softirq
while we are in the part of neigh_xmit() that expects to be run inside
an RCU-bh read side critical section, we can end up with an unexpected
RCU grace period running right in the middle of that critical section,
making things go boom.
This patch fixes this impedance mismatch in the callee, by making
neigh_xmit() always take rcu_read_{,un}lock_bh() around the code that
expects to be treated as an RCU-bh read side critical section, as this
seems a safer option than fixing it in the callers.
Fixes: 4fd3d7d9e868f ("neigh: Add helper function neigh_xmit")
Signed-off-by: David Barroso <dbarroso@fastly.com>
Signed-off-by: Lennert Buytenhek <lbuytenhek@fastly.com>
Acked-by: David Ahern <dsa@cumulusnetworks.com>
Acked-by: Robert Shearman <rshearma@brocade.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-06-28 11:16:43 +03:00
if ( IS_ERR ( neigh ) ) {
2023-03-21 04:01:14 +00:00
rcu_read_unlock ( ) ;
2015-03-03 17:11:16 -06:00
goto out_kfree_skb ;
neigh: Explicitly declare RCU-bh read side critical section in neigh_xmit()
neigh_xmit() expects to be called inside an RCU-bh read side critical
section, and while one of its two current callers gets this right, the
other one doesn't.
More specifically, neigh_xmit() has two callers, mpls_forward() and
mpls_output(), and while both callers call neigh_xmit() under
rcu_read_lock(), this provides sufficient protection for neigh_xmit()
only in the case of mpls_forward(), as that is always called from
softirq context and therefore doesn't need explicit BH protection,
while mpls_output() can be called from process context with softirqs
enabled.
When mpls_output() is called from process context, with softirqs
enabled, we can be preempted by a softirq at any time, and RCU-bh
considers the completion of a softirq as signaling the end of any
pending read-side critical sections, so if we do get a softirq
while we are in the part of neigh_xmit() that expects to be run inside
an RCU-bh read side critical section, we can end up with an unexpected
RCU grace period running right in the middle of that critical section,
making things go boom.
This patch fixes this impedance mismatch in the callee, by making
neigh_xmit() always take rcu_read_{,un}lock_bh() around the code that
expects to be treated as an RCU-bh read side critical section, as this
seems a safer option than fixing it in the callers.
Fixes: 4fd3d7d9e868f ("neigh: Add helper function neigh_xmit")
Signed-off-by: David Barroso <dbarroso@fastly.com>
Signed-off-by: Lennert Buytenhek <lbuytenhek@fastly.com>
Acked-by: David Ahern <dsa@cumulusnetworks.com>
Acked-by: Robert Shearman <rshearma@brocade.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-06-28 11:16:43 +03:00
}
2023-09-21 09:27:13 +00:00
err = READ_ONCE ( neigh - > output ) ( neigh , skb ) ;
2023-03-21 04:01:14 +00:00
rcu_read_unlock ( ) ;
2015-03-03 17:11:16 -06:00
}
2015-03-07 16:25:56 -06:00
else if ( index = = NEIGH_LINK_TABLE ) {
err = dev_hard_header ( skb , dev , ntohs ( skb - > protocol ) ,
addr , NULL , skb - > len ) ;
if ( err < 0 )
goto out_kfree_skb ;
err = dev_queue_xmit ( skb ) ;
}
2015-03-03 17:11:16 -06:00
out :
return err ;
out_kfree_skb :
kfree_skb ( skb ) ;
goto out ;
}
EXPORT_SYMBOL ( neigh_xmit ) ;
2005-04-16 15:20:36 -07:00
# ifdef CONFIG_PROC_FS
static struct neighbour * neigh_get_first ( struct seq_file * seq )
{
struct neigh_seq_state * state = seq - > private ;
2008-03-26 02:36:06 +09:00
struct net * net = seq_file_net ( seq ) ;
2010-10-04 06:15:44 +00:00
struct neigh_hash_table * nht = state - > nht ;
2005-04-16 15:20:36 -07:00
struct neighbour * n = NULL ;
2019-07-26 10:46:11 +01:00
int bucket ;
2005-04-16 15:20:36 -07:00
state - > flags & = ~ NEIGH_SEQ_IS_PNEIGH ;
2011-07-11 01:28:12 -07:00
for ( bucket = 0 ; bucket < ( 1 < < nht - > hash_shift ) ; bucket + + ) {
2023-03-21 04:01:14 +00:00
n = rcu_dereference ( nht - > hash_buckets [ bucket ] ) ;
2005-04-16 15:20:36 -07:00
while ( n ) {
2008-03-26 03:57:35 +09:00
if ( ! net_eq ( dev_net ( n - > dev ) , net ) )
2008-01-24 00:13:18 -08:00
goto next ;
2005-04-16 15:20:36 -07:00
if ( state - > neigh_sub_iter ) {
loff_t fakep = 0 ;
void * v ;
v = state - > neigh_sub_iter ( state , n , & fakep ) ;
if ( ! v )
goto next ;
}
if ( ! ( state - > flags & NEIGH_SEQ_SKIP_NOARP ) )
break ;
2023-03-13 20:17:31 +00:00
if ( READ_ONCE ( n - > nud_state ) & ~ NUD_NOARP )
2005-04-16 15:20:36 -07:00
break ;
2010-10-06 17:49:21 -07:00
next :
2023-03-21 04:01:14 +00:00
n = rcu_dereference ( n - > next ) ;
2005-04-16 15:20:36 -07:00
}
if ( n )
break ;
}
state - > bucket = bucket ;
return n ;
}
static struct neighbour * neigh_get_next ( struct seq_file * seq ,
struct neighbour * n ,
loff_t * pos )
{
struct neigh_seq_state * state = seq - > private ;
2008-03-26 02:36:06 +09:00
struct net * net = seq_file_net ( seq ) ;
2010-10-04 06:15:44 +00:00
struct neigh_hash_table * nht = state - > nht ;
2005-04-16 15:20:36 -07:00
if ( state - > neigh_sub_iter ) {
void * v = state - > neigh_sub_iter ( state , n , pos ) ;
if ( v )
return n ;
}
2023-03-21 04:01:14 +00:00
n = rcu_dereference ( n - > next ) ;
2005-04-16 15:20:36 -07:00
while ( 1 ) {
while ( n ) {
2008-03-26 03:57:35 +09:00
if ( ! net_eq ( dev_net ( n - > dev ) , net ) )
2008-01-24 00:13:18 -08:00
goto next ;
2005-04-16 15:20:36 -07:00
if ( state - > neigh_sub_iter ) {
void * v = state - > neigh_sub_iter ( state , n , pos ) ;
if ( v )
return n ;
goto next ;
}
if ( ! ( state - > flags & NEIGH_SEQ_SKIP_NOARP ) )
break ;
2023-03-13 20:17:31 +00:00
if ( READ_ONCE ( n - > nud_state ) & ~ NUD_NOARP )
2005-04-16 15:20:36 -07:00
break ;
2010-10-06 17:49:21 -07:00
next :
2023-03-21 04:01:14 +00:00
n = rcu_dereference ( n - > next ) ;
2005-04-16 15:20:36 -07:00
}
if ( n )
break ;
2011-07-11 01:28:12 -07:00
if ( + + state - > bucket > = ( 1 < < nht - > hash_shift ) )
2005-04-16 15:20:36 -07:00
break ;
2023-03-21 04:01:14 +00:00
n = rcu_dereference ( nht - > hash_buckets [ state - > bucket ] ) ;
2005-04-16 15:20:36 -07:00
}
if ( n & & pos )
- - ( * pos ) ;
return n ;
}
static struct neighbour * neigh_get_idx ( struct seq_file * seq , loff_t * pos )
{
struct neighbour * n = neigh_get_first ( seq ) ;
if ( n ) {
2008-08-03 01:10:55 -07:00
- - ( * pos ) ;
2005-04-16 15:20:36 -07:00
while ( * pos ) {
n = neigh_get_next ( seq , n , pos ) ;
if ( ! n )
break ;
}
}
return * pos ? NULL : n ;
}
static struct pneigh_entry * pneigh_get_first ( struct seq_file * seq )
{
struct neigh_seq_state * state = seq - > private ;
2008-03-26 02:36:06 +09:00
struct net * net = seq_file_net ( seq ) ;
2005-04-16 15:20:36 -07:00
struct neigh_table * tbl = state - > tbl ;
struct pneigh_entry * pn = NULL ;
2021-05-08 18:03:05 +08:00
int bucket ;
2005-04-16 15:20:36 -07:00
state - > flags | = NEIGH_SEQ_IS_PNEIGH ;
for ( bucket = 0 ; bucket < = PNEIGH_HASHMASK ; bucket + + ) {
pn = tbl - > phash_buckets [ bucket ] ;
2008-03-26 03:57:35 +09:00
while ( pn & & ! net_eq ( pneigh_net ( pn ) , net ) )
2008-01-24 00:13:18 -08:00
pn = pn - > next ;
2005-04-16 15:20:36 -07:00
if ( pn )
break ;
}
state - > bucket = bucket ;
return pn ;
}
static struct pneigh_entry * pneigh_get_next ( struct seq_file * seq ,
struct pneigh_entry * pn ,
loff_t * pos )
{
struct neigh_seq_state * state = seq - > private ;
2008-03-26 02:36:06 +09:00
struct net * net = seq_file_net ( seq ) ;
2005-04-16 15:20:36 -07:00
struct neigh_table * tbl = state - > tbl ;
2011-11-25 13:24:49 -05:00
do {
pn = pn - > next ;
} while ( pn & & ! net_eq ( pneigh_net ( pn ) , net ) ) ;
2005-04-16 15:20:36 -07:00
while ( ! pn ) {
if ( + + state - > bucket > PNEIGH_HASHMASK )
break ;
pn = tbl - > phash_buckets [ state - > bucket ] ;
2008-03-26 03:57:35 +09:00
while ( pn & & ! net_eq ( pneigh_net ( pn ) , net ) )
2008-01-24 00:13:18 -08:00
pn = pn - > next ;
2005-04-16 15:20:36 -07:00
if ( pn )
break ;
}
if ( pn & & pos )
- - ( * pos ) ;
return pn ;
}
static struct pneigh_entry * pneigh_get_idx ( struct seq_file * seq , loff_t * pos )
{
struct pneigh_entry * pn = pneigh_get_first ( seq ) ;
if ( pn ) {
2008-08-03 01:10:55 -07:00
- - ( * pos ) ;
2005-04-16 15:20:36 -07:00
while ( * pos ) {
pn = pneigh_get_next ( seq , pn , pos ) ;
if ( ! pn )
break ;
}
}
return * pos ? NULL : pn ;
}
static void * neigh_get_idx_any ( struct seq_file * seq , loff_t * pos )
{
struct neigh_seq_state * state = seq - > private ;
void * rc ;
2008-08-03 01:10:55 -07:00
loff_t idxpos = * pos ;
2005-04-16 15:20:36 -07:00
2008-08-03 01:10:55 -07:00
rc = neigh_get_idx ( seq , & idxpos ) ;
2005-04-16 15:20:36 -07:00
if ( ! rc & & ! ( state - > flags & NEIGH_SEQ_NEIGH_ONLY ) )
2008-08-03 01:10:55 -07:00
rc = pneigh_get_idx ( seq , & idxpos ) ;
2005-04-16 15:20:36 -07:00
return rc ;
}
void * neigh_seq_start ( struct seq_file * seq , loff_t * pos , struct neigh_table * tbl , unsigned int neigh_seq_flags )
2019-06-15 16:28:48 -07:00
__acquires ( tbl - > lock )
2023-03-21 04:01:14 +00:00
__acquires ( rcu )
2005-04-16 15:20:36 -07:00
{
struct neigh_seq_state * state = seq - > private ;
state - > tbl = tbl ;
state - > bucket = 0 ;
state - > flags = ( neigh_seq_flags & ~ NEIGH_SEQ_IS_PNEIGH ) ;
2023-03-21 04:01:14 +00:00
rcu_read_lock ( ) ;
state - > nht = rcu_dereference ( tbl - > nht ) ;
read_lock_bh ( & tbl - > lock ) ;
2010-10-06 17:49:21 -07:00
2008-08-03 01:10:55 -07:00
return * pos ? neigh_get_idx_any ( seq , pos ) : SEQ_START_TOKEN ;
2005-04-16 15:20:36 -07:00
}
EXPORT_SYMBOL ( neigh_seq_start ) ;
void * neigh_seq_next ( struct seq_file * seq , void * v , loff_t * pos )
{
struct neigh_seq_state * state ;
void * rc ;
if ( v = = SEQ_START_TOKEN ) {
2008-08-03 01:02:41 -07:00
rc = neigh_get_first ( seq ) ;
2005-04-16 15:20:36 -07:00
goto out ;
}
state = seq - > private ;
if ( ! ( state - > flags & NEIGH_SEQ_IS_PNEIGH ) ) {
rc = neigh_get_next ( seq , v , NULL ) ;
if ( rc )
goto out ;
if ( ! ( state - > flags & NEIGH_SEQ_NEIGH_ONLY ) )
rc = pneigh_get_first ( seq ) ;
} else {
BUG_ON ( state - > flags & NEIGH_SEQ_NEIGH_ONLY ) ;
rc = pneigh_get_next ( seq , v , NULL ) ;
}
out :
+ + ( * pos ) ;
return rc ;
}
EXPORT_SYMBOL ( neigh_seq_next ) ;
void neigh_seq_stop ( struct seq_file * seq , void * v )
2019-06-15 16:28:48 -07:00
__releases ( tbl - > lock )
2023-03-21 04:01:14 +00:00
__releases ( rcu )
2005-04-16 15:20:36 -07:00
{
2019-06-15 16:28:48 -07:00
struct neigh_seq_state * state = seq - > private ;
struct neigh_table * tbl = state - > tbl ;
2023-03-21 04:01:14 +00:00
read_unlock_bh ( & tbl - > lock ) ;
rcu_read_unlock ( ) ;
2005-04-16 15:20:36 -07:00
}
EXPORT_SYMBOL ( neigh_seq_stop ) ;
/* statistics via seq_file */
static void * neigh_stat_seq_start ( struct seq_file * seq , loff_t * pos )
{
2022-01-21 22:14:23 -08:00
struct neigh_table * tbl = pde_data ( file_inode ( seq - > file ) ) ;
2005-04-16 15:20:36 -07:00
int cpu ;
if ( * pos = = 0 )
return SEQ_START_TOKEN ;
2007-02-09 23:24:36 +09:00
2008-12-29 12:23:42 +00:00
for ( cpu = * pos - 1 ; cpu < nr_cpu_ids ; + + cpu ) {
2005-04-16 15:20:36 -07:00
if ( ! cpu_possible ( cpu ) )
continue ;
* pos = cpu + 1 ;
return per_cpu_ptr ( tbl - > stats , cpu ) ;
}
return NULL ;
}
static void * neigh_stat_seq_next ( struct seq_file * seq , void * v , loff_t * pos )
{
2022-01-21 22:14:23 -08:00
struct neigh_table * tbl = pde_data ( file_inode ( seq - > file ) ) ;
2005-04-16 15:20:36 -07:00
int cpu ;
2008-12-29 12:23:42 +00:00
for ( cpu = * pos ; cpu < nr_cpu_ids ; + + cpu ) {
2005-04-16 15:20:36 -07:00
if ( ! cpu_possible ( cpu ) )
continue ;
* pos = cpu + 1 ;
return per_cpu_ptr ( tbl - > stats , cpu ) ;
}
2020-01-23 10:11:28 +03:00
( * pos ) + + ;
2005-04-16 15:20:36 -07:00
return NULL ;
}
static void neigh_stat_seq_stop ( struct seq_file * seq , void * v )
{
}
static int neigh_stat_seq_show ( struct seq_file * seq , void * v )
{
2022-01-21 22:14:23 -08:00
struct neigh_table * tbl = pde_data ( file_inode ( seq - > file ) ) ;
2005-04-16 15:20:36 -07:00
struct neigh_statistics * st = v ;
if ( v = = SEQ_START_TOKEN ) {
2021-08-02 16:05:08 +08:00
seq_puts ( seq , " entries allocs destroys hash_grows lookups hits res_failed rcv_probes_mcast rcv_probes_ucast periodic_gc_runs forced_gc_runs unresolved_discards table_fulls \n " ) ;
2005-04-16 15:20:36 -07:00
return 0 ;
}
2021-08-02 16:05:08 +08:00
seq_printf ( seq , " %08x %08lx %08lx %08lx %08lx %08lx %08lx "
" %08lx %08lx %08lx "
" %08lx %08lx %08lx \n " ,
2005-04-16 15:20:36 -07:00
atomic_read ( & tbl - > entries ) ,
st - > allocs ,
st - > destroys ,
st - > hash_grows ,
st - > lookups ,
st - > hits ,
st - > res_failed ,
st - > rcv_probes_mcast ,
st - > rcv_probes_ucast ,
st - > periodic_gc_runs ,
2008-07-16 20:50:49 -07:00
st - > forced_gc_runs ,
2015-08-07 11:10:37 -07:00
st - > unres_discards ,
st - > table_fulls
2005-04-16 15:20:36 -07:00
) ;
return 0 ;
}
2007-03-12 14:34:29 -07:00
static const struct seq_operations neigh_stat_seq_ops = {
2005-04-16 15:20:36 -07:00
. start = neigh_stat_seq_start ,
. next = neigh_stat_seq_next ,
. stop = neigh_stat_seq_stop ,
. show = neigh_stat_seq_show ,
} ;
# endif /* CONFIG_PROC_FS */
2017-03-19 22:01:28 -07:00
static void __neigh_notify ( struct neighbour * n , int type , int flags ,
u32 pid )
2005-04-16 15:20:36 -07:00
{
2008-03-25 21:47:49 +09:00
struct net * net = dev_net ( n - > dev ) ;
2006-08-07 17:56:37 -07:00
struct sk_buff * skb ;
2006-08-15 00:33:14 -07:00
int err = - ENOBUFS ;
2005-04-16 15:20:36 -07:00
2006-11-10 14:10:15 -08:00
skb = nlmsg_new ( neigh_nlmsg_size ( ) , GFP_ATOMIC ) ;
2006-08-07 17:56:37 -07:00
if ( skb = = NULL )
2006-08-15 00:33:14 -07:00
goto errout ;
2005-04-16 15:20:36 -07:00
2017-03-19 22:01:28 -07:00
err = neigh_fill_info ( skb , n , pid , 0 , type , flags ) ;
2007-01-31 23:16:40 -08:00
if ( err < 0 ) {
/* -EMSGSIZE implies BUG in neigh_nlmsg_size() */
WARN_ON ( err = = - EMSGSIZE ) ;
kfree_skb ( skb ) ;
goto errout ;
}
2009-02-24 23:18:28 -08:00
rtnl_notify ( skb , net , 0 , RTNLGRP_NEIGH , NULL , GFP_ATOMIC ) ;
return ;
2006-08-15 00:33:14 -07:00
errout :
if ( err < 0 )
2008-01-24 00:13:18 -08:00
rtnl_set_sk_err ( net , RTNLGRP_NEIGH , err ) ;
2005-04-16 15:20:36 -07:00
}
2006-08-15 00:33:14 -07:00
void neigh_app_ns ( struct neighbour * n )
2005-04-16 15:20:36 -07:00
{
2017-03-19 22:01:28 -07:00
__neigh_notify ( n , RTM_GETNEIGH , NLM_F_REQUEST , 0 ) ;
2006-08-15 00:33:14 -07:00
}
2008-03-24 18:39:10 +09:00
EXPORT_SYMBOL ( neigh_app_ns ) ;
2005-04-16 15:20:36 -07:00
# ifdef CONFIG_SYSCTL
2012-12-06 10:04:04 +08:00
static int unres_qlen_max = INT_MAX / SKB_TRUESIZE ( ETH_FRAME_LEN ) ;
2005-04-16 15:20:36 -07:00
2013-06-11 23:04:25 -07:00
static int proc_unres_qlen ( struct ctl_table * ctl , int write ,
2020-04-24 08:43:38 +02:00
void * buffer , size_t * lenp , loff_t * ppos )
neigh: new unresolved queue limits
Le mercredi 09 novembre 2011 à 16:21 -0500, David Miller a écrit :
> From: David Miller <davem@davemloft.net>
> Date: Wed, 09 Nov 2011 16:16:44 -0500 (EST)
>
> > From: Eric Dumazet <eric.dumazet@gmail.com>
> > Date: Wed, 09 Nov 2011 12:14:09 +0100
> >
> >> unres_qlen is the number of frames we are able to queue per unresolved
> >> neighbour. Its default value (3) was never changed and is responsible
> >> for strange drops, especially if IP fragments are used, or multiple
> >> sessions start in parallel. Even a single tcp flow can hit this limit.
> > ...
> >
> > Ok, I've applied this, let's see what happens :-)
>
> Early answer, build fails.
>
> Please test build this patch with DECNET enabled and resubmit. The
> decnet neigh layer still refers to the removed ->queue_len member.
>
> Thanks.
Ouch, this was fixed on one machine yesterday, but not the other one I
used this morning, sorry.
[PATCH V5 net-next] neigh: new unresolved queue limits
unres_qlen is the number of frames we are able to queue per unresolved
neighbour. Its default value (3) was never changed and is responsible
for strange drops, especially if IP fragments are used, or multiple
sessions start in parallel. Even a single tcp flow can hit this limit.
$ arp -d 192.168.20.108 ; ping -c 2 -s 8000 192.168.20.108
PING 192.168.20.108 (192.168.20.108) 8000(8028) bytes of data.
8008 bytes from 192.168.20.108: icmp_seq=2 ttl=64 time=0.322 ms
Signed-off-by: David S. Miller <davem@davemloft.net>
2011-11-09 12:07:14 +00:00
{
int size , ret ;
2013-06-11 23:04:25 -07:00
struct ctl_table tmp = * ctl ;
neigh: new unresolved queue limits
Le mercredi 09 novembre 2011 à 16:21 -0500, David Miller a écrit :
> From: David Miller <davem@davemloft.net>
> Date: Wed, 09 Nov 2011 16:16:44 -0500 (EST)
>
> > From: Eric Dumazet <eric.dumazet@gmail.com>
> > Date: Wed, 09 Nov 2011 12:14:09 +0100
> >
> >> unres_qlen is the number of frames we are able to queue per unresolved
> >> neighbour. Its default value (3) was never changed and is responsible
> >> for strange drops, especially if IP fragments are used, or multiple
> >> sessions start in parallel. Even a single tcp flow can hit this limit.
> > ...
> >
> > Ok, I've applied this, let's see what happens :-)
>
> Early answer, build fails.
>
> Please test build this patch with DECNET enabled and resubmit. The
> decnet neigh layer still refers to the removed ->queue_len member.
>
> Thanks.
Ouch, this was fixed on one machine yesterday, but not the other one I
used this morning, sorry.
[PATCH V5 net-next] neigh: new unresolved queue limits
unres_qlen is the number of frames we are able to queue per unresolved
neighbour. Its default value (3) was never changed and is responsible
for strange drops, especially if IP fragments are used, or multiple
sessions start in parallel. Even a single tcp flow can hit this limit.
$ arp -d 192.168.20.108 ; ping -c 2 -s 8000 192.168.20.108
PING 192.168.20.108 (192.168.20.108) 8000(8028) bytes of data.
8008 bytes from 192.168.20.108: icmp_seq=2 ttl=64 time=0.322 ms
Signed-off-by: David S. Miller <davem@davemloft.net>
2011-11-09 12:07:14 +00:00
proc/sysctl: add shared variables for range check
In the sysctl code the proc_dointvec_minmax() function is often used to
validate the user supplied value between an allowed range. This
function uses the extra1 and extra2 members from struct ctl_table as
minimum and maximum allowed value.
On sysctl handler declaration, in every source file there are some
readonly variables containing just an integer which address is assigned
to the extra1 and extra2 members, so the sysctl range is enforced.
The special values 0, 1 and INT_MAX are very often used as range
boundary, leading duplication of variables like zero=0, one=1,
int_max=INT_MAX in different source files:
$ git grep -E '\.extra[12].*&(zero|one|int_max)' |wc -l
248
Add a const int array containing the most commonly used values, some
macros to refer more easily to the correct array member, and use them
instead of creating a local one for every object file.
This is the bloat-o-meter output comparing the old and new binary
compiled with the default Fedora config:
# scripts/bloat-o-meter -d vmlinux.o.old vmlinux.o
add/remove: 2/2 grow/shrink: 0/2 up/down: 24/-188 (-164)
Data old new delta
sysctl_vals - 12 +12
__kstrtab_sysctl_vals - 12 +12
max 14 10 -4
int_max 16 - -16
one 68 - -68
zero 128 28 -100
Total: Before=20583249, After=20583085, chg -0.00%
[mcroce@redhat.com: tipc: remove two unused variables]
Link: http://lkml.kernel.org/r/20190530091952.4108-1-mcroce@redhat.com
[akpm@linux-foundation.org: fix net/ipv6/sysctl_net_ipv6.c]
[arnd@arndb.de: proc/sysctl: make firmware loader table conditional]
Link: http://lkml.kernel.org/r/20190617130014.1713870-1-arnd@arndb.de
[akpm@linux-foundation.org: fix fs/eventpoll.c]
Link: http://lkml.kernel.org/r/20190430180111.10688-1-mcroce@redhat.com
Signed-off-by: Matteo Croce <mcroce@redhat.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Aaron Tomlin <atomlin@redhat.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-07-18 15:58:50 -07:00
tmp . extra1 = SYSCTL_ZERO ;
2012-12-04 18:49:15 +00:00
tmp . extra2 = & unres_qlen_max ;
neigh: new unresolved queue limits
Le mercredi 09 novembre 2011 à 16:21 -0500, David Miller a écrit :
> From: David Miller <davem@davemloft.net>
> Date: Wed, 09 Nov 2011 16:16:44 -0500 (EST)
>
> > From: Eric Dumazet <eric.dumazet@gmail.com>
> > Date: Wed, 09 Nov 2011 12:14:09 +0100
> >
> >> unres_qlen is the number of frames we are able to queue per unresolved
> >> neighbour. Its default value (3) was never changed and is responsible
> >> for strange drops, especially if IP fragments are used, or multiple
> >> sessions start in parallel. Even a single tcp flow can hit this limit.
> > ...
> >
> > Ok, I've applied this, let's see what happens :-)
>
> Early answer, build fails.
>
> Please test build this patch with DECNET enabled and resubmit. The
> decnet neigh layer still refers to the removed ->queue_len member.
>
> Thanks.
Ouch, this was fixed on one machine yesterday, but not the other one I
used this morning, sorry.
[PATCH V5 net-next] neigh: new unresolved queue limits
unres_qlen is the number of frames we are able to queue per unresolved
neighbour. Its default value (3) was never changed and is responsible
for strange drops, especially if IP fragments are used, or multiple
sessions start in parallel. Even a single tcp flow can hit this limit.
$ arp -d 192.168.20.108 ; ping -c 2 -s 8000 192.168.20.108
PING 192.168.20.108 (192.168.20.108) 8000(8028) bytes of data.
8008 bytes from 192.168.20.108: icmp_seq=2 ttl=64 time=0.322 ms
Signed-off-by: David S. Miller <davem@davemloft.net>
2011-11-09 12:07:14 +00:00
tmp . data = & size ;
2012-12-04 18:49:15 +00:00
size = * ( int * ) ctl - > data / SKB_TRUESIZE ( ETH_FRAME_LEN ) ;
ret = proc_dointvec_minmax ( & tmp , write , buffer , lenp , ppos ) ;
neigh: new unresolved queue limits
Le mercredi 09 novembre 2011 à 16:21 -0500, David Miller a écrit :
> From: David Miller <davem@davemloft.net>
> Date: Wed, 09 Nov 2011 16:16:44 -0500 (EST)
>
> > From: Eric Dumazet <eric.dumazet@gmail.com>
> > Date: Wed, 09 Nov 2011 12:14:09 +0100
> >
> >> unres_qlen is the number of frames we are able to queue per unresolved
> >> neighbour. Its default value (3) was never changed and is responsible
> >> for strange drops, especially if IP fragments are used, or multiple
> >> sessions start in parallel. Even a single tcp flow can hit this limit.
> > ...
> >
> > Ok, I've applied this, let's see what happens :-)
>
> Early answer, build fails.
>
> Please test build this patch with DECNET enabled and resubmit. The
> decnet neigh layer still refers to the removed ->queue_len member.
>
> Thanks.
Ouch, this was fixed on one machine yesterday, but not the other one I
used this morning, sorry.
[PATCH V5 net-next] neigh: new unresolved queue limits
unres_qlen is the number of frames we are able to queue per unresolved
neighbour. Its default value (3) was never changed and is responsible
for strange drops, especially if IP fragments are used, or multiple
sessions start in parallel. Even a single tcp flow can hit this limit.
$ arp -d 192.168.20.108 ; ping -c 2 -s 8000 192.168.20.108
PING 192.168.20.108 (192.168.20.108) 8000(8028) bytes of data.
8008 bytes from 192.168.20.108: icmp_seq=2 ttl=64 time=0.322 ms
Signed-off-by: David S. Miller <davem@davemloft.net>
2011-11-09 12:07:14 +00:00
if ( write & & ! ret )
* ( int * ) ctl - > data = size * SKB_TRUESIZE ( ETH_FRAME_LEN ) ;
return ret ;
}
2013-12-07 19:26:56 +01:00
static void neigh_copy_dflt_parms ( struct net * net , struct neigh_parms * p ,
int index )
{
struct net_device * dev ;
int family = neigh_parms_family ( p ) ;
rcu_read_lock ( ) ;
for_each_netdev_rcu ( net , dev ) {
struct neigh_parms * dst_p =
neigh_get_dev_parms_rcu ( dev , family ) ;
if ( dst_p & & ! test_bit ( index , dst_p - > data_state ) )
dst_p - > data [ index ] = p - > data [ index ] ;
}
rcu_read_unlock ( ) ;
}
static void neigh_proc_update ( struct ctl_table * ctl , int write )
{
struct net_device * dev = ctl - > extra1 ;
struct neigh_parms * p = ctl - > extra2 ;
2013-12-10 23:55:07 +01:00
struct net * net = neigh_parms_net ( p ) ;
2013-12-07 19:26:56 +01:00
int index = ( int * ) ctl - > data - p - > data ;
if ( ! write )
return ;
set_bit ( index , p - > data_state ) ;
2017-02-15 01:00:36 +01:00
if ( index = = NEIGH_VAR_DELAY_PROBE_TIME )
call_netevent_notifiers ( NETEVENT_DELAY_PROBE_TIME_UPDATE , p ) ;
2013-12-07 19:26:56 +01:00
if ( ! dev ) /* NULL dev means this is default value */
neigh_copy_dflt_parms ( net , p , index ) ;
}
2013-12-07 19:26:53 +01:00
static int neigh_proc_dointvec_zero_intmax ( struct ctl_table * ctl , int write ,
2020-04-24 08:43:38 +02:00
void * buffer , size_t * lenp ,
loff_t * ppos )
2013-12-07 19:26:53 +01:00
{
struct ctl_table tmp = * ctl ;
2013-12-07 19:26:56 +01:00
int ret ;
2013-12-07 19:26:53 +01:00
proc/sysctl: add shared variables for range check
In the sysctl code the proc_dointvec_minmax() function is often used to
validate the user supplied value between an allowed range. This
function uses the extra1 and extra2 members from struct ctl_table as
minimum and maximum allowed value.
On sysctl handler declaration, in every source file there are some
readonly variables containing just an integer which address is assigned
to the extra1 and extra2 members, so the sysctl range is enforced.
The special values 0, 1 and INT_MAX are very often used as range
boundary, leading duplication of variables like zero=0, one=1,
int_max=INT_MAX in different source files:
$ git grep -E '\.extra[12].*&(zero|one|int_max)' |wc -l
248
Add a const int array containing the most commonly used values, some
macros to refer more easily to the correct array member, and use them
instead of creating a local one for every object file.
This is the bloat-o-meter output comparing the old and new binary
compiled with the default Fedora config:
# scripts/bloat-o-meter -d vmlinux.o.old vmlinux.o
add/remove: 2/2 grow/shrink: 0/2 up/down: 24/-188 (-164)
Data old new delta
sysctl_vals - 12 +12
__kstrtab_sysctl_vals - 12 +12
max 14 10 -4
int_max 16 - -16
one 68 - -68
zero 128 28 -100
Total: Before=20583249, After=20583085, chg -0.00%
[mcroce@redhat.com: tipc: remove two unused variables]
Link: http://lkml.kernel.org/r/20190530091952.4108-1-mcroce@redhat.com
[akpm@linux-foundation.org: fix net/ipv6/sysctl_net_ipv6.c]
[arnd@arndb.de: proc/sysctl: make firmware loader table conditional]
Link: http://lkml.kernel.org/r/20190617130014.1713870-1-arnd@arndb.de
[akpm@linux-foundation.org: fix fs/eventpoll.c]
Link: http://lkml.kernel.org/r/20190430180111.10688-1-mcroce@redhat.com
Signed-off-by: Matteo Croce <mcroce@redhat.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Aaron Tomlin <atomlin@redhat.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-07-18 15:58:50 -07:00
tmp . extra1 = SYSCTL_ZERO ;
tmp . extra2 = SYSCTL_INT_MAX ;
2013-12-07 19:26:53 +01:00
2013-12-07 19:26:56 +01:00
ret = proc_dointvec_minmax ( & tmp , write , buffer , lenp , ppos ) ;
neigh_proc_update ( ctl , write ) ;
return ret ;
2013-12-07 19:26:53 +01:00
}
2022-06-29 08:48:32 +00:00
static int neigh_proc_dointvec_ms_jiffies_positive ( struct ctl_table * ctl , int write ,
void * buffer , size_t * lenp , loff_t * ppos )
{
struct ctl_table tmp = * ctl ;
int ret ;
int min = msecs_to_jiffies ( 1 ) ;
tmp . extra1 = & min ;
tmp . extra2 = NULL ;
ret = proc_dointvec_ms_jiffies_minmax ( & tmp , write , buffer , lenp , ppos ) ;
neigh_proc_update ( ctl , write ) ;
return ret ;
}
2020-04-24 08:43:38 +02:00
int neigh_proc_dointvec ( struct ctl_table * ctl , int write , void * buffer ,
size_t * lenp , loff_t * ppos )
2013-12-07 19:26:54 +01:00
{
2013-12-07 19:26:56 +01:00
int ret = proc_dointvec ( ctl , write , buffer , lenp , ppos ) ;
neigh_proc_update ( ctl , write ) ;
return ret ;
2013-12-07 19:26:54 +01:00
}
EXPORT_SYMBOL ( neigh_proc_dointvec ) ;
2020-04-24 08:43:38 +02:00
int neigh_proc_dointvec_jiffies ( struct ctl_table * ctl , int write , void * buffer ,
2013-12-07 19:26:54 +01:00
size_t * lenp , loff_t * ppos )
{
2013-12-07 19:26:56 +01:00
int ret = proc_dointvec_jiffies ( ctl , write , buffer , lenp , ppos ) ;
neigh_proc_update ( ctl , write ) ;
return ret ;
2013-12-07 19:26:54 +01:00
}
EXPORT_SYMBOL ( neigh_proc_dointvec_jiffies ) ;
static int neigh_proc_dointvec_userhz_jiffies ( struct ctl_table * ctl , int write ,
2020-04-24 08:43:38 +02:00
void * buffer , size_t * lenp ,
loff_t * ppos )
2013-12-07 19:26:54 +01:00
{
2013-12-07 19:26:56 +01:00
int ret = proc_dointvec_userhz_jiffies ( ctl , write , buffer , lenp , ppos ) ;
neigh_proc_update ( ctl , write ) ;
return ret ;
2013-12-07 19:26:54 +01:00
}
int neigh_proc_dointvec_ms_jiffies ( struct ctl_table * ctl , int write ,
2020-04-24 08:43:38 +02:00
void * buffer , size_t * lenp , loff_t * ppos )
2013-12-07 19:26:54 +01:00
{
2013-12-07 19:26:56 +01:00
int ret = proc_dointvec_ms_jiffies ( ctl , write , buffer , lenp , ppos ) ;
neigh_proc_update ( ctl , write ) ;
return ret ;
2013-12-07 19:26:54 +01:00
}
EXPORT_SYMBOL ( neigh_proc_dointvec_ms_jiffies ) ;
static int neigh_proc_dointvec_unres_qlen ( struct ctl_table * ctl , int write ,
2020-04-24 08:43:38 +02:00
void * buffer , size_t * lenp ,
loff_t * ppos )
2013-12-07 19:26:54 +01:00
{
2013-12-07 19:26:56 +01:00
int ret = proc_unres_qlen ( ctl , write , buffer , lenp , ppos ) ;
neigh_proc_update ( ctl , write ) ;
return ret ;
2013-12-07 19:26:54 +01:00
}
2015-01-14 04:22:39 +01:00
static int neigh_proc_base_reachable_time ( struct ctl_table * ctl , int write ,
2020-04-24 08:43:38 +02:00
void * buffer , size_t * lenp ,
loff_t * ppos )
2015-01-14 04:22:39 +01:00
{
struct neigh_parms * p = ctl - > extra2 ;
int ret ;
if ( strcmp ( ctl - > procname , " base_reachable_time " ) = = 0 )
ret = neigh_proc_dointvec_jiffies ( ctl , write , buffer , lenp , ppos ) ;
else if ( strcmp ( ctl - > procname , " base_reachable_time_ms " ) = = 0 )
ret = neigh_proc_dointvec_ms_jiffies ( ctl , write , buffer , lenp , ppos ) ;
else
ret = - 1 ;
if ( write & & ret = = 0 ) {
/* update reachable_time as well, otherwise, the change will
* only be effective after the next time neigh_periodic_work
* decides to recompute it
*/
p - > reachable_time =
neigh_rand_reach_time ( NEIGH_VAR ( p , BASE_REACHABLE_TIME ) ) ;
}
return ret ;
}
2013-12-07 19:26:53 +01:00
# define NEIGH_PARMS_DATA_OFFSET(index) \
( & ( ( struct neigh_parms * ) 0 ) - > data [ index ] )
# define NEIGH_SYSCTL_ENTRY(attr, data_attr, name, mval, proc) \
[ NEIGH_VAR_ # # attr ] = { \
. procname = name , \
. data = NEIGH_PARMS_DATA_OFFSET ( NEIGH_VAR_ # # data_attr ) , \
. maxlen = sizeof ( int ) , \
. mode = mval , \
. proc_handler = proc , \
}
# define NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(attr, name) \
NEIGH_SYSCTL_ENTRY ( attr , attr , name , 0644 , neigh_proc_dointvec_zero_intmax )
# define NEIGH_SYSCTL_JIFFIES_ENTRY(attr, name) \
2013-12-07 19:26:54 +01:00
NEIGH_SYSCTL_ENTRY ( attr , attr , name , 0644 , neigh_proc_dointvec_jiffies )
2013-12-07 19:26:53 +01:00
# define NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(attr, name) \
2013-12-07 19:26:54 +01:00
NEIGH_SYSCTL_ENTRY ( attr , attr , name , 0644 , neigh_proc_dointvec_userhz_jiffies )
2013-12-07 19:26:53 +01:00
2022-06-29 08:48:32 +00:00
# define NEIGH_SYSCTL_MS_JIFFIES_POSITIVE_ENTRY(attr, name) \
NEIGH_SYSCTL_ENTRY ( attr , attr , name , 0644 , neigh_proc_dointvec_ms_jiffies_positive )
2013-12-07 19:26:53 +01:00
# define NEIGH_SYSCTL_MS_JIFFIES_REUSED_ENTRY(attr, data_attr, name) \
2013-12-07 19:26:54 +01:00
NEIGH_SYSCTL_ENTRY ( attr , data_attr , name , 0644 , neigh_proc_dointvec_ms_jiffies )
2013-12-07 19:26:53 +01:00
# define NEIGH_SYSCTL_UNRES_QLEN_REUSED_ENTRY(attr, data_attr, name) \
2013-12-07 19:26:54 +01:00
NEIGH_SYSCTL_ENTRY ( attr , data_attr , name , 0644 , neigh_proc_dointvec_unres_qlen )
2010-02-14 03:27:03 +00:00
2005-04-16 15:20:36 -07:00
static struct neigh_sysctl_table {
struct ctl_table_header * sysctl_header ;
neigh: new unresolved queue limits
Le mercredi 09 novembre 2011 à 16:21 -0500, David Miller a écrit :
> From: David Miller <davem@davemloft.net>
> Date: Wed, 09 Nov 2011 16:16:44 -0500 (EST)
>
> > From: Eric Dumazet <eric.dumazet@gmail.com>
> > Date: Wed, 09 Nov 2011 12:14:09 +0100
> >
> >> unres_qlen is the number of frames we are able to queue per unresolved
> >> neighbour. Its default value (3) was never changed and is responsible
> >> for strange drops, especially if IP fragments are used, or multiple
> >> sessions start in parallel. Even a single tcp flow can hit this limit.
> > ...
> >
> > Ok, I've applied this, let's see what happens :-)
>
> Early answer, build fails.
>
> Please test build this patch with DECNET enabled and resubmit. The
> decnet neigh layer still refers to the removed ->queue_len member.
>
> Thanks.
Ouch, this was fixed on one machine yesterday, but not the other one I
used this morning, sorry.
[PATCH V5 net-next] neigh: new unresolved queue limits
unres_qlen is the number of frames we are able to queue per unresolved
neighbour. Its default value (3) was never changed and is responsible
for strange drops, especially if IP fragments are used, or multiple
sessions start in parallel. Even a single tcp flow can hit this limit.
$ arp -d 192.168.20.108 ; ping -c 2 -s 8000 192.168.20.108
PING 192.168.20.108 (192.168.20.108) 8000(8028) bytes of data.
8008 bytes from 192.168.20.108: icmp_seq=2 ttl=64 time=0.322 ms
Signed-off-by: David S. Miller <davem@davemloft.net>
2011-11-09 12:07:14 +00:00
struct ctl_table neigh_vars [ NEIGH_VAR_MAX + 1 ] ;
2006-09-22 14:15:41 -07:00
} neigh_sysctl_template __read_mostly = {
2005-04-16 15:20:36 -07:00
. neigh_vars = {
2013-12-07 19:26:53 +01:00
NEIGH_SYSCTL_ZERO_INTMAX_ENTRY ( MCAST_PROBES , " mcast_solicit " ) ,
NEIGH_SYSCTL_ZERO_INTMAX_ENTRY ( UCAST_PROBES , " ucast_solicit " ) ,
NEIGH_SYSCTL_ZERO_INTMAX_ENTRY ( APP_PROBES , " app_solicit " ) ,
2015-03-19 22:41:46 +09:00
NEIGH_SYSCTL_ZERO_INTMAX_ENTRY ( MCAST_REPROBES , " mcast_resolicit " ) ,
2013-12-07 19:26:53 +01:00
NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY ( RETRANS_TIME , " retrans_time " ) ,
NEIGH_SYSCTL_JIFFIES_ENTRY ( BASE_REACHABLE_TIME , " base_reachable_time " ) ,
NEIGH_SYSCTL_JIFFIES_ENTRY ( DELAY_PROBE_TIME , " delay_first_probe_time " ) ,
2022-06-29 08:48:32 +00:00
NEIGH_SYSCTL_MS_JIFFIES_POSITIVE_ENTRY ( INTERVAL_PROBE_TIME_MS ,
" interval_probe_time_ms " ) ,
2013-12-07 19:26:53 +01:00
NEIGH_SYSCTL_JIFFIES_ENTRY ( GC_STALETIME , " gc_stale_time " ) ,
NEIGH_SYSCTL_ZERO_INTMAX_ENTRY ( QUEUE_LEN_BYTES , " unres_qlen_bytes " ) ,
NEIGH_SYSCTL_ZERO_INTMAX_ENTRY ( PROXY_QLEN , " proxy_qlen " ) ,
NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY ( ANYCAST_DELAY , " anycast_delay " ) ,
NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY ( PROXY_DELAY , " proxy_delay " ) ,
NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY ( LOCKTIME , " locktime " ) ,
NEIGH_SYSCTL_UNRES_QLEN_REUSED_ENTRY ( QUEUE_LEN , QUEUE_LEN_BYTES , " unres_qlen " ) ,
NEIGH_SYSCTL_MS_JIFFIES_REUSED_ENTRY ( RETRANS_TIME_MS , RETRANS_TIME , " retrans_time_ms " ) ,
NEIGH_SYSCTL_MS_JIFFIES_REUSED_ENTRY ( BASE_REACHABLE_TIME_MS , BASE_REACHABLE_TIME , " base_reachable_time_ms " ) ,
neigh: new unresolved queue limits
Le mercredi 09 novembre 2011 à 16:21 -0500, David Miller a écrit :
> From: David Miller <davem@davemloft.net>
> Date: Wed, 09 Nov 2011 16:16:44 -0500 (EST)
>
> > From: Eric Dumazet <eric.dumazet@gmail.com>
> > Date: Wed, 09 Nov 2011 12:14:09 +0100
> >
> >> unres_qlen is the number of frames we are able to queue per unresolved
> >> neighbour. Its default value (3) was never changed and is responsible
> >> for strange drops, especially if IP fragments are used, or multiple
> >> sessions start in parallel. Even a single tcp flow can hit this limit.
> > ...
> >
> > Ok, I've applied this, let's see what happens :-)
>
> Early answer, build fails.
>
> Please test build this patch with DECNET enabled and resubmit. The
> decnet neigh layer still refers to the removed ->queue_len member.
>
> Thanks.
Ouch, this was fixed on one machine yesterday, but not the other one I
used this morning, sorry.
[PATCH V5 net-next] neigh: new unresolved queue limits
unres_qlen is the number of frames we are able to queue per unresolved
neighbour. Its default value (3) was never changed and is responsible
for strange drops, especially if IP fragments are used, or multiple
sessions start in parallel. Even a single tcp flow can hit this limit.
$ arp -d 192.168.20.108 ; ping -c 2 -s 8000 192.168.20.108
PING 192.168.20.108 (192.168.20.108) 8000(8028) bytes of data.
8008 bytes from 192.168.20.108: icmp_seq=2 ttl=64 time=0.322 ms
Signed-off-by: David S. Miller <davem@davemloft.net>
2011-11-09 12:07:14 +00:00
[ NEIGH_VAR_GC_INTERVAL ] = {
2005-04-16 15:20:36 -07:00
. procname = " gc_interval " ,
. maxlen = sizeof ( int ) ,
. mode = 0644 ,
2008-11-03 18:21:05 -08:00
. proc_handler = proc_dointvec_jiffies ,
2005-04-16 15:20:36 -07:00
} ,
neigh: new unresolved queue limits
Le mercredi 09 novembre 2011 à 16:21 -0500, David Miller a écrit :
> From: David Miller <davem@davemloft.net>
> Date: Wed, 09 Nov 2011 16:16:44 -0500 (EST)
>
> > From: Eric Dumazet <eric.dumazet@gmail.com>
> > Date: Wed, 09 Nov 2011 12:14:09 +0100
> >
> >> unres_qlen is the number of frames we are able to queue per unresolved
> >> neighbour. Its default value (3) was never changed and is responsible
> >> for strange drops, especially if IP fragments are used, or multiple
> >> sessions start in parallel. Even a single tcp flow can hit this limit.
> > ...
> >
> > Ok, I've applied this, let's see what happens :-)
>
> Early answer, build fails.
>
> Please test build this patch with DECNET enabled and resubmit. The
> decnet neigh layer still refers to the removed ->queue_len member.
>
> Thanks.
Ouch, this was fixed on one machine yesterday, but not the other one I
used this morning, sorry.
[PATCH V5 net-next] neigh: new unresolved queue limits
unres_qlen is the number of frames we are able to queue per unresolved
neighbour. Its default value (3) was never changed and is responsible
for strange drops, especially if IP fragments are used, or multiple
sessions start in parallel. Even a single tcp flow can hit this limit.
$ arp -d 192.168.20.108 ; ping -c 2 -s 8000 192.168.20.108
PING 192.168.20.108 (192.168.20.108) 8000(8028) bytes of data.
8008 bytes from 192.168.20.108: icmp_seq=2 ttl=64 time=0.322 ms
Signed-off-by: David S. Miller <davem@davemloft.net>
2011-11-09 12:07:14 +00:00
[ NEIGH_VAR_GC_THRESH1 ] = {
2005-04-16 15:20:36 -07:00
. procname = " gc_thresh1 " ,
. maxlen = sizeof ( int ) ,
. mode = 0644 ,
proc/sysctl: add shared variables for range check
In the sysctl code the proc_dointvec_minmax() function is often used to
validate the user supplied value between an allowed range. This
function uses the extra1 and extra2 members from struct ctl_table as
minimum and maximum allowed value.
On sysctl handler declaration, in every source file there are some
readonly variables containing just an integer which address is assigned
to the extra1 and extra2 members, so the sysctl range is enforced.
The special values 0, 1 and INT_MAX are very often used as range
boundary, leading duplication of variables like zero=0, one=1,
int_max=INT_MAX in different source files:
$ git grep -E '\.extra[12].*&(zero|one|int_max)' |wc -l
248
Add a const int array containing the most commonly used values, some
macros to refer more easily to the correct array member, and use them
instead of creating a local one for every object file.
This is the bloat-o-meter output comparing the old and new binary
compiled with the default Fedora config:
# scripts/bloat-o-meter -d vmlinux.o.old vmlinux.o
add/remove: 2/2 grow/shrink: 0/2 up/down: 24/-188 (-164)
Data old new delta
sysctl_vals - 12 +12
__kstrtab_sysctl_vals - 12 +12
max 14 10 -4
int_max 16 - -16
one 68 - -68
zero 128 28 -100
Total: Before=20583249, After=20583085, chg -0.00%
[mcroce@redhat.com: tipc: remove two unused variables]
Link: http://lkml.kernel.org/r/20190530091952.4108-1-mcroce@redhat.com
[akpm@linux-foundation.org: fix net/ipv6/sysctl_net_ipv6.c]
[arnd@arndb.de: proc/sysctl: make firmware loader table conditional]
Link: http://lkml.kernel.org/r/20190617130014.1713870-1-arnd@arndb.de
[akpm@linux-foundation.org: fix fs/eventpoll.c]
Link: http://lkml.kernel.org/r/20190430180111.10688-1-mcroce@redhat.com
Signed-off-by: Matteo Croce <mcroce@redhat.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Aaron Tomlin <atomlin@redhat.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-07-18 15:58:50 -07:00
. extra1 = SYSCTL_ZERO ,
. extra2 = SYSCTL_INT_MAX ,
2013-07-24 10:39:06 +02:00
. proc_handler = proc_dointvec_minmax ,
2005-04-16 15:20:36 -07:00
} ,
neigh: new unresolved queue limits
Le mercredi 09 novembre 2011 à 16:21 -0500, David Miller a écrit :
> From: David Miller <davem@davemloft.net>
> Date: Wed, 09 Nov 2011 16:16:44 -0500 (EST)
>
> > From: Eric Dumazet <eric.dumazet@gmail.com>
> > Date: Wed, 09 Nov 2011 12:14:09 +0100
> >
> >> unres_qlen is the number of frames we are able to queue per unresolved
> >> neighbour. Its default value (3) was never changed and is responsible
> >> for strange drops, especially if IP fragments are used, or multiple
> >> sessions start in parallel. Even a single tcp flow can hit this limit.
> > ...
> >
> > Ok, I've applied this, let's see what happens :-)
>
> Early answer, build fails.
>
> Please test build this patch with DECNET enabled and resubmit. The
> decnet neigh layer still refers to the removed ->queue_len member.
>
> Thanks.
Ouch, this was fixed on one machine yesterday, but not the other one I
used this morning, sorry.
[PATCH V5 net-next] neigh: new unresolved queue limits
unres_qlen is the number of frames we are able to queue per unresolved
neighbour. Its default value (3) was never changed and is responsible
for strange drops, especially if IP fragments are used, or multiple
sessions start in parallel. Even a single tcp flow can hit this limit.
$ arp -d 192.168.20.108 ; ping -c 2 -s 8000 192.168.20.108
PING 192.168.20.108 (192.168.20.108) 8000(8028) bytes of data.
8008 bytes from 192.168.20.108: icmp_seq=2 ttl=64 time=0.322 ms
Signed-off-by: David S. Miller <davem@davemloft.net>
2011-11-09 12:07:14 +00:00
[ NEIGH_VAR_GC_THRESH2 ] = {
2005-04-16 15:20:36 -07:00
. procname = " gc_thresh2 " ,
. maxlen = sizeof ( int ) ,
. mode = 0644 ,
proc/sysctl: add shared variables for range check
In the sysctl code the proc_dointvec_minmax() function is often used to
validate the user supplied value between an allowed range. This
function uses the extra1 and extra2 members from struct ctl_table as
minimum and maximum allowed value.
On sysctl handler declaration, in every source file there are some
readonly variables containing just an integer which address is assigned
to the extra1 and extra2 members, so the sysctl range is enforced.
The special values 0, 1 and INT_MAX are very often used as range
boundary, leading duplication of variables like zero=0, one=1,
int_max=INT_MAX in different source files:
$ git grep -E '\.extra[12].*&(zero|one|int_max)' |wc -l
248
Add a const int array containing the most commonly used values, some
macros to refer more easily to the correct array member, and use them
instead of creating a local one for every object file.
This is the bloat-o-meter output comparing the old and new binary
compiled with the default Fedora config:
# scripts/bloat-o-meter -d vmlinux.o.old vmlinux.o
add/remove: 2/2 grow/shrink: 0/2 up/down: 24/-188 (-164)
Data old new delta
sysctl_vals - 12 +12
__kstrtab_sysctl_vals - 12 +12
max 14 10 -4
int_max 16 - -16
one 68 - -68
zero 128 28 -100
Total: Before=20583249, After=20583085, chg -0.00%
[mcroce@redhat.com: tipc: remove two unused variables]
Link: http://lkml.kernel.org/r/20190530091952.4108-1-mcroce@redhat.com
[akpm@linux-foundation.org: fix net/ipv6/sysctl_net_ipv6.c]
[arnd@arndb.de: proc/sysctl: make firmware loader table conditional]
Link: http://lkml.kernel.org/r/20190617130014.1713870-1-arnd@arndb.de
[akpm@linux-foundation.org: fix fs/eventpoll.c]
Link: http://lkml.kernel.org/r/20190430180111.10688-1-mcroce@redhat.com
Signed-off-by: Matteo Croce <mcroce@redhat.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Aaron Tomlin <atomlin@redhat.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-07-18 15:58:50 -07:00
. extra1 = SYSCTL_ZERO ,
. extra2 = SYSCTL_INT_MAX ,
2013-07-24 10:39:06 +02:00
. proc_handler = proc_dointvec_minmax ,
2005-04-16 15:20:36 -07:00
} ,
neigh: new unresolved queue limits
Le mercredi 09 novembre 2011 à 16:21 -0500, David Miller a écrit :
> From: David Miller <davem@davemloft.net>
> Date: Wed, 09 Nov 2011 16:16:44 -0500 (EST)
>
> > From: Eric Dumazet <eric.dumazet@gmail.com>
> > Date: Wed, 09 Nov 2011 12:14:09 +0100
> >
> >> unres_qlen is the number of frames we are able to queue per unresolved
> >> neighbour. Its default value (3) was never changed and is responsible
> >> for strange drops, especially if IP fragments are used, or multiple
> >> sessions start in parallel. Even a single tcp flow can hit this limit.
> > ...
> >
> > Ok, I've applied this, let's see what happens :-)
>
> Early answer, build fails.
>
> Please test build this patch with DECNET enabled and resubmit. The
> decnet neigh layer still refers to the removed ->queue_len member.
>
> Thanks.
Ouch, this was fixed on one machine yesterday, but not the other one I
used this morning, sorry.
[PATCH V5 net-next] neigh: new unresolved queue limits
unres_qlen is the number of frames we are able to queue per unresolved
neighbour. Its default value (3) was never changed and is responsible
for strange drops, especially if IP fragments are used, or multiple
sessions start in parallel. Even a single tcp flow can hit this limit.
$ arp -d 192.168.20.108 ; ping -c 2 -s 8000 192.168.20.108
PING 192.168.20.108 (192.168.20.108) 8000(8028) bytes of data.
8008 bytes from 192.168.20.108: icmp_seq=2 ttl=64 time=0.322 ms
Signed-off-by: David S. Miller <davem@davemloft.net>
2011-11-09 12:07:14 +00:00
[ NEIGH_VAR_GC_THRESH3 ] = {
2005-04-16 15:20:36 -07:00
. procname = " gc_thresh3 " ,
. maxlen = sizeof ( int ) ,
. mode = 0644 ,
proc/sysctl: add shared variables for range check
In the sysctl code the proc_dointvec_minmax() function is often used to
validate the user supplied value between an allowed range. This
function uses the extra1 and extra2 members from struct ctl_table as
minimum and maximum allowed value.
On sysctl handler declaration, in every source file there are some
readonly variables containing just an integer which address is assigned
to the extra1 and extra2 members, so the sysctl range is enforced.
The special values 0, 1 and INT_MAX are very often used as range
boundary, leading duplication of variables like zero=0, one=1,
int_max=INT_MAX in different source files:
$ git grep -E '\.extra[12].*&(zero|one|int_max)' |wc -l
248
Add a const int array containing the most commonly used values, some
macros to refer more easily to the correct array member, and use them
instead of creating a local one for every object file.
This is the bloat-o-meter output comparing the old and new binary
compiled with the default Fedora config:
# scripts/bloat-o-meter -d vmlinux.o.old vmlinux.o
add/remove: 2/2 grow/shrink: 0/2 up/down: 24/-188 (-164)
Data old new delta
sysctl_vals - 12 +12
__kstrtab_sysctl_vals - 12 +12
max 14 10 -4
int_max 16 - -16
one 68 - -68
zero 128 28 -100
Total: Before=20583249, After=20583085, chg -0.00%
[mcroce@redhat.com: tipc: remove two unused variables]
Link: http://lkml.kernel.org/r/20190530091952.4108-1-mcroce@redhat.com
[akpm@linux-foundation.org: fix net/ipv6/sysctl_net_ipv6.c]
[arnd@arndb.de: proc/sysctl: make firmware loader table conditional]
Link: http://lkml.kernel.org/r/20190617130014.1713870-1-arnd@arndb.de
[akpm@linux-foundation.org: fix fs/eventpoll.c]
Link: http://lkml.kernel.org/r/20190430180111.10688-1-mcroce@redhat.com
Signed-off-by: Matteo Croce <mcroce@redhat.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Aaron Tomlin <atomlin@redhat.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-07-18 15:58:50 -07:00
. extra1 = SYSCTL_ZERO ,
. extra2 = SYSCTL_INT_MAX ,
2013-07-24 10:39:06 +02:00
. proc_handler = proc_dointvec_minmax ,
2005-04-16 15:20:36 -07:00
} ,
2007-12-02 00:08:16 +11:00
{ } ,
2005-04-16 15:20:36 -07:00
} ,
} ;
int neigh_sysctl_register ( struct net_device * dev , struct neigh_parms * p ,
2013-12-07 19:26:55 +01:00
proc_handler * handler )
2005-04-16 15:20:36 -07:00
{
2013-12-07 19:26:53 +01:00
int i ;
2007-12-02 00:06:34 +11:00
struct neigh_sysctl_table * t ;
2013-12-07 19:26:53 +01:00
const char * dev_name_source ;
2012-04-19 13:38:03 +00:00
char neigh_path [ sizeof ( " net//neigh/ " ) + IFNAMSIZ + IFNAMSIZ ] ;
2013-12-07 19:26:55 +01:00
char * p_name ;
2023-08-09 12:50:03 +02:00
size_t neigh_vars_size ;
2005-04-16 15:20:36 -07:00
2022-05-02 15:15:51 +03:00
t = kmemdup ( & neigh_sysctl_template , sizeof ( * t ) , GFP_KERNEL_ACCOUNT ) ;
2005-04-16 15:20:36 -07:00
if ( ! t )
2007-12-02 00:06:34 +11:00
goto err ;
2014-02-21 14:52:57 +01:00
for ( i = 0 ; i < NEIGH_VAR_GC_INTERVAL ; i + + ) {
2013-12-07 19:26:53 +01:00
t - > neigh_vars [ i ] . data + = ( long ) p ;
2013-12-07 19:26:54 +01:00
t - > neigh_vars [ i ] . extra1 = dev ;
2013-12-07 19:26:56 +01:00
t - > neigh_vars [ i ] . extra2 = p ;
2013-12-07 19:26:54 +01:00
}
2005-04-16 15:20:36 -07:00
2023-08-09 12:50:03 +02:00
neigh_vars_size = ARRAY_SIZE ( t - > neigh_vars ) ;
2005-04-16 15:20:36 -07:00
if ( dev ) {
dev_name_source = dev - > name ;
2007-10-18 03:05:25 -07:00
/* Terminate the table early */
neigh: new unresolved queue limits
Le mercredi 09 novembre 2011 à 16:21 -0500, David Miller a écrit :
> From: David Miller <davem@davemloft.net>
> Date: Wed, 09 Nov 2011 16:16:44 -0500 (EST)
>
> > From: Eric Dumazet <eric.dumazet@gmail.com>
> > Date: Wed, 09 Nov 2011 12:14:09 +0100
> >
> >> unres_qlen is the number of frames we are able to queue per unresolved
> >> neighbour. Its default value (3) was never changed and is responsible
> >> for strange drops, especially if IP fragments are used, or multiple
> >> sessions start in parallel. Even a single tcp flow can hit this limit.
> > ...
> >
> > Ok, I've applied this, let's see what happens :-)
>
> Early answer, build fails.
>
> Please test build this patch with DECNET enabled and resubmit. The
> decnet neigh layer still refers to the removed ->queue_len member.
>
> Thanks.
Ouch, this was fixed on one machine yesterday, but not the other one I
used this morning, sorry.
[PATCH V5 net-next] neigh: new unresolved queue limits
unres_qlen is the number of frames we are able to queue per unresolved
neighbour. Its default value (3) was never changed and is responsible
for strange drops, especially if IP fragments are used, or multiple
sessions start in parallel. Even a single tcp flow can hit this limit.
$ arp -d 192.168.20.108 ; ping -c 2 -s 8000 192.168.20.108
PING 192.168.20.108 (192.168.20.108) 8000(8028) bytes of data.
8008 bytes from 192.168.20.108: icmp_seq=2 ttl=64 time=0.322 ms
Signed-off-by: David S. Miller <davem@davemloft.net>
2011-11-09 12:07:14 +00:00
memset ( & t - > neigh_vars [ NEIGH_VAR_GC_INTERVAL ] , 0 ,
sizeof ( t - > neigh_vars [ NEIGH_VAR_GC_INTERVAL ] ) ) ;
2023-08-09 12:50:03 +02:00
neigh_vars_size = NEIGH_VAR_BASE_REACHABLE_TIME_MS + 1 ;
2005-04-16 15:20:36 -07:00
} else {
2014-07-12 22:36:44 +02:00
struct neigh_table * tbl = p - > tbl ;
2012-04-19 13:38:03 +00:00
dev_name_source = " default " ;
2014-07-12 22:36:44 +02:00
t - > neigh_vars [ NEIGH_VAR_GC_INTERVAL ] . data = & tbl - > gc_interval ;
t - > neigh_vars [ NEIGH_VAR_GC_THRESH1 ] . data = & tbl - > gc_thresh1 ;
t - > neigh_vars [ NEIGH_VAR_GC_THRESH2 ] . data = & tbl - > gc_thresh2 ;
t - > neigh_vars [ NEIGH_VAR_GC_THRESH3 ] . data = & tbl - > gc_thresh3 ;
2005-04-16 15:20:36 -07:00
}
2009-11-05 13:32:03 -08:00
if ( handler ) {
2005-04-16 15:20:36 -07:00
/* RetransTime */
neigh: new unresolved queue limits
Le mercredi 09 novembre 2011 à 16:21 -0500, David Miller a écrit :
> From: David Miller <davem@davemloft.net>
> Date: Wed, 09 Nov 2011 16:16:44 -0500 (EST)
>
> > From: Eric Dumazet <eric.dumazet@gmail.com>
> > Date: Wed, 09 Nov 2011 12:14:09 +0100
> >
> >> unres_qlen is the number of frames we are able to queue per unresolved
> >> neighbour. Its default value (3) was never changed and is responsible
> >> for strange drops, especially if IP fragments are used, or multiple
> >> sessions start in parallel. Even a single tcp flow can hit this limit.
> > ...
> >
> > Ok, I've applied this, let's see what happens :-)
>
> Early answer, build fails.
>
> Please test build this patch with DECNET enabled and resubmit. The
> decnet neigh layer still refers to the removed ->queue_len member.
>
> Thanks.
Ouch, this was fixed on one machine yesterday, but not the other one I
used this morning, sorry.
[PATCH V5 net-next] neigh: new unresolved queue limits
unres_qlen is the number of frames we are able to queue per unresolved
neighbour. Its default value (3) was never changed and is responsible
for strange drops, especially if IP fragments are used, or multiple
sessions start in parallel. Even a single tcp flow can hit this limit.
$ arp -d 192.168.20.108 ; ping -c 2 -s 8000 192.168.20.108
PING 192.168.20.108 (192.168.20.108) 8000(8028) bytes of data.
8008 bytes from 192.168.20.108: icmp_seq=2 ttl=64 time=0.322 ms
Signed-off-by: David S. Miller <davem@davemloft.net>
2011-11-09 12:07:14 +00:00
t - > neigh_vars [ NEIGH_VAR_RETRANS_TIME ] . proc_handler = handler ;
2005-04-16 15:20:36 -07:00
/* ReachableTime */
neigh: new unresolved queue limits
Le mercredi 09 novembre 2011 à 16:21 -0500, David Miller a écrit :
> From: David Miller <davem@davemloft.net>
> Date: Wed, 09 Nov 2011 16:16:44 -0500 (EST)
>
> > From: Eric Dumazet <eric.dumazet@gmail.com>
> > Date: Wed, 09 Nov 2011 12:14:09 +0100
> >
> >> unres_qlen is the number of frames we are able to queue per unresolved
> >> neighbour. Its default value (3) was never changed and is responsible
> >> for strange drops, especially if IP fragments are used, or multiple
> >> sessions start in parallel. Even a single tcp flow can hit this limit.
> > ...
> >
> > Ok, I've applied this, let's see what happens :-)
>
> Early answer, build fails.
>
> Please test build this patch with DECNET enabled and resubmit. The
> decnet neigh layer still refers to the removed ->queue_len member.
>
> Thanks.
Ouch, this was fixed on one machine yesterday, but not the other one I
used this morning, sorry.
[PATCH V5 net-next] neigh: new unresolved queue limits
unres_qlen is the number of frames we are able to queue per unresolved
neighbour. Its default value (3) was never changed and is responsible
for strange drops, especially if IP fragments are used, or multiple
sessions start in parallel. Even a single tcp flow can hit this limit.
$ arp -d 192.168.20.108 ; ping -c 2 -s 8000 192.168.20.108
PING 192.168.20.108 (192.168.20.108) 8000(8028) bytes of data.
8008 bytes from 192.168.20.108: icmp_seq=2 ttl=64 time=0.322 ms
Signed-off-by: David S. Miller <davem@davemloft.net>
2011-11-09 12:07:14 +00:00
t - > neigh_vars [ NEIGH_VAR_BASE_REACHABLE_TIME ] . proc_handler = handler ;
2005-04-16 15:20:36 -07:00
/* RetransTime (in milliseconds)*/
neigh: new unresolved queue limits
Le mercredi 09 novembre 2011 à 16:21 -0500, David Miller a écrit :
> From: David Miller <davem@davemloft.net>
> Date: Wed, 09 Nov 2011 16:16:44 -0500 (EST)
>
> > From: Eric Dumazet <eric.dumazet@gmail.com>
> > Date: Wed, 09 Nov 2011 12:14:09 +0100
> >
> >> unres_qlen is the number of frames we are able to queue per unresolved
> >> neighbour. Its default value (3) was never changed and is responsible
> >> for strange drops, especially if IP fragments are used, or multiple
> >> sessions start in parallel. Even a single tcp flow can hit this limit.
> > ...
> >
> > Ok, I've applied this, let's see what happens :-)
>
> Early answer, build fails.
>
> Please test build this patch with DECNET enabled and resubmit. The
> decnet neigh layer still refers to the removed ->queue_len member.
>
> Thanks.
Ouch, this was fixed on one machine yesterday, but not the other one I
used this morning, sorry.
[PATCH V5 net-next] neigh: new unresolved queue limits
unres_qlen is the number of frames we are able to queue per unresolved
neighbour. Its default value (3) was never changed and is responsible
for strange drops, especially if IP fragments are used, or multiple
sessions start in parallel. Even a single tcp flow can hit this limit.
$ arp -d 192.168.20.108 ; ping -c 2 -s 8000 192.168.20.108
PING 192.168.20.108 (192.168.20.108) 8000(8028) bytes of data.
8008 bytes from 192.168.20.108: icmp_seq=2 ttl=64 time=0.322 ms
Signed-off-by: David S. Miller <davem@davemloft.net>
2011-11-09 12:07:14 +00:00
t - > neigh_vars [ NEIGH_VAR_RETRANS_TIME_MS ] . proc_handler = handler ;
2005-04-16 15:20:36 -07:00
/* ReachableTime (in milliseconds) */
neigh: new unresolved queue limits
Le mercredi 09 novembre 2011 à 16:21 -0500, David Miller a écrit :
> From: David Miller <davem@davemloft.net>
> Date: Wed, 09 Nov 2011 16:16:44 -0500 (EST)
>
> > From: Eric Dumazet <eric.dumazet@gmail.com>
> > Date: Wed, 09 Nov 2011 12:14:09 +0100
> >
> >> unres_qlen is the number of frames we are able to queue per unresolved
> >> neighbour. Its default value (3) was never changed and is responsible
> >> for strange drops, especially if IP fragments are used, or multiple
> >> sessions start in parallel. Even a single tcp flow can hit this limit.
> > ...
> >
> > Ok, I've applied this, let's see what happens :-)
>
> Early answer, build fails.
>
> Please test build this patch with DECNET enabled and resubmit. The
> decnet neigh layer still refers to the removed ->queue_len member.
>
> Thanks.
Ouch, this was fixed on one machine yesterday, but not the other one I
used this morning, sorry.
[PATCH V5 net-next] neigh: new unresolved queue limits
unres_qlen is the number of frames we are able to queue per unresolved
neighbour. Its default value (3) was never changed and is responsible
for strange drops, especially if IP fragments are used, or multiple
sessions start in parallel. Even a single tcp flow can hit this limit.
$ arp -d 192.168.20.108 ; ping -c 2 -s 8000 192.168.20.108
PING 192.168.20.108 (192.168.20.108) 8000(8028) bytes of data.
8008 bytes from 192.168.20.108: icmp_seq=2 ttl=64 time=0.322 ms
Signed-off-by: David S. Miller <davem@davemloft.net>
2011-11-09 12:07:14 +00:00
t - > neigh_vars [ NEIGH_VAR_BASE_REACHABLE_TIME_MS ] . proc_handler = handler ;
2015-01-14 04:22:39 +01:00
} else {
/* Those handlers will update p->reachable_time after
* base_reachable_time ( _ms ) is set to ensure the new timer starts being
* applied after the next neighbour update instead of waiting for
* neigh_periodic_work to update its value ( can be multiple minutes )
* So any handler that replaces them should do this as well
*/
/* ReachableTime */
t - > neigh_vars [ NEIGH_VAR_BASE_REACHABLE_TIME ] . proc_handler =
neigh_proc_base_reachable_time ;
/* ReachableTime (in milliseconds) */
t - > neigh_vars [ NEIGH_VAR_BASE_REACHABLE_TIME_MS ] . proc_handler =
neigh_proc_base_reachable_time ;
2005-04-16 15:20:36 -07:00
}
2013-12-07 19:26:55 +01:00
switch ( neigh_parms_family ( p ) ) {
case AF_INET :
p_name = " ipv4 " ;
break ;
case AF_INET6 :
p_name = " ipv6 " ;
break ;
default :
BUG ( ) ;
}
2012-04-19 13:38:03 +00:00
snprintf ( neigh_path , sizeof ( neigh_path ) , " net/%s/neigh/%s " ,
p_name , dev_name_source ) ;
2023-08-09 12:50:03 +02:00
t - > sysctl_header = register_net_sysctl_sz ( neigh_parms_net ( p ) ,
neigh_path , t - > neigh_vars ,
neigh_vars_size ) ;
2007-12-02 00:06:34 +11:00
if ( ! t - > sysctl_header )
2012-04-19 13:38:03 +00:00
goto free ;
2007-12-02 00:06:34 +11:00
2005-04-16 15:20:36 -07:00
p - > sysctl_table = t ;
return 0 ;
2007-12-02 00:06:34 +11:00
free :
2005-04-16 15:20:36 -07:00
kfree ( t ) ;
2007-12-02 00:06:34 +11:00
err :
return - ENOBUFS ;
2005-04-16 15:20:36 -07:00
}
2008-03-24 18:39:10 +09:00
EXPORT_SYMBOL ( neigh_sysctl_register ) ;
2005-04-16 15:20:36 -07:00
void neigh_sysctl_unregister ( struct neigh_parms * p )
{
if ( p - > sysctl_table ) {
struct neigh_sysctl_table * t = p - > sysctl_table ;
p - > sysctl_table = NULL ;
2012-04-19 13:24:33 +00:00
unregister_net_sysctl_table ( t - > sysctl_header ) ;
2005-04-16 15:20:36 -07:00
kfree ( t ) ;
}
}
2008-03-24 18:39:10 +09:00
EXPORT_SYMBOL ( neigh_sysctl_unregister ) ;
2005-04-16 15:20:36 -07:00
# endif /* CONFIG_SYSCTL */
2007-03-22 11:50:06 -07:00
static int __init neigh_init ( void )
{
2017-08-09 20:41:48 +02:00
rtnl_register ( PF_UNSPEC , RTM_NEWNEIGH , neigh_add , NULL , 0 ) ;
rtnl_register ( PF_UNSPEC , RTM_DELNEIGH , neigh_delete , NULL , 0 ) ;
2018-12-19 12:51:38 -08:00
rtnl_register ( PF_UNSPEC , RTM_GETNEIGH , neigh_get , neigh_dump_info , 0 ) ;
2007-03-22 11:50:06 -07:00
2011-06-10 01:27:09 +00:00
rtnl_register ( PF_UNSPEC , RTM_GETNEIGHTBL , NULL , neightbl_dump_info ,
2017-08-09 20:41:48 +02:00
0 ) ;
rtnl_register ( PF_UNSPEC , RTM_SETNEIGHTBL , neightbl_set , NULL , 0 ) ;
2007-03-22 11:50:06 -07:00
return 0 ;
}
subsys_initcall ( neigh_init ) ;