2019-05-20 09:19:03 +02:00
// SPDX-License-Identifier: GPL-2.0-or-later
2005-04-16 15:20:36 -07:00
/*
* NET3 : Garbage Collector For AF_UNIX sockets
*
* Garbage Collector :
* Copyright ( C ) Barak A . Pearlmutter .
*
* Chopped about by Alan Cox 22 / 3 / 96 to make it fit the AF_UNIX socket problem .
* If it doesn ' t work blame me , it worked when Barak sent it .
*
* Assumptions :
*
* - object w / a bit
* - free list
*
* Current optimizations :
*
* - explicit stack instead of recursion
* - tail recurse on first born instead of immediate push / pop
* - we gather the stuff that should not be killed into tree
* and stack is just a path from root to the current pointer .
*
* Future optimizations :
*
* - don ' t just push entire root set ; process in place
*
* Fixes :
* Alan Cox 07 Sept 1997 Vmalloc internal stack as needed .
* Cope with changing max_files .
* Al Viro 11 Oct 1998
* Graph may have cycles . That is , we can send the descriptor
* of foo to bar and vice versa . Current code chokes on that .
* Fix : move SCM_RIGHTS ones into the separate list and then
* skb_free ( ) them all instead of doing explicit fput ' s .
* Another problem : since fput ( ) may block somebody may
* create a new unix_socket when we are in the middle of sweep
* phase . Fix : revert the logic wrt MARKED . Mark everything
* upon the beginning and unmark non - junk ones .
*
* [ 12 Oct 1998 ] AAARGH ! New code purges all SCM_RIGHTS
* sent to connect ( ) ' ed but still not accept ( ) ' ed sockets .
* Fixed . Old code had slightly different problem here :
* extra fput ( ) in situation when we passed the descriptor via
* such socket and closed it ( descriptor ) . That would happen on
* each unix_gc ( ) until the accept ( ) . Since the struct file in
* question would go to the free list and might be reused . . .
* That might be the reason of random oopses on filp_close ( )
* in unrelated processes .
*
* AV 28 Feb 1999
* Kill the explicit allocation of stack . Now we keep the tree
* with root in dummy + pointer ( gc_current ) to one of the nodes .
* Stack is represented as path from gc_current to dummy . Unmark
* now means " add to tree " . Push = = " make it a son of gc_current " .
* Pop = = " move gc_current to parent " . We keep only pointers to
* parents ( - > gc_tree ) .
* AV 1 Mar 1999
* Damn . Added missing check for - > dead in listen queues scanning .
*
2007-07-11 14:22:39 -07:00
* Miklos Szeredi 25 Jun 2007
* Reimplement with a cycle collecting algorithm . This should
* solve several problems with the previous code , like being racy
* wrt receive and holding up unrelated socket operations .
2005-04-16 15:20:36 -07:00
*/
2007-02-09 23:25:23 +09:00
2005-04-16 15:20:36 -07:00
# include <linux/kernel.h>
# include <linux/string.h>
# include <linux/socket.h>
# include <linux/un.h>
# include <linux/net.h>
# include <linux/fs.h>
# include <linux/skbuff.h>
# include <linux/netdevice.h>
# include <linux/file.h>
# include <linux/proc_fs.h>
2006-03-20 22:33:17 -08:00
# include <linux/mutex.h>
2008-11-26 15:32:27 -08:00
# include <linux/wait.h>
2005-04-16 15:20:36 -07:00
# include <net/sock.h>
# include <net/af_unix.h>
# include <net/scm.h>
2005-08-09 20:08:28 -07:00
# include <net/tcp_states.h>
2005-04-16 15:20:36 -07:00
2024-01-29 11:04:35 -08:00
struct unix_sock * unix_get_socket ( struct file * filp )
{
struct inode * inode = file_inode ( filp ) ;
/* Socket ? */
if ( S_ISSOCK ( inode - > i_mode ) & & ! ( filp - > f_mode & FMODE_PATH ) ) {
struct socket * sock = SOCKET_I ( inode ) ;
const struct proto_ops * ops ;
struct sock * sk = sock - > sk ;
ops = READ_ONCE ( sock - > ops ) ;
2019-02-08 09:01:44 -07:00
2024-01-29 11:04:35 -08:00
/* PF_UNIX ? */
if ( sk & & ops & & ops - > family = = PF_UNIX )
return unix_sk ( sk ) ;
}
2005-04-16 15:20:36 -07:00
2024-01-29 11:04:35 -08:00
return NULL ;
}
DEFINE_SPINLOCK ( unix_gc_lock ) ;
unsigned int unix_tot_inflight ;
2007-07-11 14:22:39 -07:00
static LIST_HEAD ( gc_candidates ) ;
2024-01-29 11:04:35 -08:00
static LIST_HEAD ( gc_inflight_list ) ;
/* Keep the number of times in flight count for the file
* descriptor if it is for an AF_UNIX socket .
*/
void unix_inflight ( struct user_struct * user , struct file * filp )
{
struct unix_sock * u = unix_get_socket ( filp ) ;
spin_lock ( & unix_gc_lock ) ;
if ( u ) {
if ( ! u - > inflight ) {
WARN_ON_ONCE ( ! list_empty ( & u - > link ) ) ;
list_add_tail ( & u - > link , & gc_inflight_list ) ;
} else {
WARN_ON_ONCE ( list_empty ( & u - > link ) ) ;
}
u - > inflight + + ;
/* Paired with READ_ONCE() in wait_for_unix_gc() */
WRITE_ONCE ( unix_tot_inflight , unix_tot_inflight + 1 ) ;
}
WRITE_ONCE ( user - > unix_inflight , user - > unix_inflight + 1 ) ;
spin_unlock ( & unix_gc_lock ) ;
}
void unix_notinflight ( struct user_struct * user , struct file * filp )
{
struct unix_sock * u = unix_get_socket ( filp ) ;
spin_lock ( & unix_gc_lock ) ;
if ( u ) {
WARN_ON_ONCE ( ! u - > inflight ) ;
WARN_ON_ONCE ( list_empty ( & u - > link ) ) ;
u - > inflight - - ;
if ( ! u - > inflight )
list_del_init ( & u - > link ) ;
/* Paired with READ_ONCE() in wait_for_unix_gc() */
WRITE_ONCE ( unix_tot_inflight , unix_tot_inflight - 1 ) ;
}
WRITE_ONCE ( user - > unix_inflight , user - > unix_inflight - 1 ) ;
spin_unlock ( & unix_gc_lock ) ;
}
2005-04-16 15:20:36 -07:00
2007-11-10 22:07:13 -08:00
static void scan_inflight ( struct sock * x , void ( * func ) ( struct unix_sock * ) ,
2007-07-11 14:22:39 -07:00
struct sk_buff_head * hitlist )
2005-04-16 15:20:36 -07:00
{
2007-07-11 14:22:39 -07:00
struct sk_buff * skb ;
struct sk_buff * next ;
spin_lock ( & x - > sk_receive_queue . lock ) ;
2010-05-03 03:22:18 +00:00
skb_queue_walk_safe ( & x - > sk_receive_queue , skb , next ) {
2015-04-22 00:56:42 -06:00
/* Do we have file descriptors ? */
2007-07-11 14:22:39 -07:00
if ( UNIXCB ( skb ) . fp ) {
bool hit = false ;
2015-04-22 00:56:42 -06:00
/* Process the descriptors of this socket */
2007-07-11 14:22:39 -07:00
int nfd = UNIXCB ( skb ) . fp - > count ;
struct file * * fp = UNIXCB ( skb ) . fp - > fp ;
2015-04-22 00:56:42 -06:00
2007-07-11 14:22:39 -07:00
while ( nfd - - ) {
2015-04-22 00:56:42 -06:00
/* Get the socket the fd matches if it indeed does so */
2024-01-23 09:08:54 -08:00
struct unix_sock * u = unix_get_socket ( * fp + + ) ;
2015-04-22 00:56:42 -06:00
2024-01-23 09:08:54 -08:00
/* Ignore non-candidates, they could have been added
* to the queues after starting the garbage collection
*/
if ( u & & test_bit ( UNIX_GC_CANDIDATE , & u - > gc_flags ) ) {
hit = true ;
2008-11-09 15:23:57 +01:00
2024-01-23 09:08:54 -08:00
func ( u ) ;
2007-07-11 14:22:39 -07:00
}
}
if ( hit & & hitlist ! = NULL ) {
__skb_unlink ( skb , & x - > sk_receive_queue ) ;
__skb_queue_tail ( hitlist , skb ) ;
}
}
}
spin_unlock ( & x - > sk_receive_queue . lock ) ;
2005-04-16 15:20:36 -07:00
}
2007-11-10 22:07:13 -08:00
static void scan_children ( struct sock * x , void ( * func ) ( struct unix_sock * ) ,
2007-07-11 14:22:39 -07:00
struct sk_buff_head * hitlist )
2005-04-16 15:20:36 -07:00
{
2015-04-22 00:56:42 -06:00
if ( x - > sk_state ! = TCP_LISTEN ) {
2007-07-11 14:22:39 -07:00
scan_inflight ( x , func , hitlist ) ;
2015-04-22 00:56:42 -06:00
} else {
2007-07-11 14:22:39 -07:00
struct sk_buff * skb ;
struct sk_buff * next ;
struct unix_sock * u ;
LIST_HEAD ( embryos ) ;
2015-04-22 00:56:42 -06:00
/* For a listening socket collect the queued embryos
2007-07-11 14:22:39 -07:00
* and perform a scan on them as well .
*/
spin_lock ( & x - > sk_receive_queue . lock ) ;
2010-05-03 03:22:18 +00:00
skb_queue_walk_safe ( & x - > sk_receive_queue , skb , next ) {
2007-07-11 14:22:39 -07:00
u = unix_sk ( skb - > sk ) ;
2015-04-22 00:56:42 -06:00
/* An embryo cannot be in-flight, so it's safe
2007-07-11 14:22:39 -07:00
* to use the list link .
*/
2024-01-29 11:04:33 -08:00
WARN_ON_ONCE ( ! list_empty ( & u - > link ) ) ;
2007-07-11 14:22:39 -07:00
list_add_tail ( & u - > link , & embryos ) ;
}
spin_unlock ( & x - > sk_receive_queue . lock ) ;
while ( ! list_empty ( & embryos ) ) {
u = list_entry ( embryos . next , struct unix_sock , link ) ;
scan_inflight ( & u - > sk , func , hitlist ) ;
list_del_init ( & u - > link ) ;
}
}
2005-04-16 15:20:36 -07:00
}
2007-11-10 22:07:13 -08:00
static void dec_inflight ( struct unix_sock * usk )
2005-04-16 15:20:36 -07:00
{
2024-01-23 09:08:53 -08:00
usk - > inflight - - ;
2007-07-11 14:22:39 -07:00
}
2005-04-16 15:20:36 -07:00
2007-11-10 22:07:13 -08:00
static void inc_inflight ( struct unix_sock * usk )
2007-07-11 14:22:39 -07:00
{
2024-01-23 09:08:53 -08:00
usk - > inflight + + ;
2005-04-16 15:20:36 -07:00
}
2007-11-10 22:07:13 -08:00
static void inc_inflight_move_tail ( struct unix_sock * u )
2007-07-11 14:22:39 -07:00
{
2024-01-23 09:08:53 -08:00
u - > inflight + + ;
2015-04-22 00:56:42 -06:00
/* If this still might be part of a cycle, move it to the end
2008-11-09 15:23:57 +01:00
* of the list , so that it ' s checked even if it was already
* passed over
2007-07-11 14:22:39 -07:00
*/
2013-05-01 05:24:03 +00:00
if ( test_bit ( UNIX_GC_MAYBE_CYCLE , & u - > gc_flags ) )
2007-07-11 14:22:39 -07:00
list_move_tail ( & u - > link , & gc_candidates ) ;
}
2005-04-16 15:20:36 -07:00
2014-10-07 23:02:15 +02:00
static bool gc_in_progress ;
2005-04-16 15:20:36 -07:00
2024-01-23 09:08:55 -08:00
static void __unix_gc ( struct work_struct * work )
2008-11-26 15:32:27 -08:00
{
2007-07-11 14:22:39 -07:00
struct sk_buff_head hitlist ;
2024-01-29 11:04:34 -08:00
struct unix_sock * u , * next ;
2008-11-09 15:23:57 +01:00
LIST_HEAD ( not_cycle_list ) ;
2024-01-29 11:04:34 -08:00
struct list_head cursor ;
2005-04-16 15:20:36 -07:00
2007-07-11 14:22:39 -07:00
spin_lock ( & unix_gc_lock ) ;
2005-04-16 15:20:36 -07:00
2015-04-22 00:56:42 -06:00
/* First, select candidates for garbage collection. Only
2007-07-11 14:22:39 -07:00
* in - flight sockets are considered , and from those only ones
* which don ' t have any external reference .
*
* Holding unix_gc_lock will protect these candidates from
* being detached , and hence from gaining an external
2008-11-09 15:23:57 +01:00
* reference . Since there are no possible receivers , all
* buffers currently on the candidates ' queues stay there
* during the garbage collection .
*
* We also know that no new candidate can be added onto the
* receive queues . Other , non candidate sockets _can_ be
* added to queue , so we must make sure only to touch
* candidates .
2005-04-16 15:20:36 -07:00
*/
2007-07-11 14:22:39 -07:00
list_for_each_entry_safe ( u , next , & gc_inflight_list , link ) {
2008-07-26 00:39:17 -04:00
long total_refs ;
2007-07-11 14:22:39 -07:00
total_refs = file_count ( u - > sk . sk_socket - > file ) ;
2024-01-29 11:04:33 -08:00
WARN_ON_ONCE ( ! u - > inflight ) ;
WARN_ON_ONCE ( total_refs < u - > inflight ) ;
2024-01-23 09:08:53 -08:00
if ( total_refs = = u - > inflight ) {
2007-07-11 14:22:39 -07:00
list_move_tail ( & u - > link , & gc_candidates ) ;
2013-05-01 05:24:03 +00:00
__set_bit ( UNIX_GC_CANDIDATE , & u - > gc_flags ) ;
__set_bit ( UNIX_GC_MAYBE_CYCLE , & u - > gc_flags ) ;
2007-07-11 14:22:39 -07:00
}
}
2005-04-16 15:20:36 -07:00
2015-04-22 00:56:42 -06:00
/* Now remove all internal in-flight reference to children of
2007-07-11 14:22:39 -07:00
* the candidates .
2005-04-16 15:20:36 -07:00
*/
2007-07-11 14:22:39 -07:00
list_for_each_entry ( u , & gc_candidates , link )
scan_children ( & u - > sk , dec_inflight , NULL ) ;
2005-04-16 15:20:36 -07:00
2015-04-22 00:56:42 -06:00
/* Restore the references for children of all candidates,
2007-07-11 14:22:39 -07:00
* which have remaining references . Do this recursively , so
* only those remain , which form cyclic references .
*
* Use a " cursor " link , to make the list traversal safe , even
* though elements might be moved about .
2005-04-16 15:20:36 -07:00
*/
2007-07-11 14:22:39 -07:00
list_add ( & cursor , & gc_candidates ) ;
while ( cursor . next ! = & gc_candidates ) {
u = list_entry ( cursor . next , struct unix_sock , link ) ;
2005-04-16 15:20:36 -07:00
2007-07-11 14:22:39 -07:00
/* Move cursor to after the current position. */
list_move ( & cursor , & u - > link ) ;
2007-02-09 23:25:23 +09:00
2024-01-23 09:08:53 -08:00
if ( u - > inflight ) {
2008-11-09 15:23:57 +01:00
list_move_tail ( & u - > link , & not_cycle_list ) ;
2013-05-01 05:24:03 +00:00
__clear_bit ( UNIX_GC_MAYBE_CYCLE , & u - > gc_flags ) ;
2007-07-11 14:22:39 -07:00
scan_children ( & u - > sk , inc_inflight_move_tail , NULL ) ;
2005-04-16 15:20:36 -07:00
}
}
2007-07-11 14:22:39 -07:00
list_del ( & cursor ) ;
2005-04-16 15:20:36 -07:00
2017-03-14 20:16:42 -07:00
/* Now gc_candidates contains only garbage. Restore original
* inflight counters for these as well , and remove the skbuffs
* which are creating the cycle ( s ) .
*/
skb_queue_head_init ( & hitlist ) ;
2024-02-19 09:46:57 -08:00
list_for_each_entry ( u , & gc_candidates , link ) {
2017-03-14 20:16:42 -07:00
scan_children ( & u - > sk , inc_inflight , & hitlist ) ;
2024-02-19 09:46:57 -08:00
# if IS_ENABLED(CONFIG_AF_UNIX_OOB)
if ( u - > oob_skb ) {
kfree_skb ( u - > oob_skb ) ;
u - > oob_skb = NULL ;
}
# endif
}
2015-04-22 00:56:42 -06:00
/* not_cycle_list contains those sockets which do not make up a
2008-11-09 15:23:57 +01:00
* cycle . Restore these to the inflight list .
*/
while ( ! list_empty ( & not_cycle_list ) ) {
u = list_entry ( not_cycle_list . next , struct unix_sock , link ) ;
2013-05-01 05:24:03 +00:00
__clear_bit ( UNIX_GC_CANDIDATE , & u - > gc_flags ) ;
2008-11-09 15:23:57 +01:00
list_move_tail ( & u - > link , & gc_inflight_list ) ;
}
2007-07-11 14:22:39 -07:00
spin_unlock ( & unix_gc_lock ) ;
2005-04-16 15:20:36 -07:00
2007-07-11 14:22:39 -07:00
/* Here we are. Hitlist is filled. Die. */
__skb_queue_purge ( & hitlist ) ;
2005-04-16 15:20:36 -07:00
2007-07-11 14:22:39 -07:00
spin_lock ( & unix_gc_lock ) ;
2005-04-16 15:20:36 -07:00
2007-07-11 14:22:39 -07:00
/* All candidates should have been detached by now. */
2024-01-29 11:04:33 -08:00
WARN_ON_ONCE ( ! list_empty ( & gc_candidates ) ) ;
2022-01-14 08:43:28 -08:00
/* Paired with READ_ONCE() in wait_for_unix_gc(). */
WRITE_ONCE ( gc_in_progress , false ) ;
2007-07-11 14:22:39 -07:00
spin_unlock ( & unix_gc_lock ) ;
2005-04-16 15:20:36 -07:00
}
2024-01-23 09:08:55 -08:00
static DECLARE_WORK ( unix_gc_work , __unix_gc ) ;
void unix_gc ( void )
{
WRITE_ONCE ( gc_in_progress , true ) ;
queue_work ( system_unbound_wq , & unix_gc_work ) ;
}
# define UNIX_INFLIGHT_TRIGGER_GC 16000
af_unix: Try to run GC async.
If more than 16000 inflight AF_UNIX sockets exist and the garbage
collector is not running, unix_(dgram|stream)_sendmsg() call unix_gc().
Also, they wait for unix_gc() to complete.
In unix_gc(), all inflight AF_UNIX sockets are traversed at least once,
and more if they are the GC candidate. Thus, sendmsg() significantly
slows down with too many inflight AF_UNIX sockets.
However, if a process sends data with no AF_UNIX FD, the sendmsg() call
does not need to wait for GC. After this change, only the process that
meets the condition below will be blocked under such a situation.
1) cmsg contains AF_UNIX socket
2) more than 32 AF_UNIX sent by the same user are still inflight
Note that even a sendmsg() call that does not meet the condition but has
AF_UNIX FD will be blocked later in unix_scm_to_skb() by the spinlock,
but we allow that as a bonus for sane users.
The results below are the time spent in unix_dgram_sendmsg() sending 1
byte of data with no FD 4096 times on a host where 32K inflight AF_UNIX
sockets exist.
Without series: the sane sendmsg() needs to wait gc unreasonably.
$ sudo /usr/share/bcc/tools/funclatency -p 11165 unix_dgram_sendmsg
Tracing 1 functions for "unix_dgram_sendmsg"... Hit Ctrl-C to end.
^C
nsecs : count distribution
[...]
524288 -> 1048575 : 0 | |
1048576 -> 2097151 : 3881 |****************************************|
2097152 -> 4194303 : 214 |** |
4194304 -> 8388607 : 1 | |
avg = 1825567 nsecs, total: 7477526027 nsecs, count: 4096
With series: the sane sendmsg() can finish much faster.
$ sudo /usr/share/bcc/tools/funclatency -p 8702 unix_dgram_sendmsg
Tracing 1 functions for "unix_dgram_sendmsg"... Hit Ctrl-C to end.
^C
nsecs : count distribution
[...]
128 -> 255 : 0 | |
256 -> 511 : 4092 |****************************************|
512 -> 1023 : 2 | |
1024 -> 2047 : 0 | |
2048 -> 4095 : 0 | |
4096 -> 8191 : 1 | |
8192 -> 16383 : 1 | |
avg = 410 nsecs, total: 1680510 nsecs, count: 4096
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Link: https://lore.kernel.org/r/20240123170856.41348-6-kuniyu@amazon.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2024-01-23 09:08:56 -08:00
# define UNIX_INFLIGHT_SANE_USER (SCM_MAX_FD * 8)
2024-01-23 09:08:55 -08:00
af_unix: Try to run GC async.
If more than 16000 inflight AF_UNIX sockets exist and the garbage
collector is not running, unix_(dgram|stream)_sendmsg() call unix_gc().
Also, they wait for unix_gc() to complete.
In unix_gc(), all inflight AF_UNIX sockets are traversed at least once,
and more if they are the GC candidate. Thus, sendmsg() significantly
slows down with too many inflight AF_UNIX sockets.
However, if a process sends data with no AF_UNIX FD, the sendmsg() call
does not need to wait for GC. After this change, only the process that
meets the condition below will be blocked under such a situation.
1) cmsg contains AF_UNIX socket
2) more than 32 AF_UNIX sent by the same user are still inflight
Note that even a sendmsg() call that does not meet the condition but has
AF_UNIX FD will be blocked later in unix_scm_to_skb() by the spinlock,
but we allow that as a bonus for sane users.
The results below are the time spent in unix_dgram_sendmsg() sending 1
byte of data with no FD 4096 times on a host where 32K inflight AF_UNIX
sockets exist.
Without series: the sane sendmsg() needs to wait gc unreasonably.
$ sudo /usr/share/bcc/tools/funclatency -p 11165 unix_dgram_sendmsg
Tracing 1 functions for "unix_dgram_sendmsg"... Hit Ctrl-C to end.
^C
nsecs : count distribution
[...]
524288 -> 1048575 : 0 | |
1048576 -> 2097151 : 3881 |****************************************|
2097152 -> 4194303 : 214 |** |
4194304 -> 8388607 : 1 | |
avg = 1825567 nsecs, total: 7477526027 nsecs, count: 4096
With series: the sane sendmsg() can finish much faster.
$ sudo /usr/share/bcc/tools/funclatency -p 8702 unix_dgram_sendmsg
Tracing 1 functions for "unix_dgram_sendmsg"... Hit Ctrl-C to end.
^C
nsecs : count distribution
[...]
128 -> 255 : 0 | |
256 -> 511 : 4092 |****************************************|
512 -> 1023 : 2 | |
1024 -> 2047 : 0 | |
2048 -> 4095 : 0 | |
4096 -> 8191 : 1 | |
8192 -> 16383 : 1 | |
avg = 410 nsecs, total: 1680510 nsecs, count: 4096
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Link: https://lore.kernel.org/r/20240123170856.41348-6-kuniyu@amazon.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2024-01-23 09:08:56 -08:00
void wait_for_unix_gc ( struct scm_fp_list * fpl )
2024-01-23 09:08:55 -08:00
{
/* If number of inflight sockets is insane,
* force a garbage collect right now .
*
* Paired with the WRITE_ONCE ( ) in unix_inflight ( ) ,
* unix_notinflight ( ) , and __unix_gc ( ) .
*/
if ( READ_ONCE ( unix_tot_inflight ) > UNIX_INFLIGHT_TRIGGER_GC & &
! READ_ONCE ( gc_in_progress ) )
unix_gc ( ) ;
af_unix: Try to run GC async.
If more than 16000 inflight AF_UNIX sockets exist and the garbage
collector is not running, unix_(dgram|stream)_sendmsg() call unix_gc().
Also, they wait for unix_gc() to complete.
In unix_gc(), all inflight AF_UNIX sockets are traversed at least once,
and more if they are the GC candidate. Thus, sendmsg() significantly
slows down with too many inflight AF_UNIX sockets.
However, if a process sends data with no AF_UNIX FD, the sendmsg() call
does not need to wait for GC. After this change, only the process that
meets the condition below will be blocked under such a situation.
1) cmsg contains AF_UNIX socket
2) more than 32 AF_UNIX sent by the same user are still inflight
Note that even a sendmsg() call that does not meet the condition but has
AF_UNIX FD will be blocked later in unix_scm_to_skb() by the spinlock,
but we allow that as a bonus for sane users.
The results below are the time spent in unix_dgram_sendmsg() sending 1
byte of data with no FD 4096 times on a host where 32K inflight AF_UNIX
sockets exist.
Without series: the sane sendmsg() needs to wait gc unreasonably.
$ sudo /usr/share/bcc/tools/funclatency -p 11165 unix_dgram_sendmsg
Tracing 1 functions for "unix_dgram_sendmsg"... Hit Ctrl-C to end.
^C
nsecs : count distribution
[...]
524288 -> 1048575 : 0 | |
1048576 -> 2097151 : 3881 |****************************************|
2097152 -> 4194303 : 214 |** |
4194304 -> 8388607 : 1 | |
avg = 1825567 nsecs, total: 7477526027 nsecs, count: 4096
With series: the sane sendmsg() can finish much faster.
$ sudo /usr/share/bcc/tools/funclatency -p 8702 unix_dgram_sendmsg
Tracing 1 functions for "unix_dgram_sendmsg"... Hit Ctrl-C to end.
^C
nsecs : count distribution
[...]
128 -> 255 : 0 | |
256 -> 511 : 4092 |****************************************|
512 -> 1023 : 2 | |
1024 -> 2047 : 0 | |
2048 -> 4095 : 0 | |
4096 -> 8191 : 1 | |
8192 -> 16383 : 1 | |
avg = 410 nsecs, total: 1680510 nsecs, count: 4096
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Link: https://lore.kernel.org/r/20240123170856.41348-6-kuniyu@amazon.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2024-01-23 09:08:56 -08:00
/* Penalise users who want to send AF_UNIX sockets
* but whose sockets have not been received yet .
*/
if ( ! fpl | | ! fpl - > count_unix | |
READ_ONCE ( fpl - > user - > unix_inflight ) < UNIX_INFLIGHT_SANE_USER )
return ;
2024-01-23 09:08:55 -08:00
if ( READ_ONCE ( gc_in_progress ) )
flush_work ( & unix_gc_work ) ;
}