2019-05-20 10:19:03 +03:00
// SPDX-License-Identifier: GPL-2.0-or-later
2005-04-17 02:20:36 +04:00
/*
* NET3 : Garbage Collector For AF_UNIX sockets
*
* Garbage Collector :
* Copyright ( C ) Barak A . Pearlmutter .
*
* Chopped about by Alan Cox 22 / 3 / 96 to make it fit the AF_UNIX socket problem .
* If it doesn ' t work blame me , it worked when Barak sent it .
*
* Assumptions :
*
* - object w / a bit
* - free list
*
* Current optimizations :
*
* - explicit stack instead of recursion
* - tail recurse on first born instead of immediate push / pop
* - we gather the stuff that should not be killed into tree
* and stack is just a path from root to the current pointer .
*
* Future optimizations :
*
* - don ' t just push entire root set ; process in place
*
* Fixes :
* Alan Cox 07 Sept 1997 Vmalloc internal stack as needed .
* Cope with changing max_files .
* Al Viro 11 Oct 1998
* Graph may have cycles . That is , we can send the descriptor
* of foo to bar and vice versa . Current code chokes on that .
* Fix : move SCM_RIGHTS ones into the separate list and then
* skb_free ( ) them all instead of doing explicit fput ' s .
* Another problem : since fput ( ) may block somebody may
* create a new unix_socket when we are in the middle of sweep
* phase . Fix : revert the logic wrt MARKED . Mark everything
* upon the beginning and unmark non - junk ones .
*
* [ 12 Oct 1998 ] AAARGH ! New code purges all SCM_RIGHTS
* sent to connect ( ) ' ed but still not accept ( ) ' ed sockets .
* Fixed . Old code had slightly different problem here :
* extra fput ( ) in situation when we passed the descriptor via
* such socket and closed it ( descriptor ) . That would happen on
* each unix_gc ( ) until the accept ( ) . Since the struct file in
* question would go to the free list and might be reused . . .
* That might be the reason of random oopses on filp_close ( )
* in unrelated processes .
*
* AV 28 Feb 1999
* Kill the explicit allocation of stack . Now we keep the tree
* with root in dummy + pointer ( gc_current ) to one of the nodes .
* Stack is represented as path from gc_current to dummy . Unmark
* now means " add to tree " . Push = = " make it a son of gc_current " .
* Pop = = " move gc_current to parent " . We keep only pointers to
* parents ( - > gc_tree ) .
* AV 1 Mar 1999
* Damn . Added missing check for - > dead in listen queues scanning .
*
2007-07-12 01:22:39 +04:00
* Miklos Szeredi 25 Jun 2007
* Reimplement with a cycle collecting algorithm . This should
* solve several problems with the previous code , like being racy
* wrt receive and holding up unrelated socket operations .
2005-04-17 02:20:36 +04:00
*/
2007-02-09 17:25:23 +03:00
2005-04-17 02:20:36 +04:00
# include <linux/kernel.h>
# include <linux/string.h>
# include <linux/socket.h>
# include <linux/un.h>
# include <linux/net.h>
# include <linux/fs.h>
# include <linux/skbuff.h>
# include <linux/netdevice.h>
# include <linux/file.h>
# include <linux/proc_fs.h>
2006-03-21 09:33:17 +03:00
# include <linux/mutex.h>
2008-11-27 02:32:27 +03:00
# include <linux/wait.h>
2005-04-17 02:20:36 +04:00
# include <net/sock.h>
# include <net/af_unix.h>
# include <net/scm.h>
2005-08-10 07:08:28 +04:00
# include <net/tcp_states.h>
2005-04-17 02:20:36 +04:00
2019-02-08 19:01:44 +03:00
# include "scm.h"
2005-04-17 02:20:36 +04:00
/* Internal data structures and random procedures: */
2007-07-12 01:22:39 +04:00
static LIST_HEAD ( gc_candidates ) ;
2008-11-27 02:32:27 +03:00
static DECLARE_WAIT_QUEUE_HEAD ( unix_gc_wait ) ;
2005-04-17 02:20:36 +04:00
2007-11-11 09:07:13 +03:00
static void scan_inflight ( struct sock * x , void ( * func ) ( struct unix_sock * ) ,
2007-07-12 01:22:39 +04:00
struct sk_buff_head * hitlist )
2005-04-17 02:20:36 +04:00
{
2007-07-12 01:22:39 +04:00
struct sk_buff * skb ;
struct sk_buff * next ;
spin_lock ( & x - > sk_receive_queue . lock ) ;
2010-05-03 07:22:18 +04:00
skb_queue_walk_safe ( & x - > sk_receive_queue , skb , next ) {
2015-04-22 09:56:42 +03:00
/* Do we have file descriptors ? */
2007-07-12 01:22:39 +04:00
if ( UNIXCB ( skb ) . fp ) {
bool hit = false ;
2015-04-22 09:56:42 +03:00
/* Process the descriptors of this socket */
2007-07-12 01:22:39 +04:00
int nfd = UNIXCB ( skb ) . fp - > count ;
struct file * * fp = UNIXCB ( skb ) . fp - > fp ;
2015-04-22 09:56:42 +03:00
2007-07-12 01:22:39 +04:00
while ( nfd - - ) {
2015-04-22 09:56:42 +03:00
/* Get the socket the fd matches if it indeed does so */
2007-07-12 01:22:39 +04:00
struct sock * sk = unix_get_socket ( * fp + + ) ;
2015-04-22 09:56:42 +03:00
2007-11-11 09:07:13 +03:00
if ( sk ) {
2008-11-09 17:23:57 +03:00
struct unix_sock * u = unix_sk ( sk ) ;
2015-04-22 09:56:42 +03:00
/* Ignore non-candidates, they could
2008-11-09 17:23:57 +03:00
* have been added to the queues after
* starting the garbage collection
*/
2013-05-01 09:24:03 +04:00
if ( test_bit ( UNIX_GC_CANDIDATE , & u - > gc_flags ) ) {
2008-11-09 17:23:57 +03:00
hit = true ;
2015-04-22 09:56:42 +03:00
2008-11-09 17:23:57 +03:00
func ( u ) ;
}
2007-07-12 01:22:39 +04:00
}
}
if ( hit & & hitlist ! = NULL ) {
__skb_unlink ( skb , & x - > sk_receive_queue ) ;
__skb_queue_tail ( hitlist , skb ) ;
}
}
}
spin_unlock ( & x - > sk_receive_queue . lock ) ;
2005-04-17 02:20:36 +04:00
}
2007-11-11 09:07:13 +03:00
static void scan_children ( struct sock * x , void ( * func ) ( struct unix_sock * ) ,
2007-07-12 01:22:39 +04:00
struct sk_buff_head * hitlist )
2005-04-17 02:20:36 +04:00
{
2015-04-22 09:56:42 +03:00
if ( x - > sk_state ! = TCP_LISTEN ) {
2007-07-12 01:22:39 +04:00
scan_inflight ( x , func , hitlist ) ;
2015-04-22 09:56:42 +03:00
} else {
2007-07-12 01:22:39 +04:00
struct sk_buff * skb ;
struct sk_buff * next ;
struct unix_sock * u ;
LIST_HEAD ( embryos ) ;
2015-04-22 09:56:42 +03:00
/* For a listening socket collect the queued embryos
2007-07-12 01:22:39 +04:00
* and perform a scan on them as well .
*/
spin_lock ( & x - > sk_receive_queue . lock ) ;
2010-05-03 07:22:18 +04:00
skb_queue_walk_safe ( & x - > sk_receive_queue , skb , next ) {
2007-07-12 01:22:39 +04:00
u = unix_sk ( skb - > sk ) ;
2015-04-22 09:56:42 +03:00
/* An embryo cannot be in-flight, so it's safe
2007-07-12 01:22:39 +04:00
* to use the list link .
*/
BUG_ON ( ! list_empty ( & u - > link ) ) ;
list_add_tail ( & u - > link , & embryos ) ;
}
spin_unlock ( & x - > sk_receive_queue . lock ) ;
while ( ! list_empty ( & embryos ) ) {
u = list_entry ( embryos . next , struct unix_sock , link ) ;
scan_inflight ( & u - > sk , func , hitlist ) ;
list_del_init ( & u - > link ) ;
}
}
2005-04-17 02:20:36 +04:00
}
2007-11-11 09:07:13 +03:00
static void dec_inflight ( struct unix_sock * usk )
2005-04-17 02:20:36 +04:00
{
2008-07-26 08:39:17 +04:00
atomic_long_dec ( & usk - > inflight ) ;
2007-07-12 01:22:39 +04:00
}
2005-04-17 02:20:36 +04:00
2007-11-11 09:07:13 +03:00
static void inc_inflight ( struct unix_sock * usk )
2007-07-12 01:22:39 +04:00
{
2008-07-26 08:39:17 +04:00
atomic_long_inc ( & usk - > inflight ) ;
2005-04-17 02:20:36 +04:00
}
2007-11-11 09:07:13 +03:00
static void inc_inflight_move_tail ( struct unix_sock * u )
2007-07-12 01:22:39 +04:00
{
2008-07-26 08:39:17 +04:00
atomic_long_inc ( & u - > inflight ) ;
2015-04-22 09:56:42 +03:00
/* If this still might be part of a cycle, move it to the end
2008-11-09 17:23:57 +03:00
* of the list , so that it ' s checked even if it was already
* passed over
2007-07-12 01:22:39 +04:00
*/
2013-05-01 09:24:03 +04:00
if ( test_bit ( UNIX_GC_MAYBE_CYCLE , & u - > gc_flags ) )
2007-07-12 01:22:39 +04:00
list_move_tail ( & u - > link , & gc_candidates ) ;
}
2005-04-17 02:20:36 +04:00
2014-10-08 01:02:15 +04:00
static bool gc_in_progress ;
2010-11-24 20:15:27 +03:00
# define UNIX_INFLIGHT_TRIGGER_GC 16000
2005-04-17 02:20:36 +04:00
2008-11-27 02:32:27 +03:00
void wait_for_unix_gc ( void )
2005-04-17 02:20:36 +04:00
{
2015-04-22 09:56:42 +03:00
/* If number of inflight sockets is insane,
2010-11-24 20:15:27 +03:00
* force a garbage collect right now .
2022-01-14 19:43:28 +03:00
* Paired with the WRITE_ONCE ( ) in unix_inflight ( ) ,
* unix_notinflight ( ) and gc_in_progress ( ) .
2010-11-24 20:15:27 +03:00
*/
2022-01-14 19:43:28 +03:00
if ( READ_ONCE ( unix_tot_inflight ) > UNIX_INFLIGHT_TRIGGER_GC & &
! READ_ONCE ( gc_in_progress ) )
2010-11-24 20:15:27 +03:00
unix_gc ( ) ;
2008-11-27 02:32:27 +03:00
wait_event ( unix_gc_wait , gc_in_progress = = false ) ;
}
2005-04-17 02:20:36 +04:00
2008-11-27 02:32:27 +03:00
/* The external entry point: unix_gc() */
void unix_gc ( void )
{
2022-10-03 15:59:47 +03:00
struct sk_buff * next_skb , * skb ;
2007-07-12 01:22:39 +04:00
struct unix_sock * u ;
struct unix_sock * next ;
struct sk_buff_head hitlist ;
struct list_head cursor ;
2008-11-09 17:23:57 +03:00
LIST_HEAD ( not_cycle_list ) ;
2005-04-17 02:20:36 +04:00
2007-07-12 01:22:39 +04:00
spin_lock ( & unix_gc_lock ) ;
2005-04-17 02:20:36 +04:00
2007-07-12 01:22:39 +04:00
/* Avoid a recursive GC. */
if ( gc_in_progress )
goto out ;
2005-04-17 02:20:36 +04:00
2022-01-14 19:43:28 +03:00
/* Paired with READ_ONCE() in wait_for_unix_gc(). */
WRITE_ONCE ( gc_in_progress , true ) ;
2015-04-22 09:56:42 +03:00
/* First, select candidates for garbage collection. Only
2007-07-12 01:22:39 +04:00
* in - flight sockets are considered , and from those only ones
* which don ' t have any external reference .
*
* Holding unix_gc_lock will protect these candidates from
* being detached , and hence from gaining an external
2008-11-09 17:23:57 +03:00
* reference . Since there are no possible receivers , all
* buffers currently on the candidates ' queues stay there
* during the garbage collection .
*
* We also know that no new candidate can be added onto the
* receive queues . Other , non candidate sockets _can_ be
* added to queue , so we must make sure only to touch
* candidates .
2005-04-17 02:20:36 +04:00
*/
2007-07-12 01:22:39 +04:00
list_for_each_entry_safe ( u , next , & gc_inflight_list , link ) {
2008-07-26 08:39:17 +04:00
long total_refs ;
long inflight_refs ;
2007-07-12 01:22:39 +04:00
total_refs = file_count ( u - > sk . sk_socket - > file ) ;
2008-07-26 08:39:17 +04:00
inflight_refs = atomic_long_read ( & u - > inflight ) ;
2007-07-12 01:22:39 +04:00
BUG_ON ( inflight_refs < 1 ) ;
BUG_ON ( total_refs < inflight_refs ) ;
if ( total_refs = = inflight_refs ) {
list_move_tail ( & u - > link , & gc_candidates ) ;
2013-05-01 09:24:03 +04:00
__set_bit ( UNIX_GC_CANDIDATE , & u - > gc_flags ) ;
__set_bit ( UNIX_GC_MAYBE_CYCLE , & u - > gc_flags ) ;
2007-07-12 01:22:39 +04:00
}
}
2005-04-17 02:20:36 +04:00
2015-04-22 09:56:42 +03:00
/* Now remove all internal in-flight reference to children of
2007-07-12 01:22:39 +04:00
* the candidates .
2005-04-17 02:20:36 +04:00
*/
2007-07-12 01:22:39 +04:00
list_for_each_entry ( u , & gc_candidates , link )
scan_children ( & u - > sk , dec_inflight , NULL ) ;
2005-04-17 02:20:36 +04:00
2015-04-22 09:56:42 +03:00
/* Restore the references for children of all candidates,
2007-07-12 01:22:39 +04:00
* which have remaining references . Do this recursively , so
* only those remain , which form cyclic references .
*
* Use a " cursor " link , to make the list traversal safe , even
* though elements might be moved about .
2005-04-17 02:20:36 +04:00
*/
2007-07-12 01:22:39 +04:00
list_add ( & cursor , & gc_candidates ) ;
while ( cursor . next ! = & gc_candidates ) {
u = list_entry ( cursor . next , struct unix_sock , link ) ;
2005-04-17 02:20:36 +04:00
2007-07-12 01:22:39 +04:00
/* Move cursor to after the current position. */
list_move ( & cursor , & u - > link ) ;
2007-02-09 17:25:23 +03:00
2008-07-26 08:39:17 +04:00
if ( atomic_long_read ( & u - > inflight ) > 0 ) {
2008-11-09 17:23:57 +03:00
list_move_tail ( & u - > link , & not_cycle_list ) ;
2013-05-01 09:24:03 +04:00
__clear_bit ( UNIX_GC_MAYBE_CYCLE , & u - > gc_flags ) ;
2007-07-12 01:22:39 +04:00
scan_children ( & u - > sk , inc_inflight_move_tail , NULL ) ;
2005-04-17 02:20:36 +04:00
}
}
2007-07-12 01:22:39 +04:00
list_del ( & cursor ) ;
2005-04-17 02:20:36 +04:00
2017-03-15 06:16:42 +03:00
/* Now gc_candidates contains only garbage. Restore original
* inflight counters for these as well , and remove the skbuffs
* which are creating the cycle ( s ) .
*/
skb_queue_head_init ( & hitlist ) ;
2024-02-19 20:46:57 +03:00
list_for_each_entry ( u , & gc_candidates , link ) {
2017-03-15 06:16:42 +03:00
scan_children ( & u - > sk , inc_inflight , & hitlist ) ;
2024-02-19 20:46:57 +03:00
# if IS_ENABLED(CONFIG_AF_UNIX_OOB)
if ( u - > oob_skb ) {
kfree_skb ( u - > oob_skb ) ;
u - > oob_skb = NULL ;
}
# endif
}
2015-04-22 09:56:42 +03:00
/* not_cycle_list contains those sockets which do not make up a
2008-11-09 17:23:57 +03:00
* cycle . Restore these to the inflight list .
*/
while ( ! list_empty ( & not_cycle_list ) ) {
u = list_entry ( not_cycle_list . next , struct unix_sock , link ) ;
2013-05-01 09:24:03 +04:00
__clear_bit ( UNIX_GC_CANDIDATE , & u - > gc_flags ) ;
2008-11-09 17:23:57 +03:00
list_move_tail ( & u - > link , & gc_inflight_list ) ;
}
2007-07-12 01:22:39 +04:00
spin_unlock ( & unix_gc_lock ) ;
2005-04-17 02:20:36 +04:00
2022-10-03 15:59:47 +03:00
/* We need io_uring to clean its registered files, ignore all io_uring
* originated skbs . It ' s fine as io_uring doesn ' t keep references to
* other io_uring instances and so killing all other files in the cycle
* will put all io_uring references forcing it to go through normal
* release . path eventually putting registered files .
*/
skb_queue_walk_safe ( & hitlist , skb , next_skb ) {
2023-03-07 17:59:59 +03:00
if ( skb - > destructor = = io_uring_destruct_scm ) {
2022-10-03 15:59:47 +03:00
__skb_unlink ( skb , & hitlist ) ;
skb_queue_tail ( & skb - > sk - > sk_receive_queue , skb ) ;
}
}
2007-07-12 01:22:39 +04:00
/* Here we are. Hitlist is filled. Die. */
__skb_queue_purge ( & hitlist ) ;
2005-04-17 02:20:36 +04:00
2007-07-12 01:22:39 +04:00
spin_lock ( & unix_gc_lock ) ;
2005-04-17 02:20:36 +04:00
2022-10-03 15:59:47 +03:00
/* There could be io_uring registered files, just push them back to
* the inflight list
*/
list_for_each_entry_safe ( u , next , & gc_candidates , link )
list_move_tail ( & u - > link , & gc_inflight_list ) ;
2007-07-12 01:22:39 +04:00
/* All candidates should have been detached by now. */
BUG_ON ( ! list_empty ( & gc_candidates ) ) ;
2022-01-14 19:43:28 +03:00
/* Paired with READ_ONCE() in wait_for_unix_gc(). */
WRITE_ONCE ( gc_in_progress , false ) ;
2008-11-27 02:32:27 +03:00
wake_up ( & unix_gc_wait ) ;
2005-04-17 02:20:36 +04:00
2007-07-12 01:22:39 +04:00
out :
spin_unlock ( & unix_gc_lock ) ;
2005-04-17 02:20:36 +04:00
}