2019-05-27 09:55:01 +03:00
// SPDX-License-Identifier: GPL-2.0-or-later
2005-04-17 02:20:36 +04:00
/*
* INET An implementation of the TCP / IP protocol suite for the LINUX
* operating system . INET is implemented using the BSD Socket
* interface as the means of communication with the user level .
*
* Generic socket support routines . Memory allocators , socket lock / release
* handler for protocols to use and generic option handler .
*
2005-05-06 03:16:16 +04:00
* Authors : Ross Biro
2005-04-17 02:20:36 +04:00
* Fred N . van Kempen , < waltje @ uWalt . NL . Mugnet . ORG >
* Florian La Roche , < flla @ stud . uni - sb . de >
* Alan Cox , < A . Cox @ swansea . ac . uk >
*
* Fixes :
* Alan Cox : Numerous verify_area ( ) problems
* Alan Cox : Connecting on a connecting socket
* now returns an error for tcp .
* Alan Cox : sock - > protocol is set correctly .
* and is not sometimes left as 0.
* Alan Cox : connect handles icmp errors on a
* connect properly . Unfortunately there
* is a restart syscall nasty there . I
* can ' t match BSD without hacking the C
* library . Ideas urgently sought !
* Alan Cox : Disallow bind ( ) to addresses that are
* not ours - especially broadcast ones ! !
* Alan Cox : Socket 1024 _IS_ ok for users . ( fencepost )
* Alan Cox : sock_wfree / sock_rfree don ' t destroy sockets ,
* instead they leave that for the DESTROY timer .
* Alan Cox : Clean up error flag in accept
* Alan Cox : TCP ack handling is buggy , the DESTROY timer
* was buggy . Put a remove_sock ( ) in the handler
* for memory when we hit 0. Also altered the timer
2007-02-09 17:24:36 +03:00
* code . The ACK stuff can wait and needs major
2005-04-17 02:20:36 +04:00
* TCP layer surgery .
* Alan Cox : Fixed TCP ack bug , removed remove sock
* and fixed timer / inet_bh race .
* Alan Cox : Added zapped flag for TCP
* Alan Cox : Move kfree_skb into skbuff . c and tidied up surplus code
* Alan Cox : for new sk_buff allocations wmalloc / rmalloc now call alloc_skb
* Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources
* Alan Cox : Supports socket option broadcast now as does udp . Packet and raw need fixing .
* Alan Cox : Added RCVBUF , SNDBUF size setting . It suddenly occurred to me how easy it was so . . .
* Rick Sladkey : Relaxed UDP rules for matching packets .
* C . E . Hawkins : IFF_PROMISC / SIOCGHWADDR support
* Pauline Middelink : identd support
* Alan Cox : Fixed connect ( ) taking signals I think .
* Alan Cox : SO_LINGER supported
* Alan Cox : Error reporting fixes
* Anonymous : inet_create tidied up ( sk - > reuse setting )
* Alan Cox : inet sockets don ' t set sk - > type !
* Alan Cox : Split socket option code
* Alan Cox : Callbacks
* Alan Cox : Nagle flag for Charles & Johannes stuff
* Alex : Removed restriction on inet fioctl
* Alan Cox : Splitting INET from NET core
* Alan Cox : Fixed bogus SO_TYPE handling in getsockopt ( )
* Adam Caldwell : Missing return in SO_DONTROUTE / SO_DEBUG code
* Alan Cox : Split IP from generic code
* Alan Cox : New kfree_skbmem ( )
* Alan Cox : Make SO_DEBUG superuser only .
* Alan Cox : Allow anyone to clear SO_DEBUG
* ( compatibility fix )
* Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput .
* Alan Cox : Allocator for a socket is settable .
* Alan Cox : SO_ERROR includes soft errors .
* Alan Cox : Allow NULL arguments on some SO_ opts
* Alan Cox : Generic socket allocation to make hooks
* easier ( suggested by Craig Metz ) .
* Michael Pall : SO_ERROR returns positive errno again
* Steve Whitehouse : Added default destructor to free
* protocol private data .
* Steve Whitehouse : Added various other default routines
* common to several socket families .
* Chris Evans : Call suser ( ) check last on F_SETOWN
* Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER .
* Andi Kleen : Add sock_kmalloc ( ) / sock_kfree_s ( )
* Andi Kleen : Fix write_space callback
* Chris Evans : Security fixes - signedness again
* Arnaldo C . Melo : cleanups , use skb_queue_purge
*
* To Fix :
*/
2012-05-16 23:58:40 +04:00
# define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
2018-07-04 01:42:48 +03:00
# include <asm/unaligned.h>
2006-01-11 23:17:47 +03:00
# include <linux/capability.h>
2005-04-17 02:20:36 +04:00
# include <linux/errno.h>
2013-07-19 21:40:09 +04:00
# include <linux/errqueue.h>
2005-04-17 02:20:36 +04:00
# include <linux/types.h>
# include <linux/socket.h>
# include <linux/in.h>
# include <linux/kernel.h>
# include <linux/module.h>
# include <linux/proc_fs.h>
# include <linux/seq_file.h>
# include <linux/sched.h>
2017-05-09 01:59:53 +03:00
# include <linux/sched/mm.h>
2005-04-17 02:20:36 +04:00
# include <linux/timer.h>
# include <linux/string.h>
# include <linux/sockios.h>
# include <linux/net.h>
# include <linux/mm.h>
# include <linux/slab.h>
# include <linux/interrupt.h>
# include <linux/poll.h>
# include <linux/tcp.h>
# include <linux/init.h>
2006-10-20 00:08:53 +04:00
# include <linux/highmem.h>
2010-06-13 07:28:59 +04:00
# include <linux/user_namespace.h>
2012-02-24 11:31:31 +04:00
# include <linux/static_key.h>
2012-01-10 01:44:23 +04:00
# include <linux/memcontrol.h>
2012-05-03 10:25:55 +04:00
# include <linux/prefetch.h>
2020-07-22 10:40:27 +03:00
# include <linux/compat.h>
2005-04-17 02:20:36 +04:00
2016-12-24 22:46:01 +03:00
# include <linux/uaccess.h>
2005-04-17 02:20:36 +04:00
# include <linux/netdevice.h>
# include <net/protocol.h>
# include <linux/skbuff.h>
2007-09-12 14:01:34 +04:00
# include <net/net_namespace.h>
[NET] Generalise TCP's struct open_request minisock infrastructure
Kept this first changeset minimal, without changing existing names to
ease peer review.
Basicaly tcp_openreq_alloc now receives the or_calltable, that in turn
has two new members:
->slab, that replaces tcp_openreq_cachep
->obj_size, to inform the size of the openreq descendant for
a specific protocol
The protocol specific fields in struct open_request were moved to a
class hierarchy, with the things that are common to all connection
oriented PF_INET protocols in struct inet_request_sock, the TCP ones
in tcp_request_sock, that is an inet_request_sock, that is an
open_request.
I.e. this uses the same approach used for the struct sock class
hierarchy, with sk_prot indicating if the protocol wants to use the
open_request infrastructure by filling in sk_prot->rsk_prot with an
or_calltable.
Results? Performance is improved and TCP v4 now uses only 64 bytes per
open request minisock, down from 96 without this patch :-)
Next changeset will rename some of the structs, fields and functions
mentioned above, struct or_calltable is way unclear, better name it
struct request_sock_ops, s/struct open_request/struct request_sock/g,
etc.
Signed-off-by: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
2005-06-19 09:46:52 +04:00
# include <net/request_sock.h>
2005-04-17 02:20:36 +04:00
# include <net/sock.h>
2009-02-12 08:03:38 +03:00
# include <linux/net_tstamp.h>
2005-04-17 02:20:36 +04:00
# include <net/xfrm.h>
# include <linux/ipsec.h>
cls_cgroup: Store classid in struct sock
Up until now cls_cgroup has relied on fetching the classid out of
the current executing thread. This runs into trouble when a packet
processing is delayed in which case it may execute out of another
thread's context.
Furthermore, even when a packet is not delayed we may fail to
classify it if soft IRQs have been disabled, because this scenario
is indistinguishable from one where a packet unrelated to the
current thread is processed by a real soft IRQ.
In fact, the current semantics is inherently broken, as a single
skb may be constructed out of the writes of two different tasks.
A different manifestation of this problem is when the TCP stack
transmits in response of an incoming ACK. This is currently
unclassified.
As we already have a concept of packet ownership for accounting
purposes in the skb->sk pointer, this is a natural place to store
the classid in a persistent manner.
This patch adds the cls_cgroup classid in struct sock, filling up
an existing hole on 64-bit :)
The value is set at socket creation time. So all sockets created
via socket(2) automatically gains the ID of the thread creating it.
Whenever another process touches the socket by either reading or
writing to it, we will change the socket classid to that of the
process if it has a valid (non-zero) classid.
For sockets created on inbound connections through accept(2), we
inherit the classid of the original listening socket through
sk_clone, possibly preceding the actual accept(2) call.
In order to minimise risks, I have not made this the authoritative
classid. For now it is only used as a backup when we execute
with soft IRQs disabled. Once we're completely happy with its
semantics we can use it as the sole classid.
Footnote: I have rearranged the error path on cls_group module
creation. If we didn't do this, then there is a window where
someone could create a tc rule using cls_group before the cgroup
subsystem has been registered.
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
2010-05-24 11:12:34 +04:00
# include <net/cls_cgroup.h>
2011-11-22 09:10:51 +04:00
# include <net/netprio_cgroup.h>
2015-06-15 18:26:18 +03:00
# include <linux/sock_diag.h>
2005-04-17 02:20:36 +04:00
# include <linux/filter.h>
2016-01-05 01:41:47 +03:00
# include <net/sock_reuseport.h>
bpf: Introduce bpf sk local storage
After allowing a bpf prog to
- directly read the skb->sk ptr
- get the fullsock bpf_sock by "bpf_sk_fullsock()"
- get the bpf_tcp_sock by "bpf_tcp_sock()"
- get the listener sock by "bpf_get_listener_sock()"
- avoid duplicating the fields of "(bpf_)sock" and "(bpf_)tcp_sock"
into different bpf running context.
this patch is another effort to make bpf's network programming
more intuitive to do (together with memory and performance benefit).
When bpf prog needs to store data for a sk, the current practice is to
define a map with the usual 4-tuples (src/dst ip/port) as the key.
If multiple bpf progs require to store different sk data, multiple maps
have to be defined. Hence, wasting memory to store the duplicated
keys (i.e. 4 tuples here) in each of the bpf map.
[ The smallest key could be the sk pointer itself which requires
some enhancement in the verifier and it is a separate topic. ]
Also, the bpf prog needs to clean up the elem when sk is freed.
Otherwise, the bpf map will become full and un-usable quickly.
The sk-free tracking currently could be done during sk state
transition (e.g. BPF_SOCK_OPS_STATE_CB).
The size of the map needs to be predefined which then usually ended-up
with an over-provisioned map in production. Even the map was re-sizable,
while the sk naturally come and go away already, this potential re-size
operation is arguably redundant if the data can be directly connected
to the sk itself instead of proxy-ing through a bpf map.
This patch introduces sk->sk_bpf_storage to provide local storage space
at sk for bpf prog to use. The space will be allocated when the first bpf
prog has created data for this particular sk.
The design optimizes the bpf prog's lookup (and then optionally followed by
an inline update). bpf_spin_lock should be used if the inline update needs
to be protected.
BPF_MAP_TYPE_SK_STORAGE:
-----------------------
To define a bpf "sk-local-storage", a BPF_MAP_TYPE_SK_STORAGE map (new in
this patch) needs to be created. Multiple BPF_MAP_TYPE_SK_STORAGE maps can
be created to fit different bpf progs' needs. The map enforces
BTF to allow printing the sk-local-storage during a system-wise
sk dump (e.g. "ss -ta") in the future.
The purpose of a BPF_MAP_TYPE_SK_STORAGE map is not for lookup/update/delete
a "sk-local-storage" data from a particular sk.
Think of the map as a meta-data (or "type") of a "sk-local-storage". This
particular "type" of "sk-local-storage" data can then be stored in any sk.
The main purposes of this map are mostly:
1. Define the size of a "sk-local-storage" type.
2. Provide a similar syscall userspace API as the map (e.g. lookup/update,
map-id, map-btf...etc.)
3. Keep track of all sk's storages of this "type" and clean them up
when the map is freed.
sk->sk_bpf_storage:
------------------
The main lookup/update/delete is done on sk->sk_bpf_storage (which
is a "struct bpf_sk_storage"). When doing a lookup,
the "map" pointer is now used as the "key" to search on the
sk_storage->list. The "map" pointer is actually serving
as the "type" of the "sk-local-storage" that is being
requested.
To allow very fast lookup, it should be as fast as looking up an
array at a stable-offset. At the same time, it is not ideal to
set a hard limit on the number of sk-local-storage "type" that the
system can have. Hence, this patch takes a cache approach.
The last search result from sk_storage->list is cached in
sk_storage->cache[] which is a stable sized array. Each
"sk-local-storage" type has a stable offset to the cache[] array.
In the future, a map's flag could be introduced to do cache
opt-out/enforcement if it became necessary.
The cache size is 16 (i.e. 16 types of "sk-local-storage").
Programs can share map. On the program side, having a few bpf_progs
running in the networking hotpath is already a lot. The bpf_prog
should have already consolidated the existing sock-key-ed map usage
to minimize the map lookup penalty. 16 has enough runway to grow.
All sk-local-storage data will be removed from sk->sk_bpf_storage
during sk destruction.
bpf_sk_storage_get() and bpf_sk_storage_delete():
------------------------------------------------
Instead of using bpf_map_(lookup|update|delete)_elem(),
the bpf prog needs to use the new helper bpf_sk_storage_get() and
bpf_sk_storage_delete(). The verifier can then enforce the
ARG_PTR_TO_SOCKET argument. The bpf_sk_storage_get() also allows to
"create" new elem if one does not exist in the sk. It is done by
the new BPF_SK_STORAGE_GET_F_CREATE flag. An optional value can also be
provided as the initial value during BPF_SK_STORAGE_GET_F_CREATE.
The BPF_MAP_TYPE_SK_STORAGE also supports bpf_spin_lock. Together,
it has eliminated the potential use cases for an equivalent
bpf_map_update_elem() API (for bpf_prog) in this patch.
Misc notes:
----------
1. map_get_next_key is not supported. From the userspace syscall
perspective, the map has the socket fd as the key while the map
can be shared by pinned-file or map-id.
Since btf is enforced, the existing "ss" could be enhanced to pretty
print the local-storage.
Supporting a kernel defined btf with 4 tuples as the return key could
be explored later also.
2. The sk->sk_lock cannot be acquired. Atomic operations is used instead.
e.g. cmpxchg is done on the sk->sk_bpf_storage ptr.
Please refer to the source code comments for the details in
synchronization cases and considerations.
3. The mem is charged to the sk->sk_omem_alloc as the sk filter does.
Benchmark:
---------
Here is the benchmark data collected by turning on
the "kernel.bpf_stats_enabled" sysctl.
Two bpf progs are tested:
One bpf prog with the usual bpf hashmap (max_entries = 8192) with the
sk ptr as the key. (verifier is modified to support sk ptr as the key
That should have shortened the key lookup time.)
Another bpf prog is with the new BPF_MAP_TYPE_SK_STORAGE.
Both are storing a "u32 cnt", do a lookup on "egress_skb/cgroup" for
each egress skb and then bump the cnt. netperf is used to drive
data with 4096 connected UDP sockets.
BPF_MAP_TYPE_HASH with a modifier verifier (152ns per bpf run)
27: cgroup_skb name egress_sk_map tag 74f56e832918070b run_time_ns 58280107540 run_cnt 381347633
loaded_at 2019-04-15T13:46:39-0700 uid 0
xlated 344B jited 258B memlock 4096B map_ids 16
btf_id 5
BPF_MAP_TYPE_SK_STORAGE in this patch (66ns per bpf run)
30: cgroup_skb name egress_sk_stora tag d4aa70984cc7bbf6 run_time_ns 25617093319 run_cnt 390989739
loaded_at 2019-04-15T13:47:54-0700 uid 0
xlated 168B jited 156B memlock 4096B map_ids 17
btf_id 6
Here is a high-level picture on how are the objects organized:
sk
┌──────┐
│ │
│ │
│ │
│*sk_bpf_storage─────▶ bpf_sk_storage
└──────┘ ┌───────┐
┌───────────┤ list │
│ │ │
│ │ │
│ │ │
│ └───────┘
│
│ elem
│ ┌────────┐
├─▶│ snode │
│ ├────────┤
│ │ data │ bpf_map
│ ├────────┤ ┌─────────┐
│ │map_node│◀─┬─────┤ list │
│ └────────┘ │ │ │
│ │ │ │
│ elem │ │ │
│ ┌────────┐ │ └─────────┘
└─▶│ snode │ │
├────────┤ │
bpf_map │ data │ │
┌─────────┐ ├────────┤ │
│ list ├───────▶│map_node│ │
│ │ └────────┘ │
│ │ │
│ │ elem │
└─────────┘ ┌────────┐ │
┌─▶│ snode │ │
│ ├────────┤ │
│ │ data │ │
│ ├────────┤ │
│ │map_node│◀─┘
│ └────────┘
│
│
│ ┌───────┐
sk └──────────│ list │
┌──────┐ │ │
│ │ │ │
│ │ │ │
│ │ └───────┘
│*sk_bpf_storage───────▶bpf_sk_storage
└──────┘
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2019-04-27 02:39:39 +03:00
# include <net/bpf_sk_storage.h>
2005-04-17 02:20:36 +04:00
2011-06-17 16:00:03 +04:00
# include <trace/events/sock.h>
2005-04-17 02:20:36 +04:00
# include <net/tcp.h>
2013-07-10 18:13:17 +04:00
# include <net/busy_poll.h>
2013-06-10 12:39:50 +04:00
2021-06-30 11:11:59 +03:00
# include <linux/ethtool.h>
2011-12-16 04:51:59 +04:00
static DEFINE_MUTEX ( proto_list_mutex ) ;
2011-12-12 01:47:04 +04:00
static LIST_HEAD ( proto_list ) ;
2014-04-24 01:26:56 +04:00
/**
* sk_ns_capable - General socket capability test
* @ sk : Socket to use a capability on or through
* @ user_ns : The user namespace of the capability to use
* @ cap : The capability to use
*
* Test to see if the opener of the socket had when the socket was
* created and the current process has the capability @ cap in the user
* namespace @ user_ns .
*/
bool sk_ns_capable ( const struct sock * sk ,
struct user_namespace * user_ns , int cap )
{
return file_ns_capable ( sk - > sk_socket - > file , user_ns , cap ) & &
ns_capable ( user_ns , cap ) ;
}
EXPORT_SYMBOL ( sk_ns_capable ) ;
/**
* sk_capable - Socket global capability test
* @ sk : Socket to use a capability on or through
2014-09-04 18:44:36 +04:00
* @ cap : The global capability to use
2014-04-24 01:26:56 +04:00
*
* Test to see if the opener of the socket had when the socket was
* created and the current process has the capability @ cap in all user
* namespaces .
*/
bool sk_capable ( const struct sock * sk , int cap )
{
return sk_ns_capable ( sk , & init_user_ns , cap ) ;
}
EXPORT_SYMBOL ( sk_capable ) ;
/**
* sk_net_capable - Network namespace socket capability test
* @ sk : Socket to use a capability on or through
* @ cap : The capability to use
*
2014-09-04 18:44:36 +04:00
* Test to see if the opener of the socket had when the socket was created
2014-04-24 01:26:56 +04:00
* and the current process has the capability @ cap over the network namespace
* the socket is a member of .
*/
bool sk_net_capable ( const struct sock * sk , int cap )
{
return sk_ns_capable ( sk , sock_net ( sk ) - > user_ns , cap ) ;
}
EXPORT_SYMBOL ( sk_net_capable ) ;
2006-07-03 11:25:12 +04:00
/*
* Each address family might have different locking rules , so we have
2017-03-09 11:09:05 +03:00
* one slock key per address family and separate keys for internal and
* userspace sockets .
2006-07-03 11:25:12 +04:00
*/
2006-07-03 11:25:35 +04:00
static struct lock_class_key af_family_keys [ AF_MAX ] ;
2017-03-09 11:09:05 +03:00
static struct lock_class_key af_family_kern_keys [ AF_MAX ] ;
2006-07-03 11:25:35 +04:00
static struct lock_class_key af_family_slock_keys [ AF_MAX ] ;
2017-03-09 11:09:05 +03:00
static struct lock_class_key af_family_kern_slock_keys [ AF_MAX ] ;
2006-07-03 11:25:35 +04:00
/*
* Make lock validator output more readable . ( we pre - construct these
* strings build - time , so that runtime initialization of socket
* locks is fast ) :
*/
2017-03-09 11:09:05 +03:00
# define _sock_locks(x) \
x " AF_UNSPEC " , x " AF_UNIX " , x " AF_INET " , \
x " AF_AX25 " , x " AF_IPX " , x " AF_APPLETALK " , \
x " AF_NETROM " , x " AF_BRIDGE " , x " AF_ATMPVC " , \
x " AF_X25 " , x " AF_INET6 " , x " AF_ROSE " , \
x " AF_DECnet " , x " AF_NETBEUI " , x " AF_SECURITY " , \
x " AF_KEY " , x " AF_NETLINK " , x " AF_PACKET " , \
x " AF_ASH " , x " AF_ECONET " , x " AF_ATMSVC " , \
x " AF_RDS " , x " AF_SNA " , x " AF_IRDA " , \
x " AF_PPPOX " , x " AF_WANPIPE " , x " AF_LLC " , \
x " 27 " , x " 28 " , x " AF_CAN " , \
x " AF_TIPC " , x " AF_BLUETOOTH " , x " IUCV " , \
x " AF_RXRPC " , x " AF_ISDN " , x " AF_PHONET " , \
x " AF_IEEE802154 " , x " AF_CAIF " , x " AF_ALG " , \
x " AF_NFC " , x " AF_VSOCK " , x " AF_KCM " , \
2018-05-02 14:01:22 +03:00
x " AF_QIPCRTR " , x " AF_SMC " , x " AF_XDP " , \
2021-07-29 05:20:39 +03:00
x " AF_MCTP " , \
2018-05-02 14:01:22 +03:00
x " AF_MAX "
2017-03-09 11:09:05 +03:00
2009-08-05 21:42:58 +04:00
static const char * const af_family_key_strings [ AF_MAX + 1 ] = {
2017-03-09 11:09:05 +03:00
_sock_locks ( " sk_lock- " )
2006-07-03 11:25:35 +04:00
} ;
2009-08-05 21:42:58 +04:00
static const char * const af_family_slock_key_strings [ AF_MAX + 1 ] = {
2017-03-09 11:09:05 +03:00
_sock_locks ( " slock- " )
2006-07-03 11:25:35 +04:00
} ;
2009-08-05 21:42:58 +04:00
static const char * const af_family_clock_key_strings [ AF_MAX + 1 ] = {
2017-03-09 11:09:05 +03:00
_sock_locks ( " clock- " )
} ;
static const char * const af_family_kern_key_strings [ AF_MAX + 1 ] = {
_sock_locks ( " k-sk_lock- " )
} ;
static const char * const af_family_kern_slock_key_strings [ AF_MAX + 1 ] = {
_sock_locks ( " k-slock- " )
} ;
static const char * const af_family_kern_clock_key_strings [ AF_MAX + 1 ] = {
_sock_locks ( " k-clock- " )
2007-07-19 12:49:00 +04:00
} ;
net/socket: use per af lockdep classes for sk queues
Currently the sock queue's spin locks get their lockdep
classes by the default init_spin_lock() initializer:
all socket families get - usually, see below - a single
class for rx, another specific class for tx, etc.
This can lead to false positive lockdep splat, as
reported by Andrey.
Moreover there are two separate initialization points
for the sock queues, one in sk_clone_lock() and one
in sock_init_data(), so that e.g. the rx queue lock
can get one of two possible, different classes, depending
on the socket being cloned or not.
This change tries to address the above, setting explicitly
a per address family lockdep class for each queue's
spinlock. Also, move the duplicated initialization code to a
single location.
v1 -> v2:
- renamed the init helper
rfc -> v1:
- no changes, tested with several different workload
Suggested-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-09 15:54:08 +03:00
static const char * const af_family_rlock_key_strings [ AF_MAX + 1 ] = {
2018-08-02 19:14:33 +03:00
_sock_locks ( " rlock- " )
net/socket: use per af lockdep classes for sk queues
Currently the sock queue's spin locks get their lockdep
classes by the default init_spin_lock() initializer:
all socket families get - usually, see below - a single
class for rx, another specific class for tx, etc.
This can lead to false positive lockdep splat, as
reported by Andrey.
Moreover there are two separate initialization points
for the sock queues, one in sk_clone_lock() and one
in sock_init_data(), so that e.g. the rx queue lock
can get one of two possible, different classes, depending
on the socket being cloned or not.
This change tries to address the above, setting explicitly
a per address family lockdep class for each queue's
spinlock. Also, move the duplicated initialization code to a
single location.
v1 -> v2:
- renamed the init helper
rfc -> v1:
- no changes, tested with several different workload
Suggested-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-09 15:54:08 +03:00
} ;
static const char * const af_family_wlock_key_strings [ AF_MAX + 1 ] = {
2018-08-02 19:14:33 +03:00
_sock_locks ( " wlock- " )
net/socket: use per af lockdep classes for sk queues
Currently the sock queue's spin locks get their lockdep
classes by the default init_spin_lock() initializer:
all socket families get - usually, see below - a single
class for rx, another specific class for tx, etc.
This can lead to false positive lockdep splat, as
reported by Andrey.
Moreover there are two separate initialization points
for the sock queues, one in sk_clone_lock() and one
in sock_init_data(), so that e.g. the rx queue lock
can get one of two possible, different classes, depending
on the socket being cloned or not.
This change tries to address the above, setting explicitly
a per address family lockdep class for each queue's
spinlock. Also, move the duplicated initialization code to a
single location.
v1 -> v2:
- renamed the init helper
rfc -> v1:
- no changes, tested with several different workload
Suggested-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-09 15:54:08 +03:00
} ;
static const char * const af_family_elock_key_strings [ AF_MAX + 1 ] = {
2018-08-02 19:14:33 +03:00
_sock_locks ( " elock- " )
net/socket: use per af lockdep classes for sk queues
Currently the sock queue's spin locks get their lockdep
classes by the default init_spin_lock() initializer:
all socket families get - usually, see below - a single
class for rx, another specific class for tx, etc.
This can lead to false positive lockdep splat, as
reported by Andrey.
Moreover there are two separate initialization points
for the sock queues, one in sk_clone_lock() and one
in sock_init_data(), so that e.g. the rx queue lock
can get one of two possible, different classes, depending
on the socket being cloned or not.
This change tries to address the above, setting explicitly
a per address family lockdep class for each queue's
spinlock. Also, move the duplicated initialization code to a
single location.
v1 -> v2:
- renamed the init helper
rfc -> v1:
- no changes, tested with several different workload
Suggested-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-09 15:54:08 +03:00
} ;
2006-07-03 11:25:12 +04:00
/*
net/socket: use per af lockdep classes for sk queues
Currently the sock queue's spin locks get their lockdep
classes by the default init_spin_lock() initializer:
all socket families get - usually, see below - a single
class for rx, another specific class for tx, etc.
This can lead to false positive lockdep splat, as
reported by Andrey.
Moreover there are two separate initialization points
for the sock queues, one in sk_clone_lock() and one
in sock_init_data(), so that e.g. the rx queue lock
can get one of two possible, different classes, depending
on the socket being cloned or not.
This change tries to address the above, setting explicitly
a per address family lockdep class for each queue's
spinlock. Also, move the duplicated initialization code to a
single location.
v1 -> v2:
- renamed the init helper
rfc -> v1:
- no changes, tested with several different workload
Suggested-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-09 15:54:08 +03:00
* sk_callback_lock and sk queues locking rules are per - address - family ,
2006-07-03 11:25:12 +04:00
* so split the lock classes by using a per - AF key :
*/
static struct lock_class_key af_callback_keys [ AF_MAX ] ;
net/socket: use per af lockdep classes for sk queues
Currently the sock queue's spin locks get their lockdep
classes by the default init_spin_lock() initializer:
all socket families get - usually, see below - a single
class for rx, another specific class for tx, etc.
This can lead to false positive lockdep splat, as
reported by Andrey.
Moreover there are two separate initialization points
for the sock queues, one in sk_clone_lock() and one
in sock_init_data(), so that e.g. the rx queue lock
can get one of two possible, different classes, depending
on the socket being cloned or not.
This change tries to address the above, setting explicitly
a per address family lockdep class for each queue's
spinlock. Also, move the duplicated initialization code to a
single location.
v1 -> v2:
- renamed the init helper
rfc -> v1:
- no changes, tested with several different workload
Suggested-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-09 15:54:08 +03:00
static struct lock_class_key af_rlock_keys [ AF_MAX ] ;
static struct lock_class_key af_wlock_keys [ AF_MAX ] ;
static struct lock_class_key af_elock_keys [ AF_MAX ] ;
2017-03-09 11:09:05 +03:00
static struct lock_class_key af_kern_callback_keys [ AF_MAX ] ;
2006-07-03 11:25:12 +04:00
2005-04-17 02:20:36 +04:00
/* Run time adjustable parameters. */
2006-09-23 01:15:41 +04:00
__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX ;
2012-04-30 10:13:50 +04:00
EXPORT_SYMBOL ( sysctl_wmem_max ) ;
2006-09-23 01:15:41 +04:00
__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX ;
2012-04-30 10:13:50 +04:00
EXPORT_SYMBOL ( sysctl_rmem_max ) ;
2006-09-23 01:15:41 +04:00
__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX ;
__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX ;
2005-04-17 02:20:36 +04:00
2011-03-31 05:57:33 +04:00
/* Maximal space eaten by iovec or ancillary data plus some space */
2006-09-23 01:15:41 +04:00
int sysctl_optmem_max __read_mostly = sizeof ( unsigned long ) * ( 2 * UIO_MAXIOV + 512 ) ;
2009-05-27 15:30:05 +04:00
EXPORT_SYMBOL ( sysctl_optmem_max ) ;
2005-04-17 02:20:36 +04:00
2015-01-30 21:29:32 +03:00
int sysctl_tstamp_allow_data __read_mostly = 1 ;
2018-05-08 19:06:59 +03:00
DEFINE_STATIC_KEY_FALSE ( memalloc_socks_key ) ;
EXPORT_SYMBOL_GPL ( memalloc_socks_key ) ;
2012-08-01 03:44:19 +04:00
2012-08-01 03:44:16 +04:00
/**
* sk_set_memalloc - sets % SOCK_MEMALLOC
* @ sk : socket to set it on
*
* Set % SOCK_MEMALLOC on a socket for access to emergency reserves .
* It ' s the responsibility of the admin to adjust min_free_kbytes
* to meet the requirements
*/
void sk_set_memalloc ( struct sock * sk )
{
sock_set_flag ( sk , SOCK_MEMALLOC ) ;
sk - > sk_allocation | = __GFP_MEMALLOC ;
2018-05-08 19:06:59 +03:00
static_branch_inc ( & memalloc_socks_key ) ;
2012-08-01 03:44:16 +04:00
}
EXPORT_SYMBOL_GPL ( sk_set_memalloc ) ;
void sk_clear_memalloc ( struct sock * sk )
{
sock_reset_flag ( sk , SOCK_MEMALLOC ) ;
sk - > sk_allocation & = ~ __GFP_MEMALLOC ;
2018-05-08 19:06:59 +03:00
static_branch_dec ( & memalloc_socks_key ) ;
netvm: prevent a stream-specific deadlock
This patch series is based on top of "Swap-over-NBD without deadlocking
v15" as it depends on the same reservation of PF_MEMALLOC reserves logic.
When a user or administrator requires swap for their application, they
create a swap partition and file, format it with mkswap and activate it
with swapon. In diskless systems this is not an option so if swap if
required then swapping over the network is considered. The two likely
scenarios are when blade servers are used as part of a cluster where the
form factor or maintenance costs do not allow the use of disks and thin
clients.
The Linux Terminal Server Project recommends the use of the Network Block
Device (NBD) for swap but this is not always an option. There is no
guarantee that the network attached storage (NAS) device is running Linux
or supports NBD. However, it is likely that it supports NFS so there are
users that want support for swapping over NFS despite any performance
concern. Some distributions currently carry patches that support swapping
over NFS but it would be preferable to support it in the mainline kernel.
Patch 1 avoids a stream-specific deadlock that potentially affects TCP.
Patch 2 is a small modification to SELinux to avoid using PFMEMALLOC
reserves.
Patch 3 adds three helpers for filesystems to handle swap cache pages.
For example, page_file_mapping() returns page->mapping for
file-backed pages and the address_space of the underlying
swap file for swap cache pages.
Patch 4 adds two address_space_operations to allow a filesystem
to pin all metadata relevant to a swapfile in memory. Upon
successful activation, the swapfile is marked SWP_FILE and
the address space operation ->direct_IO is used for writing
and ->readpage for reading in swap pages.
Patch 5 notes that patch 3 is bolting
filesystem-specific-swapfile-support onto the side and that
the default handlers have different information to what
is available to the filesystem. This patch refactors the
code so that there are generic handlers for each of the new
address_space operations.
Patch 6 adds an API to allow a vector of kernel addresses to be
translated to struct pages and pinned for IO.
Patch 7 adds support for using highmem pages for swap by kmapping
the pages before calling the direct_IO handler.
Patch 8 updates NFS to use the helpers from patch 3 where necessary.
Patch 9 avoids setting PF_private on PG_swapcache pages within NFS.
Patch 10 implements the new swapfile-related address_space operations
for NFS and teaches the direct IO handler how to manage
kernel addresses.
Patch 11 prevents page allocator recursions in NFS by using GFP_NOIO
where appropriate.
Patch 12 fixes a NULL pointer dereference that occurs when using
swap-over-NFS.
With the patches applied, it is possible to mount a swapfile that is on an
NFS filesystem. Swap performance is not great with a swap stress test
taking roughly twice as long to complete than if the swap device was
backed by NBD.
This patch: netvm: prevent a stream-specific deadlock
It could happen that all !SOCK_MEMALLOC sockets have buffered so much data
that we're over the global rmem limit. This will prevent SOCK_MEMALLOC
buffers from receiving data, which will prevent userspace from running,
which is needed to reduce the buffered data.
Fix this by exempting the SOCK_MEMALLOC sockets from the rmem limit. Once
this change it applied, it is important that sockets that set
SOCK_MEMALLOC do not clear the flag until the socket is being torn down.
If this happens, a warning is generated and the tokens reclaimed to avoid
accounting errors until the bug is fixed.
[davem@davemloft.net: Warning about clearing SOCK_MEMALLOC]
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Mel Gorman <mgorman@suse.de>
Acked-by: David S. Miller <davem@davemloft.net>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Trond Myklebust <Trond.Myklebust@netapp.com>
Cc: Neil Brown <neilb@suse.de>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Mike Christie <michaelc@cs.wisc.edu>
Cc: Eric B Munson <emunson@mgebm.net>
Cc: Sebastian Andrzej Siewior <sebastian@breakpoint.cc>
Cc: Mel Gorman <mgorman@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-08-01 03:44:41 +04:00
/*
* SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
2015-06-11 04:02:04 +03:00
* progress of swapping . SOCK_MEMALLOC may be cleared while
* it has rmem allocations due to the last swapfile being deactivated
* but there is a risk that the socket is unusable due to exceeding
* the rmem limits . Reclaim the reserves and obey rmem limits again .
netvm: prevent a stream-specific deadlock
This patch series is based on top of "Swap-over-NBD without deadlocking
v15" as it depends on the same reservation of PF_MEMALLOC reserves logic.
When a user or administrator requires swap for their application, they
create a swap partition and file, format it with mkswap and activate it
with swapon. In diskless systems this is not an option so if swap if
required then swapping over the network is considered. The two likely
scenarios are when blade servers are used as part of a cluster where the
form factor or maintenance costs do not allow the use of disks and thin
clients.
The Linux Terminal Server Project recommends the use of the Network Block
Device (NBD) for swap but this is not always an option. There is no
guarantee that the network attached storage (NAS) device is running Linux
or supports NBD. However, it is likely that it supports NFS so there are
users that want support for swapping over NFS despite any performance
concern. Some distributions currently carry patches that support swapping
over NFS but it would be preferable to support it in the mainline kernel.
Patch 1 avoids a stream-specific deadlock that potentially affects TCP.
Patch 2 is a small modification to SELinux to avoid using PFMEMALLOC
reserves.
Patch 3 adds three helpers for filesystems to handle swap cache pages.
For example, page_file_mapping() returns page->mapping for
file-backed pages and the address_space of the underlying
swap file for swap cache pages.
Patch 4 adds two address_space_operations to allow a filesystem
to pin all metadata relevant to a swapfile in memory. Upon
successful activation, the swapfile is marked SWP_FILE and
the address space operation ->direct_IO is used for writing
and ->readpage for reading in swap pages.
Patch 5 notes that patch 3 is bolting
filesystem-specific-swapfile-support onto the side and that
the default handlers have different information to what
is available to the filesystem. This patch refactors the
code so that there are generic handlers for each of the new
address_space operations.
Patch 6 adds an API to allow a vector of kernel addresses to be
translated to struct pages and pinned for IO.
Patch 7 adds support for using highmem pages for swap by kmapping
the pages before calling the direct_IO handler.
Patch 8 updates NFS to use the helpers from patch 3 where necessary.
Patch 9 avoids setting PF_private on PG_swapcache pages within NFS.
Patch 10 implements the new swapfile-related address_space operations
for NFS and teaches the direct IO handler how to manage
kernel addresses.
Patch 11 prevents page allocator recursions in NFS by using GFP_NOIO
where appropriate.
Patch 12 fixes a NULL pointer dereference that occurs when using
swap-over-NFS.
With the patches applied, it is possible to mount a swapfile that is on an
NFS filesystem. Swap performance is not great with a swap stress test
taking roughly twice as long to complete than if the swap device was
backed by NBD.
This patch: netvm: prevent a stream-specific deadlock
It could happen that all !SOCK_MEMALLOC sockets have buffered so much data
that we're over the global rmem limit. This will prevent SOCK_MEMALLOC
buffers from receiving data, which will prevent userspace from running,
which is needed to reduce the buffered data.
Fix this by exempting the SOCK_MEMALLOC sockets from the rmem limit. Once
this change it applied, it is important that sockets that set
SOCK_MEMALLOC do not clear the flag until the socket is being torn down.
If this happens, a warning is generated and the tokens reclaimed to avoid
accounting errors until the bug is fixed.
[davem@davemloft.net: Warning about clearing SOCK_MEMALLOC]
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Mel Gorman <mgorman@suse.de>
Acked-by: David S. Miller <davem@davemloft.net>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Trond Myklebust <Trond.Myklebust@netapp.com>
Cc: Neil Brown <neilb@suse.de>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Mike Christie <michaelc@cs.wisc.edu>
Cc: Eric B Munson <emunson@mgebm.net>
Cc: Sebastian Andrzej Siewior <sebastian@breakpoint.cc>
Cc: Mel Gorman <mgorman@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-08-01 03:44:41 +04:00
*/
2015-06-11 04:02:04 +03:00
sk_mem_reclaim ( sk ) ;
2012-08-01 03:44:16 +04:00
}
EXPORT_SYMBOL_GPL ( sk_clear_memalloc ) ;
2012-08-01 03:44:26 +04:00
int __sk_backlog_rcv ( struct sock * sk , struct sk_buff * skb )
{
int ret ;
2017-05-09 01:59:53 +03:00
unsigned int noreclaim_flag ;
2012-08-01 03:44:26 +04:00
/* these should have been dropped before queueing */
BUG_ON ( ! sock_flag ( sk , SOCK_MEMALLOC ) ) ;
2017-05-09 01:59:53 +03:00
noreclaim_flag = memalloc_noreclaim_save ( ) ;
2021-11-15 22:02:41 +03:00
ret = INDIRECT_CALL_INET ( sk - > sk_backlog_rcv ,
tcp_v6_do_rcv ,
tcp_v4_do_rcv ,
sk , skb ) ;
2017-05-09 01:59:53 +03:00
memalloc_noreclaim_restore ( noreclaim_flag ) ;
2012-08-01 03:44:26 +04:00
return ret ;
}
EXPORT_SYMBOL ( __sk_backlog_rcv ) ;
2021-06-28 01:48:21 +03:00
void sk_error_report ( struct sock * sk )
{
sk - > sk_error_report ( sk ) ;
2021-06-28 01:48:22 +03:00
switch ( sk - > sk_family ) {
case AF_INET :
fallthrough ;
case AF_INET6 :
trace_inet_sk_error_report ( sk ) ;
break ;
default :
break ;
}
2021-06-28 01:48:21 +03:00
}
EXPORT_SYMBOL ( sk_error_report ) ;
2021-10-08 13:00:53 +03:00
int sock_get_timeout ( long timeo , void * optval , bool old_timeval )
2019-02-02 18:34:44 +03:00
{
2019-02-02 18:34:54 +03:00
struct __kernel_sock_timeval tv ;
2019-02-02 18:34:44 +03:00
if ( timeo = = MAX_SCHEDULE_TIMEOUT ) {
tv . tv_sec = 0 ;
tv . tv_usec = 0 ;
} else {
tv . tv_sec = timeo / HZ ;
tv . tv_usec = ( ( timeo % HZ ) * USEC_PER_SEC ) / HZ ;
}
2019-04-16 23:31:14 +03:00
if ( old_timeval & & in_compat_syscall ( ) & & ! COMPAT_USE_64BIT_TIME ) {
2019-02-02 18:34:44 +03:00
struct old_timeval32 tv32 = { tv . tv_sec , tv . tv_usec } ;
* ( struct old_timeval32 * ) optval = tv32 ;
return sizeof ( tv32 ) ;
}
2019-02-02 18:34:54 +03:00
if ( old_timeval ) {
struct __kernel_old_timeval old_tv ;
old_tv . tv_sec = tv . tv_sec ;
old_tv . tv_usec = tv . tv_usec ;
* ( struct __kernel_old_timeval * ) optval = old_tv ;
2019-10-10 07:08:24 +03:00
return sizeof ( old_tv ) ;
2019-02-02 18:34:54 +03:00
}
2019-10-10 07:08:24 +03:00
* ( struct __kernel_sock_timeval * ) optval = tv ;
return sizeof ( tv ) ;
2019-02-02 18:34:44 +03:00
}
2021-10-08 13:00:53 +03:00
EXPORT_SYMBOL ( sock_get_timeout ) ;
2019-02-02 18:34:44 +03:00
2021-10-08 13:00:53 +03:00
int sock_copy_user_timeval ( struct __kernel_sock_timeval * tv ,
sockptr_t optval , int optlen , bool old_timeval )
2005-04-17 02:20:36 +04:00
{
2019-04-16 23:31:14 +03:00
if ( old_timeval & & in_compat_syscall ( ) & & ! COMPAT_USE_64BIT_TIME ) {
2019-02-02 18:34:44 +03:00
struct old_timeval32 tv32 ;
if ( optlen < sizeof ( tv32 ) )
return - EINVAL ;
2020-07-23 09:08:49 +03:00
if ( copy_from_sockptr ( & tv32 , optval , sizeof ( tv32 ) ) )
2019-02-02 18:34:44 +03:00
return - EFAULT ;
2021-10-08 13:00:53 +03:00
tv - > tv_sec = tv32 . tv_sec ;
tv - > tv_usec = tv32 . tv_usec ;
2019-02-02 18:34:54 +03:00
} else if ( old_timeval ) {
struct __kernel_old_timeval old_tv ;
if ( optlen < sizeof ( old_tv ) )
return - EINVAL ;
2020-07-23 09:08:49 +03:00
if ( copy_from_sockptr ( & old_tv , optval , sizeof ( old_tv ) ) )
2019-02-02 18:34:54 +03:00
return - EFAULT ;
2021-10-08 13:00:53 +03:00
tv - > tv_sec = old_tv . tv_sec ;
tv - > tv_usec = old_tv . tv_usec ;
2019-02-02 18:34:44 +03:00
} else {
2021-10-08 13:00:53 +03:00
if ( optlen < sizeof ( * tv ) )
2019-02-02 18:34:44 +03:00
return - EINVAL ;
2021-10-08 13:00:53 +03:00
if ( copy_from_sockptr ( tv , optval , sizeof ( * tv ) ) )
2019-02-02 18:34:44 +03:00
return - EFAULT ;
}
2021-10-08 13:00:53 +03:00
return 0 ;
}
EXPORT_SYMBOL ( sock_copy_user_timeval ) ;
static int sock_set_timeout ( long * timeo_p , sockptr_t optval , int optlen ,
bool old_timeval )
{
struct __kernel_sock_timeval tv ;
int err = sock_copy_user_timeval ( & tv , optval , optlen , old_timeval ) ;
if ( err )
return err ;
2007-05-25 03:58:54 +04:00
if ( tv . tv_usec < 0 | | tv . tv_usec > = USEC_PER_SEC )
return - EDOM ;
2005-04-17 02:20:36 +04:00
2007-05-25 03:58:54 +04:00
if ( tv . tv_sec < 0 ) {
2007-07-10 00:16:00 +04:00
static int warned __read_mostly ;
2007-05-25 03:58:54 +04:00
* timeo_p = 0 ;
2008-05-03 03:20:10 +04:00
if ( warned < 10 & & net_ratelimit ( ) ) {
2007-05-25 03:58:54 +04:00
warned + + ;
2012-05-16 23:58:40 +04:00
pr_info ( " %s: `%s' (pid %d) tries to set negative timeout \n " ,
__func__ , current - > comm , task_pid_nr ( current ) ) ;
2008-05-03 03:20:10 +04:00
}
2007-05-25 03:58:54 +04:00
return 0 ;
}
2005-04-17 02:20:36 +04:00
* timeo_p = MAX_SCHEDULE_TIMEOUT ;
if ( tv . tv_sec = = 0 & & tv . tv_usec = = 0 )
return 0 ;
2019-02-02 18:34:54 +03:00
if ( tv . tv_sec < ( MAX_SCHEDULE_TIMEOUT / HZ - 1 ) )
* timeo_p = tv . tv_sec * HZ + DIV_ROUND_UP ( ( unsigned long ) tv . tv_usec , USEC_PER_SEC / HZ ) ;
2005-04-17 02:20:36 +04:00
return 0 ;
}
2015-10-26 15:51:37 +03:00
static bool sock_needs_netstamp ( const struct sock * sk )
{
switch ( sk - > sk_family ) {
case AF_UNSPEC :
case AF_UNIX :
return false ;
default :
return true ;
}
}
2011-11-28 16:04:18 +04:00
static void sock_disable_timestamp ( struct sock * sk , unsigned long flags )
2007-02-09 17:24:36 +03:00
{
2011-11-28 16:04:18 +04:00
if ( sk - > sk_flags & flags ) {
sk - > sk_flags & = ~ flags ;
2015-10-26 15:51:37 +03:00
if ( sock_needs_netstamp ( sk ) & &
! ( sk - > sk_flags & SK_FLAGS_TIMESTAMP ) )
2009-02-12 08:03:38 +03:00
net_disable_timestamp ( ) ;
2005-04-17 02:20:36 +04:00
}
}
2016-04-05 19:41:15 +03:00
int __sock_queue_rcv_skb ( struct sock * sk , struct sk_buff * skb )
2006-03-28 13:08:21 +04:00
{
net: Generalize socket rx gap / receive queue overflow cmsg
Create a new socket level option to report number of queue overflows
Recently I augmented the AF_PACKET protocol to report the number of frames lost
on the socket receive queue between any two enqueued frames. This value was
exported via a SOL_PACKET level cmsg. AFter I completed that work it was
requested that this feature be generalized so that any datagram oriented socket
could make use of this option. As such I've created this patch, It creates a
new SOL_SOCKET level option called SO_RXQ_OVFL, which when enabled exports a
SOL_SOCKET level cmsg that reports the nubmer of times the sk_receive_queue
overflowed between any two given frames. It also augments the AF_PACKET
protocol to take advantage of this new feature (as it previously did not touch
sk->sk_drops, which this patch uses to record the overflow count). Tested
successfully by me.
Notes:
1) Unlike my previous patch, this patch simply records the sk_drops value, which
is not a number of drops between packets, but rather a total number of drops.
Deltas must be computed in user space.
2) While this patch currently works with datagram oriented protocols, it will
also be accepted by non-datagram oriented protocols. I'm not sure if thats
agreeable to everyone, but my argument in favor of doing so is that, for those
protocols which aren't applicable to this option, sk_drops will always be zero,
and reporting no drops on a receive queue that isn't used for those
non-participating protocols seems reasonable to me. This also saves us having
to code in a per-protocol opt in mechanism.
3) This applies cleanly to net-next assuming that commit
977750076d98c7ff6cbda51858bb5a5894a9d9ab (my af packet cmsg patch) is reverted
Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2009-10-13 00:26:31 +04:00
unsigned long flags ;
struct sk_buff_head * list = & sk - > sk_receive_queue ;
2006-03-28 13:08:21 +04:00
2011-12-21 11:11:44 +04:00
if ( atomic_read ( & sk - > sk_rmem_alloc ) > = sk - > sk_rcvbuf ) {
2009-10-15 07:40:11 +04:00
atomic_inc ( & sk - > sk_drops ) ;
2011-06-17 16:00:03 +04:00
trace_sock_rcvqueue_full ( sk , skb ) ;
2009-10-15 07:40:11 +04:00
return - ENOMEM ;
2006-03-28 13:08:21 +04:00
}
netvm: prevent a stream-specific deadlock
This patch series is based on top of "Swap-over-NBD without deadlocking
v15" as it depends on the same reservation of PF_MEMALLOC reserves logic.
When a user or administrator requires swap for their application, they
create a swap partition and file, format it with mkswap and activate it
with swapon. In diskless systems this is not an option so if swap if
required then swapping over the network is considered. The two likely
scenarios are when blade servers are used as part of a cluster where the
form factor or maintenance costs do not allow the use of disks and thin
clients.
The Linux Terminal Server Project recommends the use of the Network Block
Device (NBD) for swap but this is not always an option. There is no
guarantee that the network attached storage (NAS) device is running Linux
or supports NBD. However, it is likely that it supports NFS so there are
users that want support for swapping over NFS despite any performance
concern. Some distributions currently carry patches that support swapping
over NFS but it would be preferable to support it in the mainline kernel.
Patch 1 avoids a stream-specific deadlock that potentially affects TCP.
Patch 2 is a small modification to SELinux to avoid using PFMEMALLOC
reserves.
Patch 3 adds three helpers for filesystems to handle swap cache pages.
For example, page_file_mapping() returns page->mapping for
file-backed pages and the address_space of the underlying
swap file for swap cache pages.
Patch 4 adds two address_space_operations to allow a filesystem
to pin all metadata relevant to a swapfile in memory. Upon
successful activation, the swapfile is marked SWP_FILE and
the address space operation ->direct_IO is used for writing
and ->readpage for reading in swap pages.
Patch 5 notes that patch 3 is bolting
filesystem-specific-swapfile-support onto the side and that
the default handlers have different information to what
is available to the filesystem. This patch refactors the
code so that there are generic handlers for each of the new
address_space operations.
Patch 6 adds an API to allow a vector of kernel addresses to be
translated to struct pages and pinned for IO.
Patch 7 adds support for using highmem pages for swap by kmapping
the pages before calling the direct_IO handler.
Patch 8 updates NFS to use the helpers from patch 3 where necessary.
Patch 9 avoids setting PF_private on PG_swapcache pages within NFS.
Patch 10 implements the new swapfile-related address_space operations
for NFS and teaches the direct IO handler how to manage
kernel addresses.
Patch 11 prevents page allocator recursions in NFS by using GFP_NOIO
where appropriate.
Patch 12 fixes a NULL pointer dereference that occurs when using
swap-over-NFS.
With the patches applied, it is possible to mount a swapfile that is on an
NFS filesystem. Swap performance is not great with a swap stress test
taking roughly twice as long to complete than if the swap device was
backed by NBD.
This patch: netvm: prevent a stream-specific deadlock
It could happen that all !SOCK_MEMALLOC sockets have buffered so much data
that we're over the global rmem limit. This will prevent SOCK_MEMALLOC
buffers from receiving data, which will prevent userspace from running,
which is needed to reduce the buffered data.
Fix this by exempting the SOCK_MEMALLOC sockets from the rmem limit. Once
this change it applied, it is important that sockets that set
SOCK_MEMALLOC do not clear the flag until the socket is being torn down.
If this happens, a warning is generated and the tokens reclaimed to avoid
accounting errors until the bug is fixed.
[davem@davemloft.net: Warning about clearing SOCK_MEMALLOC]
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Mel Gorman <mgorman@suse.de>
Acked-by: David S. Miller <davem@davemloft.net>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Trond Myklebust <Trond.Myklebust@netapp.com>
Cc: Neil Brown <neilb@suse.de>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Mike Christie <michaelc@cs.wisc.edu>
Cc: Eric B Munson <emunson@mgebm.net>
Cc: Sebastian Andrzej Siewior <sebastian@breakpoint.cc>
Cc: Mel Gorman <mgorman@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-08-01 03:44:41 +04:00
if ( ! sk_rmem_schedule ( sk , skb , skb - > truesize ) ) {
2009-10-15 07:40:11 +04:00
atomic_inc ( & sk - > sk_drops ) ;
return - ENOBUFS ;
2007-12-31 11:11:19 +03:00
}
2006-03-28 13:08:21 +04:00
skb - > dev = NULL ;
skb_set_owner_r ( skb , sk ) ;
2008-12-18 09:11:38 +03:00
2010-05-12 03:19:48 +04:00
/* we escape from rcu protected region, make sure we dont leak
* a norefcounted dst
*/
skb_dst_force ( skb ) ;
net: Generalize socket rx gap / receive queue overflow cmsg
Create a new socket level option to report number of queue overflows
Recently I augmented the AF_PACKET protocol to report the number of frames lost
on the socket receive queue between any two enqueued frames. This value was
exported via a SOL_PACKET level cmsg. AFter I completed that work it was
requested that this feature be generalized so that any datagram oriented socket
could make use of this option. As such I've created this patch, It creates a
new SOL_SOCKET level option called SO_RXQ_OVFL, which when enabled exports a
SOL_SOCKET level cmsg that reports the nubmer of times the sk_receive_queue
overflowed between any two given frames. It also augments the AF_PACKET
protocol to take advantage of this new feature (as it previously did not touch
sk->sk_drops, which this patch uses to record the overflow count). Tested
successfully by me.
Notes:
1) Unlike my previous patch, this patch simply records the sk_drops value, which
is not a number of drops between packets, but rather a total number of drops.
Deltas must be computed in user space.
2) While this patch currently works with datagram oriented protocols, it will
also be accepted by non-datagram oriented protocols. I'm not sure if thats
agreeable to everyone, but my argument in favor of doing so is that, for those
protocols which aren't applicable to this option, sk_drops will always be zero,
and reporting no drops on a receive queue that isn't used for those
non-participating protocols seems reasonable to me. This also saves us having
to code in a per-protocol opt in mechanism.
3) This applies cleanly to net-next assuming that commit
977750076d98c7ff6cbda51858bb5a5894a9d9ab (my af packet cmsg patch) is reverted
Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2009-10-13 00:26:31 +04:00
spin_lock_irqsave ( & list - > lock , flags ) ;
2015-03-01 15:58:30 +03:00
sock_skb_set_dropcount ( sk , skb ) ;
net: Generalize socket rx gap / receive queue overflow cmsg
Create a new socket level option to report number of queue overflows
Recently I augmented the AF_PACKET protocol to report the number of frames lost
on the socket receive queue between any two enqueued frames. This value was
exported via a SOL_PACKET level cmsg. AFter I completed that work it was
requested that this feature be generalized so that any datagram oriented socket
could make use of this option. As such I've created this patch, It creates a
new SOL_SOCKET level option called SO_RXQ_OVFL, which when enabled exports a
SOL_SOCKET level cmsg that reports the nubmer of times the sk_receive_queue
overflowed between any two given frames. It also augments the AF_PACKET
protocol to take advantage of this new feature (as it previously did not touch
sk->sk_drops, which this patch uses to record the overflow count). Tested
successfully by me.
Notes:
1) Unlike my previous patch, this patch simply records the sk_drops value, which
is not a number of drops between packets, but rather a total number of drops.
Deltas must be computed in user space.
2) While this patch currently works with datagram oriented protocols, it will
also be accepted by non-datagram oriented protocols. I'm not sure if thats
agreeable to everyone, but my argument in favor of doing so is that, for those
protocols which aren't applicable to this option, sk_drops will always be zero,
and reporting no drops on a receive queue that isn't used for those
non-participating protocols seems reasonable to me. This also saves us having
to code in a per-protocol opt in mechanism.
3) This applies cleanly to net-next assuming that commit
977750076d98c7ff6cbda51858bb5a5894a9d9ab (my af packet cmsg patch) is reverted
Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2009-10-13 00:26:31 +04:00
__skb_queue_tail ( list , skb ) ;
spin_unlock_irqrestore ( & list - > lock , flags ) ;
2006-03-28 13:08:21 +04:00
if ( ! sock_flag ( sk , SOCK_DEAD ) )
2014-04-12 00:15:36 +04:00
sk - > sk_data_ready ( sk ) ;
2009-10-15 07:40:11 +04:00
return 0 ;
2006-03-28 13:08:21 +04:00
}
2016-04-05 19:41:15 +03:00
EXPORT_SYMBOL ( __sock_queue_rcv_skb ) ;
int sock_queue_rcv_skb ( struct sock * sk , struct sk_buff * skb )
{
int err ;
err = sk_filter ( sk , skb ) ;
if ( err )
return err ;
return __sock_queue_rcv_skb ( sk , skb ) ;
}
2006-03-28 13:08:21 +04:00
EXPORT_SYMBOL ( sock_queue_rcv_skb ) ;
2016-07-13 01:18:57 +03:00
int __sk_receive_skb ( struct sock * sk , struct sk_buff * skb ,
2016-11-03 03:14:41 +03:00
const int nested , unsigned int trim_cap , bool refcounted )
2006-03-28 13:08:21 +04:00
{
int rc = NET_RX_SUCCESS ;
2016-07-13 01:18:57 +03:00
if ( sk_filter_trim_cap ( sk , skb , trim_cap ) )
2006-03-28 13:08:21 +04:00
goto discard_and_relse ;
skb - > dev = NULL ;
2014-07-22 22:16:51 +04:00
if ( sk_rcvqueues_full ( sk , sk - > sk_rcvbuf ) ) {
2010-04-28 02:13:20 +04:00
atomic_inc ( & sk - > sk_drops ) ;
goto discard_and_relse ;
}
2006-11-16 19:06:06 +03:00
if ( nested )
bh_lock_sock_nested ( sk ) ;
else
bh_lock_sock ( sk ) ;
2006-07-03 11:25:35 +04:00
if ( ! sock_owned_by_user ( sk ) ) {
/*
* trylock + unlock semantics :
*/
mutex_acquire ( & sk - > sk_lock . dep_map , 0 , 1 , _RET_IP_ ) ;
2008-10-08 01:18:42 +04:00
rc = sk_backlog_rcv ( sk , skb ) ;
2006-07-03 11:25:35 +04:00
2019-09-19 19:09:40 +03:00
mutex_release ( & sk - > sk_lock . dep_map , _RET_IP_ ) ;
2019-10-10 01:21:13 +03:00
} else if ( sk_add_backlog ( sk , skb , READ_ONCE ( sk - > sk_rcvbuf ) ) ) {
2010-03-04 21:01:40 +03:00
bh_unlock_sock ( sk ) ;
atomic_inc ( & sk - > sk_drops ) ;
goto discard_and_relse ;
}
2006-03-28 13:08:21 +04:00
bh_unlock_sock ( sk ) ;
out :
2016-11-03 03:14:41 +03:00
if ( refcounted )
sock_put ( sk ) ;
2006-03-28 13:08:21 +04:00
return rc ;
discard_and_relse :
kfree_skb ( skb ) ;
goto out ;
}
2016-07-13 01:18:57 +03:00
EXPORT_SYMBOL ( __sk_receive_skb ) ;
2006-03-28 13:08:21 +04:00
2021-02-01 20:41:32 +03:00
INDIRECT_CALLABLE_DECLARE ( struct dst_entry * ip6_dst_check ( struct dst_entry * ,
u32 ) ) ;
INDIRECT_CALLABLE_DECLARE ( struct dst_entry * ipv4_dst_check ( struct dst_entry * ,
u32 ) ) ;
2006-03-28 13:08:21 +04:00
struct dst_entry * __sk_dst_check ( struct sock * sk , u32 cookie )
{
2010-04-09 03:03:29 +04:00
struct dst_entry * dst = __sk_dst_get ( sk ) ;
2006-03-28 13:08:21 +04:00
2021-02-01 20:41:32 +03:00
if ( dst & & dst - > obsolete & &
INDIRECT_CALL_INET ( dst - > ops - > check , ip6_dst_check , ipv4_dst_check ,
dst , cookie ) = = NULL ) {
2009-10-20 03:46:20 +04:00
sk_tx_queue_clear ( sk ) ;
2017-02-07 00:14:11 +03:00
sk - > sk_dst_pending_confirm = 0 ;
2011-08-01 20:19:00 +04:00
RCU_INIT_POINTER ( sk - > sk_dst_cache , NULL ) ;
2006-03-28 13:08:21 +04:00
dst_release ( dst ) ;
return NULL ;
}
return dst ;
}
EXPORT_SYMBOL ( __sk_dst_check ) ;
struct dst_entry * sk_dst_check ( struct sock * sk , u32 cookie )
{
struct dst_entry * dst = sk_dst_get ( sk ) ;
2021-02-01 20:41:32 +03:00
if ( dst & & dst - > obsolete & &
INDIRECT_CALL_INET ( dst - > ops - > check , ip6_dst_check , ipv4_dst_check ,
dst , cookie ) = = NULL ) {
2006-03-28 13:08:21 +04:00
sk_dst_reset ( sk ) ;
dst_release ( dst ) ;
return NULL ;
}
return dst ;
}
EXPORT_SYMBOL ( sk_dst_check ) ;
2020-05-28 08:12:13 +03:00
static int sock_bindtoindex_locked ( struct sock * sk , int ifindex )
2007-09-15 03:41:03 +04:00
{
int ret = - ENOPROTOOPT ;
# ifdef CONFIG_NETDEVICES
2008-03-25 20:26:21 +03:00
struct net * net = sock_net ( sk ) ;
2007-09-15 03:41:03 +04:00
/* Sorry... */
ret = - EPERM ;
net: core: enable SO_BINDTODEVICE for non-root users
Currently, SO_BINDTODEVICE requires CAP_NET_RAW. This change allows a
non-root user to bind a socket to an interface if it is not already
bound. This is useful to allow an application to bind itself to a
specific VRF for outgoing or incoming connections. Currently, an
application wanting to manage connections through several VRF need to
be privileged.
Previously, IP_UNICAST_IF and IPV6_UNICAST_IF were added for
Wine (76e21053b5bf3 and c4062dfc425e9) specifically for use by
non-root processes. However, they are restricted to sendmsg() and not
usable with TCP. Allowing SO_BINDTODEVICE would allow TCP clients to
get the same privilege. As for TCP servers, outside the VRF use case,
SO_BINDTODEVICE would only further restrict connections a server could
accept.
When an application is restricted to a VRF (with `ip vrf exec`), the
socket is bound to an interface at creation and therefore, a
non-privileged call to SO_BINDTODEVICE to escape the VRF fails.
When an application bound a socket to SO_BINDTODEVICE and transmit it
to a non-privileged process through a Unix socket, a tentative to
change the bound device also fails.
Before:
>>> import socket
>>> s=socket.socket(socket.AF_INET, socket.SOCK_STREAM)
>>> s.setsockopt(socket.SOL_SOCKET, socket.SO_BINDTODEVICE, b"dummy0")
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
PermissionError: [Errno 1] Operation not permitted
After:
>>> import socket
>>> s=socket.socket(socket.AF_INET, socket.SOCK_STREAM)
>>> s.setsockopt(socket.SOL_SOCKET, socket.SO_BINDTODEVICE, b"dummy0")
>>> s.setsockopt(socket.SOL_SOCKET, socket.SO_BINDTODEVICE, b"dummy0")
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
PermissionError: [Errno 1] Operation not permitted
Signed-off-by: Vincent Bernat <vincent@bernat.ch>
Reviewed-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2020-03-31 16:20:10 +03:00
if ( sk - > sk_bound_dev_if & & ! ns_capable ( net - > user_ns , CAP_NET_RAW ) )
2007-09-15 03:41:03 +04:00
goto out ;
net: introduce SO_BINDTOIFINDEX sockopt
This introduces a new generic SOL_SOCKET-level socket option called
SO_BINDTOIFINDEX. It behaves similar to SO_BINDTODEVICE, but takes a
network interface index as argument, rather than the network interface
name.
User-space often refers to network-interfaces via their index, but has
to temporarily resolve it to a name for a call into SO_BINDTODEVICE.
This might pose problems when the network-device is renamed
asynchronously by other parts of the system. When this happens, the
SO_BINDTODEVICE might either fail, or worse, it might bind to the wrong
device.
In most cases user-space only ever operates on devices which they
either manage themselves, or otherwise have a guarantee that the device
name will not change (e.g., devices that are UP cannot be renamed).
However, particularly in libraries this guarantee is non-obvious and it
would be nice if that race-condition would simply not exist. It would
make it easier for those libraries to operate even in situations where
the device-name might change under the hood.
A real use-case that we recently hit is trying to start the network
stack early in the initrd but make it survive into the real system.
Existing distributions rename network-interfaces during the transition
from initrd into the real system. This, obviously, cannot affect
devices that are up and running (unless you also consider moving them
between network-namespaces). However, the network manager now has to
make sure its management engine for dormant devices will not run in
parallel to these renames. Particularly, when you offload operations
like DHCP into separate processes, these might setup their sockets
early, and thus have to resolve the device-name possibly running into
this race-condition.
By avoiding a call to resolve the device-name, we no longer depend on
the name and can run network setup of dormant devices in parallel to
the transition off the initrd. The SO_BINDTOIFINDEX ioctl plugs this
race.
Reviewed-by: Tom Gundersen <teg@jklm.no>
Signed-off-by: David Herrmann <dh.herrmann@gmail.com>
Acked-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-01-15 16:42:14 +03:00
ret = - EINVAL ;
if ( ifindex < 0 )
goto out ;
sk - > sk_bound_dev_if = ifindex ;
if ( sk - > sk_prot - > rehash )
sk - > sk_prot - > rehash ( sk ) ;
sk_dst_reset ( sk ) ;
ret = 0 ;
out :
# endif
return ret ;
}
2020-05-31 00:09:00 +03:00
int sock_bindtoindex ( struct sock * sk , int ifindex , bool lock_sk )
2020-05-28 08:12:13 +03:00
{
int ret ;
2020-05-31 00:09:00 +03:00
if ( lock_sk )
lock_sock ( sk ) ;
2020-05-28 08:12:13 +03:00
ret = sock_bindtoindex_locked ( sk , ifindex ) ;
2020-05-31 00:09:00 +03:00
if ( lock_sk )
release_sock ( sk ) ;
2020-05-28 08:12:13 +03:00
return ret ;
}
EXPORT_SYMBOL ( sock_bindtoindex ) ;
2020-07-23 09:08:48 +03:00
static int sock_setbindtodevice ( struct sock * sk , sockptr_t optval , int optlen )
net: introduce SO_BINDTOIFINDEX sockopt
This introduces a new generic SOL_SOCKET-level socket option called
SO_BINDTOIFINDEX. It behaves similar to SO_BINDTODEVICE, but takes a
network interface index as argument, rather than the network interface
name.
User-space often refers to network-interfaces via their index, but has
to temporarily resolve it to a name for a call into SO_BINDTODEVICE.
This might pose problems when the network-device is renamed
asynchronously by other parts of the system. When this happens, the
SO_BINDTODEVICE might either fail, or worse, it might bind to the wrong
device.
In most cases user-space only ever operates on devices which they
either manage themselves, or otherwise have a guarantee that the device
name will not change (e.g., devices that are UP cannot be renamed).
However, particularly in libraries this guarantee is non-obvious and it
would be nice if that race-condition would simply not exist. It would
make it easier for those libraries to operate even in situations where
the device-name might change under the hood.
A real use-case that we recently hit is trying to start the network
stack early in the initrd but make it survive into the real system.
Existing distributions rename network-interfaces during the transition
from initrd into the real system. This, obviously, cannot affect
devices that are up and running (unless you also consider moving them
between network-namespaces). However, the network manager now has to
make sure its management engine for dormant devices will not run in
parallel to these renames. Particularly, when you offload operations
like DHCP into separate processes, these might setup their sockets
early, and thus have to resolve the device-name possibly running into
this race-condition.
By avoiding a call to resolve the device-name, we no longer depend on
the name and can run network setup of dormant devices in parallel to
the transition off the initrd. The SO_BINDTOIFINDEX ioctl plugs this
race.
Reviewed-by: Tom Gundersen <teg@jklm.no>
Signed-off-by: David Herrmann <dh.herrmann@gmail.com>
Acked-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-01-15 16:42:14 +03:00
{
int ret = - ENOPROTOOPT ;
# ifdef CONFIG_NETDEVICES
struct net * net = sock_net ( sk ) ;
char devname [ IFNAMSIZ ] ;
int index ;
2007-09-15 03:41:03 +04:00
ret = - EINVAL ;
if ( optlen < 0 )
goto out ;
/* Bind this socket to a particular device like "eth0",
* as specified in the passed interface name . If the
* name is " " or the option length is zero the socket
* is not bound .
*/
if ( optlen > IFNAMSIZ - 1 )
optlen = IFNAMSIZ - 1 ;
memset ( devname , 0 , sizeof ( devname ) ) ;
ret = - EFAULT ;
2020-07-23 09:08:48 +03:00
if ( copy_from_sockptr ( devname , optval , optlen ) )
2007-09-15 03:41:03 +04:00
goto out ;
2009-11-06 09:37:11 +03:00
index = 0 ;
if ( devname [ 0 ] ! = ' \0 ' ) {
2009-11-06 08:03:39 +03:00
struct net_device * dev ;
2007-09-15 03:41:03 +04:00
2009-11-06 08:03:39 +03:00
rcu_read_lock ( ) ;
dev = dev_get_by_name_rcu ( net , devname ) ;
if ( dev )
index = dev - > ifindex ;
rcu_read_unlock ( ) ;
2007-09-15 03:41:03 +04:00
ret = - ENODEV ;
if ( ! dev )
goto out ;
}
2020-05-31 00:09:00 +03:00
return sock_bindtoindex ( sk , index , true ) ;
2007-09-15 03:41:03 +04:00
out :
# endif
return ret ;
}
2012-11-26 09:21:08 +04:00
static int sock_getbindtodevice ( struct sock * sk , char __user * optval ,
int __user * optlen , int len )
{
int ret = - ENOPROTOOPT ;
# ifdef CONFIG_NETDEVICES
struct net * net = sock_net ( sk ) ;
char devname [ IFNAMSIZ ] ;
if ( sk - > sk_bound_dev_if = = 0 ) {
len = 0 ;
goto zero ;
}
ret = - EINVAL ;
if ( len < IFNAMSIZ )
goto out ;
2013-06-26 19:23:42 +04:00
ret = netdev_get_name ( net , devname , sk - > sk_bound_dev_if ) ;
if ( ret )
2012-11-26 09:21:08 +04:00
goto out ;
len = strlen ( devname ) + 1 ;
ret = - EFAULT ;
if ( copy_to_user ( optval , devname , len ) )
goto out ;
zero :
ret = - EFAULT ;
if ( put_user ( len , optlen ) )
goto out ;
ret = 0 ;
out :
# endif
return ret ;
}
2015-04-01 18:07:44 +03:00
bool sk_mc_loop ( struct sock * sk )
{
if ( dev_recursion_level ( ) )
return false ;
if ( ! sk )
return true ;
switch ( sk - > sk_family ) {
case AF_INET :
return inet_sk ( sk ) - > mc_loop ;
# if IS_ENABLED(CONFIG_IPV6)
case AF_INET6 :
return inet6_sk ( sk ) - > mc_loop ;
# endif
}
2020-06-18 08:23:25 +03:00
WARN_ON_ONCE ( 1 ) ;
2015-04-01 18:07:44 +03:00
return true ;
}
EXPORT_SYMBOL ( sk_mc_loop ) ;
2020-05-28 08:12:09 +03:00
void sock_set_reuseaddr ( struct sock * sk )
{
lock_sock ( sk ) ;
sk - > sk_reuse = SK_CAN_REUSE ;
release_sock ( sk ) ;
}
EXPORT_SYMBOL ( sock_set_reuseaddr ) ;
2020-05-28 08:12:17 +03:00
void sock_set_reuseport ( struct sock * sk )
{
lock_sock ( sk ) ;
sk - > sk_reuseport = true ;
release_sock ( sk ) ;
}
EXPORT_SYMBOL ( sock_set_reuseport ) ;
2020-05-28 08:12:10 +03:00
void sock_no_linger ( struct sock * sk )
{
lock_sock ( sk ) ;
sk - > sk_lingertime = 0 ;
sock_set_flag ( sk , SOCK_LINGER ) ;
release_sock ( sk ) ;
}
EXPORT_SYMBOL ( sock_no_linger ) ;
2020-05-28 08:12:11 +03:00
void sock_set_priority ( struct sock * sk , u32 priority )
{
lock_sock ( sk ) ;
sk - > sk_priority = priority ;
release_sock ( sk ) ;
}
EXPORT_SYMBOL ( sock_set_priority ) ;
2020-05-28 08:12:12 +03:00
void sock_set_sndtimeo ( struct sock * sk , s64 secs )
{
lock_sock ( sk ) ;
if ( secs & & secs < MAX_SCHEDULE_TIMEOUT / HZ - 1 )
sk - > sk_sndtimeo = secs * HZ ;
else
sk - > sk_sndtimeo = MAX_SCHEDULE_TIMEOUT ;
release_sock ( sk ) ;
}
EXPORT_SYMBOL ( sock_set_sndtimeo ) ;
2020-05-28 08:12:14 +03:00
static void __sock_set_timestamps ( struct sock * sk , bool val , bool new , bool ns )
{
if ( val ) {
sock_valbool_flag ( sk , SOCK_TSTAMP_NEW , new ) ;
sock_valbool_flag ( sk , SOCK_RCVTSTAMPNS , ns ) ;
sock_set_flag ( sk , SOCK_RCVTSTAMP ) ;
sock_enable_timestamp ( sk , SOCK_TIMESTAMP ) ;
} else {
sock_reset_flag ( sk , SOCK_RCVTSTAMP ) ;
sock_reset_flag ( sk , SOCK_RCVTSTAMPNS ) ;
}
}
void sock_enable_timestamps ( struct sock * sk )
{
lock_sock ( sk ) ;
__sock_set_timestamps ( sk , true , false , true ) ;
release_sock ( sk ) ;
}
EXPORT_SYMBOL ( sock_enable_timestamps ) ;
2021-06-04 02:24:27 +03:00
void sock_set_timestamp ( struct sock * sk , int optname , bool valbool )
{
switch ( optname ) {
case SO_TIMESTAMP_OLD :
__sock_set_timestamps ( sk , valbool , false , false ) ;
break ;
case SO_TIMESTAMP_NEW :
__sock_set_timestamps ( sk , valbool , true , false ) ;
break ;
case SO_TIMESTAMPNS_OLD :
__sock_set_timestamps ( sk , valbool , false , true ) ;
break ;
case SO_TIMESTAMPNS_NEW :
__sock_set_timestamps ( sk , valbool , true , true ) ;
break ;
}
}
2021-06-30 11:11:59 +03:00
static int sock_timestamping_bind_phc ( struct sock * sk , int phc_index )
{
struct net * net = sock_net ( sk ) ;
struct net_device * dev = NULL ;
bool match = false ;
int * vclock_index ;
int i , num ;
if ( sk - > sk_bound_dev_if )
dev = dev_get_by_index ( net , sk - > sk_bound_dev_if ) ;
if ( ! dev ) {
pr_err ( " %s: sock not bind to device \n " , __func__ ) ;
return - EOPNOTSUPP ;
}
num = ethtool_get_phc_vclocks ( dev , & vclock_index ) ;
for ( i = 0 ; i < num ; i + + ) {
if ( * ( vclock_index + i ) = = phc_index ) {
match = true ;
break ;
}
}
if ( num > 0 )
kfree ( vclock_index ) ;
if ( ! match )
return - EINVAL ;
sk - > sk_bind_phc = phc_index ;
return 0 ;
}
int sock_set_timestamping ( struct sock * sk , int optname ,
struct so_timestamping timestamping )
2021-06-04 02:24:28 +03:00
{
2021-06-30 11:11:59 +03:00
int val = timestamping . flags ;
int ret ;
2021-06-04 02:24:28 +03:00
if ( val & ~ SOF_TIMESTAMPING_MASK )
return - EINVAL ;
if ( val & SOF_TIMESTAMPING_OPT_ID & &
! ( sk - > sk_tsflags & SOF_TIMESTAMPING_OPT_ID ) ) {
2021-11-15 22:02:33 +03:00
if ( sk_is_tcp ( sk ) ) {
2021-06-04 02:24:28 +03:00
if ( ( 1 < < sk - > sk_state ) &
( TCPF_CLOSE | TCPF_LISTEN ) )
return - EINVAL ;
sk - > sk_tskey = tcp_sk ( sk ) - > snd_una ;
} else {
sk - > sk_tskey = 0 ;
}
}
if ( val & SOF_TIMESTAMPING_OPT_STATS & &
! ( val & SOF_TIMESTAMPING_OPT_TSONLY ) )
return - EINVAL ;
2021-06-30 11:11:59 +03:00
if ( val & SOF_TIMESTAMPING_BIND_PHC ) {
ret = sock_timestamping_bind_phc ( sk , timestamping . bind_phc ) ;
if ( ret )
return ret ;
}
2021-06-04 02:24:28 +03:00
sk - > sk_tsflags = val ;
sock_valbool_flag ( sk , SOCK_TSTAMP_NEW , optname = = SO_TIMESTAMPING_NEW ) ;
if ( val & SOF_TIMESTAMPING_RX_SOFTWARE )
sock_enable_timestamp ( sk ,
SOCK_TIMESTAMPING_RX_SOFTWARE ) ;
else
sock_disable_timestamp ( sk ,
( 1UL < < SOCK_TIMESTAMPING_RX_SOFTWARE ) ) ;
return 0 ;
}
2020-05-28 08:12:15 +03:00
void sock_set_keepalive ( struct sock * sk )
{
lock_sock ( sk ) ;
if ( sk - > sk_prot - > keepalive )
sk - > sk_prot - > keepalive ( sk , true ) ;
sock_valbool_flag ( sk , SOCK_KEEPOPEN , true ) ;
release_sock ( sk ) ;
}
EXPORT_SYMBOL ( sock_set_keepalive ) ;
2020-05-28 08:12:16 +03:00
static void __sock_set_rcvbuf ( struct sock * sk , int val )
{
/* Ensure val * 2 fits into an int, to prevent max_t() from treating it
* as a negative value .
*/
val = min_t ( int , val , INT_MAX / 2 ) ;
sk - > sk_userlocks | = SOCK_RCVBUF_LOCK ;
/* We double it on the way in to account for "struct sk_buff" etc.
* overhead . Applications assume that the SO_RCVBUF setting they make
* will allow that much actual data to be received on that socket .
*
* Applications are unaware that " struct sk_buff " and other overheads
* allocate from the receive buffer during socket buffer allocation .
*
* And after considering the possible alternatives , returning the value
* we actually used in getsockopt is the most desirable behavior .
*/
WRITE_ONCE ( sk - > sk_rcvbuf , max_t ( int , val * 2 , SOCK_MIN_RCVBUF ) ) ;
}
void sock_set_rcvbuf ( struct sock * sk , int val )
{
lock_sock ( sk ) ;
__sock_set_rcvbuf ( sk , val ) ;
release_sock ( sk ) ;
}
EXPORT_SYMBOL ( sock_set_rcvbuf ) ;
2021-06-01 00:00:30 +03:00
static void __sock_set_mark ( struct sock * sk , u32 val )
{
if ( val ! = sk - > sk_mark ) {
sk - > sk_mark = val ;
sk_dst_reset ( sk ) ;
}
}
2020-06-26 20:26:48 +03:00
void sock_set_mark ( struct sock * sk , u32 val )
{
lock_sock ( sk ) ;
2021-06-01 00:00:30 +03:00
__sock_set_mark ( sk , val ) ;
2020-06-26 20:26:48 +03:00
release_sock ( sk ) ;
}
EXPORT_SYMBOL ( sock_set_mark ) ;
2021-09-29 20:25:11 +03:00
static void sock_release_reserved_memory ( struct sock * sk , int bytes )
{
/* Round down bytes to multiple of pages */
bytes & = ~ ( SK_MEM_QUANTUM - 1 ) ;
WARN_ON ( bytes > sk - > sk_reserved_mem ) ;
sk - > sk_reserved_mem - = bytes ;
sk_mem_reclaim ( sk ) ;
}
static int sock_reserve_memory ( struct sock * sk , int bytes )
{
long allocated ;
bool charged ;
int pages ;
2021-11-04 02:49:11 +03:00
if ( ! mem_cgroup_sockets_enabled | | ! sk - > sk_memcg | | ! sk_has_account ( sk ) )
2021-09-29 20:25:11 +03:00
return - EOPNOTSUPP ;
if ( ! bytes )
return 0 ;
pages = sk_mem_pages ( bytes ) ;
/* pre-charge to memcg */
charged = mem_cgroup_charge_skmem ( sk - > sk_memcg , pages ,
GFP_KERNEL | __GFP_RETRY_MAYFAIL ) ;
if ( ! charged )
return - ENOMEM ;
/* pre-charge to forward_alloc */
allocated = sk_memory_allocated_add ( sk , pages ) ;
/* If the system goes into memory pressure with this
* precharge , give up and return error .
*/
if ( allocated > sk_prot_mem_limits ( sk , 1 ) ) {
sk_memory_allocated_sub ( sk , pages ) ;
mem_cgroup_uncharge_skmem ( sk - > sk_memcg , pages ) ;
return - ENOMEM ;
}
sk - > sk_forward_alloc + = pages < < SK_MEM_QUANTUM_SHIFT ;
sk - > sk_reserved_mem + = pages < < SK_MEM_QUANTUM_SHIFT ;
return 0 ;
}
2005-04-17 02:20:36 +04:00
/*
* This is meant for all protocols to use and covers goings on
* at the socket level . Everything here is generic .
*/
int sock_setsockopt ( struct socket * sock , int level , int optname ,
2020-07-23 09:08:50 +03:00
sockptr_t optval , unsigned int optlen )
2005-04-17 02:20:36 +04:00
{
2021-06-30 11:11:59 +03:00
struct so_timestamping timestamping ;
2018-07-04 01:42:48 +03:00
struct sock_txtime sk_txtime ;
2009-05-27 15:30:05 +04:00
struct sock * sk = sock - > sk ;
2005-04-17 02:20:36 +04:00
int val ;
int valbool ;
struct linger ling ;
int ret = 0 ;
2007-02-09 17:24:36 +03:00
2005-04-17 02:20:36 +04:00
/*
* Options without arguments
*/
2007-09-15 03:41:03 +04:00
if ( optname = = SO_BINDTODEVICE )
2012-11-26 09:21:08 +04:00
return sock_setbindtodevice ( sk , optval , optlen ) ;
2007-09-15 03:41:03 +04:00
2007-04-11 07:10:33 +04:00
if ( optlen < sizeof ( int ) )
return - EINVAL ;
2007-02-09 17:24:36 +03:00
2020-07-23 09:08:50 +03:00
if ( copy_from_sockptr ( & val , optval , sizeof ( val ) ) )
2005-04-17 02:20:36 +04:00
return - EFAULT ;
2007-02-09 17:24:36 +03:00
2009-05-27 15:30:05 +04:00
valbool = val ? 1 : 0 ;
2005-04-17 02:20:36 +04:00
lock_sock ( sk ) ;
2009-05-27 15:30:05 +04:00
switch ( optname ) {
2007-04-11 07:10:33 +04:00
case SO_DEBUG :
2009-05-27 15:30:05 +04:00
if ( val & & ! capable ( CAP_NET_ADMIN ) )
2007-04-11 07:10:33 +04:00
ret = - EACCES ;
2009-05-27 15:30:05 +04:00
else
2007-11-15 14:03:19 +03:00
sock_valbool_flag ( sk , SOCK_DBG , valbool ) ;
2007-04-11 07:10:33 +04:00
break ;
case SO_REUSEADDR :
2018-06-12 20:05:55 +03:00
sk - > sk_reuse = ( valbool ? SK_CAN_REUSE : SK_NO_REUSE ) ;
2007-04-11 07:10:33 +04:00
break ;
2013-01-22 13:49:50 +04:00
case SO_REUSEPORT :
sk - > sk_reuseport = valbool ;
break ;
2007-04-11 07:10:33 +04:00
case SO_TYPE :
2009-08-04 11:28:28 +04:00
case SO_PROTOCOL :
2009-08-04 11:28:29 +04:00
case SO_DOMAIN :
2007-04-11 07:10:33 +04:00
case SO_ERROR :
ret = - ENOPROTOOPT ;
break ;
case SO_DONTROUTE :
2007-11-15 14:03:19 +03:00
sock_valbool_flag ( sk , SOCK_LOCALROUTE , valbool ) ;
net: call sk_dst_reset when set SO_DONTROUTE
after set SO_DONTROUTE to 1, the IP layer should not route packets if
the dest IP address is not in link scope. But if the socket has cached
the dst_entry, such packets would be routed until the sk_dst_cache
expires. So we should clean the sk_dst_cache when a user set
SO_DONTROUTE option. Below are server/client python scripts which
could reprodue this issue:
server side code:
==========================================================================
import socket
import struct
import time
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
s.bind(('0.0.0.0', 9000))
s.listen(1)
sock, addr = s.accept()
sock.setsockopt(socket.SOL_SOCKET, socket.SO_DONTROUTE, struct.pack('i', 1))
while True:
sock.send(b'foo')
time.sleep(1)
==========================================================================
client side code:
==========================================================================
import socket
import time
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
s.connect(('server_address', 9000))
while True:
data = s.recv(1024)
print(data)
==========================================================================
Signed-off-by: yupeng <yupeng0921@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-12-06 05:56:28 +03:00
sk_dst_reset ( sk ) ;
2007-04-11 07:10:33 +04:00
break ;
case SO_BROADCAST :
sock_valbool_flag ( sk , SOCK_BROADCAST , valbool ) ;
break ;
case SO_SNDBUF :
/* Don't error on this BSD doesn't and if you think
2012-04-27 00:07:59 +04:00
* about it this is right . Otherwise apps have to
* play ' guess the biggest size ' games . RCVBUF / SNDBUF
* are treated in BSD as hints
*/
val = min_t ( u32 , val , sysctl_wmem_max ) ;
2005-08-10 06:30:51 +04:00
set_sndbuf :
sock: consistent handling of extreme SO_SNDBUF/SO_RCVBUF values
SO_SNDBUF and SO_RCVBUF (and their *BUFFORCE version) may overflow or
underflow their input value. This patch aims at providing explicit
handling of these extreme cases, to get a clear behaviour even with
values bigger than INT_MAX / 2 or lower than INT_MIN / 2.
For simplicity, only SO_SNDBUF and SO_SNDBUFFORCE are described here,
but the same explanation and fix apply to SO_RCVBUF and SO_RCVBUFFORCE
(with 'SNDBUF' replaced by 'RCVBUF' and 'wmem_max' by 'rmem_max').
Overflow of positive values
===========================
When handling SO_SNDBUF or SO_SNDBUFFORCE, if 'val' exceeds
INT_MAX / 2, the buffer size is set to its minimum value because
'val * 2' overflows, and max_t() considers that it's smaller than
SOCK_MIN_SNDBUF. For SO_SNDBUF, this can only happen with
net.core.wmem_max > INT_MAX / 2.
SO_SNDBUF and SO_SNDBUFFORCE are actually designed to let users probe
for the maximum buffer size by setting an arbitrary large number that
gets capped to the maximum allowed/possible size. Having the upper
half of the positive integer space to potentially reduce the buffer
size to its minimum value defeats this purpose.
This patch caps the base value to INT_MAX / 2, so that bigger values
don't overflow and keep setting the buffer size to its maximum.
Underflow of negative values
============================
For negative numbers, SO_SNDBUF always considers them bigger than
net.core.wmem_max, which is bounded by [SOCK_MIN_SNDBUF, INT_MAX].
Therefore such values are set to net.core.wmem_max and we're back to
the behaviour of positive integers described above (return maximum
buffer size if wmem_max <= INT_MAX / 2, return SOCK_MIN_SNDBUF
otherwise).
However, SO_SNDBUFFORCE behaves differently. The user value is
directly multiplied by two and compared with SOCK_MIN_SNDBUF. If
'val * 2' doesn't underflow or if it underflows to a value smaller
than SOCK_MIN_SNDBUF then buffer size is set to its minimum value.
Otherwise the buffer size is set to the underflowed value.
This patch treats negative values passed to SO_SNDBUFFORCE as null, to
prevent underflows. Therefore negative values now always set the buffer
size to its minimum value.
Even though SO_SNDBUF behaves inconsistently by setting buffer size to
the maximum value when passed a negative number, no attempt is made to
modify this behaviour. There may exist some programs that rely on using
negative numbers to set the maximum buffer size. Avoiding overflows
because of extreme net.core.wmem_max values is the most we can do here.
Summary of altered behaviours
=============================
val : user-space value passed to setsockopt()
val_uf : the underflowed value resulting from doubling val when
val < INT_MIN / 2
wmem_max : short for net.core.wmem_max
val_cap : min(val, wmem_max)
min_len : minimal buffer length (that is, SOCK_MIN_SNDBUF)
max_len : maximal possible buffer length, regardless of wmem_max (that
is, INT_MAX - 1)
^^^^ : altered behaviour
SO_SNDBUF:
+-------------------------+-------------+------------+----------------+
| CONDITION | OLD RESULT | NEW RESULT | COMMENT |
+-------------------------+-------------+------------+----------------+
| val < 0 && | | | No overflow, |
| wmem_max <= INT_MAX/2 | wmem_max*2 | wmem_max*2 | keep original |
| | | | behaviour |
+-------------------------+-------------+------------+----------------+
| val < 0 && | | | Cap wmem_max |
| INT_MAX/2 < wmem_max | min_len | max_len | to prevent |
| | | ^^^^^^^ | overflow |
+-------------------------+-------------+------------+----------------+
| 0 <= val <= min_len/2 | min_len | min_len | Ordinary case |
+-------------------------+-------------+------------+----------------+
| min_len/2 < val && | val_cap*2 | val_cap*2 | Ordinary case |
| val_cap <= INT_MAX/2 | | | |
+-------------------------+-------------+------------+----------------+
| min_len < val && | | | Cap val_cap |
| INT_MAX/2 < val_cap | min_len | max_len | again to |
| (implies that | | ^^^^^^^ | prevent |
| INT_MAX/2 < wmem_max) | | | overflow |
+-------------------------+-------------+------------+----------------+
SO_SNDBUFFORCE:
+------------------------------+---------+---------+------------------+
| CONDITION | BEFORE | AFTER | COMMENT |
| | PATCH | PATCH | |
+------------------------------+---------+---------+------------------+
| val < INT_MIN/2 && | min_len | min_len | Underflow with |
| val_uf <= min_len | | | no consequence |
+------------------------------+---------+---------+------------------+
| val < INT_MIN/2 && | val_uf | min_len | Set val to 0 to |
| val_uf > min_len | | ^^^^^^^ | avoid underflow |
+------------------------------+---------+---------+------------------+
| INT_MIN/2 <= val < 0 | min_len | min_len | No underflow |
+------------------------------+---------+---------+------------------+
| 0 <= val <= min_len/2 | min_len | min_len | Ordinary case |
+------------------------------+---------+---------+------------------+
| min_len/2 < val <= INT_MAX/2 | val*2 | val*2 | Ordinary case |
+------------------------------+---------+---------+------------------+
| INT_MAX/2 < val | min_len | max_len | Cap val to |
| | | ^^^^^^^ | prevent overflow |
+------------------------------+---------+---------+------------------+
Signed-off-by: Guillaume Nault <gnault@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-02-13 06:30:34 +03:00
/* Ensure val * 2 fits into an int, to prevent max_t()
* from treating it as a negative value .
*/
val = min_t ( int , val , INT_MAX / 2 ) ;
2007-04-11 07:10:33 +04:00
sk - > sk_userlocks | = SOCK_SNDBUF_LOCK ;
2019-10-11 06:17:45 +03:00
WRITE_ONCE ( sk - > sk_sndbuf ,
max_t ( int , val * 2 , SOCK_MIN_SNDBUF ) ) ;
2012-04-27 00:07:59 +04:00
/* Wake up sending tasks if we upped the value. */
2007-04-11 07:10:33 +04:00
sk - > sk_write_space ( sk ) ;
break ;
2005-04-17 02:20:36 +04:00
2007-04-11 07:10:33 +04:00
case SO_SNDBUFFORCE :
if ( ! capable ( CAP_NET_ADMIN ) ) {
ret = - EPERM ;
break ;
}
sock: consistent handling of extreme SO_SNDBUF/SO_RCVBUF values
SO_SNDBUF and SO_RCVBUF (and their *BUFFORCE version) may overflow or
underflow their input value. This patch aims at providing explicit
handling of these extreme cases, to get a clear behaviour even with
values bigger than INT_MAX / 2 or lower than INT_MIN / 2.
For simplicity, only SO_SNDBUF and SO_SNDBUFFORCE are described here,
but the same explanation and fix apply to SO_RCVBUF and SO_RCVBUFFORCE
(with 'SNDBUF' replaced by 'RCVBUF' and 'wmem_max' by 'rmem_max').
Overflow of positive values
===========================
When handling SO_SNDBUF or SO_SNDBUFFORCE, if 'val' exceeds
INT_MAX / 2, the buffer size is set to its minimum value because
'val * 2' overflows, and max_t() considers that it's smaller than
SOCK_MIN_SNDBUF. For SO_SNDBUF, this can only happen with
net.core.wmem_max > INT_MAX / 2.
SO_SNDBUF and SO_SNDBUFFORCE are actually designed to let users probe
for the maximum buffer size by setting an arbitrary large number that
gets capped to the maximum allowed/possible size. Having the upper
half of the positive integer space to potentially reduce the buffer
size to its minimum value defeats this purpose.
This patch caps the base value to INT_MAX / 2, so that bigger values
don't overflow and keep setting the buffer size to its maximum.
Underflow of negative values
============================
For negative numbers, SO_SNDBUF always considers them bigger than
net.core.wmem_max, which is bounded by [SOCK_MIN_SNDBUF, INT_MAX].
Therefore such values are set to net.core.wmem_max and we're back to
the behaviour of positive integers described above (return maximum
buffer size if wmem_max <= INT_MAX / 2, return SOCK_MIN_SNDBUF
otherwise).
However, SO_SNDBUFFORCE behaves differently. The user value is
directly multiplied by two and compared with SOCK_MIN_SNDBUF. If
'val * 2' doesn't underflow or if it underflows to a value smaller
than SOCK_MIN_SNDBUF then buffer size is set to its minimum value.
Otherwise the buffer size is set to the underflowed value.
This patch treats negative values passed to SO_SNDBUFFORCE as null, to
prevent underflows. Therefore negative values now always set the buffer
size to its minimum value.
Even though SO_SNDBUF behaves inconsistently by setting buffer size to
the maximum value when passed a negative number, no attempt is made to
modify this behaviour. There may exist some programs that rely on using
negative numbers to set the maximum buffer size. Avoiding overflows
because of extreme net.core.wmem_max values is the most we can do here.
Summary of altered behaviours
=============================
val : user-space value passed to setsockopt()
val_uf : the underflowed value resulting from doubling val when
val < INT_MIN / 2
wmem_max : short for net.core.wmem_max
val_cap : min(val, wmem_max)
min_len : minimal buffer length (that is, SOCK_MIN_SNDBUF)
max_len : maximal possible buffer length, regardless of wmem_max (that
is, INT_MAX - 1)
^^^^ : altered behaviour
SO_SNDBUF:
+-------------------------+-------------+------------+----------------+
| CONDITION | OLD RESULT | NEW RESULT | COMMENT |
+-------------------------+-------------+------------+----------------+
| val < 0 && | | | No overflow, |
| wmem_max <= INT_MAX/2 | wmem_max*2 | wmem_max*2 | keep original |
| | | | behaviour |
+-------------------------+-------------+------------+----------------+
| val < 0 && | | | Cap wmem_max |
| INT_MAX/2 < wmem_max | min_len | max_len | to prevent |
| | | ^^^^^^^ | overflow |
+-------------------------+-------------+------------+----------------+
| 0 <= val <= min_len/2 | min_len | min_len | Ordinary case |
+-------------------------+-------------+------------+----------------+
| min_len/2 < val && | val_cap*2 | val_cap*2 | Ordinary case |
| val_cap <= INT_MAX/2 | | | |
+-------------------------+-------------+------------+----------------+
| min_len < val && | | | Cap val_cap |
| INT_MAX/2 < val_cap | min_len | max_len | again to |
| (implies that | | ^^^^^^^ | prevent |
| INT_MAX/2 < wmem_max) | | | overflow |
+-------------------------+-------------+------------+----------------+
SO_SNDBUFFORCE:
+------------------------------+---------+---------+------------------+
| CONDITION | BEFORE | AFTER | COMMENT |
| | PATCH | PATCH | |
+------------------------------+---------+---------+------------------+
| val < INT_MIN/2 && | min_len | min_len | Underflow with |
| val_uf <= min_len | | | no consequence |
+------------------------------+---------+---------+------------------+
| val < INT_MIN/2 && | val_uf | min_len | Set val to 0 to |
| val_uf > min_len | | ^^^^^^^ | avoid underflow |
+------------------------------+---------+---------+------------------+
| INT_MIN/2 <= val < 0 | min_len | min_len | No underflow |
+------------------------------+---------+---------+------------------+
| 0 <= val <= min_len/2 | min_len | min_len | Ordinary case |
+------------------------------+---------+---------+------------------+
| min_len/2 < val <= INT_MAX/2 | val*2 | val*2 | Ordinary case |
+------------------------------+---------+---------+------------------+
| INT_MAX/2 < val | min_len | max_len | Cap val to |
| | | ^^^^^^^ | prevent overflow |
+------------------------------+---------+---------+------------------+
Signed-off-by: Guillaume Nault <gnault@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-02-13 06:30:34 +03:00
/* No negative values (to prevent underflow, as val will be
* multiplied by 2 ) .
*/
if ( val < 0 )
val = 0 ;
2007-04-11 07:10:33 +04:00
goto set_sndbuf ;
2005-08-10 06:30:51 +04:00
2007-04-11 07:10:33 +04:00
case SO_RCVBUF :
/* Don't error on this BSD doesn't and if you think
2012-04-27 00:07:59 +04:00
* about it this is right . Otherwise apps have to
* play ' guess the biggest size ' games . RCVBUF / SNDBUF
* are treated in BSD as hints
*/
2020-05-28 08:12:16 +03:00
__sock_set_rcvbuf ( sk , min_t ( u32 , val , sysctl_rmem_max ) ) ;
2007-04-11 07:10:33 +04:00
break ;
case SO_RCVBUFFORCE :
if ( ! capable ( CAP_NET_ADMIN ) ) {
ret = - EPERM ;
2005-04-17 02:20:36 +04:00
break ;
2007-04-11 07:10:33 +04:00
}
sock: consistent handling of extreme SO_SNDBUF/SO_RCVBUF values
SO_SNDBUF and SO_RCVBUF (and their *BUFFORCE version) may overflow or
underflow their input value. This patch aims at providing explicit
handling of these extreme cases, to get a clear behaviour even with
values bigger than INT_MAX / 2 or lower than INT_MIN / 2.
For simplicity, only SO_SNDBUF and SO_SNDBUFFORCE are described here,
but the same explanation and fix apply to SO_RCVBUF and SO_RCVBUFFORCE
(with 'SNDBUF' replaced by 'RCVBUF' and 'wmem_max' by 'rmem_max').
Overflow of positive values
===========================
When handling SO_SNDBUF or SO_SNDBUFFORCE, if 'val' exceeds
INT_MAX / 2, the buffer size is set to its minimum value because
'val * 2' overflows, and max_t() considers that it's smaller than
SOCK_MIN_SNDBUF. For SO_SNDBUF, this can only happen with
net.core.wmem_max > INT_MAX / 2.
SO_SNDBUF and SO_SNDBUFFORCE are actually designed to let users probe
for the maximum buffer size by setting an arbitrary large number that
gets capped to the maximum allowed/possible size. Having the upper
half of the positive integer space to potentially reduce the buffer
size to its minimum value defeats this purpose.
This patch caps the base value to INT_MAX / 2, so that bigger values
don't overflow and keep setting the buffer size to its maximum.
Underflow of negative values
============================
For negative numbers, SO_SNDBUF always considers them bigger than
net.core.wmem_max, which is bounded by [SOCK_MIN_SNDBUF, INT_MAX].
Therefore such values are set to net.core.wmem_max and we're back to
the behaviour of positive integers described above (return maximum
buffer size if wmem_max <= INT_MAX / 2, return SOCK_MIN_SNDBUF
otherwise).
However, SO_SNDBUFFORCE behaves differently. The user value is
directly multiplied by two and compared with SOCK_MIN_SNDBUF. If
'val * 2' doesn't underflow or if it underflows to a value smaller
than SOCK_MIN_SNDBUF then buffer size is set to its minimum value.
Otherwise the buffer size is set to the underflowed value.
This patch treats negative values passed to SO_SNDBUFFORCE as null, to
prevent underflows. Therefore negative values now always set the buffer
size to its minimum value.
Even though SO_SNDBUF behaves inconsistently by setting buffer size to
the maximum value when passed a negative number, no attempt is made to
modify this behaviour. There may exist some programs that rely on using
negative numbers to set the maximum buffer size. Avoiding overflows
because of extreme net.core.wmem_max values is the most we can do here.
Summary of altered behaviours
=============================
val : user-space value passed to setsockopt()
val_uf : the underflowed value resulting from doubling val when
val < INT_MIN / 2
wmem_max : short for net.core.wmem_max
val_cap : min(val, wmem_max)
min_len : minimal buffer length (that is, SOCK_MIN_SNDBUF)
max_len : maximal possible buffer length, regardless of wmem_max (that
is, INT_MAX - 1)
^^^^ : altered behaviour
SO_SNDBUF:
+-------------------------+-------------+------------+----------------+
| CONDITION | OLD RESULT | NEW RESULT | COMMENT |
+-------------------------+-------------+------------+----------------+
| val < 0 && | | | No overflow, |
| wmem_max <= INT_MAX/2 | wmem_max*2 | wmem_max*2 | keep original |
| | | | behaviour |
+-------------------------+-------------+------------+----------------+
| val < 0 && | | | Cap wmem_max |
| INT_MAX/2 < wmem_max | min_len | max_len | to prevent |
| | | ^^^^^^^ | overflow |
+-------------------------+-------------+------------+----------------+
| 0 <= val <= min_len/2 | min_len | min_len | Ordinary case |
+-------------------------+-------------+------------+----------------+
| min_len/2 < val && | val_cap*2 | val_cap*2 | Ordinary case |
| val_cap <= INT_MAX/2 | | | |
+-------------------------+-------------+------------+----------------+
| min_len < val && | | | Cap val_cap |
| INT_MAX/2 < val_cap | min_len | max_len | again to |
| (implies that | | ^^^^^^^ | prevent |
| INT_MAX/2 < wmem_max) | | | overflow |
+-------------------------+-------------+------------+----------------+
SO_SNDBUFFORCE:
+------------------------------+---------+---------+------------------+
| CONDITION | BEFORE | AFTER | COMMENT |
| | PATCH | PATCH | |
+------------------------------+---------+---------+------------------+
| val < INT_MIN/2 && | min_len | min_len | Underflow with |
| val_uf <= min_len | | | no consequence |
+------------------------------+---------+---------+------------------+
| val < INT_MIN/2 && | val_uf | min_len | Set val to 0 to |
| val_uf > min_len | | ^^^^^^^ | avoid underflow |
+------------------------------+---------+---------+------------------+
| INT_MIN/2 <= val < 0 | min_len | min_len | No underflow |
+------------------------------+---------+---------+------------------+
| 0 <= val <= min_len/2 | min_len | min_len | Ordinary case |
+------------------------------+---------+---------+------------------+
| min_len/2 < val <= INT_MAX/2 | val*2 | val*2 | Ordinary case |
+------------------------------+---------+---------+------------------+
| INT_MAX/2 < val | min_len | max_len | Cap val to |
| | | ^^^^^^^ | prevent overflow |
+------------------------------+---------+---------+------------------+
Signed-off-by: Guillaume Nault <gnault@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-02-13 06:30:34 +03:00
/* No negative values (to prevent underflow, as val will be
* multiplied by 2 ) .
*/
2020-05-28 08:12:16 +03:00
__sock_set_rcvbuf ( sk , max ( val , 0 ) ) ;
break ;
2005-04-17 02:20:36 +04:00
2007-04-11 07:10:33 +04:00
case SO_KEEPALIVE :
2017-01-09 18:55:12 +03:00
if ( sk - > sk_prot - > keepalive )
sk - > sk_prot - > keepalive ( sk , valbool ) ;
2007-04-11 07:10:33 +04:00
sock_valbool_flag ( sk , SOCK_KEEPOPEN , valbool ) ;
break ;
case SO_OOBINLINE :
sock_valbool_flag ( sk , SOCK_URGINLINE , valbool ) ;
break ;
case SO_NO_CHECK :
2014-05-23 19:47:19 +04:00
sk - > sk_no_check_tx = valbool ;
2007-04-11 07:10:33 +04:00
break ;
case SO_PRIORITY :
2012-11-16 07:03:04 +04:00
if ( ( val > = 0 & & val < = 6 ) | |
2021-11-23 23:37:02 +03:00
ns_capable ( sock_net ( sk ) - > user_ns , CAP_NET_RAW ) | |
2012-11-16 07:03:04 +04:00
ns_capable ( sock_net ( sk ) - > user_ns , CAP_NET_ADMIN ) )
2007-04-11 07:10:33 +04:00
sk - > sk_priority = val ;
else
ret = - EPERM ;
break ;
case SO_LINGER :
if ( optlen < sizeof ( ling ) ) {
ret = - EINVAL ; /* 1003.1g */
2005-04-17 02:20:36 +04:00
break ;
2007-04-11 07:10:33 +04:00
}
2020-07-23 09:08:50 +03:00
if ( copy_from_sockptr ( & ling , optval , sizeof ( ling ) ) ) {
2007-04-11 07:10:33 +04:00
ret = - EFAULT ;
2005-04-17 02:20:36 +04:00
break ;
2007-04-11 07:10:33 +04:00
}
if ( ! ling . l_onoff )
sock_reset_flag ( sk , SOCK_LINGER ) ;
else {
2005-04-17 02:20:36 +04:00
# if (BITS_PER_LONG == 32)
2007-04-11 07:10:33 +04:00
if ( ( unsigned int ) ling . l_linger > = MAX_SCHEDULE_TIMEOUT / HZ )
sk - > sk_lingertime = MAX_SCHEDULE_TIMEOUT ;
2005-04-17 02:20:36 +04:00
else
2007-04-11 07:10:33 +04:00
# endif
sk - > sk_lingertime = ( unsigned int ) ling . l_linger * HZ ;
sock_set_flag ( sk , SOCK_LINGER ) ;
}
break ;
case SO_BSDCOMPAT :
break ;
case SO_PASSCRED :
if ( valbool )
set_bit ( SOCK_PASSCRED , & sock - > flags ) ;
else
clear_bit ( SOCK_PASSCRED , & sock - > flags ) ;
break ;
2019-02-02 18:34:46 +03:00
case SO_TIMESTAMP_OLD :
2019-02-02 18:34:50 +03:00
case SO_TIMESTAMP_NEW :
2019-02-02 18:34:46 +03:00
case SO_TIMESTAMPNS_OLD :
2019-02-02 18:34:50 +03:00
case SO_TIMESTAMPNS_NEW :
2021-07-02 17:41:01 +03:00
sock_set_timestamp ( sk , optname , valbool ) ;
2007-04-11 07:10:33 +04:00
break ;
2021-06-04 02:24:28 +03:00
2019-02-02 18:34:51 +03:00
case SO_TIMESTAMPING_NEW :
2019-02-02 18:34:46 +03:00
case SO_TIMESTAMPING_OLD :
2021-06-30 11:11:59 +03:00
if ( optlen = = sizeof ( timestamping ) ) {
if ( copy_from_sockptr ( & timestamping , optval ,
2021-07-07 13:01:00 +03:00
sizeof ( timestamping ) ) ) {
ret = - EFAULT ;
break ;
}
2021-06-30 11:11:59 +03:00
} else {
memset ( & timestamping , 0 , sizeof ( timestamping ) ) ;
timestamping . flags = val ;
}
ret = sock_set_timestamping ( sk , optname , timestamping ) ;
2009-02-12 08:03:38 +03:00
break ;
2007-04-11 07:10:33 +04:00
case SO_RCVLOWAT :
if ( val < 0 )
val = INT_MAX ;
2018-04-16 20:33:35 +03:00
if ( sock - > ops - > set_rcvlowat )
ret = sock - > ops - > set_rcvlowat ( sk , val ) ;
else
2019-10-10 01:32:35 +03:00
WRITE_ONCE ( sk - > sk_rcvlowat , val ? : 1 ) ;
2007-04-11 07:10:33 +04:00
break ;
2019-02-02 18:34:53 +03:00
case SO_RCVTIMEO_OLD :
2019-02-02 18:34:54 +03:00
case SO_RCVTIMEO_NEW :
2020-07-23 09:08:50 +03:00
ret = sock_set_timeout ( & sk - > sk_rcvtimeo , optval ,
2020-07-23 09:08:49 +03:00
optlen , optname = = SO_RCVTIMEO_OLD ) ;
2007-04-11 07:10:33 +04:00
break ;
2019-02-02 18:34:53 +03:00
case SO_SNDTIMEO_OLD :
2019-02-02 18:34:54 +03:00
case SO_SNDTIMEO_NEW :
2020-07-23 09:08:50 +03:00
ret = sock_set_timeout ( & sk - > sk_sndtimeo , optval ,
2020-07-23 09:08:49 +03:00
optlen , optname = = SO_SNDTIMEO_OLD ) ;
2007-04-11 07:10:33 +04:00
break ;
2005-04-17 02:20:36 +04:00
2020-07-17 09:23:13 +03:00
case SO_ATTACH_FILTER : {
struct sock_fprog fprog ;
2007-04-11 07:10:33 +04:00
2020-07-23 09:08:50 +03:00
ret = copy_bpf_fprog_from_user ( & fprog , optval , optlen ) ;
2020-07-17 09:23:13 +03:00
if ( ! ret )
2007-04-11 07:10:33 +04:00
ret = sk_attach_filter ( & fprog , sk ) ;
break ;
2020-07-17 09:23:13 +03:00
}
2014-12-02 02:06:35 +03:00
case SO_ATTACH_BPF :
ret = - EINVAL ;
if ( optlen = = sizeof ( u32 ) ) {
u32 ufd ;
ret = - EFAULT ;
2020-07-23 09:08:50 +03:00
if ( copy_from_sockptr ( & ufd , optval , sizeof ( ufd ) ) )
2014-12-02 02:06:35 +03:00
break ;
ret = sk_attach_bpf ( ufd , sk ) ;
}
break ;
2020-07-17 09:23:13 +03:00
case SO_ATTACH_REUSEPORT_CBPF : {
struct sock_fprog fprog ;
2016-01-05 01:41:47 +03:00
2020-07-23 09:08:50 +03:00
ret = copy_bpf_fprog_from_user ( & fprog , optval , optlen ) ;
2020-07-17 09:23:13 +03:00
if ( ! ret )
2016-01-05 01:41:47 +03:00
ret = sk_reuseport_attach_filter ( & fprog , sk ) ;
break ;
2020-07-17 09:23:13 +03:00
}
2016-01-05 01:41:47 +03:00
case SO_ATTACH_REUSEPORT_EBPF :
ret = - EINVAL ;
if ( optlen = = sizeof ( u32 ) ) {
u32 ufd ;
ret = - EFAULT ;
2020-07-23 09:08:50 +03:00
if ( copy_from_sockptr ( & ufd , optval , sizeof ( ufd ) ) )
2016-01-05 01:41:47 +03:00
break ;
ret = sk_reuseport_attach_bpf ( ufd , sk ) ;
}
break ;
2019-06-14 01:00:01 +03:00
case SO_DETACH_REUSEPORT_BPF :
ret = reuseport_detach_prog ( sk ) ;
break ;
2007-04-11 07:10:33 +04:00
case SO_DETACH_FILTER :
2007-10-18 08:21:26 +04:00
ret = sk_detach_filter ( sk ) ;
2007-04-11 07:10:33 +04:00
break ;
2005-04-17 02:20:36 +04:00
2013-01-17 01:55:49 +04:00
case SO_LOCK_FILTER :
if ( sock_flag ( sk , SOCK_FILTER_LOCKED ) & & ! valbool )
ret = - EPERM ;
else
sock_valbool_flag ( sk , SOCK_FILTER_LOCKED , valbool ) ;
break ;
2007-04-11 07:10:33 +04:00
case SO_PASSSEC :
if ( valbool )
set_bit ( SOCK_PASSSEC , & sock - > flags ) ;
else
clear_bit ( SOCK_PASSSEC , & sock - > flags ) ;
break ;
2008-01-31 06:08:16 +03:00
case SO_MARK :
2018-11-08 17:13:35 +03:00
if ( ! ns_capable ( sock_net ( sk ) - > user_ns , CAP_NET_ADMIN ) ) {
2008-01-31 06:08:16 +03:00
ret = - EPERM ;
2021-06-01 00:00:30 +03:00
break ;
2018-11-08 17:13:35 +03:00
}
2021-06-01 00:00:30 +03:00
__sock_set_mark ( sk , val ) ;
2008-01-31 06:08:16 +03:00
break ;
[AF_UNIX]: Datagram getpeersec
This patch implements an API whereby an application can determine the
label of its peer's Unix datagram sockets via the auxiliary data mechanism of
recvmsg.
Patch purpose:
This patch enables a security-aware application to retrieve the
security context of the peer of a Unix datagram socket. The application
can then use this security context to determine the security context for
processing on behalf of the peer who sent the packet.
Patch design and implementation:
The design and implementation is very similar to the UDP case for INET
sockets. Basically we build upon the existing Unix domain socket API for
retrieving user credentials. Linux offers the API for obtaining user
credentials via ancillary messages (i.e., out of band/control messages
that are bundled together with a normal message). To retrieve the security
context, the application first indicates to the kernel such desire by
setting the SO_PASSSEC option via getsockopt. Then the application
retrieves the security context using the auxiliary data mechanism.
An example server application for Unix datagram socket should look like this:
toggle = 1;
toggle_len = sizeof(toggle);
setsockopt(sockfd, SOL_SOCKET, SO_PASSSEC, &toggle, &toggle_len);
recvmsg(sockfd, &msg_hdr, 0);
if (msg_hdr.msg_controllen > sizeof(struct cmsghdr)) {
cmsg_hdr = CMSG_FIRSTHDR(&msg_hdr);
if (cmsg_hdr->cmsg_len <= CMSG_LEN(sizeof(scontext)) &&
cmsg_hdr->cmsg_level == SOL_SOCKET &&
cmsg_hdr->cmsg_type == SCM_SECURITY) {
memcpy(&scontext, CMSG_DATA(cmsg_hdr), sizeof(scontext));
}
}
sock_setsockopt is enhanced with a new socket option SOCK_PASSSEC to allow
a server socket to receive security context of the peer.
Testing:
We have tested the patch by setting up Unix datagram client and server
applications. We verified that the server can retrieve the security context
using the auxiliary data mechanism of recvmsg.
Signed-off-by: Catherine Zhang <cxzhang@watson.ibm.com>
Acked-by: Acked-by: James Morris <jmorris@namei.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2006-06-29 23:27:47 +04:00
net: Generalize socket rx gap / receive queue overflow cmsg
Create a new socket level option to report number of queue overflows
Recently I augmented the AF_PACKET protocol to report the number of frames lost
on the socket receive queue between any two enqueued frames. This value was
exported via a SOL_PACKET level cmsg. AFter I completed that work it was
requested that this feature be generalized so that any datagram oriented socket
could make use of this option. As such I've created this patch, It creates a
new SOL_SOCKET level option called SO_RXQ_OVFL, which when enabled exports a
SOL_SOCKET level cmsg that reports the nubmer of times the sk_receive_queue
overflowed between any two given frames. It also augments the AF_PACKET
protocol to take advantage of this new feature (as it previously did not touch
sk->sk_drops, which this patch uses to record the overflow count). Tested
successfully by me.
Notes:
1) Unlike my previous patch, this patch simply records the sk_drops value, which
is not a number of drops between packets, but rather a total number of drops.
Deltas must be computed in user space.
2) While this patch currently works with datagram oriented protocols, it will
also be accepted by non-datagram oriented protocols. I'm not sure if thats
agreeable to everyone, but my argument in favor of doing so is that, for those
protocols which aren't applicable to this option, sk_drops will always be zero,
and reporting no drops on a receive queue that isn't used for those
non-participating protocols seems reasonable to me. This also saves us having
to code in a per-protocol opt in mechanism.
3) This applies cleanly to net-next assuming that commit
977750076d98c7ff6cbda51858bb5a5894a9d9ab (my af packet cmsg patch) is reverted
Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2009-10-13 00:26:31 +04:00
case SO_RXQ_OVFL :
2011-10-07 07:30:20 +04:00
sock_valbool_flag ( sk , SOCK_RXQ_OVFL , valbool ) ;
net: Generalize socket rx gap / receive queue overflow cmsg
Create a new socket level option to report number of queue overflows
Recently I augmented the AF_PACKET protocol to report the number of frames lost
on the socket receive queue between any two enqueued frames. This value was
exported via a SOL_PACKET level cmsg. AFter I completed that work it was
requested that this feature be generalized so that any datagram oriented socket
could make use of this option. As such I've created this patch, It creates a
new SOL_SOCKET level option called SO_RXQ_OVFL, which when enabled exports a
SOL_SOCKET level cmsg that reports the nubmer of times the sk_receive_queue
overflowed between any two given frames. It also augments the AF_PACKET
protocol to take advantage of this new feature (as it previously did not touch
sk->sk_drops, which this patch uses to record the overflow count). Tested
successfully by me.
Notes:
1) Unlike my previous patch, this patch simply records the sk_drops value, which
is not a number of drops between packets, but rather a total number of drops.
Deltas must be computed in user space.
2) While this patch currently works with datagram oriented protocols, it will
also be accepted by non-datagram oriented protocols. I'm not sure if thats
agreeable to everyone, but my argument in favor of doing so is that, for those
protocols which aren't applicable to this option, sk_drops will always be zero,
and reporting no drops on a receive queue that isn't used for those
non-participating protocols seems reasonable to me. This also saves us having
to code in a per-protocol opt in mechanism.
3) This applies cleanly to net-next assuming that commit
977750076d98c7ff6cbda51858bb5a5894a9d9ab (my af packet cmsg patch) is reverted
Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2009-10-13 00:26:31 +04:00
break ;
2011-11-09 13:15:42 +04:00
case SO_WIFI_STATUS :
sock_valbool_flag ( sk , SOCK_WIFI_STATUS , valbool ) ;
break ;
2012-02-21 11:31:34 +04:00
case SO_PEEK_OFF :
if ( sock - > ops - > set_peek_off )
2013-12-08 02:26:27 +04:00
ret = sock - > ops - > set_peek_off ( sk , val ) ;
2012-02-21 11:31:34 +04:00
else
ret = - EOPNOTSUPP ;
break ;
2012-02-11 19:39:30 +04:00
case SO_NOFCS :
sock_valbool_flag ( sk , SOCK_NOFCS , valbool ) ;
break ;
2013-03-28 15:19:25 +04:00
case SO_SELECT_ERR_QUEUE :
sock_valbool_flag ( sk , SOCK_SELECT_ERR_QUEUE , valbool ) ;
break ;
2013-08-01 07:10:25 +04:00
# ifdef CONFIG_NET_RX_BUSY_POLL
2013-07-10 18:13:36 +04:00
case SO_BUSY_POLL :
2013-06-14 17:33:57 +04:00
/* allow unprivileged users to decrease the value */
if ( ( val > sk - > sk_ll_usec ) & & ! capable ( CAP_NET_ADMIN ) )
ret = - EPERM ;
else {
if ( val < 0 )
ret = - EINVAL ;
else
2021-06-29 17:12:45 +03:00
WRITE_ONCE ( sk - > sk_ll_usec , val ) ;
2013-06-14 17:33:57 +04:00
}
break ;
net: Introduce preferred busy-polling
The existing busy-polling mode, enabled by the SO_BUSY_POLL socket
option or system-wide using the /proc/sys/net/core/busy_read knob, is
an opportunistic. That means that if the NAPI context is not
scheduled, it will poll it. If, after busy-polling, the budget is
exceeded the busy-polling logic will schedule the NAPI onto the
regular softirq handling.
One implication of the behavior above is that a busy/heavy loaded NAPI
context will never enter/allow for busy-polling. Some applications
prefer that most NAPI processing would be done by busy-polling.
This series adds a new socket option, SO_PREFER_BUSY_POLL, that works
in concert with the napi_defer_hard_irqs and gro_flush_timeout
knobs. The napi_defer_hard_irqs and gro_flush_timeout knobs were
introduced in commit 6f8b12d661d0 ("net: napi: add hard irqs deferral
feature"), and allows for a user to defer interrupts to be enabled and
instead schedule the NAPI context from a watchdog timer. When a user
enables the SO_PREFER_BUSY_POLL, again with the other knobs enabled,
and the NAPI context is being processed by a softirq, the softirq NAPI
processing will exit early to allow the busy-polling to be performed.
If the application stops performing busy-polling via a system call,
the watchdog timer defined by gro_flush_timeout will timeout, and
regular softirq handling will resume.
In summary; Heavy traffic applications that prefer busy-polling over
softirq processing should use this option.
Example usage:
$ echo 2 | sudo tee /sys/class/net/ens785f1/napi_defer_hard_irqs
$ echo 200000 | sudo tee /sys/class/net/ens785f1/gro_flush_timeout
Note that the timeout should be larger than the userspace processing
window, otherwise the watchdog will timeout and fall back to regular
softirq processing.
Enable the SO_BUSY_POLL/SO_PREFER_BUSY_POLL options on your socket.
Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Jakub Kicinski <kuba@kernel.org>
Link: https://lore.kernel.org/bpf/20201130185205.196029-2-bjorn.topel@gmail.com
2020-11-30 21:51:56 +03:00
case SO_PREFER_BUSY_POLL :
if ( valbool & & ! capable ( CAP_NET_ADMIN ) )
ret = - EPERM ;
else
WRITE_ONCE ( sk - > sk_prefer_busy_poll , valbool ) ;
break ;
2020-11-30 21:51:57 +03:00
case SO_BUSY_POLL_BUDGET :
if ( val > READ_ONCE ( sk - > sk_busy_poll_budget ) & & ! capable ( CAP_NET_ADMIN ) ) {
ret = - EPERM ;
} else {
if ( val < 0 | | val > U16_MAX )
ret = - EINVAL ;
else
WRITE_ONCE ( sk - > sk_busy_poll_budget , val ) ;
}
break ;
2013-06-14 17:33:57 +04:00
# endif
2013-09-24 19:20:52 +04:00
case SO_MAX_PACING_RATE :
2019-03-01 02:17:27 +03:00
{
2020-10-22 09:41:46 +03:00
unsigned long ulval = ( val = = ~ 0U ) ? ~ 0UL : ( unsigned int ) val ;
2019-03-01 02:17:27 +03:00
if ( sizeof ( ulval ) ! = sizeof ( val ) & &
optlen > = sizeof ( ulval ) & &
2020-07-23 09:08:50 +03:00
copy_from_sockptr ( & ulval , optval , sizeof ( ulval ) ) ) {
2019-03-01 02:17:27 +03:00
ret = - EFAULT ;
break ;
}
if ( ulval ! = ~ 0UL )
2017-05-16 14:24:36 +03:00
cmpxchg ( & sk - > sk_pacing_status ,
SK_PACING_NONE ,
SK_PACING_NEEDED ) ;
2019-03-01 02:17:27 +03:00
sk - > sk_max_pacing_rate = ulval ;
sk - > sk_pacing_rate = min ( sk - > sk_pacing_rate , ulval ) ;
2013-09-24 19:20:52 +04:00
break ;
2019-03-01 02:17:27 +03:00
}
2015-10-09 05:33:21 +03:00
case SO_INCOMING_CPU :
2019-10-30 23:00:04 +03:00
WRITE_ONCE ( sk - > sk_incoming_cpu , val ) ;
2015-10-09 05:33:21 +03:00
break ;
2016-02-24 21:02:52 +03:00
case SO_CNX_ADVICE :
if ( val = = 1 )
dst_negative_advice ( sk ) ;
break ;
2017-08-03 23:29:40 +03:00
case SO_ZEROCOPY :
2018-02-15 21:49:34 +03:00
if ( sk - > sk_family = = PF_INET | | sk - > sk_family = = PF_INET6 ) {
2021-11-15 22:02:33 +03:00
if ( ! ( sk_is_tcp ( sk ) | |
2018-11-30 23:32:39 +03:00
( sk - > sk_type = = SOCK_DGRAM & &
sk - > sk_protocol = = IPPROTO_UDP ) ) )
2018-02-15 21:49:34 +03:00
ret = - ENOTSUPP ;
} else if ( sk - > sk_family ! = PF_RDS ) {
2017-08-03 23:29:40 +03:00
ret = - ENOTSUPP ;
2018-02-15 21:49:34 +03:00
}
if ( ! ret ) {
if ( val < 0 | | val > 1 )
ret = - EINVAL ;
else
sock_valbool_flag ( sk , SOCK_ZEROCOPY , valbool ) ;
}
2018-03-07 20:40:57 +03:00
break ;
2018-07-04 01:42:48 +03:00
case SO_TXTIME :
2020-05-07 20:05:39 +03:00
if ( optlen ! = sizeof ( struct sock_txtime ) ) {
2018-07-04 01:42:48 +03:00
ret = - EINVAL ;
2020-05-07 20:05:39 +03:00
break ;
2020-07-23 09:08:50 +03:00
} else if ( copy_from_sockptr ( & sk_txtime , optval ,
2018-07-04 01:42:48 +03:00
sizeof ( struct sock_txtime ) ) ) {
ret = - EFAULT ;
2020-05-07 20:05:39 +03:00
break ;
2018-07-04 01:42:48 +03:00
} else if ( sk_txtime . flags & ~ SOF_TXTIME_FLAGS_MASK ) {
ret = - EINVAL ;
2020-05-07 20:05:39 +03:00
break ;
}
/* CLOCK_MONOTONIC is only used by sch_fq, and this packet
* scheduler has enough safe guards .
*/
if ( sk_txtime . clockid ! = CLOCK_MONOTONIC & &
! ns_capable ( sock_net ( sk ) - > user_ns , CAP_NET_ADMIN ) ) {
ret = - EPERM ;
break ;
2018-07-04 01:42:48 +03:00
}
2020-05-07 20:05:39 +03:00
sock_valbool_flag ( sk , SOCK_TXTIME , true ) ;
sk - > sk_clockid = sk_txtime . clockid ;
sk - > sk_txtime_deadline_mode =
! ! ( sk_txtime . flags & SOF_TXTIME_DEADLINE_MODE ) ;
sk - > sk_txtime_report_errors =
! ! ( sk_txtime . flags & SOF_TXTIME_REPORT_ERRORS ) ;
2018-07-04 01:42:48 +03:00
break ;
net: introduce SO_BINDTOIFINDEX sockopt
This introduces a new generic SOL_SOCKET-level socket option called
SO_BINDTOIFINDEX. It behaves similar to SO_BINDTODEVICE, but takes a
network interface index as argument, rather than the network interface
name.
User-space often refers to network-interfaces via their index, but has
to temporarily resolve it to a name for a call into SO_BINDTODEVICE.
This might pose problems when the network-device is renamed
asynchronously by other parts of the system. When this happens, the
SO_BINDTODEVICE might either fail, or worse, it might bind to the wrong
device.
In most cases user-space only ever operates on devices which they
either manage themselves, or otherwise have a guarantee that the device
name will not change (e.g., devices that are UP cannot be renamed).
However, particularly in libraries this guarantee is non-obvious and it
would be nice if that race-condition would simply not exist. It would
make it easier for those libraries to operate even in situations where
the device-name might change under the hood.
A real use-case that we recently hit is trying to start the network
stack early in the initrd but make it survive into the real system.
Existing distributions rename network-interfaces during the transition
from initrd into the real system. This, obviously, cannot affect
devices that are up and running (unless you also consider moving them
between network-namespaces). However, the network manager now has to
make sure its management engine for dormant devices will not run in
parallel to these renames. Particularly, when you offload operations
like DHCP into separate processes, these might setup their sockets
early, and thus have to resolve the device-name possibly running into
this race-condition.
By avoiding a call to resolve the device-name, we no longer depend on
the name and can run network setup of dormant devices in parallel to
the transition off the initrd. The SO_BINDTOIFINDEX ioctl plugs this
race.
Reviewed-by: Tom Gundersen <teg@jklm.no>
Signed-off-by: David Herrmann <dh.herrmann@gmail.com>
Acked-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-01-15 16:42:14 +03:00
case SO_BINDTOIFINDEX :
2020-05-28 08:12:13 +03:00
ret = sock_bindtoindex_locked ( sk , val ) ;
net: introduce SO_BINDTOIFINDEX sockopt
This introduces a new generic SOL_SOCKET-level socket option called
SO_BINDTOIFINDEX. It behaves similar to SO_BINDTODEVICE, but takes a
network interface index as argument, rather than the network interface
name.
User-space often refers to network-interfaces via their index, but has
to temporarily resolve it to a name for a call into SO_BINDTODEVICE.
This might pose problems when the network-device is renamed
asynchronously by other parts of the system. When this happens, the
SO_BINDTODEVICE might either fail, or worse, it might bind to the wrong
device.
In most cases user-space only ever operates on devices which they
either manage themselves, or otherwise have a guarantee that the device
name will not change (e.g., devices that are UP cannot be renamed).
However, particularly in libraries this guarantee is non-obvious and it
would be nice if that race-condition would simply not exist. It would
make it easier for those libraries to operate even in situations where
the device-name might change under the hood.
A real use-case that we recently hit is trying to start the network
stack early in the initrd but make it survive into the real system.
Existing distributions rename network-interfaces during the transition
from initrd into the real system. This, obviously, cannot affect
devices that are up and running (unless you also consider moving them
between network-namespaces). However, the network manager now has to
make sure its management engine for dormant devices will not run in
parallel to these renames. Particularly, when you offload operations
like DHCP into separate processes, these might setup their sockets
early, and thus have to resolve the device-name possibly running into
this race-condition.
By avoiding a call to resolve the device-name, we no longer depend on
the name and can run network setup of dormant devices in parallel to
the transition off the initrd. The SO_BINDTOIFINDEX ioctl plugs this
race.
Reviewed-by: Tom Gundersen <teg@jklm.no>
Signed-off-by: David Herrmann <dh.herrmann@gmail.com>
Acked-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-01-15 16:42:14 +03:00
break ;
2021-08-04 10:55:56 +03:00
case SO_BUF_LOCK :
if ( val & ~ SOCK_BUF_LOCK_MASK ) {
ret = - EINVAL ;
break ;
}
sk - > sk_userlocks = val | ( sk - > sk_userlocks &
~ SOCK_BUF_LOCK_MASK ) ;
break ;
2021-09-29 20:25:11 +03:00
case SO_RESERVE_MEM :
{
int delta ;
if ( val < 0 ) {
ret = - EINVAL ;
break ;
}
delta = val - sk - > sk_reserved_mem ;
if ( delta < 0 )
sock_release_reserved_memory ( sk , - delta ) ;
else
ret = sock_reserve_memory ( sk , delta ) ;
break ;
}
2007-04-11 07:10:33 +04:00
default :
ret = - ENOPROTOOPT ;
break ;
2007-02-09 17:24:36 +03:00
}
2005-04-17 02:20:36 +04:00
release_sock ( sk ) ;
return ret ;
}
2009-05-27 15:30:05 +04:00
EXPORT_SYMBOL ( sock_setsockopt ) ;
2005-04-17 02:20:36 +04:00
2021-09-30 01:57:50 +03:00
static const struct cred * sk_get_peer_cred ( struct sock * sk )
{
const struct cred * cred ;
spin_lock ( & sk - > sk_peer_lock ) ;
cred = get_cred ( sk - > sk_peer_cred ) ;
spin_unlock ( & sk - > sk_peer_lock ) ;
return cred ;
}
2005-04-17 02:20:36 +04:00
2014-01-03 21:17:14 +04:00
static void cred_to_ucred ( struct pid * pid , const struct cred * cred ,
struct ucred * ucred )
2010-06-13 07:28:59 +04:00
{
ucred - > pid = pid_vnr ( pid ) ;
ucred - > uid = ucred - > gid = - 1 ;
if ( cred ) {
struct user_namespace * current_ns = current_user_ns ( ) ;
2012-05-24 02:39:45 +04:00
ucred - > uid = from_kuid_munged ( current_ns , cred - > euid ) ;
ucred - > gid = from_kgid_munged ( current_ns , cred - > egid ) ;
2010-06-13 07:28:59 +04:00
}
}
net: introduce SO_PEERGROUPS getsockopt
This adds the new getsockopt(2) option SO_PEERGROUPS on SOL_SOCKET to
retrieve the auxiliary groups of the remote peer. It is designed to
naturally extend SO_PEERCRED. That is, the underlying data is from the
same credentials. Regarding its syntax, it is based on SO_PEERSEC. That
is, if the provided buffer is too small, ERANGE is returned and @optlen
is updated. Otherwise, the information is copied, @optlen is set to the
actual size, and 0 is returned.
While SO_PEERCRED (and thus `struct ucred') already returns the primary
group, it lacks the auxiliary group vector. However, nearly all access
controls (including kernel side VFS and SYSVIPC, but also user-space
polkit, DBus, ...) consider the entire set of groups, rather than just
the primary group. But this is currently not possible with pure
SO_PEERCRED. Instead, user-space has to work around this and query the
system database for the auxiliary groups of a UID retrieved via
SO_PEERCRED.
Unfortunately, there is no race-free way to query the auxiliary groups
of the PID/UID retrieved via SO_PEERCRED. Hence, the current user-space
solution is to use getgrouplist(3p), which itself falls back to NSS and
whatever is configured in nsswitch.conf(3). This effectively checks
which groups we *would* assign to the user if it logged in *now*. On
normal systems it is as easy as reading /etc/group, but with NSS it can
resort to quering network databases (eg., LDAP), using IPC or network
communication.
Long story short: Whenever we want to use auxiliary groups for access
checks on IPC, we need further IPC to talk to the user/group databases,
rather than just relying on SO_PEERCRED and the incoming socket. This
is unfortunate, and might even result in dead-locks if the database
query uses the same IPC as the original request.
So far, those recursions / dead-locks have been avoided by using
primitive IPC for all crucial NSS modules. However, we want to avoid
re-inventing the wheel for each NSS module that might be involved in
user/group queries. Hence, we would preferably make DBus (and other IPC
that supports access-management based on groups) work without resorting
to the user/group database. This new SO_PEERGROUPS ioctl would allow us
to make dbus-daemon work without ever calling into NSS.
Cc: Michal Sekletar <msekleta@redhat.com>
Cc: Simon McVittie <simon.mcvittie@collabora.co.uk>
Reviewed-by: Tom Gundersen <teg@jklm.no>
Signed-off-by: David Herrmann <dh.herrmann@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-06-21 11:47:15 +03:00
static int groups_to_user ( gid_t __user * dst , const struct group_info * src )
{
struct user_namespace * user_ns = current_user_ns ( ) ;
int i ;
for ( i = 0 ; i < src - > ngroups ; i + + )
if ( put_user ( from_kgid_munged ( user_ns , src - > gid [ i ] ) , dst + i ) )
return - EFAULT ;
return 0 ;
}
2005-04-17 02:20:36 +04:00
int sock_getsockopt ( struct socket * sock , int level , int optname ,
char __user * optval , int __user * optlen )
{
struct sock * sk = sock - > sk ;
2007-02-09 17:24:36 +03:00
2007-04-11 07:10:33 +04:00
union {
2007-02-09 17:24:36 +03:00
int val ;
2017-04-06 05:00:55 +03:00
u64 val64 ;
2019-03-01 02:17:28 +03:00
unsigned long ulval ;
2007-02-09 17:24:36 +03:00
struct linger ling ;
2019-02-02 18:34:44 +03:00
struct old_timeval32 tm32 ;
struct __kernel_old_timeval tm ;
2019-02-02 18:34:54 +03:00
struct __kernel_sock_timeval stm ;
2018-07-04 01:42:48 +03:00
struct sock_txtime txtime ;
2021-06-30 11:11:59 +03:00
struct so_timestamping timestamping ;
2005-04-17 02:20:36 +04:00
} v ;
2007-02-09 17:24:36 +03:00
2010-01-15 12:08:58 +03:00
int lv = sizeof ( int ) ;
2005-04-17 02:20:36 +04:00
int len ;
2007-02-09 17:24:36 +03:00
2007-04-11 07:10:33 +04:00
if ( get_user ( len , optlen ) )
2007-02-09 17:24:36 +03:00
return - EFAULT ;
2007-04-11 07:10:33 +04:00
if ( len < 0 )
2005-04-17 02:20:36 +04:00
return - EINVAL ;
2007-02-09 17:24:36 +03:00
2009-02-24 02:38:41 +03:00
memset ( & v , 0 , sizeof ( v ) ) ;
net: 4 bytes kernel memory disclosure in SO_BSDCOMPAT gsopt try #2
In function sock_getsockopt() located in net/core/sock.c, optval v.val
is not correctly initialized and directly returned in userland in case
we have SO_BSDCOMPAT option set.
This dummy code should trigger the bug:
int main(void)
{
unsigned char buf[4] = { 0, 0, 0, 0 };
int len;
int sock;
sock = socket(33, 2, 2);
getsockopt(sock, 1, SO_BSDCOMPAT, &buf, &len);
printf("%x%x%x%x\n", buf[0], buf[1], buf[2], buf[3]);
close(sock);
}
Here is a patch that fix this bug by initalizing v.val just after its
declaration.
Signed-off-by: Clément Lecigne <clement.lecigne@netasq.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2009-02-13 03:59:09 +03:00
2009-05-27 15:30:05 +04:00
switch ( optname ) {
2007-04-11 07:10:33 +04:00
case SO_DEBUG :
v . val = sock_flag ( sk , SOCK_DBG ) ;
break ;
case SO_DONTROUTE :
v . val = sock_flag ( sk , SOCK_LOCALROUTE ) ;
break ;
case SO_BROADCAST :
2012-05-16 09:57:07 +04:00
v . val = sock_flag ( sk , SOCK_BROADCAST ) ;
2007-04-11 07:10:33 +04:00
break ;
case SO_SNDBUF :
v . val = sk - > sk_sndbuf ;
break ;
case SO_RCVBUF :
v . val = sk - > sk_rcvbuf ;
break ;
case SO_REUSEADDR :
v . val = sk - > sk_reuse ;
break ;
2013-01-22 13:49:50 +04:00
case SO_REUSEPORT :
v . val = sk - > sk_reuseport ;
break ;
2007-04-11 07:10:33 +04:00
case SO_KEEPALIVE :
2012-05-16 09:57:07 +04:00
v . val = sock_flag ( sk , SOCK_KEEPOPEN ) ;
2007-04-11 07:10:33 +04:00
break ;
case SO_TYPE :
v . val = sk - > sk_type ;
break ;
2009-08-04 11:28:28 +04:00
case SO_PROTOCOL :
v . val = sk - > sk_protocol ;
break ;
2009-08-04 11:28:29 +04:00
case SO_DOMAIN :
v . val = sk - > sk_family ;
break ;
2007-04-11 07:10:33 +04:00
case SO_ERROR :
v . val = - sock_error ( sk ) ;
2009-05-27 15:30:05 +04:00
if ( v . val = = 0 )
2007-04-11 07:10:33 +04:00
v . val = xchg ( & sk - > sk_err_soft , 0 ) ;
break ;
case SO_OOBINLINE :
2012-05-16 09:57:07 +04:00
v . val = sock_flag ( sk , SOCK_URGINLINE ) ;
2007-04-11 07:10:33 +04:00
break ;
case SO_NO_CHECK :
2014-05-23 19:47:19 +04:00
v . val = sk - > sk_no_check_tx ;
2007-04-11 07:10:33 +04:00
break ;
case SO_PRIORITY :
v . val = sk - > sk_priority ;
break ;
case SO_LINGER :
lv = sizeof ( v . ling ) ;
2012-05-16 09:57:07 +04:00
v . ling . l_onoff = sock_flag ( sk , SOCK_LINGER ) ;
2007-04-11 07:10:33 +04:00
v . ling . l_linger = sk - > sk_lingertime / HZ ;
break ;
case SO_BSDCOMPAT :
break ;
2019-02-02 18:34:46 +03:00
case SO_TIMESTAMP_OLD :
2007-03-26 09:14:49 +04:00
v . val = sock_flag ( sk , SOCK_RCVTSTAMP ) & &
2019-02-02 18:34:50 +03:00
! sock_flag ( sk , SOCK_TSTAMP_NEW ) & &
2007-03-26 09:14:49 +04:00
! sock_flag ( sk , SOCK_RCVTSTAMPNS ) ;
break ;
2019-02-02 18:34:46 +03:00
case SO_TIMESTAMPNS_OLD :
2019-02-02 18:34:50 +03:00
v . val = sock_flag ( sk , SOCK_RCVTSTAMPNS ) & & ! sock_flag ( sk , SOCK_TSTAMP_NEW ) ;
break ;
case SO_TIMESTAMP_NEW :
v . val = sock_flag ( sk , SOCK_RCVTSTAMP ) & & sock_flag ( sk , SOCK_TSTAMP_NEW ) ;
break ;
case SO_TIMESTAMPNS_NEW :
v . val = sock_flag ( sk , SOCK_RCVTSTAMPNS ) & & sock_flag ( sk , SOCK_TSTAMP_NEW ) ;
2007-04-11 07:10:33 +04:00
break ;
2019-02-02 18:34:46 +03:00
case SO_TIMESTAMPING_OLD :
2021-06-30 11:11:59 +03:00
lv = sizeof ( v . timestamping ) ;
v . timestamping . flags = sk - > sk_tsflags ;
v . timestamping . bind_phc = sk - > sk_bind_phc ;
2009-02-12 08:03:38 +03:00
break ;
2019-02-02 18:34:54 +03:00
case SO_RCVTIMEO_OLD :
case SO_RCVTIMEO_NEW :
lv = sock_get_timeout ( sk - > sk_rcvtimeo , & v , SO_RCVTIMEO_OLD = = optname ) ;
2007-04-11 07:10:33 +04:00
break ;
2019-02-02 18:34:54 +03:00
case SO_SNDTIMEO_OLD :
case SO_SNDTIMEO_NEW :
lv = sock_get_timeout ( sk - > sk_sndtimeo , & v , SO_SNDTIMEO_OLD = = optname ) ;
2007-04-11 07:10:33 +04:00
break ;
2005-04-17 02:20:36 +04:00
2007-04-11 07:10:33 +04:00
case SO_RCVLOWAT :
v . val = sk - > sk_rcvlowat ;
break ;
2005-04-17 02:20:36 +04:00
2007-04-11 07:10:33 +04:00
case SO_SNDLOWAT :
2009-05-27 15:30:05 +04:00
v . val = 1 ;
2007-04-11 07:10:33 +04:00
break ;
2005-04-17 02:20:36 +04:00
2007-04-11 07:10:33 +04:00
case SO_PASSCRED :
2012-04-27 00:07:59 +04:00
v . val = ! ! test_bit ( SOCK_PASSCRED , & sock - > flags ) ;
2007-04-11 07:10:33 +04:00
break ;
2005-04-17 02:20:36 +04:00
2007-04-11 07:10:33 +04:00
case SO_PEERCRED :
2010-06-13 07:30:14 +04:00
{
struct ucred peercred ;
if ( len > sizeof ( peercred ) )
len = sizeof ( peercred ) ;
2021-09-30 01:57:50 +03:00
spin_lock ( & sk - > sk_peer_lock ) ;
2010-06-13 07:30:14 +04:00
cred_to_ucred ( sk - > sk_peer_pid , sk - > sk_peer_cred , & peercred ) ;
2021-09-30 01:57:50 +03:00
spin_unlock ( & sk - > sk_peer_lock ) ;
2010-06-13 07:30:14 +04:00
if ( copy_to_user ( optval , & peercred , len ) )
2007-04-11 07:10:33 +04:00
return - EFAULT ;
goto lenout ;
2010-06-13 07:30:14 +04:00
}
2005-04-17 02:20:36 +04:00
net: introduce SO_PEERGROUPS getsockopt
This adds the new getsockopt(2) option SO_PEERGROUPS on SOL_SOCKET to
retrieve the auxiliary groups of the remote peer. It is designed to
naturally extend SO_PEERCRED. That is, the underlying data is from the
same credentials. Regarding its syntax, it is based on SO_PEERSEC. That
is, if the provided buffer is too small, ERANGE is returned and @optlen
is updated. Otherwise, the information is copied, @optlen is set to the
actual size, and 0 is returned.
While SO_PEERCRED (and thus `struct ucred') already returns the primary
group, it lacks the auxiliary group vector. However, nearly all access
controls (including kernel side VFS and SYSVIPC, but also user-space
polkit, DBus, ...) consider the entire set of groups, rather than just
the primary group. But this is currently not possible with pure
SO_PEERCRED. Instead, user-space has to work around this and query the
system database for the auxiliary groups of a UID retrieved via
SO_PEERCRED.
Unfortunately, there is no race-free way to query the auxiliary groups
of the PID/UID retrieved via SO_PEERCRED. Hence, the current user-space
solution is to use getgrouplist(3p), which itself falls back to NSS and
whatever is configured in nsswitch.conf(3). This effectively checks
which groups we *would* assign to the user if it logged in *now*. On
normal systems it is as easy as reading /etc/group, but with NSS it can
resort to quering network databases (eg., LDAP), using IPC or network
communication.
Long story short: Whenever we want to use auxiliary groups for access
checks on IPC, we need further IPC to talk to the user/group databases,
rather than just relying on SO_PEERCRED and the incoming socket. This
is unfortunate, and might even result in dead-locks if the database
query uses the same IPC as the original request.
So far, those recursions / dead-locks have been avoided by using
primitive IPC for all crucial NSS modules. However, we want to avoid
re-inventing the wheel for each NSS module that might be involved in
user/group queries. Hence, we would preferably make DBus (and other IPC
that supports access-management based on groups) work without resorting
to the user/group database. This new SO_PEERGROUPS ioctl would allow us
to make dbus-daemon work without ever calling into NSS.
Cc: Michal Sekletar <msekleta@redhat.com>
Cc: Simon McVittie <simon.mcvittie@collabora.co.uk>
Reviewed-by: Tom Gundersen <teg@jklm.no>
Signed-off-by: David Herrmann <dh.herrmann@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-06-21 11:47:15 +03:00
case SO_PEERGROUPS :
{
2021-09-30 01:57:50 +03:00
const struct cred * cred ;
net: introduce SO_PEERGROUPS getsockopt
This adds the new getsockopt(2) option SO_PEERGROUPS on SOL_SOCKET to
retrieve the auxiliary groups of the remote peer. It is designed to
naturally extend SO_PEERCRED. That is, the underlying data is from the
same credentials. Regarding its syntax, it is based on SO_PEERSEC. That
is, if the provided buffer is too small, ERANGE is returned and @optlen
is updated. Otherwise, the information is copied, @optlen is set to the
actual size, and 0 is returned.
While SO_PEERCRED (and thus `struct ucred') already returns the primary
group, it lacks the auxiliary group vector. However, nearly all access
controls (including kernel side VFS and SYSVIPC, but also user-space
polkit, DBus, ...) consider the entire set of groups, rather than just
the primary group. But this is currently not possible with pure
SO_PEERCRED. Instead, user-space has to work around this and query the
system database for the auxiliary groups of a UID retrieved via
SO_PEERCRED.
Unfortunately, there is no race-free way to query the auxiliary groups
of the PID/UID retrieved via SO_PEERCRED. Hence, the current user-space
solution is to use getgrouplist(3p), which itself falls back to NSS and
whatever is configured in nsswitch.conf(3). This effectively checks
which groups we *would* assign to the user if it logged in *now*. On
normal systems it is as easy as reading /etc/group, but with NSS it can
resort to quering network databases (eg., LDAP), using IPC or network
communication.
Long story short: Whenever we want to use auxiliary groups for access
checks on IPC, we need further IPC to talk to the user/group databases,
rather than just relying on SO_PEERCRED and the incoming socket. This
is unfortunate, and might even result in dead-locks if the database
query uses the same IPC as the original request.
So far, those recursions / dead-locks have been avoided by using
primitive IPC for all crucial NSS modules. However, we want to avoid
re-inventing the wheel for each NSS module that might be involved in
user/group queries. Hence, we would preferably make DBus (and other IPC
that supports access-management based on groups) work without resorting
to the user/group database. This new SO_PEERGROUPS ioctl would allow us
to make dbus-daemon work without ever calling into NSS.
Cc: Michal Sekletar <msekleta@redhat.com>
Cc: Simon McVittie <simon.mcvittie@collabora.co.uk>
Reviewed-by: Tom Gundersen <teg@jklm.no>
Signed-off-by: David Herrmann <dh.herrmann@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-06-21 11:47:15 +03:00
int ret , n ;
2021-09-30 01:57:50 +03:00
cred = sk_get_peer_cred ( sk ) ;
if ( ! cred )
net: introduce SO_PEERGROUPS getsockopt
This adds the new getsockopt(2) option SO_PEERGROUPS on SOL_SOCKET to
retrieve the auxiliary groups of the remote peer. It is designed to
naturally extend SO_PEERCRED. That is, the underlying data is from the
same credentials. Regarding its syntax, it is based on SO_PEERSEC. That
is, if the provided buffer is too small, ERANGE is returned and @optlen
is updated. Otherwise, the information is copied, @optlen is set to the
actual size, and 0 is returned.
While SO_PEERCRED (and thus `struct ucred') already returns the primary
group, it lacks the auxiliary group vector. However, nearly all access
controls (including kernel side VFS and SYSVIPC, but also user-space
polkit, DBus, ...) consider the entire set of groups, rather than just
the primary group. But this is currently not possible with pure
SO_PEERCRED. Instead, user-space has to work around this and query the
system database for the auxiliary groups of a UID retrieved via
SO_PEERCRED.
Unfortunately, there is no race-free way to query the auxiliary groups
of the PID/UID retrieved via SO_PEERCRED. Hence, the current user-space
solution is to use getgrouplist(3p), which itself falls back to NSS and
whatever is configured in nsswitch.conf(3). This effectively checks
which groups we *would* assign to the user if it logged in *now*. On
normal systems it is as easy as reading /etc/group, but with NSS it can
resort to quering network databases (eg., LDAP), using IPC or network
communication.
Long story short: Whenever we want to use auxiliary groups for access
checks on IPC, we need further IPC to talk to the user/group databases,
rather than just relying on SO_PEERCRED and the incoming socket. This
is unfortunate, and might even result in dead-locks if the database
query uses the same IPC as the original request.
So far, those recursions / dead-locks have been avoided by using
primitive IPC for all crucial NSS modules. However, we want to avoid
re-inventing the wheel for each NSS module that might be involved in
user/group queries. Hence, we would preferably make DBus (and other IPC
that supports access-management based on groups) work without resorting
to the user/group database. This new SO_PEERGROUPS ioctl would allow us
to make dbus-daemon work without ever calling into NSS.
Cc: Michal Sekletar <msekleta@redhat.com>
Cc: Simon McVittie <simon.mcvittie@collabora.co.uk>
Reviewed-by: Tom Gundersen <teg@jklm.no>
Signed-off-by: David Herrmann <dh.herrmann@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-06-21 11:47:15 +03:00
return - ENODATA ;
2021-09-30 01:57:50 +03:00
n = cred - > group_info - > ngroups ;
net: introduce SO_PEERGROUPS getsockopt
This adds the new getsockopt(2) option SO_PEERGROUPS on SOL_SOCKET to
retrieve the auxiliary groups of the remote peer. It is designed to
naturally extend SO_PEERCRED. That is, the underlying data is from the
same credentials. Regarding its syntax, it is based on SO_PEERSEC. That
is, if the provided buffer is too small, ERANGE is returned and @optlen
is updated. Otherwise, the information is copied, @optlen is set to the
actual size, and 0 is returned.
While SO_PEERCRED (and thus `struct ucred') already returns the primary
group, it lacks the auxiliary group vector. However, nearly all access
controls (including kernel side VFS and SYSVIPC, but also user-space
polkit, DBus, ...) consider the entire set of groups, rather than just
the primary group. But this is currently not possible with pure
SO_PEERCRED. Instead, user-space has to work around this and query the
system database for the auxiliary groups of a UID retrieved via
SO_PEERCRED.
Unfortunately, there is no race-free way to query the auxiliary groups
of the PID/UID retrieved via SO_PEERCRED. Hence, the current user-space
solution is to use getgrouplist(3p), which itself falls back to NSS and
whatever is configured in nsswitch.conf(3). This effectively checks
which groups we *would* assign to the user if it logged in *now*. On
normal systems it is as easy as reading /etc/group, but with NSS it can
resort to quering network databases (eg., LDAP), using IPC or network
communication.
Long story short: Whenever we want to use auxiliary groups for access
checks on IPC, we need further IPC to talk to the user/group databases,
rather than just relying on SO_PEERCRED and the incoming socket. This
is unfortunate, and might even result in dead-locks if the database
query uses the same IPC as the original request.
So far, those recursions / dead-locks have been avoided by using
primitive IPC for all crucial NSS modules. However, we want to avoid
re-inventing the wheel for each NSS module that might be involved in
user/group queries. Hence, we would preferably make DBus (and other IPC
that supports access-management based on groups) work without resorting
to the user/group database. This new SO_PEERGROUPS ioctl would allow us
to make dbus-daemon work without ever calling into NSS.
Cc: Michal Sekletar <msekleta@redhat.com>
Cc: Simon McVittie <simon.mcvittie@collabora.co.uk>
Reviewed-by: Tom Gundersen <teg@jklm.no>
Signed-off-by: David Herrmann <dh.herrmann@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-06-21 11:47:15 +03:00
if ( len < n * sizeof ( gid_t ) ) {
len = n * sizeof ( gid_t ) ;
2021-09-30 01:57:50 +03:00
put_cred ( cred ) ;
net: introduce SO_PEERGROUPS getsockopt
This adds the new getsockopt(2) option SO_PEERGROUPS on SOL_SOCKET to
retrieve the auxiliary groups of the remote peer. It is designed to
naturally extend SO_PEERCRED. That is, the underlying data is from the
same credentials. Regarding its syntax, it is based on SO_PEERSEC. That
is, if the provided buffer is too small, ERANGE is returned and @optlen
is updated. Otherwise, the information is copied, @optlen is set to the
actual size, and 0 is returned.
While SO_PEERCRED (and thus `struct ucred') already returns the primary
group, it lacks the auxiliary group vector. However, nearly all access
controls (including kernel side VFS and SYSVIPC, but also user-space
polkit, DBus, ...) consider the entire set of groups, rather than just
the primary group. But this is currently not possible with pure
SO_PEERCRED. Instead, user-space has to work around this and query the
system database for the auxiliary groups of a UID retrieved via
SO_PEERCRED.
Unfortunately, there is no race-free way to query the auxiliary groups
of the PID/UID retrieved via SO_PEERCRED. Hence, the current user-space
solution is to use getgrouplist(3p), which itself falls back to NSS and
whatever is configured in nsswitch.conf(3). This effectively checks
which groups we *would* assign to the user if it logged in *now*. On
normal systems it is as easy as reading /etc/group, but with NSS it can
resort to quering network databases (eg., LDAP), using IPC or network
communication.
Long story short: Whenever we want to use auxiliary groups for access
checks on IPC, we need further IPC to talk to the user/group databases,
rather than just relying on SO_PEERCRED and the incoming socket. This
is unfortunate, and might even result in dead-locks if the database
query uses the same IPC as the original request.
So far, those recursions / dead-locks have been avoided by using
primitive IPC for all crucial NSS modules. However, we want to avoid
re-inventing the wheel for each NSS module that might be involved in
user/group queries. Hence, we would preferably make DBus (and other IPC
that supports access-management based on groups) work without resorting
to the user/group database. This new SO_PEERGROUPS ioctl would allow us
to make dbus-daemon work without ever calling into NSS.
Cc: Michal Sekletar <msekleta@redhat.com>
Cc: Simon McVittie <simon.mcvittie@collabora.co.uk>
Reviewed-by: Tom Gundersen <teg@jklm.no>
Signed-off-by: David Herrmann <dh.herrmann@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-06-21 11:47:15 +03:00
return put_user ( len , optlen ) ? - EFAULT : - ERANGE ;
}
len = n * sizeof ( gid_t ) ;
2021-09-30 01:57:50 +03:00
ret = groups_to_user ( ( gid_t __user * ) optval , cred - > group_info ) ;
put_cred ( cred ) ;
net: introduce SO_PEERGROUPS getsockopt
This adds the new getsockopt(2) option SO_PEERGROUPS on SOL_SOCKET to
retrieve the auxiliary groups of the remote peer. It is designed to
naturally extend SO_PEERCRED. That is, the underlying data is from the
same credentials. Regarding its syntax, it is based on SO_PEERSEC. That
is, if the provided buffer is too small, ERANGE is returned and @optlen
is updated. Otherwise, the information is copied, @optlen is set to the
actual size, and 0 is returned.
While SO_PEERCRED (and thus `struct ucred') already returns the primary
group, it lacks the auxiliary group vector. However, nearly all access
controls (including kernel side VFS and SYSVIPC, but also user-space
polkit, DBus, ...) consider the entire set of groups, rather than just
the primary group. But this is currently not possible with pure
SO_PEERCRED. Instead, user-space has to work around this and query the
system database for the auxiliary groups of a UID retrieved via
SO_PEERCRED.
Unfortunately, there is no race-free way to query the auxiliary groups
of the PID/UID retrieved via SO_PEERCRED. Hence, the current user-space
solution is to use getgrouplist(3p), which itself falls back to NSS and
whatever is configured in nsswitch.conf(3). This effectively checks
which groups we *would* assign to the user if it logged in *now*. On
normal systems it is as easy as reading /etc/group, but with NSS it can
resort to quering network databases (eg., LDAP), using IPC or network
communication.
Long story short: Whenever we want to use auxiliary groups for access
checks on IPC, we need further IPC to talk to the user/group databases,
rather than just relying on SO_PEERCRED and the incoming socket. This
is unfortunate, and might even result in dead-locks if the database
query uses the same IPC as the original request.
So far, those recursions / dead-locks have been avoided by using
primitive IPC for all crucial NSS modules. However, we want to avoid
re-inventing the wheel for each NSS module that might be involved in
user/group queries. Hence, we would preferably make DBus (and other IPC
that supports access-management based on groups) work without resorting
to the user/group database. This new SO_PEERGROUPS ioctl would allow us
to make dbus-daemon work without ever calling into NSS.
Cc: Michal Sekletar <msekleta@redhat.com>
Cc: Simon McVittie <simon.mcvittie@collabora.co.uk>
Reviewed-by: Tom Gundersen <teg@jklm.no>
Signed-off-by: David Herrmann <dh.herrmann@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-06-21 11:47:15 +03:00
if ( ret )
return ret ;
goto lenout ;
}
2007-04-11 07:10:33 +04:00
case SO_PEERNAME :
{
char address [ 128 ] ;
2018-02-12 22:00:20 +03:00
lv = sock - > ops - > getname ( sock , ( struct sockaddr * ) address , 2 ) ;
if ( lv < 0 )
2007-04-11 07:10:33 +04:00
return - ENOTCONN ;
if ( lv < len )
return - EINVAL ;
if ( copy_to_user ( optval , address , len ) )
return - EFAULT ;
goto lenout ;
}
2005-04-17 02:20:36 +04:00
2007-04-11 07:10:33 +04:00
/* Dubious BSD thing... Probably nobody even uses it, but
* the UNIX standard wants it for whatever reason . . . - DaveM
*/
case SO_ACCEPTCONN :
v . val = sk - > sk_state = = TCP_LISTEN ;
break ;
2005-04-17 02:20:36 +04:00
2007-04-11 07:10:33 +04:00
case SO_PASSSEC :
2012-04-27 00:07:59 +04:00
v . val = ! ! test_bit ( SOCK_PASSSEC , & sock - > flags ) ;
2007-04-11 07:10:33 +04:00
break ;
[AF_UNIX]: Datagram getpeersec
This patch implements an API whereby an application can determine the
label of its peer's Unix datagram sockets via the auxiliary data mechanism of
recvmsg.
Patch purpose:
This patch enables a security-aware application to retrieve the
security context of the peer of a Unix datagram socket. The application
can then use this security context to determine the security context for
processing on behalf of the peer who sent the packet.
Patch design and implementation:
The design and implementation is very similar to the UDP case for INET
sockets. Basically we build upon the existing Unix domain socket API for
retrieving user credentials. Linux offers the API for obtaining user
credentials via ancillary messages (i.e., out of band/control messages
that are bundled together with a normal message). To retrieve the security
context, the application first indicates to the kernel such desire by
setting the SO_PASSSEC option via getsockopt. Then the application
retrieves the security context using the auxiliary data mechanism.
An example server application for Unix datagram socket should look like this:
toggle = 1;
toggle_len = sizeof(toggle);
setsockopt(sockfd, SOL_SOCKET, SO_PASSSEC, &toggle, &toggle_len);
recvmsg(sockfd, &msg_hdr, 0);
if (msg_hdr.msg_controllen > sizeof(struct cmsghdr)) {
cmsg_hdr = CMSG_FIRSTHDR(&msg_hdr);
if (cmsg_hdr->cmsg_len <= CMSG_LEN(sizeof(scontext)) &&
cmsg_hdr->cmsg_level == SOL_SOCKET &&
cmsg_hdr->cmsg_type == SCM_SECURITY) {
memcpy(&scontext, CMSG_DATA(cmsg_hdr), sizeof(scontext));
}
}
sock_setsockopt is enhanced with a new socket option SOCK_PASSSEC to allow
a server socket to receive security context of the peer.
Testing:
We have tested the patch by setting up Unix datagram client and server
applications. We verified that the server can retrieve the security context
using the auxiliary data mechanism of recvmsg.
Signed-off-by: Catherine Zhang <cxzhang@watson.ibm.com>
Acked-by: Acked-by: James Morris <jmorris@namei.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2006-06-29 23:27:47 +04:00
2007-04-11 07:10:33 +04:00
case SO_PEERSEC :
return security_socket_getpeersec_stream ( sock , optval , optlen , len ) ;
2005-04-17 02:20:36 +04:00
2008-01-31 06:08:16 +03:00
case SO_MARK :
v . val = sk - > sk_mark ;
break ;
net: Generalize socket rx gap / receive queue overflow cmsg
Create a new socket level option to report number of queue overflows
Recently I augmented the AF_PACKET protocol to report the number of frames lost
on the socket receive queue between any two enqueued frames. This value was
exported via a SOL_PACKET level cmsg. AFter I completed that work it was
requested that this feature be generalized so that any datagram oriented socket
could make use of this option. As such I've created this patch, It creates a
new SOL_SOCKET level option called SO_RXQ_OVFL, which when enabled exports a
SOL_SOCKET level cmsg that reports the nubmer of times the sk_receive_queue
overflowed between any two given frames. It also augments the AF_PACKET
protocol to take advantage of this new feature (as it previously did not touch
sk->sk_drops, which this patch uses to record the overflow count). Tested
successfully by me.
Notes:
1) Unlike my previous patch, this patch simply records the sk_drops value, which
is not a number of drops between packets, but rather a total number of drops.
Deltas must be computed in user space.
2) While this patch currently works with datagram oriented protocols, it will
also be accepted by non-datagram oriented protocols. I'm not sure if thats
agreeable to everyone, but my argument in favor of doing so is that, for those
protocols which aren't applicable to this option, sk_drops will always be zero,
and reporting no drops on a receive queue that isn't used for those
non-participating protocols seems reasonable to me. This also saves us having
to code in a per-protocol opt in mechanism.
3) This applies cleanly to net-next assuming that commit
977750076d98c7ff6cbda51858bb5a5894a9d9ab (my af packet cmsg patch) is reverted
Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2009-10-13 00:26:31 +04:00
case SO_RXQ_OVFL :
2012-05-16 09:57:07 +04:00
v . val = sock_flag ( sk , SOCK_RXQ_OVFL ) ;
net: Generalize socket rx gap / receive queue overflow cmsg
Create a new socket level option to report number of queue overflows
Recently I augmented the AF_PACKET protocol to report the number of frames lost
on the socket receive queue between any two enqueued frames. This value was
exported via a SOL_PACKET level cmsg. AFter I completed that work it was
requested that this feature be generalized so that any datagram oriented socket
could make use of this option. As such I've created this patch, It creates a
new SOL_SOCKET level option called SO_RXQ_OVFL, which when enabled exports a
SOL_SOCKET level cmsg that reports the nubmer of times the sk_receive_queue
overflowed between any two given frames. It also augments the AF_PACKET
protocol to take advantage of this new feature (as it previously did not touch
sk->sk_drops, which this patch uses to record the overflow count). Tested
successfully by me.
Notes:
1) Unlike my previous patch, this patch simply records the sk_drops value, which
is not a number of drops between packets, but rather a total number of drops.
Deltas must be computed in user space.
2) While this patch currently works with datagram oriented protocols, it will
also be accepted by non-datagram oriented protocols. I'm not sure if thats
agreeable to everyone, but my argument in favor of doing so is that, for those
protocols which aren't applicable to this option, sk_drops will always be zero,
and reporting no drops on a receive queue that isn't used for those
non-participating protocols seems reasonable to me. This also saves us having
to code in a per-protocol opt in mechanism.
3) This applies cleanly to net-next assuming that commit
977750076d98c7ff6cbda51858bb5a5894a9d9ab (my af packet cmsg patch) is reverted
Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2009-10-13 00:26:31 +04:00
break ;
2011-11-09 13:15:42 +04:00
case SO_WIFI_STATUS :
2012-05-16 09:57:07 +04:00
v . val = sock_flag ( sk , SOCK_WIFI_STATUS ) ;
2011-11-09 13:15:42 +04:00
break ;
2012-02-21 11:31:34 +04:00
case SO_PEEK_OFF :
if ( ! sock - > ops - > set_peek_off )
return - EOPNOTSUPP ;
v . val = sk - > sk_peek_off ;
break ;
2012-02-24 23:48:34 +04:00
case SO_NOFCS :
2012-05-16 09:57:07 +04:00
v . val = sock_flag ( sk , SOCK_NOFCS ) ;
2012-02-24 23:48:34 +04:00
break ;
2012-11-26 09:21:08 +04:00
2012-10-19 03:55:56 +04:00
case SO_BINDTODEVICE :
2012-11-26 09:21:08 +04:00
return sock_getbindtodevice ( sk , optval , optlen , len ) ;
sk-filter: Add ability to get socket filter program (v2)
The SO_ATTACH_FILTER option is set only. I propose to add the get
ability by using SO_ATTACH_FILTER in getsockopt. To be less
irritating to eyes the SO_GET_FILTER alias to it is declared. This
ability is required by checkpoint-restore project to be able to
save full state of a socket.
There are two issues with getting filter back.
First, kernel modifies the sock_filter->code on filter load, thus in
order to return the filter element back to user we have to decode it
into user-visible constants. Fortunately the modification in question
is interconvertible.
Second, the BPF_S_ALU_DIV_K code modifies the command argument k to
speed up the run-time division by doing kernel_k = reciprocal(user_k).
Bad news is that different user_k may result in same kernel_k, so we
can't get the original user_k back. Good news is that we don't have
to do it. What we need to is calculate a user2_k so, that
reciprocal(user2_k) == reciprocal(user_k) == kernel_k
i.e. if it's re-loaded back the compiled again value will be exactly
the same as it was. That said, the user2_k can be calculated like this
user2_k = reciprocal(kernel_k)
with an exception, that if kernel_k == 0, then user2_k == 1.
The optlen argument is treated like this -- when zero, kernel returns
the amount of sock_fprog elements in filter, otherwise it should be
large enough for the sock_fprog array.
changes since v1:
* Declared SO_GET_FILTER in all arch headers
* Added decode of vlan-tag codes
Signed-off-by: Pavel Emelyanov <xemul@parallels.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2012-11-01 06:01:48 +04:00
case SO_GET_FILTER :
len = sk_get_filter ( sk , ( struct sock_filter __user * ) optval , len ) ;
if ( len < 0 )
return len ;
goto lenout ;
2012-11-26 09:21:08 +04:00
2013-01-17 01:55:49 +04:00
case SO_LOCK_FILTER :
v . val = sock_flag ( sk , SOCK_FILTER_LOCKED ) ;
break ;
2014-01-17 20:09:45 +04:00
case SO_BPF_EXTENSIONS :
v . val = bpf_tell_extensions ( ) ;
break ;
2013-03-28 15:19:25 +04:00
case SO_SELECT_ERR_QUEUE :
v . val = sock_flag ( sk , SOCK_SELECT_ERR_QUEUE ) ;
break ;
2013-08-01 07:10:25 +04:00
# ifdef CONFIG_NET_RX_BUSY_POLL
2013-07-10 18:13:36 +04:00
case SO_BUSY_POLL :
2013-06-14 17:33:57 +04:00
v . val = sk - > sk_ll_usec ;
break ;
net: Introduce preferred busy-polling
The existing busy-polling mode, enabled by the SO_BUSY_POLL socket
option or system-wide using the /proc/sys/net/core/busy_read knob, is
an opportunistic. That means that if the NAPI context is not
scheduled, it will poll it. If, after busy-polling, the budget is
exceeded the busy-polling logic will schedule the NAPI onto the
regular softirq handling.
One implication of the behavior above is that a busy/heavy loaded NAPI
context will never enter/allow for busy-polling. Some applications
prefer that most NAPI processing would be done by busy-polling.
This series adds a new socket option, SO_PREFER_BUSY_POLL, that works
in concert with the napi_defer_hard_irqs and gro_flush_timeout
knobs. The napi_defer_hard_irqs and gro_flush_timeout knobs were
introduced in commit 6f8b12d661d0 ("net: napi: add hard irqs deferral
feature"), and allows for a user to defer interrupts to be enabled and
instead schedule the NAPI context from a watchdog timer. When a user
enables the SO_PREFER_BUSY_POLL, again with the other knobs enabled,
and the NAPI context is being processed by a softirq, the softirq NAPI
processing will exit early to allow the busy-polling to be performed.
If the application stops performing busy-polling via a system call,
the watchdog timer defined by gro_flush_timeout will timeout, and
regular softirq handling will resume.
In summary; Heavy traffic applications that prefer busy-polling over
softirq processing should use this option.
Example usage:
$ echo 2 | sudo tee /sys/class/net/ens785f1/napi_defer_hard_irqs
$ echo 200000 | sudo tee /sys/class/net/ens785f1/gro_flush_timeout
Note that the timeout should be larger than the userspace processing
window, otherwise the watchdog will timeout and fall back to regular
softirq processing.
Enable the SO_BUSY_POLL/SO_PREFER_BUSY_POLL options on your socket.
Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Jakub Kicinski <kuba@kernel.org>
Link: https://lore.kernel.org/bpf/20201130185205.196029-2-bjorn.topel@gmail.com
2020-11-30 21:51:56 +03:00
case SO_PREFER_BUSY_POLL :
v . val = READ_ONCE ( sk - > sk_prefer_busy_poll ) ;
break ;
2013-06-14 17:33:57 +04:00
# endif
2013-09-24 19:20:52 +04:00
case SO_MAX_PACING_RATE :
2019-03-01 02:17:28 +03:00
if ( sizeof ( v . ulval ) ! = sizeof ( v . val ) & & len > = sizeof ( v . ulval ) ) {
lv = sizeof ( v . ulval ) ;
v . ulval = sk - > sk_max_pacing_rate ;
} else {
/* 32bit version */
v . val = min_t ( unsigned long , sk - > sk_max_pacing_rate , ~ 0U ) ;
}
2013-09-24 19:20:52 +04:00
break ;
net: introduce SO_INCOMING_CPU
Alternative to RPS/RFS is to use hardware support for multiple
queues.
Then split a set of million of sockets into worker threads, each
one using epoll() to manage events on its own socket pool.
Ideally, we want one thread per RX/TX queue/cpu, but we have no way to
know after accept() or connect() on which queue/cpu a socket is managed.
We normally use one cpu per RX queue (IRQ smp_affinity being properly
set), so remembering on socket structure which cpu delivered last packet
is enough to solve the problem.
After accept(), connect(), or even file descriptor passing around
processes, applications can use :
int cpu;
socklen_t len = sizeof(cpu);
getsockopt(fd, SOL_SOCKET, SO_INCOMING_CPU, &cpu, &len);
And use this information to put the socket into the right silo
for optimal performance, as all networking stack should run
on the appropriate cpu, without need to send IPI (RPS/RFS).
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-11-11 16:54:28 +03:00
case SO_INCOMING_CPU :
2019-10-30 23:00:04 +03:00
v . val = READ_ONCE ( sk - > sk_incoming_cpu ) ;
net: introduce SO_INCOMING_CPU
Alternative to RPS/RFS is to use hardware support for multiple
queues.
Then split a set of million of sockets into worker threads, each
one using epoll() to manage events on its own socket pool.
Ideally, we want one thread per RX/TX queue/cpu, but we have no way to
know after accept() or connect() on which queue/cpu a socket is managed.
We normally use one cpu per RX queue (IRQ smp_affinity being properly
set), so remembering on socket structure which cpu delivered last packet
is enough to solve the problem.
After accept(), connect(), or even file descriptor passing around
processes, applications can use :
int cpu;
socklen_t len = sizeof(cpu);
getsockopt(fd, SOL_SOCKET, SO_INCOMING_CPU, &cpu, &len);
And use this information to put the socket into the right silo
for optimal performance, as all networking stack should run
on the appropriate cpu, without need to send IPI (RPS/RFS).
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-11-11 16:54:28 +03:00
break ;
2017-03-20 22:22:03 +03:00
case SO_MEMINFO :
{
u32 meminfo [ SK_MEMINFO_VARS ] ;
sk_get_meminfo ( sk , meminfo ) ;
len = min_t ( unsigned int , len , sizeof ( meminfo ) ) ;
if ( copy_to_user ( optval , & meminfo , len ) )
return - EFAULT ;
goto lenout ;
}
2017-03-24 20:08:36 +03:00
# ifdef CONFIG_NET_RX_BUSY_POLL
case SO_INCOMING_NAPI_ID :
v . val = READ_ONCE ( sk - > sk_napi_id ) ;
/* aggregate non-NAPI IDs down to 0 */
if ( v . val < MIN_NAPI_ID )
v . val = 0 ;
break ;
# endif
2017-04-06 05:00:55 +03:00
case SO_COOKIE :
lv = sizeof ( u64 ) ;
if ( len < lv )
return - EINVAL ;
v . val64 = sock_gen_cookie ( sk ) ;
break ;
2017-08-03 23:29:40 +03:00
case SO_ZEROCOPY :
v . val = sock_flag ( sk , SOCK_ZEROCOPY ) ;
break ;
2018-07-04 01:42:48 +03:00
case SO_TXTIME :
lv = sizeof ( v . txtime ) ;
v . txtime . clockid = sk - > sk_clockid ;
v . txtime . flags | = sk - > sk_txtime_deadline_mode ?
SOF_TXTIME_DEADLINE_MODE : 0 ;
2018-07-04 01:43:00 +03:00
v . txtime . flags | = sk - > sk_txtime_report_errors ?
SOF_TXTIME_REPORT_ERRORS : 0 ;
2018-07-04 01:42:48 +03:00
break ;
net: introduce SO_BINDTOIFINDEX sockopt
This introduces a new generic SOL_SOCKET-level socket option called
SO_BINDTOIFINDEX. It behaves similar to SO_BINDTODEVICE, but takes a
network interface index as argument, rather than the network interface
name.
User-space often refers to network-interfaces via their index, but has
to temporarily resolve it to a name for a call into SO_BINDTODEVICE.
This might pose problems when the network-device is renamed
asynchronously by other parts of the system. When this happens, the
SO_BINDTODEVICE might either fail, or worse, it might bind to the wrong
device.
In most cases user-space only ever operates on devices which they
either manage themselves, or otherwise have a guarantee that the device
name will not change (e.g., devices that are UP cannot be renamed).
However, particularly in libraries this guarantee is non-obvious and it
would be nice if that race-condition would simply not exist. It would
make it easier for those libraries to operate even in situations where
the device-name might change under the hood.
A real use-case that we recently hit is trying to start the network
stack early in the initrd but make it survive into the real system.
Existing distributions rename network-interfaces during the transition
from initrd into the real system. This, obviously, cannot affect
devices that are up and running (unless you also consider moving them
between network-namespaces). However, the network manager now has to
make sure its management engine for dormant devices will not run in
parallel to these renames. Particularly, when you offload operations
like DHCP into separate processes, these might setup their sockets
early, and thus have to resolve the device-name possibly running into
this race-condition.
By avoiding a call to resolve the device-name, we no longer depend on
the name and can run network setup of dormant devices in parallel to
the transition off the initrd. The SO_BINDTOIFINDEX ioctl plugs this
race.
Reviewed-by: Tom Gundersen <teg@jklm.no>
Signed-off-by: David Herrmann <dh.herrmann@gmail.com>
Acked-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-01-15 16:42:14 +03:00
case SO_BINDTOIFINDEX :
v . val = sk - > sk_bound_dev_if ;
break ;
2021-06-23 16:56:45 +03:00
case SO_NETNS_COOKIE :
lv = sizeof ( u64 ) ;
if ( len ! = lv )
return - EINVAL ;
v . val64 = sock_net ( sk ) - > net_cookie ;
break ;
2021-08-04 10:55:56 +03:00
case SO_BUF_LOCK :
v . val = sk - > sk_userlocks & SOCK_BUF_LOCK_MASK ;
break ;
2021-09-29 20:25:11 +03:00
case SO_RESERVE_MEM :
v . val = sk - > sk_reserved_mem ;
break ;
2007-04-11 07:10:33 +04:00
default :
2015-03-23 12:04:13 +03:00
/* We implement the SO_SNDLOWAT etc to not be settable
* ( 1003.1 g 7 ) .
*/
2007-04-11 07:10:33 +04:00
return - ENOPROTOOPT ;
2005-04-17 02:20:36 +04:00
}
2007-04-11 07:10:33 +04:00
2005-04-17 02:20:36 +04:00
if ( len > lv )
len = lv ;
if ( copy_to_user ( optval , & v , len ) )
return - EFAULT ;
lenout :
2007-02-09 17:24:36 +03:00
if ( put_user ( len , optlen ) )
return - EFAULT ;
return 0 ;
2005-04-17 02:20:36 +04:00
}
2006-07-03 11:25:35 +04:00
/*
* Initialize an sk_lock .
*
* ( We also register the sk_lock with the lock validator . )
*/
2007-03-22 22:27:49 +03:00
static inline void sock_lock_init ( struct sock * sk )
2006-07-03 11:25:35 +04:00
{
2017-03-09 11:09:05 +03:00
if ( sk - > sk_kern_sock )
sock_lock_init_class_and_name (
sk ,
af_family_kern_slock_key_strings [ sk - > sk_family ] ,
af_family_kern_slock_keys + sk - > sk_family ,
af_family_kern_key_strings [ sk - > sk_family ] ,
af_family_kern_keys + sk - > sk_family ) ;
else
sock_lock_init_class_and_name (
sk ,
2006-12-07 07:35:24 +03:00
af_family_slock_key_strings [ sk - > sk_family ] ,
af_family_slock_keys + sk - > sk_family ,
af_family_key_strings [ sk - > sk_family ] ,
af_family_keys + sk - > sk_family ) ;
2006-07-03 11:25:35 +04:00
}
2009-07-16 03:13:10 +04:00
/*
* Copy all fields from osk to nsk but nsk - > sk_refcnt must not change yet ,
* even temporarly , because of RCU lookups . sk_node should also be left as is .
net: optimize INET input path further
Followup of commit b178bb3dfc30 (net: reorder struct sock fields)
Optimize INET input path a bit further, by :
1) moving sk_refcnt close to sk_lock.
This reduces number of dirtied cache lines by one on 64bit arches (and
64 bytes cache line size).
2) moving inet_daddr & inet_rcv_saddr at the beginning of sk
(same cache line than hash / family / bound_dev_if / nulls_node)
This reduces number of accessed cache lines in lookups by one, and dont
increase size of inet and timewait socks.
inet and tw sockets now share same place-holder for these fields.
Before patch :
offsetof(struct sock, sk_refcnt) = 0x10
offsetof(struct sock, sk_lock) = 0x40
offsetof(struct sock, sk_receive_queue) = 0x60
offsetof(struct inet_sock, inet_daddr) = 0x270
offsetof(struct inet_sock, inet_rcv_saddr) = 0x274
After patch :
offsetof(struct sock, sk_refcnt) = 0x44
offsetof(struct sock, sk_lock) = 0x48
offsetof(struct sock, sk_receive_queue) = 0x68
offsetof(struct inet_sock, inet_daddr) = 0x0
offsetof(struct inet_sock, inet_rcv_saddr) = 0x4
compute_score() (udp or tcp) now use a single cache line per ignored
item, instead of two.
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2010-11-30 22:04:07 +03:00
* We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
2009-07-16 03:13:10 +04:00
*/
2007-11-01 10:29:45 +03:00
static void sock_copy ( struct sock * nsk , const struct sock * osk )
{
2020-02-18 20:10:13 +03:00
const struct proto * prot = READ_ONCE ( osk - > sk_prot ) ;
2007-11-01 10:29:45 +03:00
# ifdef CONFIG_SECURITY_NETWORK
void * sptr = nsk - > sk_security ;
# endif
2021-01-28 18:02:17 +03:00
/* If we move sk_tx_queue_mapping out of the private section,
* we must check if sk_tx_queue_clear ( ) is called after
* sock_copy ( ) in sk_clone_lock ( ) .
*/
BUILD_BUG_ON ( offsetof ( struct sock , sk_tx_queue_mapping ) <
offsetof ( struct sock , sk_dontcopy_begin ) | |
offsetof ( struct sock , sk_tx_queue_mapping ) > =
offsetof ( struct sock , sk_dontcopy_end ) ) ;
net: optimize INET input path further
Followup of commit b178bb3dfc30 (net: reorder struct sock fields)
Optimize INET input path a bit further, by :
1) moving sk_refcnt close to sk_lock.
This reduces number of dirtied cache lines by one on 64bit arches (and
64 bytes cache line size).
2) moving inet_daddr & inet_rcv_saddr at the beginning of sk
(same cache line than hash / family / bound_dev_if / nulls_node)
This reduces number of accessed cache lines in lookups by one, and dont
increase size of inet and timewait socks.
inet and tw sockets now share same place-holder for these fields.
Before patch :
offsetof(struct sock, sk_refcnt) = 0x10
offsetof(struct sock, sk_lock) = 0x40
offsetof(struct sock, sk_receive_queue) = 0x60
offsetof(struct inet_sock, inet_daddr) = 0x270
offsetof(struct inet_sock, inet_rcv_saddr) = 0x274
After patch :
offsetof(struct sock, sk_refcnt) = 0x44
offsetof(struct sock, sk_lock) = 0x48
offsetof(struct sock, sk_receive_queue) = 0x68
offsetof(struct inet_sock, inet_daddr) = 0x0
offsetof(struct inet_sock, inet_rcv_saddr) = 0x4
compute_score() (udp or tcp) now use a single cache line per ignored
item, instead of two.
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2010-11-30 22:04:07 +03:00
memcpy ( nsk , osk , offsetof ( struct sock , sk_dontcopy_begin ) ) ;
memcpy ( & nsk - > sk_dontcopy_end , & osk - > sk_dontcopy_end ,
2020-02-18 20:10:13 +03:00
prot - > obj_size - offsetof ( struct sock , sk_dontcopy_end ) ) ;
net: optimize INET input path further
Followup of commit b178bb3dfc30 (net: reorder struct sock fields)
Optimize INET input path a bit further, by :
1) moving sk_refcnt close to sk_lock.
This reduces number of dirtied cache lines by one on 64bit arches (and
64 bytes cache line size).
2) moving inet_daddr & inet_rcv_saddr at the beginning of sk
(same cache line than hash / family / bound_dev_if / nulls_node)
This reduces number of accessed cache lines in lookups by one, and dont
increase size of inet and timewait socks.
inet and tw sockets now share same place-holder for these fields.
Before patch :
offsetof(struct sock, sk_refcnt) = 0x10
offsetof(struct sock, sk_lock) = 0x40
offsetof(struct sock, sk_receive_queue) = 0x60
offsetof(struct inet_sock, inet_daddr) = 0x270
offsetof(struct inet_sock, inet_rcv_saddr) = 0x274
After patch :
offsetof(struct sock, sk_refcnt) = 0x44
offsetof(struct sock, sk_lock) = 0x48
offsetof(struct sock, sk_receive_queue) = 0x68
offsetof(struct inet_sock, inet_daddr) = 0x0
offsetof(struct inet_sock, inet_rcv_saddr) = 0x4
compute_score() (udp or tcp) now use a single cache line per ignored
item, instead of two.
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2010-11-30 22:04:07 +03:00
2007-11-01 10:29:45 +03:00
# ifdef CONFIG_SECURITY_NETWORK
nsk - > sk_security = sptr ;
security_sk_clone ( osk , nsk ) ;
# endif
}
2007-11-01 10:36:26 +03:00
static struct sock * sk_prot_alloc ( struct proto * prot , gfp_t priority ,
int family )
2007-11-01 10:33:50 +03:00
{
struct sock * sk ;
struct kmem_cache * slab ;
slab = prot - > slab ;
2009-07-08 23:36:05 +04:00
if ( slab ! = NULL ) {
sk = kmem_cache_alloc ( slab , priority & ~ __GFP_ZERO ) ;
if ( ! sk )
return sk ;
mm: security: introduce init_on_alloc=1 and init_on_free=1 boot options
Patch series "add init_on_alloc/init_on_free boot options", v10.
Provide init_on_alloc and init_on_free boot options.
These are aimed at preventing possible information leaks and making the
control-flow bugs that depend on uninitialized values more deterministic.
Enabling either of the options guarantees that the memory returned by the
page allocator and SL[AU]B is initialized with zeroes. SLOB allocator
isn't supported at the moment, as its emulation of kmem caches complicates
handling of SLAB_TYPESAFE_BY_RCU caches correctly.
Enabling init_on_free also guarantees that pages and heap objects are
initialized right after they're freed, so it won't be possible to access
stale data by using a dangling pointer.
As suggested by Michal Hocko, right now we don't let the heap users to
disable initialization for certain allocations. There's not enough
evidence that doing so can speed up real-life cases, and introducing ways
to opt-out may result in things going out of control.
This patch (of 2):
The new options are needed to prevent possible information leaks and make
control-flow bugs that depend on uninitialized values more deterministic.
This is expected to be on-by-default on Android and Chrome OS. And it
gives the opportunity for anyone else to use it under distros too via the
boot args. (The init_on_free feature is regularly requested by folks
where memory forensics is included in their threat models.)
init_on_alloc=1 makes the kernel initialize newly allocated pages and heap
objects with zeroes. Initialization is done at allocation time at the
places where checks for __GFP_ZERO are performed.
init_on_free=1 makes the kernel initialize freed pages and heap objects
with zeroes upon their deletion. This helps to ensure sensitive data
doesn't leak via use-after-free accesses.
Both init_on_alloc=1 and init_on_free=1 guarantee that the allocator
returns zeroed memory. The two exceptions are slab caches with
constructors and SLAB_TYPESAFE_BY_RCU flag. Those are never
zero-initialized to preserve their semantics.
Both init_on_alloc and init_on_free default to zero, but those defaults
can be overridden with CONFIG_INIT_ON_ALLOC_DEFAULT_ON and
CONFIG_INIT_ON_FREE_DEFAULT_ON.
If either SLUB poisoning or page poisoning is enabled, those options take
precedence over init_on_alloc and init_on_free: initialization is only
applied to unpoisoned allocations.
Slowdown for the new features compared to init_on_free=0, init_on_alloc=0:
hackbench, init_on_free=1: +7.62% sys time (st.err 0.74%)
hackbench, init_on_alloc=1: +7.75% sys time (st.err 2.14%)
Linux build with -j12, init_on_free=1: +8.38% wall time (st.err 0.39%)
Linux build with -j12, init_on_free=1: +24.42% sys time (st.err 0.52%)
Linux build with -j12, init_on_alloc=1: -0.13% wall time (st.err 0.42%)
Linux build with -j12, init_on_alloc=1: +0.57% sys time (st.err 0.40%)
The slowdown for init_on_free=0, init_on_alloc=0 compared to the baseline
is within the standard error.
The new features are also going to pave the way for hardware memory
tagging (e.g. arm64's MTE), which will require both on_alloc and on_free
hooks to set the tags for heap objects. With MTE, tagging will have the
same cost as memory initialization.
Although init_on_free is rather costly, there are paranoid use-cases where
in-memory data lifetime is desired to be minimized. There are various
arguments for/against the realism of the associated threat models, but
given that we'll need the infrastructure for MTE anyway, and there are
people who want wipe-on-free behavior no matter what the performance cost,
it seems reasonable to include it in this series.
[glider@google.com: v8]
Link: http://lkml.kernel.org/r/20190626121943.131390-2-glider@google.com
[glider@google.com: v9]
Link: http://lkml.kernel.org/r/20190627130316.254309-2-glider@google.com
[glider@google.com: v10]
Link: http://lkml.kernel.org/r/20190628093131.199499-2-glider@google.com
Link: http://lkml.kernel.org/r/20190617151050.92663-2-glider@google.com
Signed-off-by: Alexander Potapenko <glider@google.com>
Acked-by: Kees Cook <keescook@chromium.org>
Acked-by: Michal Hocko <mhocko@suse.cz> [page and dmapool parts
Acked-by: James Morris <jamorris@linux.microsoft.com>]
Cc: Christoph Lameter <cl@linux.com>
Cc: Masahiro Yamada <yamada.masahiro@socionext.com>
Cc: "Serge E. Hallyn" <serge@hallyn.com>
Cc: Nick Desaulniers <ndesaulniers@google.com>
Cc: Kostya Serebryany <kcc@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Sandeep Patil <sspatil@android.com>
Cc: Laura Abbott <labbott@redhat.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Jann Horn <jannh@google.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Marco Elver <elver@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-07-12 06:59:19 +03:00
if ( want_init_on_alloc ( priority ) )
2016-08-23 21:39:29 +03:00
sk_prot_clear_nulls ( sk , prot - > obj_size ) ;
2010-12-17 01:26:56 +03:00
} else
2007-11-01 10:33:50 +03:00
sk = kmalloc ( prot - > obj_size , priority ) ;
2007-11-01 10:36:26 +03:00
if ( sk ! = NULL ) {
if ( security_sk_alloc ( sk , family , priority ) )
goto out_free ;
if ( ! try_module_get ( prot - > owner ) )
goto out_free_sec ;
}
2007-11-01 10:33:50 +03:00
return sk ;
2007-11-01 10:36:26 +03:00
out_free_sec :
security_sk_free ( sk ) ;
out_free :
if ( slab ! = NULL )
kmem_cache_free ( slab , sk ) ;
else
kfree ( sk ) ;
return NULL ;
2007-11-01 10:33:50 +03:00
}
static void sk_prot_free ( struct proto * prot , struct sock * sk )
{
struct kmem_cache * slab ;
2007-11-01 10:36:26 +03:00
struct module * owner ;
2007-11-01 10:33:50 +03:00
2007-11-01 10:36:26 +03:00
owner = prot - > owner ;
2007-11-01 10:33:50 +03:00
slab = prot - > slab ;
2007-11-01 10:36:26 +03:00
sock, cgroup: add sock->sk_cgroup
In cgroup v1, dealing with cgroup membership was difficult because the
number of membership associations was unbound. As a result, cgroup v1
grew several controllers whose primary purpose is either tagging
membership or pull in configuration knobs from other subsystems so
that cgroup membership test can be avoided.
net_cls and net_prio controllers are examples of the latter. They
allow configuring network-specific attributes from cgroup side so that
network subsystem can avoid testing cgroup membership; unfortunately,
these are not only cumbersome but also problematic.
Both net_cls and net_prio aren't properly hierarchical. Both inherit
configuration from the parent on creation but there's no interaction
afterwards. An ancestor doesn't restrict the behavior in its subtree
in anyway and configuration changes aren't propagated downwards.
Especially when combined with cgroup delegation, this is problematic
because delegatees can mess up whatever network configuration
implemented at the system level. net_prio would allow the delegatees
to set whatever priority value regardless of CAP_NET_ADMIN and net_cls
the same for classid.
While it is possible to solve these issues from controller side by
implementing hierarchical allowable ranges in both controllers, it
would involve quite a bit of complexity in the controllers and further
obfuscate network configuration as it becomes even more difficult to
tell what's actually being configured looking from the network side.
While not much can be done for v1 at this point, as membership
handling is sane on cgroup v2, it'd be better to make cgroup matching
behave like other network matches and classifiers than introducing
further complications.
In preparation, this patch updates sock->sk_cgrp_data handling so that
it points to the v2 cgroup that sock was created in until either
net_prio or net_cls is used. Once either of the two is used,
sock->sk_cgrp_data reverts to its previous role of carrying prioidx
and classid. This is to avoid adding yet another cgroup related field
to struct sock.
As the mode switching can happen at most once per boot, the switching
mechanism is aimed at lowering hot path overhead. It may leak a
finite, likely small, number of cgroup refs and report spurious
prioidx or classid on switching; however, dynamic updates of prioidx
and classid have always been racy and lossy - socks between creation
and fd installation are never updated, config changes don't update
existing sockets at all, and prioidx may index with dead and recycled
cgroup IDs. Non-critical inaccuracies from small race windows won't
make any noticeable difference.
This patch doesn't make use of the pointer yet. The following patch
will implement netfilter match for cgroup2 membership.
v2: Use sock_cgroup_data to avoid inflating struct sock w/ another
cgroup specific field.
v3: Add comments explaining why sock_data_prioidx() and
sock_data_classid() use different fallback values.
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Daniel Wagner <daniel.wagner@bmw-carit.de>
CC: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-12-08 01:38:53 +03:00
cgroup_sk_free ( & sk - > sk_cgrp_data ) ;
2016-10-08 03:00:58 +03:00
mem_cgroup_sk_free ( sk ) ;
2007-11-01 10:36:26 +03:00
security_sk_free ( sk ) ;
2007-11-01 10:33:50 +03:00
if ( slab ! = NULL )
kmem_cache_free ( slab , sk ) ;
else
kfree ( sk ) ;
2007-11-01 10:36:26 +03:00
module_put ( owner ) ;
2007-11-01 10:33:50 +03:00
}
2005-04-17 02:20:36 +04:00
/**
* sk_alloc - All socket objects are allocated here
2007-10-13 08:17:49 +04:00
* @ net : the applicable net namespace
2005-05-01 19:59:25 +04:00
* @ family : protocol family
* @ priority : for allocation ( % GFP_KERNEL , % GFP_ATOMIC , etc )
* @ prot : struct proto associated with this new sock instance
2015-05-09 05:09:13 +03:00
* @ kern : is this to be a kernel socket ?
2005-04-17 02:20:36 +04:00
*/
2007-10-09 10:24:22 +04:00
struct sock * sk_alloc ( struct net * net , int family , gfp_t priority ,
2015-05-09 05:09:13 +03:00
struct proto * prot , int kern )
2005-04-17 02:20:36 +04:00
{
2007-11-01 10:33:50 +03:00
struct sock * sk ;
2005-04-17 02:20:36 +04:00
2007-11-01 10:38:43 +03:00
sk = sk_prot_alloc ( prot , priority | __GFP_ZERO , family ) ;
2005-04-17 02:20:36 +04:00
if ( sk ) {
2007-11-01 10:38:43 +03:00
sk - > sk_family = family ;
/*
* See comment in struct sock definition to understand
* why we need sk_prot_creator - acme
*/
sk - > sk_prot = sk - > sk_prot_creator = prot ;
2017-03-09 11:09:05 +03:00
sk - > sk_kern_sock = kern ;
2007-11-01 10:38:43 +03:00
sock_lock_init ( sk ) ;
2015-05-09 05:10:31 +03:00
sk - > sk_net_refcnt = kern ? 0 : 1 ;
2017-12-14 16:51:58 +03:00
if ( likely ( sk - > sk_net_refcnt ) ) {
2015-05-09 05:10:31 +03:00
get_net ( net ) ;
2017-12-14 16:51:58 +03:00
sock_inuse_add ( net , 1 ) ;
}
2015-05-09 05:10:31 +03:00
sock_net_set ( sk , net ) ;
2017-06-30 13:08:00 +03:00
refcount_set ( & sk - > sk_wmem_alloc , 1 ) ;
cls_cgroup: Store classid in struct sock
Up until now cls_cgroup has relied on fetching the classid out of
the current executing thread. This runs into trouble when a packet
processing is delayed in which case it may execute out of another
thread's context.
Furthermore, even when a packet is not delayed we may fail to
classify it if soft IRQs have been disabled, because this scenario
is indistinguishable from one where a packet unrelated to the
current thread is processed by a real soft IRQ.
In fact, the current semantics is inherently broken, as a single
skb may be constructed out of the writes of two different tasks.
A different manifestation of this problem is when the TCP stack
transmits in response of an incoming ACK. This is currently
unclassified.
As we already have a concept of packet ownership for accounting
purposes in the skb->sk pointer, this is a natural place to store
the classid in a persistent manner.
This patch adds the cls_cgroup classid in struct sock, filling up
an existing hole on 64-bit :)
The value is set at socket creation time. So all sockets created
via socket(2) automatically gains the ID of the thread creating it.
Whenever another process touches the socket by either reading or
writing to it, we will change the socket classid to that of the
process if it has a valid (non-zero) classid.
For sockets created on inbound connections through accept(2), we
inherit the classid of the original listening socket through
sk_clone, possibly preceding the actual accept(2) call.
In order to minimise risks, I have not made this the authoritative
classid. For now it is only used as a backup when we execute
with soft IRQs disabled. Once we're completely happy with its
semantics we can use it as the sole classid.
Footnote: I have rearranged the error path on cls_group module
creation. If we didn't do this, then there is a window where
someone could create a tc rule using cls_group before the cgroup
subsystem has been registered.
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
2010-05-24 11:12:34 +04:00
2016-10-08 03:00:58 +03:00
mem_cgroup_sk_alloc ( sk ) ;
2016-09-20 00:44:38 +03:00
cgroup_sk_alloc ( & sk - > sk_cgrp_data ) ;
2015-12-08 01:38:52 +03:00
sock_update_classid ( & sk - > sk_cgrp_data ) ;
sock_update_netprioidx ( & sk - > sk_cgrp_data ) ;
2020-06-22 23:26:04 +03:00
sk_tx_queue_clear ( sk ) ;
2005-04-17 02:20:36 +04:00
}
2005-09-28 02:23:38 +04:00
2007-11-01 10:36:26 +03:00
return sk ;
2005-04-17 02:20:36 +04:00
}
2009-05-27 15:30:05 +04:00
EXPORT_SYMBOL ( sk_alloc ) ;
2005-04-17 02:20:36 +04:00
2016-04-01 18:52:12 +03:00
/* Sockets having SOCK_RCU_FREE will call this function after one RCU
* grace period . This is the case for UDP sockets and TCP listeners .
*/
static void __sk_destruct ( struct rcu_head * head )
2005-04-17 02:20:36 +04:00
{
2016-04-01 18:52:12 +03:00
struct sock * sk = container_of ( head , struct sock , sk_rcu ) ;
2005-04-17 02:20:36 +04:00
struct sk_filter * filter ;
if ( sk - > sk_destruct )
sk - > sk_destruct ( sk ) ;
2010-02-23 04:04:49 +03:00
filter = rcu_dereference_check ( sk - > sk_filter ,
2017-06-30 13:08:00 +03:00
refcount_read ( & sk - > sk_wmem_alloc ) = = 0 ) ;
2005-04-17 02:20:36 +04:00
if ( filter ) {
2007-10-18 08:21:51 +04:00
sk_filter_uncharge ( sk , filter ) ;
2011-08-01 20:19:00 +04:00
RCU_INIT_POINTER ( sk - > sk_filter , NULL ) ;
2005-04-17 02:20:36 +04:00
}
2011-11-28 16:04:18 +04:00
sock_disable_timestamp ( sk , SK_FLAGS_TIMESTAMP ) ;
2005-04-17 02:20:36 +04:00
bpf: Introduce bpf sk local storage
After allowing a bpf prog to
- directly read the skb->sk ptr
- get the fullsock bpf_sock by "bpf_sk_fullsock()"
- get the bpf_tcp_sock by "bpf_tcp_sock()"
- get the listener sock by "bpf_get_listener_sock()"
- avoid duplicating the fields of "(bpf_)sock" and "(bpf_)tcp_sock"
into different bpf running context.
this patch is another effort to make bpf's network programming
more intuitive to do (together with memory and performance benefit).
When bpf prog needs to store data for a sk, the current practice is to
define a map with the usual 4-tuples (src/dst ip/port) as the key.
If multiple bpf progs require to store different sk data, multiple maps
have to be defined. Hence, wasting memory to store the duplicated
keys (i.e. 4 tuples here) in each of the bpf map.
[ The smallest key could be the sk pointer itself which requires
some enhancement in the verifier and it is a separate topic. ]
Also, the bpf prog needs to clean up the elem when sk is freed.
Otherwise, the bpf map will become full and un-usable quickly.
The sk-free tracking currently could be done during sk state
transition (e.g. BPF_SOCK_OPS_STATE_CB).
The size of the map needs to be predefined which then usually ended-up
with an over-provisioned map in production. Even the map was re-sizable,
while the sk naturally come and go away already, this potential re-size
operation is arguably redundant if the data can be directly connected
to the sk itself instead of proxy-ing through a bpf map.
This patch introduces sk->sk_bpf_storage to provide local storage space
at sk for bpf prog to use. The space will be allocated when the first bpf
prog has created data for this particular sk.
The design optimizes the bpf prog's lookup (and then optionally followed by
an inline update). bpf_spin_lock should be used if the inline update needs
to be protected.
BPF_MAP_TYPE_SK_STORAGE:
-----------------------
To define a bpf "sk-local-storage", a BPF_MAP_TYPE_SK_STORAGE map (new in
this patch) needs to be created. Multiple BPF_MAP_TYPE_SK_STORAGE maps can
be created to fit different bpf progs' needs. The map enforces
BTF to allow printing the sk-local-storage during a system-wise
sk dump (e.g. "ss -ta") in the future.
The purpose of a BPF_MAP_TYPE_SK_STORAGE map is not for lookup/update/delete
a "sk-local-storage" data from a particular sk.
Think of the map as a meta-data (or "type") of a "sk-local-storage". This
particular "type" of "sk-local-storage" data can then be stored in any sk.
The main purposes of this map are mostly:
1. Define the size of a "sk-local-storage" type.
2. Provide a similar syscall userspace API as the map (e.g. lookup/update,
map-id, map-btf...etc.)
3. Keep track of all sk's storages of this "type" and clean them up
when the map is freed.
sk->sk_bpf_storage:
------------------
The main lookup/update/delete is done on sk->sk_bpf_storage (which
is a "struct bpf_sk_storage"). When doing a lookup,
the "map" pointer is now used as the "key" to search on the
sk_storage->list. The "map" pointer is actually serving
as the "type" of the "sk-local-storage" that is being
requested.
To allow very fast lookup, it should be as fast as looking up an
array at a stable-offset. At the same time, it is not ideal to
set a hard limit on the number of sk-local-storage "type" that the
system can have. Hence, this patch takes a cache approach.
The last search result from sk_storage->list is cached in
sk_storage->cache[] which is a stable sized array. Each
"sk-local-storage" type has a stable offset to the cache[] array.
In the future, a map's flag could be introduced to do cache
opt-out/enforcement if it became necessary.
The cache size is 16 (i.e. 16 types of "sk-local-storage").
Programs can share map. On the program side, having a few bpf_progs
running in the networking hotpath is already a lot. The bpf_prog
should have already consolidated the existing sock-key-ed map usage
to minimize the map lookup penalty. 16 has enough runway to grow.
All sk-local-storage data will be removed from sk->sk_bpf_storage
during sk destruction.
bpf_sk_storage_get() and bpf_sk_storage_delete():
------------------------------------------------
Instead of using bpf_map_(lookup|update|delete)_elem(),
the bpf prog needs to use the new helper bpf_sk_storage_get() and
bpf_sk_storage_delete(). The verifier can then enforce the
ARG_PTR_TO_SOCKET argument. The bpf_sk_storage_get() also allows to
"create" new elem if one does not exist in the sk. It is done by
the new BPF_SK_STORAGE_GET_F_CREATE flag. An optional value can also be
provided as the initial value during BPF_SK_STORAGE_GET_F_CREATE.
The BPF_MAP_TYPE_SK_STORAGE also supports bpf_spin_lock. Together,
it has eliminated the potential use cases for an equivalent
bpf_map_update_elem() API (for bpf_prog) in this patch.
Misc notes:
----------
1. map_get_next_key is not supported. From the userspace syscall
perspective, the map has the socket fd as the key while the map
can be shared by pinned-file or map-id.
Since btf is enforced, the existing "ss" could be enhanced to pretty
print the local-storage.
Supporting a kernel defined btf with 4 tuples as the return key could
be explored later also.
2. The sk->sk_lock cannot be acquired. Atomic operations is used instead.
e.g. cmpxchg is done on the sk->sk_bpf_storage ptr.
Please refer to the source code comments for the details in
synchronization cases and considerations.
3. The mem is charged to the sk->sk_omem_alloc as the sk filter does.
Benchmark:
---------
Here is the benchmark data collected by turning on
the "kernel.bpf_stats_enabled" sysctl.
Two bpf progs are tested:
One bpf prog with the usual bpf hashmap (max_entries = 8192) with the
sk ptr as the key. (verifier is modified to support sk ptr as the key
That should have shortened the key lookup time.)
Another bpf prog is with the new BPF_MAP_TYPE_SK_STORAGE.
Both are storing a "u32 cnt", do a lookup on "egress_skb/cgroup" for
each egress skb and then bump the cnt. netperf is used to drive
data with 4096 connected UDP sockets.
BPF_MAP_TYPE_HASH with a modifier verifier (152ns per bpf run)
27: cgroup_skb name egress_sk_map tag 74f56e832918070b run_time_ns 58280107540 run_cnt 381347633
loaded_at 2019-04-15T13:46:39-0700 uid 0
xlated 344B jited 258B memlock 4096B map_ids 16
btf_id 5
BPF_MAP_TYPE_SK_STORAGE in this patch (66ns per bpf run)
30: cgroup_skb name egress_sk_stora tag d4aa70984cc7bbf6 run_time_ns 25617093319 run_cnt 390989739
loaded_at 2019-04-15T13:47:54-0700 uid 0
xlated 168B jited 156B memlock 4096B map_ids 17
btf_id 6
Here is a high-level picture on how are the objects organized:
sk
┌──────┐
│ │
│ │
│ │
│*sk_bpf_storage─────▶ bpf_sk_storage
└──────┘ ┌───────┐
┌───────────┤ list │
│ │ │
│ │ │
│ │ │
│ └───────┘
│
│ elem
│ ┌────────┐
├─▶│ snode │
│ ├────────┤
│ │ data │ bpf_map
│ ├────────┤ ┌─────────┐
│ │map_node│◀─┬─────┤ list │
│ └────────┘ │ │ │
│ │ │ │
│ elem │ │ │
│ ┌────────┐ │ └─────────┘
└─▶│ snode │ │
├────────┤ │
bpf_map │ data │ │
┌─────────┐ ├────────┤ │
│ list ├───────▶│map_node│ │
│ │ └────────┘ │
│ │ │
│ │ elem │
└─────────┘ ┌────────┐ │
┌─▶│ snode │ │
│ ├────────┤ │
│ │ data │ │
│ ├────────┤ │
│ │map_node│◀─┘
│ └────────┘
│
│
│ ┌───────┐
sk └──────────│ list │
┌──────┐ │ │
│ │ │ │
│ │ │ │
│ │ └───────┘
│*sk_bpf_storage───────▶bpf_sk_storage
└──────┘
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2019-04-27 02:39:39 +03:00
# ifdef CONFIG_BPF_SYSCALL
bpf_sk_storage_free ( sk ) ;
# endif
2005-04-17 02:20:36 +04:00
if ( atomic_read ( & sk - > sk_omem_alloc ) )
2012-05-16 23:58:40 +04:00
pr_debug ( " %s: optmem leakage (%d bytes) detected \n " ,
__func__ , atomic_read ( & sk - > sk_omem_alloc ) ) ;
2005-04-17 02:20:36 +04:00
2017-03-15 23:21:28 +03:00
if ( sk - > sk_frag . page ) {
put_page ( sk - > sk_frag . page ) ;
sk - > sk_frag . page = NULL ;
}
2021-09-30 01:57:50 +03:00
/* We do not need to acquire sk->sk_peer_lock, we are the last user. */
put_cred ( sk - > sk_peer_cred ) ;
2010-06-13 07:30:14 +04:00
put_pid ( sk - > sk_peer_pid ) ;
2021-09-30 01:57:50 +03:00
2015-05-09 05:10:31 +03:00
if ( likely ( sk - > sk_net_refcnt ) )
put_net ( sock_net ( sk ) ) ;
2007-11-01 10:33:50 +03:00
sk_prot_free ( sk - > sk_prot_creator , sk ) ;
2005-04-17 02:20:36 +04:00
}
net: No more expensive sock_hold()/sock_put() on each tx
One of the problem with sock memory accounting is it uses
a pair of sock_hold()/sock_put() for each transmitted packet.
This slows down bidirectional flows because the receive path
also needs to take a refcount on socket and might use a different
cpu than transmit path or transmit completion path. So these
two atomic operations also trigger cache line bounces.
We can see this in tx or tx/rx workloads (media gateways for example),
where sock_wfree() can be in top five functions in profiles.
We use this sock_hold()/sock_put() so that sock freeing
is delayed until all tx packets are completed.
As we also update sk_wmem_alloc, we could offset sk_wmem_alloc
by one unit at init time, until sk_free() is called.
Once sk_free() is called, we atomic_dec_and_test(sk_wmem_alloc)
to decrement initial offset and atomicaly check if any packets
are in flight.
skb_set_owner_w() doesnt call sock_hold() anymore
sock_wfree() doesnt call sock_put() anymore, but check if sk_wmem_alloc
reached 0 to perform the final freeing.
Drawback is that a skb->truesize error could lead to unfreeable sockets, or
even worse, prematurely calling __sk_free() on a live socket.
Nice speedups on SMP. tbench for example, going from 2691 MB/s to 2711 MB/s
on my 8 cpu dev machine, even if tbench was not really hitting sk_refcnt
contention point. 5 % speedup on a UDP transmit workload (depends
on number of flows), lowering TX completion cpu usage.
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2009-06-11 13:55:43 +04:00
2016-04-01 18:52:12 +03:00
void sk_destruct ( struct sock * sk )
{
2019-09-28 02:00:31 +03:00
bool use_call_rcu = sock_flag ( sk , SOCK_RCU_FREE ) ;
if ( rcu_access_pointer ( sk - > sk_reuseport_cb ) ) {
reuseport_detach_sock ( sk ) ;
use_call_rcu = true ;
}
if ( use_call_rcu )
2016-04-01 18:52:12 +03:00
call_rcu ( & sk - > sk_rcu , __sk_destruct ) ;
else
__sk_destruct ( & sk - > sk_rcu ) ;
}
2015-06-15 18:26:18 +03:00
static void __sk_free ( struct sock * sk )
{
2017-12-14 16:51:58 +03:00
if ( likely ( sk - > sk_net_refcnt ) )
sock_inuse_add ( sock_net ( sk ) , - 1 ) ;
2018-05-18 14:47:55 +03:00
if ( unlikely ( sk - > sk_net_refcnt & & sock_diag_has_destroy_listeners ( sk ) ) )
2015-06-15 18:26:18 +03:00
sock_diag_broadcast_destroy ( sk ) ;
else
sk_destruct ( sk ) ;
}
net: No more expensive sock_hold()/sock_put() on each tx
One of the problem with sock memory accounting is it uses
a pair of sock_hold()/sock_put() for each transmitted packet.
This slows down bidirectional flows because the receive path
also needs to take a refcount on socket and might use a different
cpu than transmit path or transmit completion path. So these
two atomic operations also trigger cache line bounces.
We can see this in tx or tx/rx workloads (media gateways for example),
where sock_wfree() can be in top five functions in profiles.
We use this sock_hold()/sock_put() so that sock freeing
is delayed until all tx packets are completed.
As we also update sk_wmem_alloc, we could offset sk_wmem_alloc
by one unit at init time, until sk_free() is called.
Once sk_free() is called, we atomic_dec_and_test(sk_wmem_alloc)
to decrement initial offset and atomicaly check if any packets
are in flight.
skb_set_owner_w() doesnt call sock_hold() anymore
sock_wfree() doesnt call sock_put() anymore, but check if sk_wmem_alloc
reached 0 to perform the final freeing.
Drawback is that a skb->truesize error could lead to unfreeable sockets, or
even worse, prematurely calling __sk_free() on a live socket.
Nice speedups on SMP. tbench for example, going from 2691 MB/s to 2711 MB/s
on my 8 cpu dev machine, even if tbench was not really hitting sk_refcnt
contention point. 5 % speedup on a UDP transmit workload (depends
on number of flows), lowering TX completion cpu usage.
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2009-06-11 13:55:43 +04:00
void sk_free ( struct sock * sk )
{
/*
2011-03-31 05:57:33 +04:00
* We subtract one from sk_wmem_alloc and can know if
net: No more expensive sock_hold()/sock_put() on each tx
One of the problem with sock memory accounting is it uses
a pair of sock_hold()/sock_put() for each transmitted packet.
This slows down bidirectional flows because the receive path
also needs to take a refcount on socket and might use a different
cpu than transmit path or transmit completion path. So these
two atomic operations also trigger cache line bounces.
We can see this in tx or tx/rx workloads (media gateways for example),
where sock_wfree() can be in top five functions in profiles.
We use this sock_hold()/sock_put() so that sock freeing
is delayed until all tx packets are completed.
As we also update sk_wmem_alloc, we could offset sk_wmem_alloc
by one unit at init time, until sk_free() is called.
Once sk_free() is called, we atomic_dec_and_test(sk_wmem_alloc)
to decrement initial offset and atomicaly check if any packets
are in flight.
skb_set_owner_w() doesnt call sock_hold() anymore
sock_wfree() doesnt call sock_put() anymore, but check if sk_wmem_alloc
reached 0 to perform the final freeing.
Drawback is that a skb->truesize error could lead to unfreeable sockets, or
even worse, prematurely calling __sk_free() on a live socket.
Nice speedups on SMP. tbench for example, going from 2691 MB/s to 2711 MB/s
on my 8 cpu dev machine, even if tbench was not really hitting sk_refcnt
contention point. 5 % speedup on a UDP transmit workload (depends
on number of flows), lowering TX completion cpu usage.
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2009-06-11 13:55:43 +04:00
* some packets are still in some tx queue .
* If not null , sock_wfree ( ) will call __sk_free ( sk ) later
*/
2017-06-30 13:08:00 +03:00
if ( refcount_dec_and_test ( & sk - > sk_wmem_alloc ) )
net: No more expensive sock_hold()/sock_put() on each tx
One of the problem with sock memory accounting is it uses
a pair of sock_hold()/sock_put() for each transmitted packet.
This slows down bidirectional flows because the receive path
also needs to take a refcount on socket and might use a different
cpu than transmit path or transmit completion path. So these
two atomic operations also trigger cache line bounces.
We can see this in tx or tx/rx workloads (media gateways for example),
where sock_wfree() can be in top five functions in profiles.
We use this sock_hold()/sock_put() so that sock freeing
is delayed until all tx packets are completed.
As we also update sk_wmem_alloc, we could offset sk_wmem_alloc
by one unit at init time, until sk_free() is called.
Once sk_free() is called, we atomic_dec_and_test(sk_wmem_alloc)
to decrement initial offset and atomicaly check if any packets
are in flight.
skb_set_owner_w() doesnt call sock_hold() anymore
sock_wfree() doesnt call sock_put() anymore, but check if sk_wmem_alloc
reached 0 to perform the final freeing.
Drawback is that a skb->truesize error could lead to unfreeable sockets, or
even worse, prematurely calling __sk_free() on a live socket.
Nice speedups on SMP. tbench for example, going from 2691 MB/s to 2711 MB/s
on my 8 cpu dev machine, even if tbench was not really hitting sk_refcnt
contention point. 5 % speedup on a UDP transmit workload (depends
on number of flows), lowering TX completion cpu usage.
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2009-06-11 13:55:43 +04:00
__sk_free ( sk ) ;
}
2009-05-27 15:30:05 +04:00
EXPORT_SYMBOL ( sk_free ) ;
2005-04-17 02:20:36 +04:00
net/socket: use per af lockdep classes for sk queues
Currently the sock queue's spin locks get their lockdep
classes by the default init_spin_lock() initializer:
all socket families get - usually, see below - a single
class for rx, another specific class for tx, etc.
This can lead to false positive lockdep splat, as
reported by Andrey.
Moreover there are two separate initialization points
for the sock queues, one in sk_clone_lock() and one
in sock_init_data(), so that e.g. the rx queue lock
can get one of two possible, different classes, depending
on the socket being cloned or not.
This change tries to address the above, setting explicitly
a per address family lockdep class for each queue's
spinlock. Also, move the duplicated initialization code to a
single location.
v1 -> v2:
- renamed the init helper
rfc -> v1:
- no changes, tested with several different workload
Suggested-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-09 15:54:08 +03:00
static void sk_init_common ( struct sock * sk )
{
skb_queue_head_init ( & sk - > sk_receive_queue ) ;
skb_queue_head_init ( & sk - > sk_write_queue ) ;
skb_queue_head_init ( & sk - > sk_error_queue ) ;
rwlock_init ( & sk - > sk_callback_lock ) ;
lockdep_set_class_and_name ( & sk - > sk_receive_queue . lock ,
af_rlock_keys + sk - > sk_family ,
af_family_rlock_key_strings [ sk - > sk_family ] ) ;
lockdep_set_class_and_name ( & sk - > sk_write_queue . lock ,
af_wlock_keys + sk - > sk_family ,
af_family_wlock_key_strings [ sk - > sk_family ] ) ;
lockdep_set_class_and_name ( & sk - > sk_error_queue . lock ,
af_elock_keys + sk - > sk_family ,
af_family_elock_key_strings [ sk - > sk_family ] ) ;
lockdep_set_class_and_name ( & sk - > sk_callback_lock ,
af_callback_keys + sk - > sk_family ,
af_family_clock_key_strings [ sk - > sk_family ] ) ;
}
2011-11-09 02:07:07 +04:00
/**
* sk_clone_lock - clone a socket , and lock its clone
* @ sk : the socket to clone
* @ priority : for allocation ( % GFP_KERNEL , % GFP_ATOMIC , etc )
*
* Caller must unlock socket even in error path ( bh_unlock_sock ( newsk ) )
*/
struct sock * sk_clone_lock ( const struct sock * sk , const gfp_t priority )
2005-08-10 07:10:12 +04:00
{
2020-02-18 20:10:13 +03:00
struct proto * prot = READ_ONCE ( sk - > sk_prot ) ;
2021-01-27 18:27:31 +03:00
struct sk_filter * filter ;
2014-07-31 07:34:12 +04:00
bool is_charged = true ;
2021-01-27 18:27:31 +03:00
struct sock * newsk ;
2005-08-10 07:10:12 +04:00
2020-02-18 20:10:13 +03:00
newsk = sk_prot_alloc ( prot , priority , sk - > sk_family ) ;
2021-01-27 18:27:31 +03:00
if ( ! newsk )
goto out ;
2005-08-10 07:10:12 +04:00
2021-01-27 18:27:31 +03:00
sock_copy ( newsk , sk ) ;
net: Set sk_prot_creator when cloning sockets to the right proto
sk->sk_prot and sk->sk_prot_creator can differ when the app uses
IPV6_ADDRFORM (transforming an IPv6-socket to an IPv4-one).
Which is why sk_prot_creator is there to make sure that sk_prot_free()
does the kmem_cache_free() on the right kmem_cache slab.
Now, if such a socket gets transformed back to a listening socket (using
connect() with AF_UNSPEC) we will allocate an IPv4 tcp_sock through
sk_clone_lock() when a new connection comes in. But sk_prot_creator will
still point to the IPv6 kmem_cache (as everything got copied in
sk_clone_lock()). When freeing, we will thus put this
memory back into the IPv6 kmem_cache although it was allocated in the
IPv4 cache. I have seen memory corruption happening because of this.
With slub-debugging and MEMCG_KMEM enabled this gives the warning
"cache_from_obj: Wrong slab cache. TCPv6 but object is from TCP"
A C-program to trigger this:
void main(void)
{
int fd = socket(AF_INET6, SOCK_STREAM, IPPROTO_TCP);
int new_fd, newest_fd, client_fd;
struct sockaddr_in6 bind_addr;
struct sockaddr_in bind_addr4, client_addr1, client_addr2;
struct sockaddr unsp;
int val;
memset(&bind_addr, 0, sizeof(bind_addr));
bind_addr.sin6_family = AF_INET6;
bind_addr.sin6_port = ntohs(42424);
memset(&client_addr1, 0, sizeof(client_addr1));
client_addr1.sin_family = AF_INET;
client_addr1.sin_port = ntohs(42424);
client_addr1.sin_addr.s_addr = inet_addr("127.0.0.1");
memset(&client_addr2, 0, sizeof(client_addr2));
client_addr2.sin_family = AF_INET;
client_addr2.sin_port = ntohs(42421);
client_addr2.sin_addr.s_addr = inet_addr("127.0.0.1");
memset(&unsp, 0, sizeof(unsp));
unsp.sa_family = AF_UNSPEC;
bind(fd, (struct sockaddr *)&bind_addr, sizeof(bind_addr));
listen(fd, 5);
client_fd = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
connect(client_fd, (struct sockaddr *)&client_addr1, sizeof(client_addr1));
new_fd = accept(fd, NULL, NULL);
close(fd);
val = AF_INET;
setsockopt(new_fd, SOL_IPV6, IPV6_ADDRFORM, &val, sizeof(val));
connect(new_fd, &unsp, sizeof(unsp));
memset(&bind_addr4, 0, sizeof(bind_addr4));
bind_addr4.sin_family = AF_INET;
bind_addr4.sin_port = ntohs(42421);
bind(new_fd, (struct sockaddr *)&bind_addr4, sizeof(bind_addr4));
listen(new_fd, 5);
client_fd = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
connect(client_fd, (struct sockaddr *)&client_addr2, sizeof(client_addr2));
newest_fd = accept(new_fd, NULL, NULL);
close(new_fd);
close(client_fd);
close(new_fd);
}
As far as I can see, this bug has been there since the beginning of the
git-days.
Signed-off-by: Christoph Paasch <cpaasch@apple.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-09-27 03:38:50 +03:00
2021-01-27 18:27:31 +03:00
newsk - > sk_prot_creator = prot ;
2005-08-10 07:10:12 +04:00
2021-01-27 18:27:31 +03:00
/* SANITY */
2021-11-15 13:16:56 +03:00
if ( likely ( newsk - > sk_net_refcnt ) ) {
2021-01-27 18:27:31 +03:00
get_net ( sock_net ( newsk ) ) ;
2021-11-15 13:16:56 +03:00
sock_inuse_add ( sock_net ( newsk ) , 1 ) ;
}
2021-01-27 18:27:31 +03:00
sk_node_init ( & newsk - > sk_node ) ;
sock_lock_init ( newsk ) ;
bh_lock_sock ( newsk ) ;
newsk - > sk_backlog . head = newsk - > sk_backlog . tail = NULL ;
newsk - > sk_backlog . len = 0 ;
2005-08-10 07:10:12 +04:00
2021-01-27 18:27:31 +03:00
atomic_set ( & newsk - > sk_rmem_alloc , 0 ) ;
2005-08-10 07:10:12 +04:00
2021-01-27 18:27:31 +03:00
/* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */
refcount_set ( & newsk - > sk_wmem_alloc , 1 ) ;
2020-03-10 08:16:06 +03:00
2021-01-27 18:27:31 +03:00
atomic_set ( & newsk - > sk_omem_alloc , 0 ) ;
sk_init_common ( newsk ) ;
2020-03-10 08:16:06 +03:00
2021-01-27 18:27:31 +03:00
newsk - > sk_dst_cache = NULL ;
newsk - > sk_dst_pending_confirm = 0 ;
newsk - > sk_wmem_queued = 0 ;
newsk - > sk_forward_alloc = 0 ;
2021-09-29 20:25:11 +03:00
newsk - > sk_reserved_mem = 0 ;
2021-01-27 18:27:31 +03:00
atomic_set ( & newsk - > sk_drops , 0 ) ;
newsk - > sk_send_head = NULL ;
newsk - > sk_userlocks = sk - > sk_userlocks & ~ SOCK_BINDPORT_LOCK ;
atomic_set ( & newsk - > sk_zckey , 0 ) ;
2005-08-10 07:10:12 +04:00
2021-01-27 18:27:31 +03:00
sock_reset_flag ( newsk , SOCK_DONE ) ;
2005-08-10 07:10:12 +04:00
2021-01-27 18:27:31 +03:00
/* sk->sk_memcg will be populated at accept() time */
newsk - > sk_memcg = NULL ;
2019-08-14 20:37:49 +03:00
2021-01-27 18:27:31 +03:00
cgroup_sk_clone ( & newsk - > sk_cgrp_data ) ;
2005-08-10 07:10:12 +04:00
2021-01-27 18:27:31 +03:00
rcu_read_lock ( ) ;
filter = rcu_dereference ( sk - > sk_filter ) ;
if ( filter ! = NULL )
/* though it's an empty new sock, the charging may fail
* if sysctl_optmem_max was changed between creation of
* original socket and cloning
*/
is_charged = sk_filter_charge ( newsk , filter ) ;
RCU_INIT_POINTER ( newsk - > sk_filter , filter ) ;
rcu_read_unlock ( ) ;
if ( unlikely ( ! is_charged | | xfrm_sk_clone_policy ( newsk , sk ) ) ) {
/* We need to make sure that we don't uncharge the new
* socket if we couldn ' t charge it in the first place
* as otherwise we uncharge the parent ' s filter .
net, sk_msg: Clear sk_user_data pointer on clone if tagged
sk_user_data can hold a pointer to an object that is not intended to be
shared between the parent socket and the child that gets a pointer copy on
clone. This is the case when sk_user_data points at reference-counted
object, like struct sk_psock.
One way to resolve it is to tag the pointer with a no-copy flag by
repurposing its lowest bit. Based on the bit-flag value we clear the child
sk_user_data pointer after cloning the parent socket.
The no-copy flag is stored in the pointer itself as opposed to externally,
say in socket flags, to guarantee that the pointer and the flag are copied
from parent to child socket in an atomic fashion. Parent socket state is
subject to change while copying, we don't hold any locks at that time.
This approach relies on an assumption that sk_user_data holds a pointer to
an object aligned at least 2 bytes. A manual audit of existing users of
rcu_dereference_sk_user_data helper confirms our assumption.
Also, an RCU-protected sk_user_data is not likely to hold a pointer to a
char value or a pathological case of "struct { char c; }". To be safe, warn
when the flag-bit is set when setting sk_user_data to catch any future
misuses.
It is worth considering why clearing sk_user_data unconditionally is not an
option. There exist users, DRBD, NVMe, and Xen drivers being among them,
that rely on the pointer being copied when cloning the listening socket.
Potentially we could distinguish these users by checking if the listening
socket has been created in kernel-space via sock_create_kern, and hence has
sk_kern_sock flag set. However, this is not the case for NVMe and Xen
drivers, which create sockets without marking them as belonging to the
kernel.
Signed-off-by: Jakub Sitnicki <jakub@cloudflare.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/bpf/20200218171023.844439-3-jakub@cloudflare.com
2020-02-18 20:10:14 +03:00
*/
2021-01-27 18:27:31 +03:00
if ( ! is_charged )
RCU_INIT_POINTER ( newsk - > sk_filter , NULL ) ;
sk_free_unlock_clone ( newsk ) ;
newsk = NULL ;
goto out ;
}
RCU_INIT_POINTER ( newsk - > sk_reuseport_cb , NULL ) ;
net, sk_msg: Clear sk_user_data pointer on clone if tagged
sk_user_data can hold a pointer to an object that is not intended to be
shared between the parent socket and the child that gets a pointer copy on
clone. This is the case when sk_user_data points at reference-counted
object, like struct sk_psock.
One way to resolve it is to tag the pointer with a no-copy flag by
repurposing its lowest bit. Based on the bit-flag value we clear the child
sk_user_data pointer after cloning the parent socket.
The no-copy flag is stored in the pointer itself as opposed to externally,
say in socket flags, to guarantee that the pointer and the flag are copied
from parent to child socket in an atomic fashion. Parent socket state is
subject to change while copying, we don't hold any locks at that time.
This approach relies on an assumption that sk_user_data holds a pointer to
an object aligned at least 2 bytes. A manual audit of existing users of
rcu_dereference_sk_user_data helper confirms our assumption.
Also, an RCU-protected sk_user_data is not likely to hold a pointer to a
char value or a pathological case of "struct { char c; }". To be safe, warn
when the flag-bit is set when setting sk_user_data to catch any future
misuses.
It is worth considering why clearing sk_user_data unconditionally is not an
option. There exist users, DRBD, NVMe, and Xen drivers being among them,
that rely on the pointer being copied when cloning the listening socket.
Potentially we could distinguish these users by checking if the listening
socket has been created in kernel-space via sock_create_kern, and hence has
sk_kern_sock flag set. However, this is not the case for NVMe and Xen
drivers, which create sockets without marking them as belonging to the
kernel.
Signed-off-by: Jakub Sitnicki <jakub@cloudflare.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/bpf/20200218171023.844439-3-jakub@cloudflare.com
2020-02-18 20:10:14 +03:00
2021-01-27 18:27:31 +03:00
if ( bpf_sk_storage_clone ( sk , newsk ) ) {
sk_free_unlock_clone ( newsk ) ;
newsk = NULL ;
goto out ;
}
2016-09-20 00:44:38 +03:00
2021-01-27 18:27:31 +03:00
/* Clear sk_user_data if parent had the pointer tagged
* as not suitable for copying when cloning .
*/
if ( sk_user_data_is_nocopy ( newsk ) )
newsk - > sk_user_data = NULL ;
newsk - > sk_err = 0 ;
newsk - > sk_err_soft = 0 ;
newsk - > sk_priority = 0 ;
newsk - > sk_incoming_cpu = raw_smp_processor_id ( ) ;
/* Before updating sk_refcnt, we must commit prior changes to memory
* ( Documentation / RCU / rculist_nulls . rst for details )
*/
smp_wmb ( ) ;
refcount_set ( & newsk - > sk_refcnt , 2 ) ;
2005-08-10 07:10:12 +04:00
2021-01-27 18:27:31 +03:00
/* Increment the counter in the same struct proto as the master
* sock ( sk_refcnt_debug_inc uses newsk - > sk_prot - > socks , that
* is the same as sk - > sk_prot - > socks , as this field was copied
* with memcpy ) .
*
* This _changes_ the previous behaviour , where
* tcp_create_openreq_child always was incrementing the
* equivalent to tcp_prot - > socks ( inet_sock_nr ) , so this have
* to be taken into account in all callers . - acme
*/
sk_refcnt_debug_inc ( newsk ) ;
sk_set_socket ( newsk , NULL ) ;
sk_tx_queue_clear ( newsk ) ;
RCU_INIT_POINTER ( newsk - > sk_wq , NULL ) ;
2005-08-10 07:10:12 +04:00
2021-01-27 18:27:31 +03:00
if ( newsk - > sk_prot - > sockets_allocated )
sk_sockets_allocated_inc ( newsk ) ;
2010-01-08 11:00:09 +03:00
2021-01-27 18:27:31 +03:00
if ( sock_needs_netstamp ( sk ) & & newsk - > sk_flags & SK_FLAGS_TIMESTAMP )
net_enable_timestamp ( ) ;
2005-08-10 07:10:12 +04:00
out :
return newsk ;
}
2011-11-09 02:07:07 +04:00
EXPORT_SYMBOL_GPL ( sk_clone_lock ) ;
2005-08-10 07:10:12 +04:00
2017-03-01 22:35:08 +03:00
void sk_free_unlock_clone ( struct sock * sk )
{
/* It is still raw copy of parent, so invalidate
* destructor and make plain sk_free ( ) */
sk - > sk_destruct = NULL ;
bh_unlock_sock ( sk ) ;
sk_free ( sk ) ;
}
EXPORT_SYMBOL_GPL ( sk_free_unlock_clone ) ;
2007-04-21 04:12:43 +04:00
void sk_setup_caps ( struct sock * sk , struct dst_entry * dst )
{
2015-05-26 18:55:28 +03:00
u32 max_segs = 1 ;
2015-12-03 08:53:57 +03:00
sk_dst_set ( sk , dst ) ;
2021-11-15 22:02:34 +03:00
sk - > sk_route_caps = dst - > dev - > features ;
if ( sk_is_tcp ( sk ) )
sk - > sk_route_caps | = NETIF_F_GSO ;
2007-04-21 04:12:43 +04:00
if ( sk - > sk_route_caps & NETIF_F_GSO )
2007-06-01 09:15:50 +04:00
sk - > sk_route_caps | = NETIF_F_GSO_SOFTWARE ;
2021-11-15 22:02:35 +03:00
if ( unlikely ( sk - > sk_gso_disabled ) )
sk - > sk_route_caps & = ~ NETIF_F_GSO_MASK ;
2007-04-21 04:12:43 +04:00
if ( sk_can_gso ( sk ) ) {
2017-08-01 12:49:10 +03:00
if ( dst - > header_len & & ! xfrm_dst_offload_ok ( dst ) ) {
2007-04-21 04:12:43 +04:00
sk - > sk_route_caps & = ~ NETIF_F_GSO_MASK ;
[NET]: Add per-connection option to set max TSO frame size
Update: My mailer ate one of Jarek's feedback mails... Fixed the
parameter in netif_set_gso_max_size() to be u32, not u16. Fixed the
whitespace issue due to a patch import botch. Changed the types from
u32 to unsigned int to be more consistent with other variables in the
area. Also brought the patch up to the latest net-2.6.26 tree.
Update: Made gso_max_size container 32 bits, not 16. Moved the
location of gso_max_size within netdev to be less hotpath. Made more
consistent names between the sock and netdev layers, and added a
define for the max GSO size.
Update: Respun for net-2.6.26 tree.
Update: changed max_gso_frame_size and sk_gso_max_size from signed to
unsigned - thanks Stephen!
This patch adds the ability for device drivers to control the size of
the TSO frames being sent to them, per TCP connection. By setting the
netdevice's gso_max_size value, the socket layer will set the GSO
frame size based on that value. This will propogate into the TCP
layer, and send TSO's of that size to the hardware.
This can be desirable to help tune the bursty nature of TSO on a
per-adapter basis, where one may have 1 GbE and 10 GbE devices
coexisting in a system, one running multiqueue and the other not, etc.
This can also be desirable for devices that cannot support full 64 KB
TSO's, but still want to benefit from some level of segmentation
offloading.
Signed-off-by: Peter P Waskiewicz Jr <peter.p.waskiewicz.jr@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-03-21 13:43:19 +03:00
} else {
2007-04-21 04:12:43 +04:00
sk - > sk_route_caps | = NETIF_F_SG | NETIF_F_HW_CSUM ;
2021-11-19 18:43:31 +03:00
/* pairs with the WRITE_ONCE() in netif_set_gso_max_size() */
sk - > sk_gso_max_size = READ_ONCE ( dst - > dev - > gso_max_size ) ;
2021-11-19 18:43:32 +03:00
/* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */
max_segs = max_t ( u32 , READ_ONCE ( dst - > dev - > gso_max_segs ) , 1 ) ;
[NET]: Add per-connection option to set max TSO frame size
Update: My mailer ate one of Jarek's feedback mails... Fixed the
parameter in netif_set_gso_max_size() to be u32, not u16. Fixed the
whitespace issue due to a patch import botch. Changed the types from
u32 to unsigned int to be more consistent with other variables in the
area. Also brought the patch up to the latest net-2.6.26 tree.
Update: Made gso_max_size container 32 bits, not 16. Moved the
location of gso_max_size within netdev to be less hotpath. Made more
consistent names between the sock and netdev layers, and added a
define for the max GSO size.
Update: Respun for net-2.6.26 tree.
Update: changed max_gso_frame_size and sk_gso_max_size from signed to
unsigned - thanks Stephen!
This patch adds the ability for device drivers to control the size of
the TSO frames being sent to them, per TCP connection. By setting the
netdevice's gso_max_size value, the socket layer will set the GSO
frame size based on that value. This will propogate into the TCP
layer, and send TSO's of that size to the hardware.
This can be desirable to help tune the bursty nature of TSO on a
per-adapter basis, where one may have 1 GbE and 10 GbE devices
coexisting in a system, one running multiqueue and the other not, etc.
This can also be desirable for devices that cannot support full 64 KB
TSO's, but still want to benefit from some level of segmentation
offloading.
Signed-off-by: Peter P Waskiewicz Jr <peter.p.waskiewicz.jr@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-03-21 13:43:19 +03:00
}
2007-04-21 04:12:43 +04:00
}
2015-05-26 18:55:28 +03:00
sk - > sk_gso_max_segs = max_segs ;
2007-04-21 04:12:43 +04:00
}
EXPORT_SYMBOL_GPL ( sk_setup_caps ) ;
2005-04-17 02:20:36 +04:00
/*
* Simple resource managers for sockets .
*/
2007-02-09 17:24:36 +03:00
/*
* Write buffer destructor automatically called from kfree_skb .
2005-04-17 02:20:36 +04:00
*/
void sock_wfree ( struct sk_buff * skb )
{
struct sock * sk = skb - > sk ;
2009-09-24 14:49:24 +04:00
unsigned int len = skb - > truesize ;
2005-04-17 02:20:36 +04:00
2009-09-24 14:49:24 +04:00
if ( ! sock_flag ( sk , SOCK_USE_WRITE_QUEUE ) ) {
/*
* Keep a reference on sk_wmem_alloc , this will be released
* after sk_write_space ( ) call
*/
2017-06-30 13:08:00 +03:00
WARN_ON ( refcount_sub_and_test ( len - 1 , & sk - > sk_wmem_alloc ) ) ;
2005-04-17 02:20:36 +04:00
sk - > sk_write_space ( sk ) ;
2009-09-24 14:49:24 +04:00
len = 1 ;
}
net: No more expensive sock_hold()/sock_put() on each tx
One of the problem with sock memory accounting is it uses
a pair of sock_hold()/sock_put() for each transmitted packet.
This slows down bidirectional flows because the receive path
also needs to take a refcount on socket and might use a different
cpu than transmit path or transmit completion path. So these
two atomic operations also trigger cache line bounces.
We can see this in tx or tx/rx workloads (media gateways for example),
where sock_wfree() can be in top five functions in profiles.
We use this sock_hold()/sock_put() so that sock freeing
is delayed until all tx packets are completed.
As we also update sk_wmem_alloc, we could offset sk_wmem_alloc
by one unit at init time, until sk_free() is called.
Once sk_free() is called, we atomic_dec_and_test(sk_wmem_alloc)
to decrement initial offset and atomicaly check if any packets
are in flight.
skb_set_owner_w() doesnt call sock_hold() anymore
sock_wfree() doesnt call sock_put() anymore, but check if sk_wmem_alloc
reached 0 to perform the final freeing.
Drawback is that a skb->truesize error could lead to unfreeable sockets, or
even worse, prematurely calling __sk_free() on a live socket.
Nice speedups on SMP. tbench for example, going from 2691 MB/s to 2711 MB/s
on my 8 cpu dev machine, even if tbench was not really hitting sk_refcnt
contention point. 5 % speedup on a UDP transmit workload (depends
on number of flows), lowering TX completion cpu usage.
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2009-06-11 13:55:43 +04:00
/*
2009-09-24 14:49:24 +04:00
* if sk_wmem_alloc reaches 0 , we must finish what sk_free ( )
* could not do because of in - flight packets
net: No more expensive sock_hold()/sock_put() on each tx
One of the problem with sock memory accounting is it uses
a pair of sock_hold()/sock_put() for each transmitted packet.
This slows down bidirectional flows because the receive path
also needs to take a refcount on socket and might use a different
cpu than transmit path or transmit completion path. So these
two atomic operations also trigger cache line bounces.
We can see this in tx or tx/rx workloads (media gateways for example),
where sock_wfree() can be in top five functions in profiles.
We use this sock_hold()/sock_put() so that sock freeing
is delayed until all tx packets are completed.
As we also update sk_wmem_alloc, we could offset sk_wmem_alloc
by one unit at init time, until sk_free() is called.
Once sk_free() is called, we atomic_dec_and_test(sk_wmem_alloc)
to decrement initial offset and atomicaly check if any packets
are in flight.
skb_set_owner_w() doesnt call sock_hold() anymore
sock_wfree() doesnt call sock_put() anymore, but check if sk_wmem_alloc
reached 0 to perform the final freeing.
Drawback is that a skb->truesize error could lead to unfreeable sockets, or
even worse, prematurely calling __sk_free() on a live socket.
Nice speedups on SMP. tbench for example, going from 2691 MB/s to 2711 MB/s
on my 8 cpu dev machine, even if tbench was not really hitting sk_refcnt
contention point. 5 % speedup on a UDP transmit workload (depends
on number of flows), lowering TX completion cpu usage.
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2009-06-11 13:55:43 +04:00
*/
2017-06-30 13:08:00 +03:00
if ( refcount_sub_and_test ( len , & sk - > sk_wmem_alloc ) )
net: No more expensive sock_hold()/sock_put() on each tx
One of the problem with sock memory accounting is it uses
a pair of sock_hold()/sock_put() for each transmitted packet.
This slows down bidirectional flows because the receive path
also needs to take a refcount on socket and might use a different
cpu than transmit path or transmit completion path. So these
two atomic operations also trigger cache line bounces.
We can see this in tx or tx/rx workloads (media gateways for example),
where sock_wfree() can be in top five functions in profiles.
We use this sock_hold()/sock_put() so that sock freeing
is delayed until all tx packets are completed.
As we also update sk_wmem_alloc, we could offset sk_wmem_alloc
by one unit at init time, until sk_free() is called.
Once sk_free() is called, we atomic_dec_and_test(sk_wmem_alloc)
to decrement initial offset and atomicaly check if any packets
are in flight.
skb_set_owner_w() doesnt call sock_hold() anymore
sock_wfree() doesnt call sock_put() anymore, but check if sk_wmem_alloc
reached 0 to perform the final freeing.
Drawback is that a skb->truesize error could lead to unfreeable sockets, or
even worse, prematurely calling __sk_free() on a live socket.
Nice speedups on SMP. tbench for example, going from 2691 MB/s to 2711 MB/s
on my 8 cpu dev machine, even if tbench was not really hitting sk_refcnt
contention point. 5 % speedup on a UDP transmit workload (depends
on number of flows), lowering TX completion cpu usage.
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2009-06-11 13:55:43 +04:00
__sk_free ( sk ) ;
2005-04-17 02:20:36 +04:00
}
2009-05-27 15:30:05 +04:00
EXPORT_SYMBOL ( sock_wfree ) ;
2005-04-17 02:20:36 +04:00
2016-05-02 20:56:27 +03:00
/* This variant of sock_wfree() is used by TCP,
* since it sets SOCK_USE_WRITE_QUEUE .
*/
void __sock_wfree ( struct sk_buff * skb )
{
struct sock * sk = skb - > sk ;
2017-06-30 13:08:00 +03:00
if ( refcount_sub_and_test ( skb - > truesize , & sk - > sk_wmem_alloc ) )
2016-05-02 20:56:27 +03:00
__sk_free ( sk ) ;
}
2015-11-02 02:36:55 +03:00
void skb_set_owner_w ( struct sk_buff * skb , struct sock * sk )
{
skb_orphan ( skb ) ;
skb - > sk = sk ;
# ifdef CONFIG_INET
if ( unlikely ( ! sk_fullsock ( sk ) ) ) {
skb - > destructor = sock_edemux ;
sock_hold ( sk ) ;
return ;
}
# endif
skb - > destructor = sock_wfree ;
skb_set_hash_from_sk ( skb , sk ) ;
/*
* We used to take a refcount on sk , but following operation
* is enough to guarantee sk_free ( ) wont free this sock until
* all in - flight packets are completed
*/
2017-06-30 13:08:00 +03:00
refcount_add ( skb - > truesize , & sk - > sk_wmem_alloc ) ;
2015-11-02 02:36:55 +03:00
}
EXPORT_SYMBOL ( skb_set_owner_w ) ;
2019-08-08 03:03:59 +03:00
static bool can_skb_orphan_partial ( const struct sk_buff * skb )
{
# ifdef CONFIG_TLS_DEVICE
/* Drivers depend on in-order delivery for crypto offload,
* partial orphan breaks out - of - order - OK logic .
*/
if ( skb - > decrypted )
return false ;
# endif
return ( skb - > destructor = = sock_wfree | |
( IS_ENABLED ( CONFIG_INET ) & & skb - > destructor = = tcp_wfree ) ) ;
}
2016-05-02 20:56:27 +03:00
/* This helper is used by netem, as it can hold packets in its
* delay queue . We want to allow the owner socket to send more
* packets , as if they were already TX completed by a typical driver .
* But we also want to keep skb - > sk set because some packet schedulers
2017-05-12 01:24:41 +03:00
* rely on it ( sch_fq for example ) .
2016-05-02 20:56:27 +03:00
*/
2013-07-31 04:55:08 +04:00
void skb_orphan_partial ( struct sk_buff * skb )
{
2017-05-12 01:24:41 +03:00
if ( skb_is_tcp_pure_ack ( skb ) )
2016-05-02 20:56:27 +03:00
return ;
2021-05-11 11:35:21 +03:00
if ( can_skb_orphan_partial ( skb ) & & skb_set_owner_sk_safe ( skb , skb - > sk ) )
return ;
skb_orphan ( skb ) ;
2013-07-31 04:55:08 +04:00
}
EXPORT_SYMBOL ( skb_orphan_partial ) ;
2007-02-09 17:24:36 +03:00
/*
* Read buffer destructor automatically called from kfree_skb .
2005-04-17 02:20:36 +04:00
*/
void sock_rfree ( struct sk_buff * skb )
{
struct sock * sk = skb - > sk ;
2010-07-11 02:45:17 +04:00
unsigned int len = skb - > truesize ;
2005-04-17 02:20:36 +04:00
2010-07-11 02:45:17 +04:00
atomic_sub ( len , & sk - > sk_rmem_alloc ) ;
sk_mem_uncharge ( sk , len ) ;
2005-04-17 02:20:36 +04:00
}
2009-05-27 15:30:05 +04:00
EXPORT_SYMBOL ( sock_rfree ) ;
2005-04-17 02:20:36 +04:00
2015-03-10 21:03:46 +03:00
/*
* Buffer destructor for skbs that are not used directly in read or write
* path , e . g . for error handler skbs . Automatically called from kfree_skb .
*/
2014-09-04 21:31:35 +04:00
void sock_efree ( struct sk_buff * skb )
{
sock_put ( skb - > sk ) ;
}
EXPORT_SYMBOL ( sock_efree ) ;
2020-03-30 01:53:38 +03:00
/* Buffer destructor for prefetch/receive path where reference count may
* not be held , e . g . for listen sockets .
*/
# ifdef CONFIG_INET
void sock_pfree ( struct sk_buff * skb )
{
2020-03-30 01:53:40 +03:00
if ( sk_is_refcounted ( skb - > sk ) )
sock_gen_put ( skb - > sk ) ;
2020-03-30 01:53:38 +03:00
}
EXPORT_SYMBOL ( sock_pfree ) ;
# endif /* CONFIG_INET */
2012-05-24 03:16:53 +04:00
kuid_t sock_i_uid ( struct sock * sk )
2005-04-17 02:20:36 +04:00
{
2012-05-24 03:16:53 +04:00
kuid_t uid ;
2005-04-17 02:20:36 +04:00
2010-09-22 16:43:39 +04:00
read_lock_bh ( & sk - > sk_callback_lock ) ;
2012-05-24 03:16:53 +04:00
uid = sk - > sk_socket ? SOCK_INODE ( sk - > sk_socket ) - > i_uid : GLOBAL_ROOT_UID ;
2010-09-22 16:43:39 +04:00
read_unlock_bh ( & sk - > sk_callback_lock ) ;
2005-04-17 02:20:36 +04:00
return uid ;
}
2009-05-27 15:30:05 +04:00
EXPORT_SYMBOL ( sock_i_uid ) ;
2005-04-17 02:20:36 +04:00
unsigned long sock_i_ino ( struct sock * sk )
{
unsigned long ino ;
2010-09-22 16:43:39 +04:00
read_lock_bh ( & sk - > sk_callback_lock ) ;
2005-04-17 02:20:36 +04:00
ino = sk - > sk_socket ? SOCK_INODE ( sk - > sk_socket ) - > i_ino : 0 ;
2010-09-22 16:43:39 +04:00
read_unlock_bh ( & sk - > sk_callback_lock ) ;
2005-04-17 02:20:36 +04:00
return ino ;
}
2009-05-27 15:30:05 +04:00
EXPORT_SYMBOL ( sock_i_ino ) ;
2005-04-17 02:20:36 +04:00
/*
* Allocate a skb from the socket ' s send buffer .
*/
2005-07-09 01:57:47 +04:00
struct sk_buff * sock_wmalloc ( struct sock * sk , unsigned long size , int force ,
2005-10-07 10:46:04 +04:00
gfp_t priority )
2005-04-17 02:20:36 +04:00
{
2019-10-11 06:17:45 +03:00
if ( force | |
refcount_read ( & sk - > sk_wmem_alloc ) < READ_ONCE ( sk - > sk_sndbuf ) ) {
2009-05-27 15:30:05 +04:00
struct sk_buff * skb = alloc_skb ( size , priority ) ;
2019-10-11 06:17:45 +03:00
2005-04-17 02:20:36 +04:00
if ( skb ) {
skb_set_owner_w ( skb , sk ) ;
return skb ;
}
}
return NULL ;
}
2009-05-27 15:30:05 +04:00
EXPORT_SYMBOL ( sock_wmalloc ) ;
2005-04-17 02:20:36 +04:00
2017-08-03 23:29:37 +03:00
static void sock_ofree ( struct sk_buff * skb )
{
struct sock * sk = skb - > sk ;
atomic_sub ( skb - > truesize , & sk - > sk_omem_alloc ) ;
}
struct sk_buff * sock_omalloc ( struct sock * sk , unsigned long size ,
gfp_t priority )
{
struct sk_buff * skb ;
/* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
if ( atomic_read ( & sk - > sk_omem_alloc ) + SKB_TRUESIZE ( size ) >
sysctl_optmem_max )
return NULL ;
skb = alloc_skb ( size , priority ) ;
if ( ! skb )
return NULL ;
atomic_add ( skb - > truesize , & sk - > sk_omem_alloc ) ;
skb - > sk = sk ;
skb - > destructor = sock_ofree ;
return skb ;
}
2007-02-09 17:24:36 +03:00
/*
2005-04-17 02:20:36 +04:00
* Allocate a memory block from the socket ' s option memory buffer .
2007-02-09 17:24:36 +03:00
*/
2005-10-07 10:46:04 +04:00
void * sock_kmalloc ( struct sock * sk , int size , gfp_t priority )
2005-04-17 02:20:36 +04:00
{
2012-04-15 09:58:06 +04:00
if ( ( unsigned int ) size < = sysctl_optmem_max & &
2005-04-17 02:20:36 +04:00
atomic_read ( & sk - > sk_omem_alloc ) + size < sysctl_optmem_max ) {
void * mem ;
/* First do the add, to avoid the race if kmalloc
2007-02-09 17:24:36 +03:00
* might sleep .
2005-04-17 02:20:36 +04:00
*/
atomic_add ( size , & sk - > sk_omem_alloc ) ;
mem = kmalloc ( size , priority ) ;
if ( mem )
return mem ;
atomic_sub ( size , & sk - > sk_omem_alloc ) ;
}
return NULL ;
}
2009-05-27 15:30:05 +04:00
EXPORT_SYMBOL ( sock_kmalloc ) ;
2005-04-17 02:20:36 +04:00
2014-11-19 19:13:11 +03:00
/* Free an option memory block. Note, we actually want the inline
* here as this allows gcc to detect the nullify and fold away the
* condition entirely .
2005-04-17 02:20:36 +04:00
*/
2014-11-19 19:13:11 +03:00
static inline void __sock_kfree_s ( struct sock * sk , void * mem , int size ,
const bool nullify )
2005-04-17 02:20:36 +04:00
{
2014-10-15 01:02:37 +04:00
if ( WARN_ON_ONCE ( ! mem ) )
return ;
2014-11-19 19:13:11 +03:00
if ( nullify )
2020-08-07 09:18:13 +03:00
kfree_sensitive ( mem ) ;
2014-11-19 19:13:11 +03:00
else
kfree ( mem ) ;
2005-04-17 02:20:36 +04:00
atomic_sub ( size , & sk - > sk_omem_alloc ) ;
}
2014-11-19 19:13:11 +03:00
void sock_kfree_s ( struct sock * sk , void * mem , int size )
{
__sock_kfree_s ( sk , mem , size , false ) ;
}
2009-05-27 15:30:05 +04:00
EXPORT_SYMBOL ( sock_kfree_s ) ;
2005-04-17 02:20:36 +04:00
2014-11-19 19:13:11 +03:00
void sock_kzfree_s ( struct sock * sk , void * mem , int size )
{
__sock_kfree_s ( sk , mem , size , true ) ;
}
EXPORT_SYMBOL ( sock_kzfree_s ) ;
2005-04-17 02:20:36 +04:00
/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
I think , these locks should be removed for datagram sockets .
*/
2009-05-27 15:30:05 +04:00
static long sock_wait_for_wmem ( struct sock * sk , long timeo )
2005-04-17 02:20:36 +04:00
{
DEFINE_WAIT ( wait ) ;
2015-11-30 07:03:10 +03:00
sk_clear_bit ( SOCKWQ_ASYNC_NOSPACE , sk ) ;
2005-04-17 02:20:36 +04:00
for ( ; ; ) {
if ( ! timeo )
break ;
if ( signal_pending ( current ) )
break ;
set_bit ( SOCK_NOSPACE , & sk - > sk_socket - > flags ) ;
2010-04-20 17:03:51 +04:00
prepare_to_wait ( sk_sleep ( sk ) , & wait , TASK_INTERRUPTIBLE ) ;
2019-10-11 06:17:45 +03:00
if ( refcount_read ( & sk - > sk_wmem_alloc ) < READ_ONCE ( sk - > sk_sndbuf ) )
2005-04-17 02:20:36 +04:00
break ;
if ( sk - > sk_shutdown & SEND_SHUTDOWN )
break ;
if ( sk - > sk_err )
break ;
timeo = schedule_timeout ( timeo ) ;
}
2010-04-20 17:03:51 +04:00
finish_wait ( sk_sleep ( sk ) , & wait ) ;
2005-04-17 02:20:36 +04:00
return timeo ;
}
/*
* Generic send / receive buffer handlers
*/
2009-02-05 03:55:54 +03:00
struct sk_buff * sock_alloc_send_pskb ( struct sock * sk , unsigned long header_len ,
unsigned long data_len , int noblock ,
2013-08-09 01:38:47 +04:00
int * errcode , int max_page_order )
2005-04-17 02:20:36 +04:00
{
2014-09-17 15:49:49 +04:00
struct sk_buff * skb ;
2005-04-17 02:20:36 +04:00
long timeo ;
int err ;
timeo = sock_sndtimeo ( sk , noblock ) ;
2014-09-17 15:49:49 +04:00
for ( ; ; ) {
2005-04-17 02:20:36 +04:00
err = sock_error ( sk ) ;
if ( err ! = 0 )
goto failure ;
err = - EPIPE ;
if ( sk - > sk_shutdown & SEND_SHUTDOWN )
goto failure ;
2019-10-11 06:17:45 +03:00
if ( sk_wmem_alloc_get ( sk ) < READ_ONCE ( sk - > sk_sndbuf ) )
2014-09-17 15:49:49 +04:00
break ;
2013-08-09 01:38:47 +04:00
2015-11-30 07:03:10 +03:00
sk_set_bit ( SOCKWQ_ASYNC_NOSPACE , sk ) ;
2014-09-17 15:49:49 +04:00
set_bit ( SOCK_NOSPACE , & sk - > sk_socket - > flags ) ;
err = - EAGAIN ;
if ( ! timeo )
2005-04-17 02:20:36 +04:00
goto failure ;
2014-09-17 15:49:49 +04:00
if ( signal_pending ( current ) )
goto interrupted ;
timeo = sock_wait_for_wmem ( sk , timeo ) ;
2005-04-17 02:20:36 +04:00
}
2014-09-17 15:49:49 +04:00
skb = alloc_skb_with_frags ( header_len , data_len , max_page_order ,
errcode , sk - > sk_allocation ) ;
if ( skb )
skb_set_owner_w ( skb , sk ) ;
2005-04-17 02:20:36 +04:00
return skb ;
interrupted :
err = sock_intr_errno ( timeo ) ;
failure :
* errcode = err ;
return NULL ;
}
2009-02-05 03:55:54 +03:00
EXPORT_SYMBOL ( sock_alloc_send_pskb ) ;
2005-04-17 02:20:36 +04:00
2007-02-09 17:24:36 +03:00
struct sk_buff * sock_alloc_send_skb ( struct sock * sk , unsigned long size ,
2005-04-17 02:20:36 +04:00
int noblock , int * errcode )
{
2013-08-09 01:38:47 +04:00
return sock_alloc_send_pskb ( sk , size , 0 , noblock , errcode , 0 ) ;
2005-04-17 02:20:36 +04:00
}
2009-05-27 15:30:05 +04:00
EXPORT_SYMBOL ( sock_alloc_send_skb ) ;
2005-04-17 02:20:36 +04:00
2016-04-03 06:08:06 +03:00
int __sock_cmsg_send ( struct sock * sk , struct msghdr * msg , struct cmsghdr * cmsg ,
struct sockcm_cookie * sockc )
{
2016-04-03 06:08:09 +03:00
u32 tsflags ;
2016-04-03 06:08:06 +03:00
switch ( cmsg - > cmsg_type ) {
case SO_MARK :
if ( ! ns_capable ( sock_net ( sk ) - > user_ns , CAP_NET_ADMIN ) )
return - EPERM ;
if ( cmsg - > cmsg_len ! = CMSG_LEN ( sizeof ( u32 ) ) )
return - EINVAL ;
sockc - > mark = * ( u32 * ) CMSG_DATA ( cmsg ) ;
break ;
2019-02-02 18:34:46 +03:00
case SO_TIMESTAMPING_OLD :
2016-04-03 06:08:09 +03:00
if ( cmsg - > cmsg_len ! = CMSG_LEN ( sizeof ( u32 ) ) )
return - EINVAL ;
tsflags = * ( u32 * ) CMSG_DATA ( cmsg ) ;
if ( tsflags & ~ SOF_TIMESTAMPING_TX_RECORD_MASK )
return - EINVAL ;
sockc - > tsflags & = ~ SOF_TIMESTAMPING_TX_RECORD_MASK ;
sockc - > tsflags | = tsflags ;
break ;
2018-07-04 01:42:48 +03:00
case SCM_TXTIME :
if ( ! sock_flag ( sk , SOCK_TXTIME ) )
return - EINVAL ;
if ( cmsg - > cmsg_len ! = CMSG_LEN ( sizeof ( u64 ) ) )
return - EINVAL ;
sockc - > transmit_time = get_unaligned ( ( u64 * ) CMSG_DATA ( cmsg ) ) ;
break ;
2016-07-11 23:51:26 +03:00
/* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
case SCM_RIGHTS :
case SCM_CREDENTIALS :
break ;
2016-04-03 06:08:06 +03:00
default :
return - EINVAL ;
}
return 0 ;
}
EXPORT_SYMBOL ( __sock_cmsg_send ) ;
2015-10-09 00:56:48 +03:00
int sock_cmsg_send ( struct sock * sk , struct msghdr * msg ,
struct sockcm_cookie * sockc )
{
struct cmsghdr * cmsg ;
2016-04-03 06:08:06 +03:00
int ret ;
2015-10-09 00:56:48 +03:00
for_each_cmsghdr ( cmsg , msg ) {
if ( ! CMSG_OK ( msg , cmsg ) )
return - EINVAL ;
if ( cmsg - > cmsg_level ! = SOL_SOCKET )
continue ;
2016-04-03 06:08:06 +03:00
ret = __sock_cmsg_send ( sk , msg , cmsg , sockc ) ;
if ( ret )
return ret ;
2015-10-09 00:56:48 +03:00
}
return 0 ;
}
EXPORT_SYMBOL ( sock_cmsg_send ) ;
2017-06-07 23:29:12 +03:00
static void sk_enter_memory_pressure ( struct sock * sk )
{
if ( ! sk - > sk_prot - > enter_memory_pressure )
return ;
sk - > sk_prot - > enter_memory_pressure ( sk ) ;
}
static void sk_leave_memory_pressure ( struct sock * sk )
{
if ( sk - > sk_prot - > leave_memory_pressure ) {
sk - > sk_prot - > leave_memory_pressure ( sk ) ;
} else {
unsigned long * memory_pressure = sk - > sk_prot - > memory_pressure ;
2019-10-09 22:55:53 +03:00
if ( memory_pressure & & READ_ONCE ( * memory_pressure ) )
WRITE_ONCE ( * memory_pressure , 0 ) ;
2017-06-07 23:29:12 +03:00
}
}
2019-06-15 02:22:21 +03:00
DEFINE_STATIC_KEY_FALSE ( net_high_order_alloc_disable_key ) ;
net: use a per task frag allocator
We currently use a per socket order-0 page cache for tcp_sendmsg()
operations.
This page is used to build fragments for skbs.
Its done to increase probability of coalescing small write() into
single segments in skbs still in write queue (not yet sent)
But it wastes a lot of memory for applications handling many mostly
idle sockets, since each socket holds one page in sk->sk_sndmsg_page
Its also quite inefficient to build TSO 64KB packets, because we need
about 16 pages per skb on arches where PAGE_SIZE = 4096, so we hit
page allocator more than wanted.
This patch adds a per task frag allocator and uses bigger pages,
if available. An automatic fallback is done in case of memory pressure.
(up to 32768 bytes per frag, thats order-3 pages on x86)
This increases TCP stream performance by 20% on loopback device,
but also benefits on other network devices, since 8x less frags are
mapped on transmit and unmapped on tx completion. Alexander Duyck
mentioned a probable performance win on systems with IOMMU enabled.
Its possible some SG enabled hardware cant cope with bigger fragments,
but their ndo_start_xmit() should already handle this, splitting a
fragment in sub fragments, since some arches have PAGE_SIZE=65536
Successfully tested on various ethernet devices.
(ixgbe, igb, bnx2x, tg3, mellanox mlx4)
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Ben Hutchings <bhutchings@solarflare.com>
Cc: Vijay Subramanian <subramanian.vijay@gmail.com>
Cc: Alexander Duyck <alexander.h.duyck@intel.com>
Tested-by: Vijay Subramanian <subramanian.vijay@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2012-09-24 03:04:42 +04:00
2013-10-18 03:27:07 +04:00
/**
* skb_page_frag_refill - check that a page_frag contains enough room
* @ sz : minimum size of the fragment we want to get
* @ pfrag : pointer to page_frag
2014-09-08 15:00:00 +04:00
* @ gfp : priority for memory allocation
2013-10-18 03:27:07 +04:00
*
* Note : While this allocator tries to use high order pages , there is
* no guarantee that allocations succeed . Therefore , @ sz MUST be
* less or equal than PAGE_SIZE .
*/
2014-08-28 07:49:34 +04:00
bool skb_page_frag_refill ( unsigned int sz , struct page_frag * pfrag , gfp_t gfp )
net: use a per task frag allocator
We currently use a per socket order-0 page cache for tcp_sendmsg()
operations.
This page is used to build fragments for skbs.
Its done to increase probability of coalescing small write() into
single segments in skbs still in write queue (not yet sent)
But it wastes a lot of memory for applications handling many mostly
idle sockets, since each socket holds one page in sk->sk_sndmsg_page
Its also quite inefficient to build TSO 64KB packets, because we need
about 16 pages per skb on arches where PAGE_SIZE = 4096, so we hit
page allocator more than wanted.
This patch adds a per task frag allocator and uses bigger pages,
if available. An automatic fallback is done in case of memory pressure.
(up to 32768 bytes per frag, thats order-3 pages on x86)
This increases TCP stream performance by 20% on loopback device,
but also benefits on other network devices, since 8x less frags are
mapped on transmit and unmapped on tx completion. Alexander Duyck
mentioned a probable performance win on systems with IOMMU enabled.
Its possible some SG enabled hardware cant cope with bigger fragments,
but their ndo_start_xmit() should already handle this, splitting a
fragment in sub fragments, since some arches have PAGE_SIZE=65536
Successfully tested on various ethernet devices.
(ixgbe, igb, bnx2x, tg3, mellanox mlx4)
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Ben Hutchings <bhutchings@solarflare.com>
Cc: Vijay Subramanian <subramanian.vijay@gmail.com>
Cc: Alexander Duyck <alexander.h.duyck@intel.com>
Tested-by: Vijay Subramanian <subramanian.vijay@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2012-09-24 03:04:42 +04:00
{
if ( pfrag - > page ) {
2016-03-18 00:19:26 +03:00
if ( page_ref_count ( pfrag - > page ) = = 1 ) {
net: use a per task frag allocator
We currently use a per socket order-0 page cache for tcp_sendmsg()
operations.
This page is used to build fragments for skbs.
Its done to increase probability of coalescing small write() into
single segments in skbs still in write queue (not yet sent)
But it wastes a lot of memory for applications handling many mostly
idle sockets, since each socket holds one page in sk->sk_sndmsg_page
Its also quite inefficient to build TSO 64KB packets, because we need
about 16 pages per skb on arches where PAGE_SIZE = 4096, so we hit
page allocator more than wanted.
This patch adds a per task frag allocator and uses bigger pages,
if available. An automatic fallback is done in case of memory pressure.
(up to 32768 bytes per frag, thats order-3 pages on x86)
This increases TCP stream performance by 20% on loopback device,
but also benefits on other network devices, since 8x less frags are
mapped on transmit and unmapped on tx completion. Alexander Duyck
mentioned a probable performance win on systems with IOMMU enabled.
Its possible some SG enabled hardware cant cope with bigger fragments,
but their ndo_start_xmit() should already handle this, splitting a
fragment in sub fragments, since some arches have PAGE_SIZE=65536
Successfully tested on various ethernet devices.
(ixgbe, igb, bnx2x, tg3, mellanox mlx4)
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Ben Hutchings <bhutchings@solarflare.com>
Cc: Vijay Subramanian <subramanian.vijay@gmail.com>
Cc: Alexander Duyck <alexander.h.duyck@intel.com>
Tested-by: Vijay Subramanian <subramanian.vijay@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2012-09-24 03:04:42 +04:00
pfrag - > offset = 0 ;
return true ;
}
2013-10-18 03:27:07 +04:00
if ( pfrag - > offset + sz < = pfrag - > size )
net: use a per task frag allocator
We currently use a per socket order-0 page cache for tcp_sendmsg()
operations.
This page is used to build fragments for skbs.
Its done to increase probability of coalescing small write() into
single segments in skbs still in write queue (not yet sent)
But it wastes a lot of memory for applications handling many mostly
idle sockets, since each socket holds one page in sk->sk_sndmsg_page
Its also quite inefficient to build TSO 64KB packets, because we need
about 16 pages per skb on arches where PAGE_SIZE = 4096, so we hit
page allocator more than wanted.
This patch adds a per task frag allocator and uses bigger pages,
if available. An automatic fallback is done in case of memory pressure.
(up to 32768 bytes per frag, thats order-3 pages on x86)
This increases TCP stream performance by 20% on loopback device,
but also benefits on other network devices, since 8x less frags are
mapped on transmit and unmapped on tx completion. Alexander Duyck
mentioned a probable performance win on systems with IOMMU enabled.
Its possible some SG enabled hardware cant cope with bigger fragments,
but their ndo_start_xmit() should already handle this, splitting a
fragment in sub fragments, since some arches have PAGE_SIZE=65536
Successfully tested on various ethernet devices.
(ixgbe, igb, bnx2x, tg3, mellanox mlx4)
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Ben Hutchings <bhutchings@solarflare.com>
Cc: Vijay Subramanian <subramanian.vijay@gmail.com>
Cc: Alexander Duyck <alexander.h.duyck@intel.com>
Tested-by: Vijay Subramanian <subramanian.vijay@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2012-09-24 03:04:42 +04:00
return true ;
put_page ( pfrag - > page ) ;
}
2014-08-28 07:49:34 +04:00
pfrag - > offset = 0 ;
2019-06-15 02:22:21 +03:00
if ( SKB_FRAG_PAGE_ORDER & &
! static_branch_unlikely ( & net_high_order_alloc_disable_key ) ) {
2015-11-07 03:28:21 +03:00
/* Avoid direct reclaim but allow kswapd to wake */
pfrag - > page = alloc_pages ( ( gfp & ~ __GFP_DIRECT_RECLAIM ) |
__GFP_COMP | __GFP_NOWARN |
__GFP_NORETRY ,
2014-08-28 07:49:34 +04:00
SKB_FRAG_PAGE_ORDER ) ;
net: use a per task frag allocator
We currently use a per socket order-0 page cache for tcp_sendmsg()
operations.
This page is used to build fragments for skbs.
Its done to increase probability of coalescing small write() into
single segments in skbs still in write queue (not yet sent)
But it wastes a lot of memory for applications handling many mostly
idle sockets, since each socket holds one page in sk->sk_sndmsg_page
Its also quite inefficient to build TSO 64KB packets, because we need
about 16 pages per skb on arches where PAGE_SIZE = 4096, so we hit
page allocator more than wanted.
This patch adds a per task frag allocator and uses bigger pages,
if available. An automatic fallback is done in case of memory pressure.
(up to 32768 bytes per frag, thats order-3 pages on x86)
This increases TCP stream performance by 20% on loopback device,
but also benefits on other network devices, since 8x less frags are
mapped on transmit and unmapped on tx completion. Alexander Duyck
mentioned a probable performance win on systems with IOMMU enabled.
Its possible some SG enabled hardware cant cope with bigger fragments,
but their ndo_start_xmit() should already handle this, splitting a
fragment in sub fragments, since some arches have PAGE_SIZE=65536
Successfully tested on various ethernet devices.
(ixgbe, igb, bnx2x, tg3, mellanox mlx4)
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Ben Hutchings <bhutchings@solarflare.com>
Cc: Vijay Subramanian <subramanian.vijay@gmail.com>
Cc: Alexander Duyck <alexander.h.duyck@intel.com>
Tested-by: Vijay Subramanian <subramanian.vijay@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2012-09-24 03:04:42 +04:00
if ( likely ( pfrag - > page ) ) {
2014-08-28 07:49:34 +04:00
pfrag - > size = PAGE_SIZE < < SKB_FRAG_PAGE_ORDER ;
net: use a per task frag allocator
We currently use a per socket order-0 page cache for tcp_sendmsg()
operations.
This page is used to build fragments for skbs.
Its done to increase probability of coalescing small write() into
single segments in skbs still in write queue (not yet sent)
But it wastes a lot of memory for applications handling many mostly
idle sockets, since each socket holds one page in sk->sk_sndmsg_page
Its also quite inefficient to build TSO 64KB packets, because we need
about 16 pages per skb on arches where PAGE_SIZE = 4096, so we hit
page allocator more than wanted.
This patch adds a per task frag allocator and uses bigger pages,
if available. An automatic fallback is done in case of memory pressure.
(up to 32768 bytes per frag, thats order-3 pages on x86)
This increases TCP stream performance by 20% on loopback device,
but also benefits on other network devices, since 8x less frags are
mapped on transmit and unmapped on tx completion. Alexander Duyck
mentioned a probable performance win on systems with IOMMU enabled.
Its possible some SG enabled hardware cant cope with bigger fragments,
but their ndo_start_xmit() should already handle this, splitting a
fragment in sub fragments, since some arches have PAGE_SIZE=65536
Successfully tested on various ethernet devices.
(ixgbe, igb, bnx2x, tg3, mellanox mlx4)
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Ben Hutchings <bhutchings@solarflare.com>
Cc: Vijay Subramanian <subramanian.vijay@gmail.com>
Cc: Alexander Duyck <alexander.h.duyck@intel.com>
Tested-by: Vijay Subramanian <subramanian.vijay@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2012-09-24 03:04:42 +04:00
return true ;
}
2014-08-28 07:49:34 +04:00
}
pfrag - > page = alloc_page ( gfp ) ;
if ( likely ( pfrag - > page ) ) {
pfrag - > size = PAGE_SIZE ;
return true ;
}
2013-10-18 03:27:07 +04:00
return false ;
}
EXPORT_SYMBOL ( skb_page_frag_refill ) ;
bool sk_page_frag_refill ( struct sock * sk , struct page_frag * pfrag )
{
if ( likely ( skb_page_frag_refill ( 32U , pfrag , sk - > sk_allocation ) ) )
return true ;
net: use a per task frag allocator
We currently use a per socket order-0 page cache for tcp_sendmsg()
operations.
This page is used to build fragments for skbs.
Its done to increase probability of coalescing small write() into
single segments in skbs still in write queue (not yet sent)
But it wastes a lot of memory for applications handling many mostly
idle sockets, since each socket holds one page in sk->sk_sndmsg_page
Its also quite inefficient to build TSO 64KB packets, because we need
about 16 pages per skb on arches where PAGE_SIZE = 4096, so we hit
page allocator more than wanted.
This patch adds a per task frag allocator and uses bigger pages,
if available. An automatic fallback is done in case of memory pressure.
(up to 32768 bytes per frag, thats order-3 pages on x86)
This increases TCP stream performance by 20% on loopback device,
but also benefits on other network devices, since 8x less frags are
mapped on transmit and unmapped on tx completion. Alexander Duyck
mentioned a probable performance win on systems with IOMMU enabled.
Its possible some SG enabled hardware cant cope with bigger fragments,
but their ndo_start_xmit() should already handle this, splitting a
fragment in sub fragments, since some arches have PAGE_SIZE=65536
Successfully tested on various ethernet devices.
(ixgbe, igb, bnx2x, tg3, mellanox mlx4)
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Ben Hutchings <bhutchings@solarflare.com>
Cc: Vijay Subramanian <subramanian.vijay@gmail.com>
Cc: Alexander Duyck <alexander.h.duyck@intel.com>
Tested-by: Vijay Subramanian <subramanian.vijay@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2012-09-24 03:04:42 +04:00
sk_enter_memory_pressure ( sk ) ;
sk_stream_moderate_sndbuf ( sk ) ;
return false ;
}
EXPORT_SYMBOL ( sk_page_frag_refill ) ;
2020-11-27 13:10:22 +03:00
void __lock_sock ( struct sock * sk )
2010-09-08 07:48:48 +04:00
__releases ( & sk - > sk_lock . slock )
__acquires ( & sk - > sk_lock . slock )
2005-04-17 02:20:36 +04:00
{
DEFINE_WAIT ( wait ) ;
2007-04-11 07:10:33 +04:00
for ( ; ; ) {
2005-04-17 02:20:36 +04:00
prepare_to_wait_exclusive ( & sk - > sk_lock . wq , & wait ,
TASK_UNINTERRUPTIBLE ) ;
spin_unlock_bh ( & sk - > sk_lock . slock ) ;
schedule ( ) ;
spin_lock_bh ( & sk - > sk_lock . slock ) ;
2007-04-11 07:10:33 +04:00
if ( ! sock_owned_by_user ( sk ) )
2005-04-17 02:20:36 +04:00
break ;
}
finish_wait ( & sk - > sk_lock . wq , & wait ) ;
}
2018-10-02 09:24:26 +03:00
void __release_sock ( struct sock * sk )
2010-09-08 07:48:48 +04:00
__releases ( & sk - > sk_lock . slock )
__acquires ( & sk - > sk_lock . slock )
2005-04-17 02:20:36 +04:00
{
2016-04-30 00:16:52 +03:00
struct sk_buff * skb , * next ;
2005-04-17 02:20:36 +04:00
2016-04-30 00:16:52 +03:00
while ( ( skb = sk - > sk_backlog . head ) ! = NULL ) {
2005-04-17 02:20:36 +04:00
sk - > sk_backlog . head = sk - > sk_backlog . tail = NULL ;
2016-04-30 00:16:52 +03:00
spin_unlock_bh ( & sk - > sk_lock . slock ) ;
2005-04-17 02:20:36 +04:00
2016-04-30 00:16:52 +03:00
do {
next = skb - > next ;
2012-04-30 20:07:09 +04:00
prefetch ( next ) ;
2010-05-12 03:19:48 +04:00
WARN_ON_ONCE ( skb_dst_is_noref ( skb ) ) ;
2018-07-30 06:42:53 +03:00
skb_mark_not_on_list ( skb ) ;
2008-10-08 01:18:42 +04:00
sk_backlog_rcv ( sk , skb ) ;
2005-04-17 02:20:36 +04:00
2016-04-30 00:16:52 +03:00
cond_resched ( ) ;
2005-04-17 02:20:36 +04:00
skb = next ;
} while ( skb ! = NULL ) ;
2016-04-30 00:16:52 +03:00
spin_lock_bh ( & sk - > sk_lock . slock ) ;
}
2010-03-04 21:01:40 +03:00
/*
* Doing the zeroing here guarantee we can not loop forever
* while a wild producer attempts to flood us .
*/
sk - > sk_backlog . len = 0 ;
2005-04-17 02:20:36 +04:00
}
2016-04-30 00:16:53 +03:00
void __sk_flush_backlog ( struct sock * sk )
{
spin_lock_bh ( & sk - > sk_lock . slock ) ;
__release_sock ( sk ) ;
spin_unlock_bh ( & sk - > sk_lock . slock ) ;
}
2005-04-17 02:20:36 +04:00
/**
* sk_wait_data - wait for data to arrive at sk_receive_queue
2005-05-01 19:59:25 +04:00
* @ sk : sock to wait on
* @ timeo : for how long
2015-07-24 19:19:25 +03:00
* @ skb : last skb seen on sk_receive_queue
2005-04-17 02:20:36 +04:00
*
* Now socket state including sk - > sk_err is changed only under lock ,
* hence we may omit checks after joining wait queue .
* We check receive queue before schedule ( ) only as optimization ;
* it is very likely that release_sock ( ) added new data .
*/
2015-07-24 19:19:25 +03:00
int sk_wait_data ( struct sock * sk , long * timeo , const struct sk_buff * skb )
2005-04-17 02:20:36 +04:00
{
2016-11-11 21:20:50 +03:00
DEFINE_WAIT_FUNC ( wait , woken_wake_function ) ;
2005-04-17 02:20:36 +04:00
int rc ;
2016-11-11 21:20:50 +03:00
add_wait_queue ( sk_sleep ( sk ) , & wait ) ;
2015-11-30 07:03:10 +03:00
sk_set_bit ( SOCKWQ_ASYNC_WAITDATA , sk ) ;
2016-11-11 21:20:50 +03:00
rc = sk_wait_event ( sk , timeo , skb_peek_tail ( & sk - > sk_receive_queue ) ! = skb , & wait ) ;
2015-11-30 07:03:10 +03:00
sk_clear_bit ( SOCKWQ_ASYNC_WAITDATA , sk ) ;
2016-11-11 21:20:50 +03:00
remove_wait_queue ( sk_sleep ( sk ) , & wait ) ;
2005-04-17 02:20:36 +04:00
return rc ;
}
EXPORT_SYMBOL ( sk_wait_data ) ;
2007-12-31 11:11:19 +03:00
/**
2016-10-21 14:55:45 +03:00
* __sk_mem_raise_allocated - increase memory_allocated
2007-12-31 11:11:19 +03:00
* @ sk : socket
* @ size : memory size to allocate
2016-10-21 14:55:45 +03:00
* @ amt : pages to allocate
2007-12-31 11:11:19 +03:00
* @ kind : allocation type
*
2016-10-21 14:55:45 +03:00
* Similar to __sk_mem_schedule ( ) , but does not update sk_forward_alloc
2007-12-31 11:11:19 +03:00
*/
2016-10-21 14:55:45 +03:00
int __sk_mem_raise_allocated ( struct sock * sk , int size , int amt , int kind )
2007-12-31 11:11:19 +03:00
{
struct proto * prot = sk - > sk_prot ;
2016-10-21 14:55:45 +03:00
long allocated = sk_memory_allocated_add ( sk , amt ) ;
2021-08-17 22:40:03 +03:00
bool memcg_charge = mem_cgroup_sockets_enabled & & sk - > sk_memcg ;
2018-07-01 18:31:30 +03:00
bool charged = true ;
net: tcp_memcontrol: sanitize tcp memory accounting callbacks
There won't be a tcp control soft limit, so integrating the memcg code
into the global skmem limiting scheme complicates things unnecessarily.
Replace this with simple and clear charge and uncharge calls--hidden
behind a jump label--to account skb memory.
Note that this is not purely aesthetic: as a result of shoehorning the
per-memcg code into the same memory accounting functions that handle the
global level, the old code would compare the per-memcg consumption
against the smaller of the per-memcg limit and the global limit. This
allowed the total consumption of multiple sockets to exceed the global
limit, as long as the individual sockets stayed within bounds. After
this change, the code will always compare the per-memcg consumption to
the per-memcg limit, and the global consumption to the global limit, and
thus close this loophole.
Without a soft limit, the per-memcg memory pressure state in sockets is
generally questionable. However, we did it until now, so we continue to
enter it when the hard limit is hit, and packets are dropped, to let
other sockets in the cgroup know that they shouldn't grow their transmit
windows, either. However, keep it simple in the new callback model and
leave memory pressure lazily when the next packet is accepted (as
opposed to doing it synchroneously when packets are processed). When
packets are dropped, network performance will already be in the toilet,
so that should be a reasonable trade-off.
As described above, consumption is now checked on the per-memcg level
and the global level separately. Likewise, memory pressure states are
maintained on both the per-memcg level and the global level, and a
socket is considered under pressure when either level asserts as much.
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Vladimir Davydov <vdavydov@virtuozzo.com>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-01-15 02:21:14 +03:00
2021-08-17 22:40:03 +03:00
if ( memcg_charge & &
! ( charged = mem_cgroup_charge_skmem ( sk - > sk_memcg , amt ,
gfp_memcg_charge ( ) ) ) )
net: tcp_memcontrol: sanitize tcp memory accounting callbacks
There won't be a tcp control soft limit, so integrating the memcg code
into the global skmem limiting scheme complicates things unnecessarily.
Replace this with simple and clear charge and uncharge calls--hidden
behind a jump label--to account skb memory.
Note that this is not purely aesthetic: as a result of shoehorning the
per-memcg code into the same memory accounting functions that handle the
global level, the old code would compare the per-memcg consumption
against the smaller of the per-memcg limit and the global limit. This
allowed the total consumption of multiple sockets to exceed the global
limit, as long as the individual sockets stayed within bounds. After
this change, the code will always compare the per-memcg consumption to
the per-memcg limit, and the global consumption to the global limit, and
thus close this loophole.
Without a soft limit, the per-memcg memory pressure state in sockets is
generally questionable. However, we did it until now, so we continue to
enter it when the hard limit is hit, and packets are dropped, to let
other sockets in the cgroup know that they shouldn't grow their transmit
windows, either. However, keep it simple in the new callback model and
leave memory pressure lazily when the next packet is accepted (as
opposed to doing it synchroneously when packets are processed). When
packets are dropped, network performance will already be in the toilet,
so that should be a reasonable trade-off.
As described above, consumption is now checked on the per-memcg level
and the global level separately. Likewise, memory pressure states are
maintained on both the per-memcg level and the global level, and a
socket is considered under pressure when either level asserts as much.
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Vladimir Davydov <vdavydov@virtuozzo.com>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-01-15 02:21:14 +03:00
goto suppress_allocation ;
2007-12-31 11:11:19 +03:00
/* Under limit. */
net: tcp_memcontrol: sanitize tcp memory accounting callbacks
There won't be a tcp control soft limit, so integrating the memcg code
into the global skmem limiting scheme complicates things unnecessarily.
Replace this with simple and clear charge and uncharge calls--hidden
behind a jump label--to account skb memory.
Note that this is not purely aesthetic: as a result of shoehorning the
per-memcg code into the same memory accounting functions that handle the
global level, the old code would compare the per-memcg consumption
against the smaller of the per-memcg limit and the global limit. This
allowed the total consumption of multiple sockets to exceed the global
limit, as long as the individual sockets stayed within bounds. After
this change, the code will always compare the per-memcg consumption to
the per-memcg limit, and the global consumption to the global limit, and
thus close this loophole.
Without a soft limit, the per-memcg memory pressure state in sockets is
generally questionable. However, we did it until now, so we continue to
enter it when the hard limit is hit, and packets are dropped, to let
other sockets in the cgroup know that they shouldn't grow their transmit
windows, either. However, keep it simple in the new callback model and
leave memory pressure lazily when the next packet is accepted (as
opposed to doing it synchroneously when packets are processed). When
packets are dropped, network performance will already be in the toilet,
so that should be a reasonable trade-off.
As described above, consumption is now checked on the per-memcg level
and the global level separately. Likewise, memory pressure states are
maintained on both the per-memcg level and the global level, and a
socket is considered under pressure when either level asserts as much.
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Vladimir Davydov <vdavydov@virtuozzo.com>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-01-15 02:21:14 +03:00
if ( allocated < = sk_prot_mem_limits ( sk , 0 ) ) {
2011-12-12 01:47:02 +04:00
sk_leave_memory_pressure ( sk ) ;
2007-12-31 11:11:19 +03:00
return 1 ;
}
net: tcp_memcontrol: sanitize tcp memory accounting callbacks
There won't be a tcp control soft limit, so integrating the memcg code
into the global skmem limiting scheme complicates things unnecessarily.
Replace this with simple and clear charge and uncharge calls--hidden
behind a jump label--to account skb memory.
Note that this is not purely aesthetic: as a result of shoehorning the
per-memcg code into the same memory accounting functions that handle the
global level, the old code would compare the per-memcg consumption
against the smaller of the per-memcg limit and the global limit. This
allowed the total consumption of multiple sockets to exceed the global
limit, as long as the individual sockets stayed within bounds. After
this change, the code will always compare the per-memcg consumption to
the per-memcg limit, and the global consumption to the global limit, and
thus close this loophole.
Without a soft limit, the per-memcg memory pressure state in sockets is
generally questionable. However, we did it until now, so we continue to
enter it when the hard limit is hit, and packets are dropped, to let
other sockets in the cgroup know that they shouldn't grow their transmit
windows, either. However, keep it simple in the new callback model and
leave memory pressure lazily when the next packet is accepted (as
opposed to doing it synchroneously when packets are processed). When
packets are dropped, network performance will already be in the toilet,
so that should be a reasonable trade-off.
As described above, consumption is now checked on the per-memcg level
and the global level separately. Likewise, memory pressure states are
maintained on both the per-memcg level and the global level, and a
socket is considered under pressure when either level asserts as much.
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Vladimir Davydov <vdavydov@virtuozzo.com>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-01-15 02:21:14 +03:00
/* Under pressure. */
if ( allocated > sk_prot_mem_limits ( sk , 1 ) )
2011-12-12 01:47:02 +04:00
sk_enter_memory_pressure ( sk ) ;
2007-12-31 11:11:19 +03:00
net: tcp_memcontrol: sanitize tcp memory accounting callbacks
There won't be a tcp control soft limit, so integrating the memcg code
into the global skmem limiting scheme complicates things unnecessarily.
Replace this with simple and clear charge and uncharge calls--hidden
behind a jump label--to account skb memory.
Note that this is not purely aesthetic: as a result of shoehorning the
per-memcg code into the same memory accounting functions that handle the
global level, the old code would compare the per-memcg consumption
against the smaller of the per-memcg limit and the global limit. This
allowed the total consumption of multiple sockets to exceed the global
limit, as long as the individual sockets stayed within bounds. After
this change, the code will always compare the per-memcg consumption to
the per-memcg limit, and the global consumption to the global limit, and
thus close this loophole.
Without a soft limit, the per-memcg memory pressure state in sockets is
generally questionable. However, we did it until now, so we continue to
enter it when the hard limit is hit, and packets are dropped, to let
other sockets in the cgroup know that they shouldn't grow their transmit
windows, either. However, keep it simple in the new callback model and
leave memory pressure lazily when the next packet is accepted (as
opposed to doing it synchroneously when packets are processed). When
packets are dropped, network performance will already be in the toilet,
so that should be a reasonable trade-off.
As described above, consumption is now checked on the per-memcg level
and the global level separately. Likewise, memory pressure states are
maintained on both the per-memcg level and the global level, and a
socket is considered under pressure when either level asserts as much.
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Vladimir Davydov <vdavydov@virtuozzo.com>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-01-15 02:21:14 +03:00
/* Over hard limit. */
if ( allocated > sk_prot_mem_limits ( sk , 2 ) )
2007-12-31 11:11:19 +03:00
goto suppress_allocation ;
/* guarantee minimum buffer size under pressure */
if ( kind = = SK_MEM_RECV ) {
2017-11-07 11:29:27 +03:00
if ( atomic_read ( & sk - > sk_rmem_alloc ) < sk_get_rmem0 ( sk , prot ) )
2007-12-31 11:11:19 +03:00
return 1 ;
2011-12-12 01:47:02 +04:00
2007-12-31 11:11:19 +03:00
} else { /* SK_MEM_SEND */
2017-11-07 11:29:27 +03:00
int wmem0 = sk_get_wmem0 ( sk , prot ) ;
2007-12-31 11:11:19 +03:00
if ( sk - > sk_type = = SOCK_STREAM ) {
2017-11-07 11:29:27 +03:00
if ( sk - > sk_wmem_queued < wmem0 )
2007-12-31 11:11:19 +03:00
return 1 ;
2017-11-07 11:29:27 +03:00
} else if ( refcount_read ( & sk - > sk_wmem_alloc ) < wmem0 ) {
2007-12-31 11:11:19 +03:00
return 1 ;
2017-11-07 11:29:27 +03:00
}
2007-12-31 11:11:19 +03:00
}
2011-12-12 01:47:02 +04:00
if ( sk_has_memory_pressure ( sk ) ) {
2019-02-12 23:26:27 +03:00
u64 alloc ;
2008-11-26 08:16:35 +03:00
2011-12-12 01:47:02 +04:00
if ( ! sk_under_memory_pressure ( sk ) )
2008-11-26 08:16:35 +03:00
return 1 ;
2011-12-12 01:47:02 +04:00
alloc = sk_sockets_allocated_read_positive ( sk ) ;
if ( sk_prot_mem_limits ( sk , 2 ) > alloc *
2007-12-31 11:11:19 +03:00
sk_mem_pages ( sk - > sk_wmem_queued +
atomic_read ( & sk - > sk_rmem_alloc ) +
sk - > sk_forward_alloc ) )
return 1 ;
}
suppress_allocation :
if ( kind = = SK_MEM_SEND & & sk - > sk_type = = SOCK_STREAM ) {
sk_stream_moderate_sndbuf ( sk ) ;
/* Fail only if socket is _under_ its sndbuf.
* In this case we cannot block , so that we have to fail .
*/
2021-08-17 22:40:03 +03:00
if ( sk - > sk_wmem_queued + size > = sk - > sk_sndbuf ) {
/* Force charge with __GFP_NOFAIL */
if ( memcg_charge & & ! charged ) {
mem_cgroup_charge_skmem ( sk - > sk_memcg , amt ,
gfp_memcg_charge ( ) | __GFP_NOFAIL ) ;
}
2007-12-31 11:11:19 +03:00
return 1 ;
2021-08-17 22:40:03 +03:00
}
2007-12-31 11:11:19 +03:00
}
2018-07-01 18:31:30 +03:00
if ( kind = = SK_MEM_SEND | | ( kind = = SK_MEM_RECV & & charged ) )
trace_sock_exceed_buf_limit ( sk , prot , allocated , kind ) ;
2011-06-17 16:00:03 +04:00
net: introduce res_counter_charge_nofail() for socket allocations
There is a case in __sk_mem_schedule(), where an allocation
is beyond the maximum, but yet we are allowed to proceed.
It happens under the following condition:
sk->sk_wmem_queued + size >= sk->sk_sndbuf
The network code won't revert the allocation in this case,
meaning that at some point later it'll try to do it. Since
this is never communicated to the underlying res_counter
code, there is an inbalance in res_counter uncharge operation.
I see two ways of fixing this:
1) storing the information about those allocations somewhere
in memcg, and then deducting from that first, before
we start draining the res_counter,
2) providing a slightly different allocation function for
the res_counter, that matches the original behavior of
the network code more closely.
I decided to go for #2 here, believing it to be more elegant,
since #1 would require us to do basically that, but in a more
obscure way.
Signed-off-by: Glauber Costa <glommer@parallels.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
CC: Tejun Heo <tj@kernel.org>
CC: Li Zefan <lizf@cn.fujitsu.com>
CC: Laurent Chavey <chavey@google.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2012-01-20 08:57:16 +04:00
sk_memory_allocated_sub ( sk , amt ) ;
2011-12-12 01:47:02 +04:00
2021-08-17 22:40:03 +03:00
if ( memcg_charge & & charged )
2016-01-15 02:21:17 +03:00
mem_cgroup_uncharge_skmem ( sk - > sk_memcg , amt ) ;
net: tcp_memcontrol: sanitize tcp memory accounting callbacks
There won't be a tcp control soft limit, so integrating the memcg code
into the global skmem limiting scheme complicates things unnecessarily.
Replace this with simple and clear charge and uncharge calls--hidden
behind a jump label--to account skb memory.
Note that this is not purely aesthetic: as a result of shoehorning the
per-memcg code into the same memory accounting functions that handle the
global level, the old code would compare the per-memcg consumption
against the smaller of the per-memcg limit and the global limit. This
allowed the total consumption of multiple sockets to exceed the global
limit, as long as the individual sockets stayed within bounds. After
this change, the code will always compare the per-memcg consumption to
the per-memcg limit, and the global consumption to the global limit, and
thus close this loophole.
Without a soft limit, the per-memcg memory pressure state in sockets is
generally questionable. However, we did it until now, so we continue to
enter it when the hard limit is hit, and packets are dropped, to let
other sockets in the cgroup know that they shouldn't grow their transmit
windows, either. However, keep it simple in the new callback model and
leave memory pressure lazily when the next packet is accepted (as
opposed to doing it synchroneously when packets are processed). When
packets are dropped, network performance will already be in the toilet,
so that should be a reasonable trade-off.
As described above, consumption is now checked on the per-memcg level
and the global level separately. Likewise, memory pressure states are
maintained on both the per-memcg level and the global level, and a
socket is considered under pressure when either level asserts as much.
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Vladimir Davydov <vdavydov@virtuozzo.com>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-01-15 02:21:14 +03:00
2007-12-31 11:11:19 +03:00
return 0 ;
}
2016-10-21 14:55:45 +03:00
EXPORT_SYMBOL ( __sk_mem_raise_allocated ) ;
/**
* __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
* @ sk : socket
* @ size : memory size to allocate
* @ kind : allocation type
*
* If kind is SK_MEM_SEND , it means wmem allocation . Otherwise it means
* rmem allocation . This function assumes that protocols which have
* memory_pressure use sk_wmem_queued as write buffer accounting .
*/
int __sk_mem_schedule ( struct sock * sk , int size , int kind )
{
int ret , amt = sk_mem_pages ( size ) ;
sk - > sk_forward_alloc + = amt < < SK_MEM_QUANTUM_SHIFT ;
ret = __sk_mem_raise_allocated ( sk , size , amt , kind ) ;
if ( ! ret )
sk - > sk_forward_alloc - = amt < < SK_MEM_QUANTUM_SHIFT ;
return ret ;
}
2007-12-31 11:11:19 +03:00
EXPORT_SYMBOL ( __sk_mem_schedule ) ;
/**
2016-10-21 14:55:45 +03:00
* __sk_mem_reduce_allocated - reclaim memory_allocated
2007-12-31 11:11:19 +03:00
* @ sk : socket
2016-10-21 14:55:45 +03:00
* @ amount : number of quanta
*
* Similar to __sk_mem_reclaim ( ) , but does not update sk_forward_alloc
2007-12-31 11:11:19 +03:00
*/
2016-10-21 14:55:45 +03:00
void __sk_mem_reduce_allocated ( struct sock * sk , int amount )
2007-12-31 11:11:19 +03:00
{
2015-05-15 22:39:25 +03:00
sk_memory_allocated_sub ( sk , amount ) ;
2007-12-31 11:11:19 +03:00
2016-01-15 02:21:17 +03:00
if ( mem_cgroup_sockets_enabled & & sk - > sk_memcg )
mem_cgroup_uncharge_skmem ( sk - > sk_memcg , amount ) ;
net: tcp_memcontrol: sanitize tcp memory accounting callbacks
There won't be a tcp control soft limit, so integrating the memcg code
into the global skmem limiting scheme complicates things unnecessarily.
Replace this with simple and clear charge and uncharge calls--hidden
behind a jump label--to account skb memory.
Note that this is not purely aesthetic: as a result of shoehorning the
per-memcg code into the same memory accounting functions that handle the
global level, the old code would compare the per-memcg consumption
against the smaller of the per-memcg limit and the global limit. This
allowed the total consumption of multiple sockets to exceed the global
limit, as long as the individual sockets stayed within bounds. After
this change, the code will always compare the per-memcg consumption to
the per-memcg limit, and the global consumption to the global limit, and
thus close this loophole.
Without a soft limit, the per-memcg memory pressure state in sockets is
generally questionable. However, we did it until now, so we continue to
enter it when the hard limit is hit, and packets are dropped, to let
other sockets in the cgroup know that they shouldn't grow their transmit
windows, either. However, keep it simple in the new callback model and
leave memory pressure lazily when the next packet is accepted (as
opposed to doing it synchroneously when packets are processed). When
packets are dropped, network performance will already be in the toilet,
so that should be a reasonable trade-off.
As described above, consumption is now checked on the per-memcg level
and the global level separately. Likewise, memory pressure states are
maintained on both the per-memcg level and the global level, and a
socket is considered under pressure when either level asserts as much.
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Vladimir Davydov <vdavydov@virtuozzo.com>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-01-15 02:21:14 +03:00
2011-12-12 01:47:02 +04:00
if ( sk_under_memory_pressure ( sk ) & &
( sk_memory_allocated ( sk ) < sk_prot_mem_limits ( sk , 0 ) ) )
sk_leave_memory_pressure ( sk ) ;
2007-12-31 11:11:19 +03:00
}
2016-10-21 14:55:45 +03:00
EXPORT_SYMBOL ( __sk_mem_reduce_allocated ) ;
/**
* __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
* @ sk : socket
* @ amount : number of bytes ( rounded down to a SK_MEM_QUANTUM multiple )
*/
void __sk_mem_reclaim ( struct sock * sk , int amount )
{
amount > > = SK_MEM_QUANTUM_SHIFT ;
sk - > sk_forward_alloc - = amount < < SK_MEM_QUANTUM_SHIFT ;
__sk_mem_reduce_allocated ( sk , amount ) ;
}
2007-12-31 11:11:19 +03:00
EXPORT_SYMBOL ( __sk_mem_reclaim ) ;
2016-04-05 19:41:16 +03:00
int sk_set_peek_off ( struct sock * sk , int val )
{
sk - > sk_peek_off = val ;
return 0 ;
}
EXPORT_SYMBOL_GPL ( sk_set_peek_off ) ;
2007-12-31 11:11:19 +03:00
2005-04-17 02:20:36 +04:00
/*
* Set of default routines for initialising struct proto_ops when
* the protocol does not support a particular function . In certain
* cases where it makes no sense for a protocol to have a " do nothing "
* function , some default processing is provided .
*/
int sock_no_bind ( struct socket * sock , struct sockaddr * saddr , int len )
{
return - EOPNOTSUPP ;
}
2009-05-27 15:30:05 +04:00
EXPORT_SYMBOL ( sock_no_bind ) ;
2005-04-17 02:20:36 +04:00
2007-02-09 17:24:36 +03:00
int sock_no_connect ( struct socket * sock , struct sockaddr * saddr ,
2005-04-17 02:20:36 +04:00
int len , int flags )
{
return - EOPNOTSUPP ;
}
2009-05-27 15:30:05 +04:00
EXPORT_SYMBOL ( sock_no_connect ) ;
2005-04-17 02:20:36 +04:00
int sock_no_socketpair ( struct socket * sock1 , struct socket * sock2 )
{
return - EOPNOTSUPP ;
}
2009-05-27 15:30:05 +04:00
EXPORT_SYMBOL ( sock_no_socketpair ) ;
2005-04-17 02:20:36 +04:00
2017-03-09 11:09:05 +03:00
int sock_no_accept ( struct socket * sock , struct socket * newsock , int flags ,
bool kern )
2005-04-17 02:20:36 +04:00
{
return - EOPNOTSUPP ;
}
2009-05-27 15:30:05 +04:00
EXPORT_SYMBOL ( sock_no_accept ) ;
2005-04-17 02:20:36 +04:00
2007-02-09 17:24:36 +03:00
int sock_no_getname ( struct socket * sock , struct sockaddr * saddr ,
2018-02-12 22:00:20 +03:00
int peer )
2005-04-17 02:20:36 +04:00
{
return - EOPNOTSUPP ;
}
2009-05-27 15:30:05 +04:00
EXPORT_SYMBOL ( sock_no_getname ) ;
2005-04-17 02:20:36 +04:00
int sock_no_ioctl ( struct socket * sock , unsigned int cmd , unsigned long arg )
{
return - EOPNOTSUPP ;
}
2009-05-27 15:30:05 +04:00
EXPORT_SYMBOL ( sock_no_ioctl ) ;
2005-04-17 02:20:36 +04:00
int sock_no_listen ( struct socket * sock , int backlog )
{
return - EOPNOTSUPP ;
}
2009-05-27 15:30:05 +04:00
EXPORT_SYMBOL ( sock_no_listen ) ;
2005-04-17 02:20:36 +04:00
int sock_no_shutdown ( struct socket * sock , int how )
{
return - EOPNOTSUPP ;
}
2009-05-27 15:30:05 +04:00
EXPORT_SYMBOL ( sock_no_shutdown ) ;
2005-04-17 02:20:36 +04:00
2015-03-02 10:37:48 +03:00
int sock_no_sendmsg ( struct socket * sock , struct msghdr * m , size_t len )
2005-04-17 02:20:36 +04:00
{
return - EOPNOTSUPP ;
}
2009-05-27 15:30:05 +04:00
EXPORT_SYMBOL ( sock_no_sendmsg ) ;
2005-04-17 02:20:36 +04:00
2017-07-29 02:22:41 +03:00
int sock_no_sendmsg_locked ( struct sock * sk , struct msghdr * m , size_t len )
{
return - EOPNOTSUPP ;
}
EXPORT_SYMBOL ( sock_no_sendmsg_locked ) ;
2015-03-02 10:37:48 +03:00
int sock_no_recvmsg ( struct socket * sock , struct msghdr * m , size_t len ,
int flags )
2005-04-17 02:20:36 +04:00
{
return - EOPNOTSUPP ;
}
2009-05-27 15:30:05 +04:00
EXPORT_SYMBOL ( sock_no_recvmsg ) ;
2005-04-17 02:20:36 +04:00
int sock_no_mmap ( struct file * file , struct socket * sock , struct vm_area_struct * vma )
{
/* Mirror missing mmap method error code */
return - ENODEV ;
}
2009-05-27 15:30:05 +04:00
EXPORT_SYMBOL ( sock_no_mmap ) ;
2005-04-17 02:20:36 +04:00
2020-06-10 02:11:29 +03:00
/*
* When a file is received ( via SCM_RIGHTS , etc ) , we must bump the
* various sock - based usage counts .
*/
void __receive_sock ( struct file * file )
{
struct socket * sock ;
2020-12-04 14:36:04 +03:00
sock = sock_from_file ( file ) ;
2020-06-10 02:11:29 +03:00
if ( sock ) {
sock_update_netprioidx ( & sock - > sk - > sk_cgrp_data ) ;
sock_update_classid ( & sock - > sk - > sk_cgrp_data ) ;
}
}
2005-04-17 02:20:36 +04:00
ssize_t sock_no_sendpage ( struct socket * sock , struct page * page , int offset , size_t size , int flags )
{
ssize_t res ;
struct msghdr msg = { . msg_flags = flags } ;
struct kvec iov ;
char * kaddr = kmap ( page ) ;
iov . iov_base = kaddr + offset ;
iov . iov_len = size ;
res = kernel_sendmsg ( sock , & msg , & iov , 1 , size ) ;
kunmap ( page ) ;
return res ;
}
2009-05-27 15:30:05 +04:00
EXPORT_SYMBOL ( sock_no_sendpage ) ;
2005-04-17 02:20:36 +04:00
2017-07-29 02:22:41 +03:00
ssize_t sock_no_sendpage_locked ( struct sock * sk , struct page * page ,
int offset , size_t size , int flags )
{
ssize_t res ;
struct msghdr msg = { . msg_flags = flags } ;
struct kvec iov ;
char * kaddr = kmap ( page ) ;
iov . iov_base = kaddr + offset ;
iov . iov_len = size ;
res = kernel_sendmsg_locked ( sk , & msg , & iov , 1 , size ) ;
kunmap ( page ) ;
return res ;
}
EXPORT_SYMBOL ( sock_no_sendpage_locked ) ;
2005-04-17 02:20:36 +04:00
/*
* Default Socket Callbacks
*/
static void sock_def_wakeup ( struct sock * sk )
{
2010-04-29 15:01:49 +04:00
struct socket_wq * wq ;
rcu_read_lock ( ) ;
wq = rcu_dereference ( sk - > sk_wq ) ;
2015-11-26 08:55:39 +03:00
if ( skwq_has_sleeper ( wq ) )
2010-04-29 15:01:49 +04:00
wake_up_interruptible_all ( & wq - > wait ) ;
rcu_read_unlock ( ) ;
2005-04-17 02:20:36 +04:00
}
static void sock_def_error_report ( struct sock * sk )
{
2010-04-29 15:01:49 +04:00
struct socket_wq * wq ;
rcu_read_lock ( ) ;
wq = rcu_dereference ( sk - > sk_wq ) ;
2015-11-26 08:55:39 +03:00
if ( skwq_has_sleeper ( wq ) )
2018-02-12 01:34:03 +03:00
wake_up_interruptible_poll ( & wq - > wait , EPOLLERR ) ;
2007-11-26 15:10:50 +03:00
sk_wake_async ( sk , SOCK_WAKE_IO , POLL_ERR ) ;
2010-04-29 15:01:49 +04:00
rcu_read_unlock ( ) ;
2005-04-17 02:20:36 +04:00
}
2020-01-20 12:29:17 +03:00
void sock_def_readable ( struct sock * sk )
2005-04-17 02:20:36 +04:00
{
2010-04-29 15:01:49 +04:00
struct socket_wq * wq ;
rcu_read_lock ( ) ;
wq = rcu_dereference ( sk - > sk_wq ) ;
2015-11-26 08:55:39 +03:00
if ( skwq_has_sleeper ( wq ) )
2018-02-12 01:34:03 +03:00
wake_up_interruptible_sync_poll ( & wq - > wait , EPOLLIN | EPOLLPRI |
EPOLLRDNORM | EPOLLRDBAND ) ;
2007-11-26 15:10:50 +03:00
sk_wake_async ( sk , SOCK_WAKE_WAITD , POLL_IN ) ;
2010-04-29 15:01:49 +04:00
rcu_read_unlock ( ) ;
2005-04-17 02:20:36 +04:00
}
static void sock_def_write_space ( struct sock * sk )
{
2010-04-29 15:01:49 +04:00
struct socket_wq * wq ;
rcu_read_lock ( ) ;
2005-04-17 02:20:36 +04:00
/* Do not wake up a writer until he can make "significant"
* progress . - - DaveM
*/
2019-10-11 06:17:45 +03:00
if ( ( refcount_read ( & sk - > sk_wmem_alloc ) < < 1 ) < = READ_ONCE ( sk - > sk_sndbuf ) ) {
2010-04-29 15:01:49 +04:00
wq = rcu_dereference ( sk - > sk_wq ) ;
2015-11-26 08:55:39 +03:00
if ( skwq_has_sleeper ( wq ) )
2018-02-12 01:34:03 +03:00
wake_up_interruptible_sync_poll ( & wq - > wait , EPOLLOUT |
EPOLLWRNORM | EPOLLWRBAND ) ;
2005-04-17 02:20:36 +04:00
/* Should agree with poll, otherwise some programs break */
if ( sock_writeable ( sk ) )
2007-11-26 15:10:50 +03:00
sk_wake_async ( sk , SOCK_WAKE_SPACE , POLL_OUT ) ;
2005-04-17 02:20:36 +04:00
}
2010-04-29 15:01:49 +04:00
rcu_read_unlock ( ) ;
2005-04-17 02:20:36 +04:00
}
static void sock_def_destruct ( struct sock * sk )
{
}
void sk_send_sigurg ( struct sock * sk )
{
if ( sk - > sk_socket & & sk - > sk_socket - > file )
if ( send_sigurg ( & sk - > sk_socket - > file - > f_owner ) )
2007-11-26 15:10:50 +03:00
sk_wake_async ( sk , SOCK_WAKE_URG , POLL_PRI ) ;
2005-04-17 02:20:36 +04:00
}
2009-05-27 15:30:05 +04:00
EXPORT_SYMBOL ( sk_send_sigurg ) ;
2005-04-17 02:20:36 +04:00
void sk_reset_timer ( struct sock * sk , struct timer_list * timer ,
unsigned long expires )
{
if ( ! mod_timer ( timer , expires ) )
sock_hold ( sk ) ;
}
EXPORT_SYMBOL ( sk_reset_timer ) ;
void sk_stop_timer ( struct sock * sk , struct timer_list * timer )
{
2013-02-04 00:32:57 +04:00
if ( del_timer ( timer ) )
2005-04-17 02:20:36 +04:00
__sock_put ( sk ) ;
}
EXPORT_SYMBOL ( sk_stop_timer ) ;
2020-09-24 03:30:01 +03:00
void sk_stop_timer_sync ( struct sock * sk , struct timer_list * timer )
{
if ( del_timer_sync ( timer ) )
__sock_put ( sk ) ;
}
EXPORT_SYMBOL ( sk_stop_timer_sync ) ;
2005-04-17 02:20:36 +04:00
void sock_init_data ( struct socket * sock , struct sock * sk )
{
net/socket: use per af lockdep classes for sk queues
Currently the sock queue's spin locks get their lockdep
classes by the default init_spin_lock() initializer:
all socket families get - usually, see below - a single
class for rx, another specific class for tx, etc.
This can lead to false positive lockdep splat, as
reported by Andrey.
Moreover there are two separate initialization points
for the sock queues, one in sk_clone_lock() and one
in sock_init_data(), so that e.g. the rx queue lock
can get one of two possible, different classes, depending
on the socket being cloned or not.
This change tries to address the above, setting explicitly
a per address family lockdep class for each queue's
spinlock. Also, move the duplicated initialization code to a
single location.
v1 -> v2:
- renamed the init helper
rfc -> v1:
- no changes, tested with several different workload
Suggested-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-09 15:54:08 +03:00
sk_init_common ( sk ) ;
2005-04-17 02:20:36 +04:00
sk - > sk_send_head = NULL ;
2017-10-17 03:29:36 +03:00
timer_setup ( & sk - > sk_timer , NULL , 0 ) ;
2007-02-09 17:24:36 +03:00
2005-04-17 02:20:36 +04:00
sk - > sk_allocation = GFP_KERNEL ;
sk - > sk_rcvbuf = sysctl_rmem_default ;
sk - > sk_sndbuf = sysctl_wmem_default ;
sk - > sk_state = TCP_CLOSE ;
2008-06-18 09:41:38 +04:00
sk_set_socket ( sk , sock ) ;
2005-04-17 02:20:36 +04:00
sock_set_flag ( sk , SOCK_ZAPPED ) ;
2007-04-11 07:10:33 +04:00
if ( sock ) {
2005-04-17 02:20:36 +04:00
sk - > sk_type = sock - > type ;
2019-07-05 22:14:16 +03:00
RCU_INIT_POINTER ( sk - > sk_wq , & sock - > wq ) ;
2005-04-17 02:20:36 +04:00
sock - > sk = sk ;
net: core: Add a UID field to struct sock.
Protocol sockets (struct sock) don't have UIDs, but most of the
time, they map 1:1 to userspace sockets (struct socket) which do.
Various operations such as the iptables xt_owner match need
access to the "UID of a socket", and do so by following the
backpointer to the struct socket. This involves taking
sk_callback_lock and doesn't work when there is no socket
because userspace has already called close().
Simplify this by adding a sk_uid field to struct sock whose value
matches the UID of the corresponding struct socket. The semantics
are as follows:
1. Whenever sk_socket is non-null: sk_uid is the same as the UID
in sk_socket, i.e., matches the return value of sock_i_uid.
Specifically, the UID is set when userspace calls socket(),
fchown(), or accept().
2. When sk_socket is NULL, sk_uid is defined as follows:
- For a socket that no longer has a sk_socket because
userspace has called close(): the previous UID.
- For a cloned socket (e.g., an incoming connection that is
established but on which userspace has not yet called
accept): the UID of the socket it was cloned from.
- For a socket that has never had an sk_socket: UID 0 inside
the user namespace corresponding to the network namespace
the socket belongs to.
Kernel sockets created by sock_create_kern are a special case
of #1 and sk_uid is the user that created them. For kernel
sockets created at network namespace creation time, such as the
per-processor ICMP and TCP sockets, this is the user that created
the network namespace.
Signed-off-by: Lorenzo Colitti <lorenzo@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-11-03 20:23:41 +03:00
sk - > sk_uid = SOCK_INODE ( sock ) - > i_uid ;
} else {
2019-02-22 12:08:22 +03:00
RCU_INIT_POINTER ( sk - > sk_wq , NULL ) ;
net: core: Add a UID field to struct sock.
Protocol sockets (struct sock) don't have UIDs, but most of the
time, they map 1:1 to userspace sockets (struct socket) which do.
Various operations such as the iptables xt_owner match need
access to the "UID of a socket", and do so by following the
backpointer to the struct socket. This involves taking
sk_callback_lock and doesn't work when there is no socket
because userspace has already called close().
Simplify this by adding a sk_uid field to struct sock whose value
matches the UID of the corresponding struct socket. The semantics
are as follows:
1. Whenever sk_socket is non-null: sk_uid is the same as the UID
in sk_socket, i.e., matches the return value of sock_i_uid.
Specifically, the UID is set when userspace calls socket(),
fchown(), or accept().
2. When sk_socket is NULL, sk_uid is defined as follows:
- For a socket that no longer has a sk_socket because
userspace has called close(): the previous UID.
- For a cloned socket (e.g., an incoming connection that is
established but on which userspace has not yet called
accept): the UID of the socket it was cloned from.
- For a socket that has never had an sk_socket: UID 0 inside
the user namespace corresponding to the network namespace
the socket belongs to.
Kernel sockets created by sock_create_kern are a special case
of #1 and sk_uid is the user that created them. For kernel
sockets created at network namespace creation time, such as the
per-processor ICMP and TCP sockets, this is the user that created
the network namespace.
Signed-off-by: Lorenzo Colitti <lorenzo@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-11-03 20:23:41 +03:00
sk - > sk_uid = make_kuid ( sock_net ( sk ) - > user_ns , 0 ) ;
}
2005-04-17 02:20:36 +04:00
rwlock_init ( & sk - > sk_callback_lock ) ;
2017-03-09 11:09:05 +03:00
if ( sk - > sk_kern_sock )
lockdep_set_class_and_name (
& sk - > sk_callback_lock ,
af_kern_callback_keys + sk - > sk_family ,
af_family_kern_clock_key_strings [ sk - > sk_family ] ) ;
else
lockdep_set_class_and_name (
& sk - > sk_callback_lock ,
2007-07-19 12:49:00 +04:00
af_callback_keys + sk - > sk_family ,
af_family_clock_key_strings [ sk - > sk_family ] ) ;
2005-04-17 02:20:36 +04:00
sk - > sk_state_change = sock_def_wakeup ;
sk - > sk_data_ready = sock_def_readable ;
sk - > sk_write_space = sock_def_write_space ;
sk - > sk_error_report = sock_def_error_report ;
sk - > sk_destruct = sock_def_destruct ;
net: use a per task frag allocator
We currently use a per socket order-0 page cache for tcp_sendmsg()
operations.
This page is used to build fragments for skbs.
Its done to increase probability of coalescing small write() into
single segments in skbs still in write queue (not yet sent)
But it wastes a lot of memory for applications handling many mostly
idle sockets, since each socket holds one page in sk->sk_sndmsg_page
Its also quite inefficient to build TSO 64KB packets, because we need
about 16 pages per skb on arches where PAGE_SIZE = 4096, so we hit
page allocator more than wanted.
This patch adds a per task frag allocator and uses bigger pages,
if available. An automatic fallback is done in case of memory pressure.
(up to 32768 bytes per frag, thats order-3 pages on x86)
This increases TCP stream performance by 20% on loopback device,
but also benefits on other network devices, since 8x less frags are
mapped on transmit and unmapped on tx completion. Alexander Duyck
mentioned a probable performance win on systems with IOMMU enabled.
Its possible some SG enabled hardware cant cope with bigger fragments,
but their ndo_start_xmit() should already handle this, splitting a
fragment in sub fragments, since some arches have PAGE_SIZE=65536
Successfully tested on various ethernet devices.
(ixgbe, igb, bnx2x, tg3, mellanox mlx4)
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Ben Hutchings <bhutchings@solarflare.com>
Cc: Vijay Subramanian <subramanian.vijay@gmail.com>
Cc: Alexander Duyck <alexander.h.duyck@intel.com>
Tested-by: Vijay Subramanian <subramanian.vijay@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2012-09-24 03:04:42 +04:00
sk - > sk_frag . page = NULL ;
sk - > sk_frag . offset = 0 ;
2012-02-21 11:31:34 +04:00
sk - > sk_peek_off = - 1 ;
2005-04-17 02:20:36 +04:00
2010-06-13 07:30:14 +04:00
sk - > sk_peer_pid = NULL ;
sk - > sk_peer_cred = NULL ;
2021-09-30 01:57:50 +03:00
spin_lock_init ( & sk - > sk_peer_lock ) ;
2005-04-17 02:20:36 +04:00
sk - > sk_write_pending = 0 ;
sk - > sk_rcvlowat = 1 ;
sk - > sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT ;
sk - > sk_sndtimeo = MAX_SCHEDULE_TIMEOUT ;
2017-03-30 15:03:06 +03:00
sk - > sk_stamp = SK_DEFAULT_STAMP ;
2018-12-28 05:55:09 +03:00
# if BITS_PER_LONG==32
seqlock_init ( & sk - > sk_stamp_seq ) ;
# endif
2017-08-03 23:29:39 +03:00
atomic_set ( & sk - > sk_zckey , 0 ) ;
2005-04-17 02:20:36 +04:00
2013-08-01 07:10:25 +04:00
# ifdef CONFIG_NET_RX_BUSY_POLL
2013-06-10 12:39:50 +04:00
sk - > sk_napi_id = 0 ;
2013-07-10 18:13:36 +04:00
sk - > sk_ll_usec = sysctl_net_busy_read ;
2013-06-10 12:39:50 +04:00
# endif
net: extend sk_pacing_rate to unsigned long
sk_pacing_rate has beed introduced as a u32 field in 2013,
effectively limiting per flow pacing to 34Gbit.
We believe it is time to allow TCP to pace high speed flows
on 64bit hosts, as we now can reach 100Gbit on one TCP flow.
This patch adds no cost for 32bit kernels.
The tcpi_pacing_rate and tcpi_max_pacing_rate were already
exported as 64bit, so iproute2/ss command require no changes.
Unfortunately the SO_MAX_PACING_RATE socket option will stay
32bit and we will need to add a new option to let applications
control high pacing rates.
State Recv-Q Send-Q Local Address:Port Peer Address:Port
ESTAB 0 1787144 10.246.9.76:49992 10.246.9.77:36741
timer:(on,003ms,0) ino:91863 sk:2 <->
skmem:(r0,rb540000,t66440,tb2363904,f605944,w1822984,o0,bl0,d0)
ts sack bbr wscale:8,8 rto:201 rtt:0.057/0.006 mss:1448
rcvmss:536 advmss:1448
cwnd:138 ssthresh:178 bytes_acked:256699822585 segs_out:177279177
segs_in:3916318 data_segs_out:177279175
bbr:(bw:31276.8Mbps,mrtt:0,pacing_gain:1.25,cwnd_gain:2)
send 28045.5Mbps lastrcv:73333
pacing_rate 38705.0Mbps delivery_rate 22997.6Mbps
busy:73333ms unacked:135 retrans:0/157 rcv_space:14480
notsent:2085120 minrtt:0.013
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-10-15 19:37:53 +03:00
sk - > sk_max_pacing_rate = ~ 0UL ;
sk - > sk_pacing_rate = ~ 0UL ;
2019-12-17 05:51:03 +03:00
WRITE_ONCE ( sk - > sk_pacing_shift , 10 ) ;
2015-10-09 05:33:21 +03:00
sk - > sk_incoming_cpu = - 1 ;
2018-06-30 07:26:57 +03:00
sk_rx_queue_clear ( sk ) ;
2009-07-16 03:13:10 +04:00
/*
* Before updating sk_refcnt , we must commit prior changes to memory
2020-04-21 20:04:05 +03:00
* ( Documentation / RCU / rculist_nulls . rst for details )
2009-07-16 03:13:10 +04:00
*/
smp_wmb ( ) ;
2017-06-30 13:08:01 +03:00
refcount_set ( & sk - > sk_refcnt , 1 ) ;
2007-11-14 07:30:01 +03:00
atomic_set ( & sk - > sk_drops , 0 ) ;
2005-04-17 02:20:36 +04:00
}
2009-05-27 15:30:05 +04:00
EXPORT_SYMBOL ( sock_init_data ) ;
2005-04-17 02:20:36 +04:00
2008-02-14 02:03:16 +03:00
void lock_sock_nested ( struct sock * sk , int subclass )
2005-04-17 02:20:36 +04:00
{
net: core: Correct the sock::sk_lock.owned lockdep annotations
lock_sock_fast() and lock_sock_nested() contain lockdep annotations for the
sock::sk_lock.owned 'mutex'. sock::sk_lock.owned is not a regular mutex. It
is just lockdep wise equivalent. In fact it's an open coded trivial mutex
implementation with some interesting features.
sock::sk_lock.slock is a regular spinlock protecting the 'mutex'
representation sock::sk_lock.owned which is a plain boolean. If 'owned' is
true, then some other task holds the 'mutex', otherwise it is uncontended.
As this locking construct is obviously endangered by lock ordering issues as
any other locking primitive it got lockdep annotated via a dedicated
dependency map sock::sk_lock.dep_map which has to be updated at the lock
and unlock sites.
lock_sock_nested() is a straight forward 'mutex' lock operation:
might_sleep();
spin_lock_bh(sock::sk_lock.slock)
while (!try_lock(sock::sk_lock.owned)) {
spin_unlock_bh(sock::sk_lock.slock);
wait_for_release();
spin_lock_bh(sock::sk_lock.slock);
}
The lockdep annotation for sock::sk_lock.owned is for unknown reasons
_after_ the lock has been acquired, i.e. after the code block above and
after releasing sock::sk_lock.slock, but inside the bottom halves disabled
region:
spin_unlock(sock::sk_lock.slock);
mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
local_bh_enable();
The placement after the unlock is obvious because otherwise the
mutex_acquire() would nest into the spin lock held region.
But that's from the lockdep perspective still the wrong place:
1) The mutex_acquire() is issued _after_ the successful acquisition which
is pointless because in a dead lock scenario this point is never
reached which means that if the deadlock is the first instance of
exposing the wrong lock order lockdep does not have a chance to detect
it.
2) It only works because lockdep is rather lax on the context from which
the mutex_acquire() is issued. Acquiring a mutex inside a bottom halves
and therefore non-preemptible region is obviously invalid, except for a
trylock which is clearly not the case here.
This 'works' stops working on RT enabled kernels where the bottom halves
serialization is done via a local lock, which exposes this misplacement
because the 'mutex' and the local lock nest the wrong way around and
lockdep complains rightfully about a lock inversion.
The placement is wrong since the initial commit a5b5bb9a053a ("[PATCH]
lockdep: annotate sk_locks") which introduced this.
Fix it by moving the mutex_acquire() in front of the actual lock
acquisition, which is what the regular mutex_lock() operation does as well.
lock_sock_fast() is not that straight forward. It looks at the first glance
like a convoluted trylock operation:
spin_lock_bh(sock::sk_lock.slock)
if (!sock::sk_lock.owned)
return false;
while (!try_lock(sock::sk_lock.owned)) {
spin_unlock_bh(sock::sk_lock.slock);
wait_for_release();
spin_lock_bh(sock::sk_lock.slock);
}
spin_unlock(sock::sk_lock.slock);
mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
local_bh_enable();
return true;
But that's not the case: lock_sock_fast() is an interesting optimization
for short critical sections which can run with bottom halves disabled and
sock::sk_lock.slock held. This allows to shortcut the 'mutex' operation in
the non contended case by preventing other lockers to acquire
sock::sk_lock.owned because they are blocked on sock::sk_lock.slock, which
in turn avoids the overhead of doing the heavy processing in release_sock()
including waking up wait queue waiters.
In the contended case, i.e. when sock::sk_lock.owned == true the behavior
is the same as lock_sock_nested().
Semantically this shortcut means, that the task acquired the 'mutex' even
if it does not touch the sock::sk_lock.owned field in the non-contended
case. Not telling lockdep about this shortcut acquisition is hiding
potential lock ordering violations in the fast path.
As a consequence the same reasoning as for the above lock_sock_nested()
case vs. the placement of the lockdep annotation applies.
The current placement of the lockdep annotation was just copied from
the original lock_sock(), now renamed to lock_sock_nested(),
implementation.
Fix this by moving the mutex_acquire() in front of the actual lock
acquisition and adding the corresponding mutex_release() into
unlock_sock_fast(). Also document the fast path return case with a comment.
Reported-by: Sebastian Siewior <bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: netdev@vger.kernel.org
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2021-09-18 15:42:35 +03:00
/* The sk_lock has mutex_lock() semantics here. */
mutex_acquire ( & sk - > sk_lock . dep_map , subclass , 0 , _RET_IP_ ) ;
2005-04-17 02:20:36 +04:00
might_sleep ( ) ;
2006-07-03 11:25:35 +04:00
spin_lock_bh ( & sk - > sk_lock . slock ) ;
2007-09-12 12:44:19 +04:00
if ( sk - > sk_lock . owned )
2005-04-17 02:20:36 +04:00
__lock_sock ( sk ) ;
2007-09-12 12:44:19 +04:00
sk - > sk_lock . owned = 1 ;
net: core: Correct the sock::sk_lock.owned lockdep annotations
lock_sock_fast() and lock_sock_nested() contain lockdep annotations for the
sock::sk_lock.owned 'mutex'. sock::sk_lock.owned is not a regular mutex. It
is just lockdep wise equivalent. In fact it's an open coded trivial mutex
implementation with some interesting features.
sock::sk_lock.slock is a regular spinlock protecting the 'mutex'
representation sock::sk_lock.owned which is a plain boolean. If 'owned' is
true, then some other task holds the 'mutex', otherwise it is uncontended.
As this locking construct is obviously endangered by lock ordering issues as
any other locking primitive it got lockdep annotated via a dedicated
dependency map sock::sk_lock.dep_map which has to be updated at the lock
and unlock sites.
lock_sock_nested() is a straight forward 'mutex' lock operation:
might_sleep();
spin_lock_bh(sock::sk_lock.slock)
while (!try_lock(sock::sk_lock.owned)) {
spin_unlock_bh(sock::sk_lock.slock);
wait_for_release();
spin_lock_bh(sock::sk_lock.slock);
}
The lockdep annotation for sock::sk_lock.owned is for unknown reasons
_after_ the lock has been acquired, i.e. after the code block above and
after releasing sock::sk_lock.slock, but inside the bottom halves disabled
region:
spin_unlock(sock::sk_lock.slock);
mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
local_bh_enable();
The placement after the unlock is obvious because otherwise the
mutex_acquire() would nest into the spin lock held region.
But that's from the lockdep perspective still the wrong place:
1) The mutex_acquire() is issued _after_ the successful acquisition which
is pointless because in a dead lock scenario this point is never
reached which means that if the deadlock is the first instance of
exposing the wrong lock order lockdep does not have a chance to detect
it.
2) It only works because lockdep is rather lax on the context from which
the mutex_acquire() is issued. Acquiring a mutex inside a bottom halves
and therefore non-preemptible region is obviously invalid, except for a
trylock which is clearly not the case here.
This 'works' stops working on RT enabled kernels where the bottom halves
serialization is done via a local lock, which exposes this misplacement
because the 'mutex' and the local lock nest the wrong way around and
lockdep complains rightfully about a lock inversion.
The placement is wrong since the initial commit a5b5bb9a053a ("[PATCH]
lockdep: annotate sk_locks") which introduced this.
Fix it by moving the mutex_acquire() in front of the actual lock
acquisition, which is what the regular mutex_lock() operation does as well.
lock_sock_fast() is not that straight forward. It looks at the first glance
like a convoluted trylock operation:
spin_lock_bh(sock::sk_lock.slock)
if (!sock::sk_lock.owned)
return false;
while (!try_lock(sock::sk_lock.owned)) {
spin_unlock_bh(sock::sk_lock.slock);
wait_for_release();
spin_lock_bh(sock::sk_lock.slock);
}
spin_unlock(sock::sk_lock.slock);
mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
local_bh_enable();
return true;
But that's not the case: lock_sock_fast() is an interesting optimization
for short critical sections which can run with bottom halves disabled and
sock::sk_lock.slock held. This allows to shortcut the 'mutex' operation in
the non contended case by preventing other lockers to acquire
sock::sk_lock.owned because they are blocked on sock::sk_lock.slock, which
in turn avoids the overhead of doing the heavy processing in release_sock()
including waking up wait queue waiters.
In the contended case, i.e. when sock::sk_lock.owned == true the behavior
is the same as lock_sock_nested().
Semantically this shortcut means, that the task acquired the 'mutex' even
if it does not touch the sock::sk_lock.owned field in the non-contended
case. Not telling lockdep about this shortcut acquisition is hiding
potential lock ordering violations in the fast path.
As a consequence the same reasoning as for the above lock_sock_nested()
case vs. the placement of the lockdep annotation applies.
The current placement of the lockdep annotation was just copied from
the original lock_sock(), now renamed to lock_sock_nested(),
implementation.
Fix this by moving the mutex_acquire() in front of the actual lock
acquisition and adding the corresponding mutex_release() into
unlock_sock_fast(). Also document the fast path return case with a comment.
Reported-by: Sebastian Siewior <bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: netdev@vger.kernel.org
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2021-09-18 15:42:35 +03:00
spin_unlock_bh ( & sk - > sk_lock . slock ) ;
2005-04-17 02:20:36 +04:00
}
2006-11-09 09:44:35 +03:00
EXPORT_SYMBOL ( lock_sock_nested ) ;
2005-04-17 02:20:36 +04:00
2008-02-14 02:03:16 +03:00
void release_sock ( struct sock * sk )
2005-04-17 02:20:36 +04:00
{
2006-07-03 11:25:35 +04:00
spin_lock_bh ( & sk - > sk_lock . slock ) ;
2005-04-17 02:20:36 +04:00
if ( sk - > sk_backlog . tail )
__release_sock ( sk ) ;
tcp: TCP Small Queues
This introduce TSQ (TCP Small Queues)
TSQ goal is to reduce number of TCP packets in xmit queues (qdisc &
device queues), to reduce RTT and cwnd bias, part of the bufferbloat
problem.
sk->sk_wmem_alloc not allowed to grow above a given limit,
allowing no more than ~128KB [1] per tcp socket in qdisc/dev layers at a
given time.
TSO packets are sized/capped to half the limit, so that we have two
TSO packets in flight, allowing better bandwidth use.
As a side effect, setting the limit to 40000 automatically reduces the
standard gso max limit (65536) to 40000/2 : It can help to reduce
latencies of high prio packets, having smaller TSO packets.
This means we divert sock_wfree() to a tcp_wfree() handler, to
queue/send following frames when skb_orphan() [2] is called for the
already queued skbs.
Results on my dev machines (tg3/ixgbe nics) are really impressive,
using standard pfifo_fast, and with or without TSO/GSO.
Without reduction of nominal bandwidth, we have reduction of buffering
per bulk sender :
< 1ms on Gbit (instead of 50ms with TSO)
< 8ms on 100Mbit (instead of 132 ms)
I no longer have 4 MBytes backlogged in qdisc by a single netperf
session, and both side socket autotuning no longer use 4 Mbytes.
As skb destructor cannot restart xmit itself ( as qdisc lock might be
taken at this point ), we delegate the work to a tasklet. We use one
tasklest per cpu for performance reasons.
If tasklet finds a socket owned by the user, it sets TSQ_OWNED flag.
This flag is tested in a new protocol method called from release_sock(),
to eventually send new segments.
[1] New /proc/sys/net/ipv4/tcp_limit_output_bytes tunable
[2] skb_orphan() is usually called at TX completion time,
but some drivers call it in their start_xmit() handler.
These drivers should at least use BQL, or else a single TCP
session can still fill the whole NIC TX ring, since TSQ will
have no effect.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Dave Taht <dave.taht@bufferbloat.net>
Cc: Tom Herbert <therbert@google.com>
Cc: Matt Mathis <mattmathis@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Cc: Nandita Dukkipati <nanditad@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2012-07-11 09:50:31 +04:00
tcp: tcp_release_cb() should release socket ownership
Lars Persson reported following deadlock :
-000 |M:0x0:0x802B6AF8(asm) <-- arch_spin_lock
-001 |tcp_v4_rcv(skb = 0x8BD527A0) <-- sk = 0x8BE6B2A0
-002 |ip_local_deliver_finish(skb = 0x8BD527A0)
-003 |__netif_receive_skb_core(skb = 0x8BD527A0, ?)
-004 |netif_receive_skb(skb = 0x8BD527A0)
-005 |elk_poll(napi = 0x8C770500, budget = 64)
-006 |net_rx_action(?)
-007 |__do_softirq()
-008 |do_softirq()
-009 |local_bh_enable()
-010 |tcp_rcv_established(sk = 0x8BE6B2A0, skb = 0x87D3A9E0, th = 0x814EBE14, ?)
-011 |tcp_v4_do_rcv(sk = 0x8BE6B2A0, skb = 0x87D3A9E0)
-012 |tcp_delack_timer_handler(sk = 0x8BE6B2A0)
-013 |tcp_release_cb(sk = 0x8BE6B2A0)
-014 |release_sock(sk = 0x8BE6B2A0)
-015 |tcp_sendmsg(?, sk = 0x8BE6B2A0, ?, ?)
-016 |sock_sendmsg(sock = 0x8518C4C0, msg = 0x87D8DAA8, size = 4096)
-017 |kernel_sendmsg(?, ?, ?, ?, size = 4096)
-018 |smb_send_kvec()
-019 |smb_send_rqst(server = 0x87C4D400, rqst = 0x87D8DBA0)
-020 |cifs_call_async()
-021 |cifs_async_writev(wdata = 0x87FD6580)
-022 |cifs_writepages(mapping = 0x852096E4, wbc = 0x87D8DC88)
-023 |__writeback_single_inode(inode = 0x852095D0, wbc = 0x87D8DC88)
-024 |writeback_sb_inodes(sb = 0x87D6D800, wb = 0x87E4A9C0, work = 0x87D8DD88)
-025 |__writeback_inodes_wb(wb = 0x87E4A9C0, work = 0x87D8DD88)
-026 |wb_writeback(wb = 0x87E4A9C0, work = 0x87D8DD88)
-027 |wb_do_writeback(wb = 0x87E4A9C0, force_wait = 0)
-028 |bdi_writeback_workfn(work = 0x87E4A9CC)
-029 |process_one_work(worker = 0x8B045880, work = 0x87E4A9CC)
-030 |worker_thread(__worker = 0x8B045880)
-031 |kthread(_create = 0x87CADD90)
-032 |ret_from_kernel_thread(asm)
Bug occurs because __tcp_checksum_complete_user() enables BH, assuming
it is running from softirq context.
Lars trace involved a NIC without RX checksum support but other points
are problematic as well, like the prequeue stuff.
Problem is triggered by a timer, that found socket being owned by user.
tcp_release_cb() should call tcp_write_timer_handler() or
tcp_delack_timer_handler() in the appropriate context :
BH disabled and socket lock held, but 'owned' field cleared,
as if they were running from timer handlers.
Fixes: 6f458dfb4092 ("tcp: improve latencies of timer triggered events")
Reported-by: Lars Persson <lars.persson@axis.com>
Tested-by: Lars Persson <lars.persson@axis.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-03-10 20:50:11 +04:00
/* Warning : release_cb() might need to release sk ownership,
* ie call sock_release_ownership ( sk ) before us .
*/
tcp: TCP Small Queues
This introduce TSQ (TCP Small Queues)
TSQ goal is to reduce number of TCP packets in xmit queues (qdisc &
device queues), to reduce RTT and cwnd bias, part of the bufferbloat
problem.
sk->sk_wmem_alloc not allowed to grow above a given limit,
allowing no more than ~128KB [1] per tcp socket in qdisc/dev layers at a
given time.
TSO packets are sized/capped to half the limit, so that we have two
TSO packets in flight, allowing better bandwidth use.
As a side effect, setting the limit to 40000 automatically reduces the
standard gso max limit (65536) to 40000/2 : It can help to reduce
latencies of high prio packets, having smaller TSO packets.
This means we divert sock_wfree() to a tcp_wfree() handler, to
queue/send following frames when skb_orphan() [2] is called for the
already queued skbs.
Results on my dev machines (tg3/ixgbe nics) are really impressive,
using standard pfifo_fast, and with or without TSO/GSO.
Without reduction of nominal bandwidth, we have reduction of buffering
per bulk sender :
< 1ms on Gbit (instead of 50ms with TSO)
< 8ms on 100Mbit (instead of 132 ms)
I no longer have 4 MBytes backlogged in qdisc by a single netperf
session, and both side socket autotuning no longer use 4 Mbytes.
As skb destructor cannot restart xmit itself ( as qdisc lock might be
taken at this point ), we delegate the work to a tasklet. We use one
tasklest per cpu for performance reasons.
If tasklet finds a socket owned by the user, it sets TSQ_OWNED flag.
This flag is tested in a new protocol method called from release_sock(),
to eventually send new segments.
[1] New /proc/sys/net/ipv4/tcp_limit_output_bytes tunable
[2] skb_orphan() is usually called at TX completion time,
but some drivers call it in their start_xmit() handler.
These drivers should at least use BQL, or else a single TCP
session can still fill the whole NIC TX ring, since TSQ will
have no effect.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Dave Taht <dave.taht@bufferbloat.net>
Cc: Tom Herbert <therbert@google.com>
Cc: Matt Mathis <mattmathis@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Cc: Nandita Dukkipati <nanditad@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2012-07-11 09:50:31 +04:00
if ( sk - > sk_prot - > release_cb )
sk - > sk_prot - > release_cb ( sk ) ;
tcp: tcp_release_cb() should release socket ownership
Lars Persson reported following deadlock :
-000 |M:0x0:0x802B6AF8(asm) <-- arch_spin_lock
-001 |tcp_v4_rcv(skb = 0x8BD527A0) <-- sk = 0x8BE6B2A0
-002 |ip_local_deliver_finish(skb = 0x8BD527A0)
-003 |__netif_receive_skb_core(skb = 0x8BD527A0, ?)
-004 |netif_receive_skb(skb = 0x8BD527A0)
-005 |elk_poll(napi = 0x8C770500, budget = 64)
-006 |net_rx_action(?)
-007 |__do_softirq()
-008 |do_softirq()
-009 |local_bh_enable()
-010 |tcp_rcv_established(sk = 0x8BE6B2A0, skb = 0x87D3A9E0, th = 0x814EBE14, ?)
-011 |tcp_v4_do_rcv(sk = 0x8BE6B2A0, skb = 0x87D3A9E0)
-012 |tcp_delack_timer_handler(sk = 0x8BE6B2A0)
-013 |tcp_release_cb(sk = 0x8BE6B2A0)
-014 |release_sock(sk = 0x8BE6B2A0)
-015 |tcp_sendmsg(?, sk = 0x8BE6B2A0, ?, ?)
-016 |sock_sendmsg(sock = 0x8518C4C0, msg = 0x87D8DAA8, size = 4096)
-017 |kernel_sendmsg(?, ?, ?, ?, size = 4096)
-018 |smb_send_kvec()
-019 |smb_send_rqst(server = 0x87C4D400, rqst = 0x87D8DBA0)
-020 |cifs_call_async()
-021 |cifs_async_writev(wdata = 0x87FD6580)
-022 |cifs_writepages(mapping = 0x852096E4, wbc = 0x87D8DC88)
-023 |__writeback_single_inode(inode = 0x852095D0, wbc = 0x87D8DC88)
-024 |writeback_sb_inodes(sb = 0x87D6D800, wb = 0x87E4A9C0, work = 0x87D8DD88)
-025 |__writeback_inodes_wb(wb = 0x87E4A9C0, work = 0x87D8DD88)
-026 |wb_writeback(wb = 0x87E4A9C0, work = 0x87D8DD88)
-027 |wb_do_writeback(wb = 0x87E4A9C0, force_wait = 0)
-028 |bdi_writeback_workfn(work = 0x87E4A9CC)
-029 |process_one_work(worker = 0x8B045880, work = 0x87E4A9CC)
-030 |worker_thread(__worker = 0x8B045880)
-031 |kthread(_create = 0x87CADD90)
-032 |ret_from_kernel_thread(asm)
Bug occurs because __tcp_checksum_complete_user() enables BH, assuming
it is running from softirq context.
Lars trace involved a NIC without RX checksum support but other points
are problematic as well, like the prequeue stuff.
Problem is triggered by a timer, that found socket being owned by user.
tcp_release_cb() should call tcp_write_timer_handler() or
tcp_delack_timer_handler() in the appropriate context :
BH disabled and socket lock held, but 'owned' field cleared,
as if they were running from timer handlers.
Fixes: 6f458dfb4092 ("tcp: improve latencies of timer triggered events")
Reported-by: Lars Persson <lars.persson@axis.com>
Tested-by: Lars Persson <lars.persson@axis.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-03-10 20:50:11 +04:00
sock_release_ownership ( sk ) ;
2006-07-03 11:25:35 +04:00
if ( waitqueue_active ( & sk - > sk_lock . wq ) )
wake_up ( & sk - > sk_lock . wq ) ;
spin_unlock_bh ( & sk - > sk_lock . slock ) ;
2005-04-17 02:20:36 +04:00
}
EXPORT_SYMBOL ( release_sock ) ;
net: introduce and use lock_sock_fast_nested()
Syzkaller reported a false positive deadlock involving
the nl socket lock and the subflow socket lock:
MPTCP: kernel_bind error, err=-98
============================================
WARNING: possible recursive locking detected
5.15.0-rc1-syzkaller #0 Not tainted
--------------------------------------------
syz-executor998/6520 is trying to acquire lock:
ffff8880795718a0 (k-sk_lock-AF_INET){+.+.}-{0:0}, at: mptcp_close+0x267/0x7b0 net/mptcp/protocol.c:2738
but task is already holding lock:
ffff8880787c8c60 (k-sk_lock-AF_INET){+.+.}-{0:0}, at: lock_sock include/net/sock.h:1612 [inline]
ffff8880787c8c60 (k-sk_lock-AF_INET){+.+.}-{0:0}, at: mptcp_close+0x23/0x7b0 net/mptcp/protocol.c:2720
other info that might help us debug this:
Possible unsafe locking scenario:
CPU0
----
lock(k-sk_lock-AF_INET);
lock(k-sk_lock-AF_INET);
*** DEADLOCK ***
May be due to missing lock nesting notation
3 locks held by syz-executor998/6520:
#0: ffffffff8d176c50 (cb_lock){++++}-{3:3}, at: genl_rcv+0x15/0x40 net/netlink/genetlink.c:802
#1: ffffffff8d176d08 (genl_mutex){+.+.}-{3:3}, at: genl_lock net/netlink/genetlink.c:33 [inline]
#1: ffffffff8d176d08 (genl_mutex){+.+.}-{3:3}, at: genl_rcv_msg+0x3e0/0x580 net/netlink/genetlink.c:790
#2: ffff8880787c8c60 (k-sk_lock-AF_INET){+.+.}-{0:0}, at: lock_sock include/net/sock.h:1612 [inline]
#2: ffff8880787c8c60 (k-sk_lock-AF_INET){+.+.}-{0:0}, at: mptcp_close+0x23/0x7b0 net/mptcp/protocol.c:2720
stack backtrace:
CPU: 1 PID: 6520 Comm: syz-executor998 Not tainted 5.15.0-rc1-syzkaller #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
__dump_stack lib/dump_stack.c:88 [inline]
dump_stack_lvl+0xcd/0x134 lib/dump_stack.c:106
print_deadlock_bug kernel/locking/lockdep.c:2944 [inline]
check_deadlock kernel/locking/lockdep.c:2987 [inline]
validate_chain kernel/locking/lockdep.c:3776 [inline]
__lock_acquire.cold+0x149/0x3ab kernel/locking/lockdep.c:5015
lock_acquire kernel/locking/lockdep.c:5625 [inline]
lock_acquire+0x1ab/0x510 kernel/locking/lockdep.c:5590
lock_sock_fast+0x36/0x100 net/core/sock.c:3229
mptcp_close+0x267/0x7b0 net/mptcp/protocol.c:2738
inet_release+0x12e/0x280 net/ipv4/af_inet.c:431
__sock_release net/socket.c:649 [inline]
sock_release+0x87/0x1b0 net/socket.c:677
mptcp_pm_nl_create_listen_socket+0x238/0x2c0 net/mptcp/pm_netlink.c:900
mptcp_nl_cmd_add_addr+0x359/0x930 net/mptcp/pm_netlink.c:1170
genl_family_rcv_msg_doit+0x228/0x320 net/netlink/genetlink.c:731
genl_family_rcv_msg net/netlink/genetlink.c:775 [inline]
genl_rcv_msg+0x328/0x580 net/netlink/genetlink.c:792
netlink_rcv_skb+0x153/0x420 net/netlink/af_netlink.c:2504
genl_rcv+0x24/0x40 net/netlink/genetlink.c:803
netlink_unicast_kernel net/netlink/af_netlink.c:1314 [inline]
netlink_unicast+0x533/0x7d0 net/netlink/af_netlink.c:1340
netlink_sendmsg+0x86d/0xdb0 net/netlink/af_netlink.c:1929
sock_sendmsg_nosec net/socket.c:704 [inline]
sock_sendmsg+0xcf/0x120 net/socket.c:724
sock_no_sendpage+0x101/0x150 net/core/sock.c:2980
kernel_sendpage.part.0+0x1a0/0x340 net/socket.c:3504
kernel_sendpage net/socket.c:3501 [inline]
sock_sendpage+0xe5/0x140 net/socket.c:1003
pipe_to_sendpage+0x2ad/0x380 fs/splice.c:364
splice_from_pipe_feed fs/splice.c:418 [inline]
__splice_from_pipe+0x43e/0x8a0 fs/splice.c:562
splice_from_pipe fs/splice.c:597 [inline]
generic_splice_sendpage+0xd4/0x140 fs/splice.c:746
do_splice_from fs/splice.c:767 [inline]
direct_splice_actor+0x110/0x180 fs/splice.c:936
splice_direct_to_actor+0x34b/0x8c0 fs/splice.c:891
do_splice_direct+0x1b3/0x280 fs/splice.c:979
do_sendfile+0xae9/0x1240 fs/read_write.c:1249
__do_sys_sendfile64 fs/read_write.c:1314 [inline]
__se_sys_sendfile64 fs/read_write.c:1300 [inline]
__x64_sys_sendfile64+0x1cc/0x210 fs/read_write.c:1300
do_syscall_x64 arch/x86/entry/common.c:50 [inline]
do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80
entry_SYSCALL_64_after_hwframe+0x44/0xae
RIP: 0033:0x7f215cb69969
Code: 28 00 00 00 75 05 48 83 c4 28 c3 e8 e1 14 00 00 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 c0 ff ff ff f7 d8 64 89 01 48
RSP: 002b:00007ffc96bb3868 EFLAGS: 00000246 ORIG_RAX: 0000000000000028
RAX: ffffffffffffffda RBX: 00007f215cbad072 RCX: 00007f215cb69969
RDX: 0000000000000000 RSI: 0000000000000004 RDI: 0000000000000005
RBP: 0000000000000000 R08: 00007ffc96bb3a08 R09: 00007ffc96bb3a08
R10: 0000000100000002 R11: 0000000000000246 R12: 00007ffc96bb387c
R13: 431bde82d7b634db R14: 0000000000000000 R15: 0000000000000000
the problem originates from uncorrect lock annotation in the mptcp
code and is only visible since commit 2dcb96bacce3 ("net: core: Correct
the sock::sk_lock.owned lockdep annotations"), but is present since
the port-based endpoint support initial implementation.
This patch addresses the issue introducing a nested variant of
lock_sock_fast() and using it in the relevant code path.
Fixes: 1729cf186d8a ("mptcp: create the listening socket for new port")
Fixes: 2dcb96bacce3 ("net: core: Correct the sock::sk_lock.owned lockdep annotations")
Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Reported-and-tested-by: syzbot+1dd53f7a89b299d59eaf@syzkaller.appspotmail.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
2021-09-29 12:59:17 +03:00
bool __lock_sock_fast ( struct sock * sk ) __acquires ( & sk - > sk_lock . slock )
2010-05-26 23:20:18 +04:00
{
might_sleep ( ) ;
spin_lock_bh ( & sk - > sk_lock . slock ) ;
net: core: Correct the sock::sk_lock.owned lockdep annotations
lock_sock_fast() and lock_sock_nested() contain lockdep annotations for the
sock::sk_lock.owned 'mutex'. sock::sk_lock.owned is not a regular mutex. It
is just lockdep wise equivalent. In fact it's an open coded trivial mutex
implementation with some interesting features.
sock::sk_lock.slock is a regular spinlock protecting the 'mutex'
representation sock::sk_lock.owned which is a plain boolean. If 'owned' is
true, then some other task holds the 'mutex', otherwise it is uncontended.
As this locking construct is obviously endangered by lock ordering issues as
any other locking primitive it got lockdep annotated via a dedicated
dependency map sock::sk_lock.dep_map which has to be updated at the lock
and unlock sites.
lock_sock_nested() is a straight forward 'mutex' lock operation:
might_sleep();
spin_lock_bh(sock::sk_lock.slock)
while (!try_lock(sock::sk_lock.owned)) {
spin_unlock_bh(sock::sk_lock.slock);
wait_for_release();
spin_lock_bh(sock::sk_lock.slock);
}
The lockdep annotation for sock::sk_lock.owned is for unknown reasons
_after_ the lock has been acquired, i.e. after the code block above and
after releasing sock::sk_lock.slock, but inside the bottom halves disabled
region:
spin_unlock(sock::sk_lock.slock);
mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
local_bh_enable();
The placement after the unlock is obvious because otherwise the
mutex_acquire() would nest into the spin lock held region.
But that's from the lockdep perspective still the wrong place:
1) The mutex_acquire() is issued _after_ the successful acquisition which
is pointless because in a dead lock scenario this point is never
reached which means that if the deadlock is the first instance of
exposing the wrong lock order lockdep does not have a chance to detect
it.
2) It only works because lockdep is rather lax on the context from which
the mutex_acquire() is issued. Acquiring a mutex inside a bottom halves
and therefore non-preemptible region is obviously invalid, except for a
trylock which is clearly not the case here.
This 'works' stops working on RT enabled kernels where the bottom halves
serialization is done via a local lock, which exposes this misplacement
because the 'mutex' and the local lock nest the wrong way around and
lockdep complains rightfully about a lock inversion.
The placement is wrong since the initial commit a5b5bb9a053a ("[PATCH]
lockdep: annotate sk_locks") which introduced this.
Fix it by moving the mutex_acquire() in front of the actual lock
acquisition, which is what the regular mutex_lock() operation does as well.
lock_sock_fast() is not that straight forward. It looks at the first glance
like a convoluted trylock operation:
spin_lock_bh(sock::sk_lock.slock)
if (!sock::sk_lock.owned)
return false;
while (!try_lock(sock::sk_lock.owned)) {
spin_unlock_bh(sock::sk_lock.slock);
wait_for_release();
spin_lock_bh(sock::sk_lock.slock);
}
spin_unlock(sock::sk_lock.slock);
mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
local_bh_enable();
return true;
But that's not the case: lock_sock_fast() is an interesting optimization
for short critical sections which can run with bottom halves disabled and
sock::sk_lock.slock held. This allows to shortcut the 'mutex' operation in
the non contended case by preventing other lockers to acquire
sock::sk_lock.owned because they are blocked on sock::sk_lock.slock, which
in turn avoids the overhead of doing the heavy processing in release_sock()
including waking up wait queue waiters.
In the contended case, i.e. when sock::sk_lock.owned == true the behavior
is the same as lock_sock_nested().
Semantically this shortcut means, that the task acquired the 'mutex' even
if it does not touch the sock::sk_lock.owned field in the non-contended
case. Not telling lockdep about this shortcut acquisition is hiding
potential lock ordering violations in the fast path.
As a consequence the same reasoning as for the above lock_sock_nested()
case vs. the placement of the lockdep annotation applies.
The current placement of the lockdep annotation was just copied from
the original lock_sock(), now renamed to lock_sock_nested(),
implementation.
Fix this by moving the mutex_acquire() in front of the actual lock
acquisition and adding the corresponding mutex_release() into
unlock_sock_fast(). Also document the fast path return case with a comment.
Reported-by: Sebastian Siewior <bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: netdev@vger.kernel.org
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2021-09-18 15:42:35 +03:00
if ( ! sk - > sk_lock . owned ) {
2010-05-26 23:20:18 +04:00
/*
net: core: Correct the sock::sk_lock.owned lockdep annotations
lock_sock_fast() and lock_sock_nested() contain lockdep annotations for the
sock::sk_lock.owned 'mutex'. sock::sk_lock.owned is not a regular mutex. It
is just lockdep wise equivalent. In fact it's an open coded trivial mutex
implementation with some interesting features.
sock::sk_lock.slock is a regular spinlock protecting the 'mutex'
representation sock::sk_lock.owned which is a plain boolean. If 'owned' is
true, then some other task holds the 'mutex', otherwise it is uncontended.
As this locking construct is obviously endangered by lock ordering issues as
any other locking primitive it got lockdep annotated via a dedicated
dependency map sock::sk_lock.dep_map which has to be updated at the lock
and unlock sites.
lock_sock_nested() is a straight forward 'mutex' lock operation:
might_sleep();
spin_lock_bh(sock::sk_lock.slock)
while (!try_lock(sock::sk_lock.owned)) {
spin_unlock_bh(sock::sk_lock.slock);
wait_for_release();
spin_lock_bh(sock::sk_lock.slock);
}
The lockdep annotation for sock::sk_lock.owned is for unknown reasons
_after_ the lock has been acquired, i.e. after the code block above and
after releasing sock::sk_lock.slock, but inside the bottom halves disabled
region:
spin_unlock(sock::sk_lock.slock);
mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
local_bh_enable();
The placement after the unlock is obvious because otherwise the
mutex_acquire() would nest into the spin lock held region.
But that's from the lockdep perspective still the wrong place:
1) The mutex_acquire() is issued _after_ the successful acquisition which
is pointless because in a dead lock scenario this point is never
reached which means that if the deadlock is the first instance of
exposing the wrong lock order lockdep does not have a chance to detect
it.
2) It only works because lockdep is rather lax on the context from which
the mutex_acquire() is issued. Acquiring a mutex inside a bottom halves
and therefore non-preemptible region is obviously invalid, except for a
trylock which is clearly not the case here.
This 'works' stops working on RT enabled kernels where the bottom halves
serialization is done via a local lock, which exposes this misplacement
because the 'mutex' and the local lock nest the wrong way around and
lockdep complains rightfully about a lock inversion.
The placement is wrong since the initial commit a5b5bb9a053a ("[PATCH]
lockdep: annotate sk_locks") which introduced this.
Fix it by moving the mutex_acquire() in front of the actual lock
acquisition, which is what the regular mutex_lock() operation does as well.
lock_sock_fast() is not that straight forward. It looks at the first glance
like a convoluted trylock operation:
spin_lock_bh(sock::sk_lock.slock)
if (!sock::sk_lock.owned)
return false;
while (!try_lock(sock::sk_lock.owned)) {
spin_unlock_bh(sock::sk_lock.slock);
wait_for_release();
spin_lock_bh(sock::sk_lock.slock);
}
spin_unlock(sock::sk_lock.slock);
mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
local_bh_enable();
return true;
But that's not the case: lock_sock_fast() is an interesting optimization
for short critical sections which can run with bottom halves disabled and
sock::sk_lock.slock held. This allows to shortcut the 'mutex' operation in
the non contended case by preventing other lockers to acquire
sock::sk_lock.owned because they are blocked on sock::sk_lock.slock, which
in turn avoids the overhead of doing the heavy processing in release_sock()
including waking up wait queue waiters.
In the contended case, i.e. when sock::sk_lock.owned == true the behavior
is the same as lock_sock_nested().
Semantically this shortcut means, that the task acquired the 'mutex' even
if it does not touch the sock::sk_lock.owned field in the non-contended
case. Not telling lockdep about this shortcut acquisition is hiding
potential lock ordering violations in the fast path.
As a consequence the same reasoning as for the above lock_sock_nested()
case vs. the placement of the lockdep annotation applies.
The current placement of the lockdep annotation was just copied from
the original lock_sock(), now renamed to lock_sock_nested(),
implementation.
Fix this by moving the mutex_acquire() in front of the actual lock
acquisition and adding the corresponding mutex_release() into
unlock_sock_fast(). Also document the fast path return case with a comment.
Reported-by: Sebastian Siewior <bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: netdev@vger.kernel.org
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2021-09-18 15:42:35 +03:00
* Fast path return with bottom halves disabled and
* sock : : sk_lock . slock held .
*
* The ' mutex ' is not contended and holding
* sock : : sk_lock . slock prevents all other lockers to
* proceed so the corresponding unlock_sock_fast ( ) can
* avoid the slow path of release_sock ( ) completely and
* just release slock .
*
* From a semantical POV this is equivalent to ' acquiring '
* the ' mutex ' , hence the corresponding lockdep
* mutex_release ( ) has to happen in the fast path of
* unlock_sock_fast ( ) .
2010-05-26 23:20:18 +04:00
*/
return false ;
net: core: Correct the sock::sk_lock.owned lockdep annotations
lock_sock_fast() and lock_sock_nested() contain lockdep annotations for the
sock::sk_lock.owned 'mutex'. sock::sk_lock.owned is not a regular mutex. It
is just lockdep wise equivalent. In fact it's an open coded trivial mutex
implementation with some interesting features.
sock::sk_lock.slock is a regular spinlock protecting the 'mutex'
representation sock::sk_lock.owned which is a plain boolean. If 'owned' is
true, then some other task holds the 'mutex', otherwise it is uncontended.
As this locking construct is obviously endangered by lock ordering issues as
any other locking primitive it got lockdep annotated via a dedicated
dependency map sock::sk_lock.dep_map which has to be updated at the lock
and unlock sites.
lock_sock_nested() is a straight forward 'mutex' lock operation:
might_sleep();
spin_lock_bh(sock::sk_lock.slock)
while (!try_lock(sock::sk_lock.owned)) {
spin_unlock_bh(sock::sk_lock.slock);
wait_for_release();
spin_lock_bh(sock::sk_lock.slock);
}
The lockdep annotation for sock::sk_lock.owned is for unknown reasons
_after_ the lock has been acquired, i.e. after the code block above and
after releasing sock::sk_lock.slock, but inside the bottom halves disabled
region:
spin_unlock(sock::sk_lock.slock);
mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
local_bh_enable();
The placement after the unlock is obvious because otherwise the
mutex_acquire() would nest into the spin lock held region.
But that's from the lockdep perspective still the wrong place:
1) The mutex_acquire() is issued _after_ the successful acquisition which
is pointless because in a dead lock scenario this point is never
reached which means that if the deadlock is the first instance of
exposing the wrong lock order lockdep does not have a chance to detect
it.
2) It only works because lockdep is rather lax on the context from which
the mutex_acquire() is issued. Acquiring a mutex inside a bottom halves
and therefore non-preemptible region is obviously invalid, except for a
trylock which is clearly not the case here.
This 'works' stops working on RT enabled kernels where the bottom halves
serialization is done via a local lock, which exposes this misplacement
because the 'mutex' and the local lock nest the wrong way around and
lockdep complains rightfully about a lock inversion.
The placement is wrong since the initial commit a5b5bb9a053a ("[PATCH]
lockdep: annotate sk_locks") which introduced this.
Fix it by moving the mutex_acquire() in front of the actual lock
acquisition, which is what the regular mutex_lock() operation does as well.
lock_sock_fast() is not that straight forward. It looks at the first glance
like a convoluted trylock operation:
spin_lock_bh(sock::sk_lock.slock)
if (!sock::sk_lock.owned)
return false;
while (!try_lock(sock::sk_lock.owned)) {
spin_unlock_bh(sock::sk_lock.slock);
wait_for_release();
spin_lock_bh(sock::sk_lock.slock);
}
spin_unlock(sock::sk_lock.slock);
mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
local_bh_enable();
return true;
But that's not the case: lock_sock_fast() is an interesting optimization
for short critical sections which can run with bottom halves disabled and
sock::sk_lock.slock held. This allows to shortcut the 'mutex' operation in
the non contended case by preventing other lockers to acquire
sock::sk_lock.owned because they are blocked on sock::sk_lock.slock, which
in turn avoids the overhead of doing the heavy processing in release_sock()
including waking up wait queue waiters.
In the contended case, i.e. when sock::sk_lock.owned == true the behavior
is the same as lock_sock_nested().
Semantically this shortcut means, that the task acquired the 'mutex' even
if it does not touch the sock::sk_lock.owned field in the non-contended
case. Not telling lockdep about this shortcut acquisition is hiding
potential lock ordering violations in the fast path.
As a consequence the same reasoning as for the above lock_sock_nested()
case vs. the placement of the lockdep annotation applies.
The current placement of the lockdep annotation was just copied from
the original lock_sock(), now renamed to lock_sock_nested(),
implementation.
Fix this by moving the mutex_acquire() in front of the actual lock
acquisition and adding the corresponding mutex_release() into
unlock_sock_fast(). Also document the fast path return case with a comment.
Reported-by: Sebastian Siewior <bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: netdev@vger.kernel.org
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2021-09-18 15:42:35 +03:00
}
2010-05-26 23:20:18 +04:00
__lock_sock ( sk ) ;
sk - > sk_lock . owned = 1 ;
2020-11-17 21:43:49 +03:00
__acquire ( & sk - > sk_lock . slock ) ;
net: core: Correct the sock::sk_lock.owned lockdep annotations
lock_sock_fast() and lock_sock_nested() contain lockdep annotations for the
sock::sk_lock.owned 'mutex'. sock::sk_lock.owned is not a regular mutex. It
is just lockdep wise equivalent. In fact it's an open coded trivial mutex
implementation with some interesting features.
sock::sk_lock.slock is a regular spinlock protecting the 'mutex'
representation sock::sk_lock.owned which is a plain boolean. If 'owned' is
true, then some other task holds the 'mutex', otherwise it is uncontended.
As this locking construct is obviously endangered by lock ordering issues as
any other locking primitive it got lockdep annotated via a dedicated
dependency map sock::sk_lock.dep_map which has to be updated at the lock
and unlock sites.
lock_sock_nested() is a straight forward 'mutex' lock operation:
might_sleep();
spin_lock_bh(sock::sk_lock.slock)
while (!try_lock(sock::sk_lock.owned)) {
spin_unlock_bh(sock::sk_lock.slock);
wait_for_release();
spin_lock_bh(sock::sk_lock.slock);
}
The lockdep annotation for sock::sk_lock.owned is for unknown reasons
_after_ the lock has been acquired, i.e. after the code block above and
after releasing sock::sk_lock.slock, but inside the bottom halves disabled
region:
spin_unlock(sock::sk_lock.slock);
mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
local_bh_enable();
The placement after the unlock is obvious because otherwise the
mutex_acquire() would nest into the spin lock held region.
But that's from the lockdep perspective still the wrong place:
1) The mutex_acquire() is issued _after_ the successful acquisition which
is pointless because in a dead lock scenario this point is never
reached which means that if the deadlock is the first instance of
exposing the wrong lock order lockdep does not have a chance to detect
it.
2) It only works because lockdep is rather lax on the context from which
the mutex_acquire() is issued. Acquiring a mutex inside a bottom halves
and therefore non-preemptible region is obviously invalid, except for a
trylock which is clearly not the case here.
This 'works' stops working on RT enabled kernels where the bottom halves
serialization is done via a local lock, which exposes this misplacement
because the 'mutex' and the local lock nest the wrong way around and
lockdep complains rightfully about a lock inversion.
The placement is wrong since the initial commit a5b5bb9a053a ("[PATCH]
lockdep: annotate sk_locks") which introduced this.
Fix it by moving the mutex_acquire() in front of the actual lock
acquisition, which is what the regular mutex_lock() operation does as well.
lock_sock_fast() is not that straight forward. It looks at the first glance
like a convoluted trylock operation:
spin_lock_bh(sock::sk_lock.slock)
if (!sock::sk_lock.owned)
return false;
while (!try_lock(sock::sk_lock.owned)) {
spin_unlock_bh(sock::sk_lock.slock);
wait_for_release();
spin_lock_bh(sock::sk_lock.slock);
}
spin_unlock(sock::sk_lock.slock);
mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
local_bh_enable();
return true;
But that's not the case: lock_sock_fast() is an interesting optimization
for short critical sections which can run with bottom halves disabled and
sock::sk_lock.slock held. This allows to shortcut the 'mutex' operation in
the non contended case by preventing other lockers to acquire
sock::sk_lock.owned because they are blocked on sock::sk_lock.slock, which
in turn avoids the overhead of doing the heavy processing in release_sock()
including waking up wait queue waiters.
In the contended case, i.e. when sock::sk_lock.owned == true the behavior
is the same as lock_sock_nested().
Semantically this shortcut means, that the task acquired the 'mutex' even
if it does not touch the sock::sk_lock.owned field in the non-contended
case. Not telling lockdep about this shortcut acquisition is hiding
potential lock ordering violations in the fast path.
As a consequence the same reasoning as for the above lock_sock_nested()
case vs. the placement of the lockdep annotation applies.
The current placement of the lockdep annotation was just copied from
the original lock_sock(), now renamed to lock_sock_nested(),
implementation.
Fix this by moving the mutex_acquire() in front of the actual lock
acquisition and adding the corresponding mutex_release() into
unlock_sock_fast(). Also document the fast path return case with a comment.
Reported-by: Sebastian Siewior <bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: netdev@vger.kernel.org
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2021-09-18 15:42:35 +03:00
spin_unlock_bh ( & sk - > sk_lock . slock ) ;
2010-05-26 23:20:18 +04:00
return true ;
}
net: introduce and use lock_sock_fast_nested()
Syzkaller reported a false positive deadlock involving
the nl socket lock and the subflow socket lock:
MPTCP: kernel_bind error, err=-98
============================================
WARNING: possible recursive locking detected
5.15.0-rc1-syzkaller #0 Not tainted
--------------------------------------------
syz-executor998/6520 is trying to acquire lock:
ffff8880795718a0 (k-sk_lock-AF_INET){+.+.}-{0:0}, at: mptcp_close+0x267/0x7b0 net/mptcp/protocol.c:2738
but task is already holding lock:
ffff8880787c8c60 (k-sk_lock-AF_INET){+.+.}-{0:0}, at: lock_sock include/net/sock.h:1612 [inline]
ffff8880787c8c60 (k-sk_lock-AF_INET){+.+.}-{0:0}, at: mptcp_close+0x23/0x7b0 net/mptcp/protocol.c:2720
other info that might help us debug this:
Possible unsafe locking scenario:
CPU0
----
lock(k-sk_lock-AF_INET);
lock(k-sk_lock-AF_INET);
*** DEADLOCK ***
May be due to missing lock nesting notation
3 locks held by syz-executor998/6520:
#0: ffffffff8d176c50 (cb_lock){++++}-{3:3}, at: genl_rcv+0x15/0x40 net/netlink/genetlink.c:802
#1: ffffffff8d176d08 (genl_mutex){+.+.}-{3:3}, at: genl_lock net/netlink/genetlink.c:33 [inline]
#1: ffffffff8d176d08 (genl_mutex){+.+.}-{3:3}, at: genl_rcv_msg+0x3e0/0x580 net/netlink/genetlink.c:790
#2: ffff8880787c8c60 (k-sk_lock-AF_INET){+.+.}-{0:0}, at: lock_sock include/net/sock.h:1612 [inline]
#2: ffff8880787c8c60 (k-sk_lock-AF_INET){+.+.}-{0:0}, at: mptcp_close+0x23/0x7b0 net/mptcp/protocol.c:2720
stack backtrace:
CPU: 1 PID: 6520 Comm: syz-executor998 Not tainted 5.15.0-rc1-syzkaller #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
__dump_stack lib/dump_stack.c:88 [inline]
dump_stack_lvl+0xcd/0x134 lib/dump_stack.c:106
print_deadlock_bug kernel/locking/lockdep.c:2944 [inline]
check_deadlock kernel/locking/lockdep.c:2987 [inline]
validate_chain kernel/locking/lockdep.c:3776 [inline]
__lock_acquire.cold+0x149/0x3ab kernel/locking/lockdep.c:5015
lock_acquire kernel/locking/lockdep.c:5625 [inline]
lock_acquire+0x1ab/0x510 kernel/locking/lockdep.c:5590
lock_sock_fast+0x36/0x100 net/core/sock.c:3229
mptcp_close+0x267/0x7b0 net/mptcp/protocol.c:2738
inet_release+0x12e/0x280 net/ipv4/af_inet.c:431
__sock_release net/socket.c:649 [inline]
sock_release+0x87/0x1b0 net/socket.c:677
mptcp_pm_nl_create_listen_socket+0x238/0x2c0 net/mptcp/pm_netlink.c:900
mptcp_nl_cmd_add_addr+0x359/0x930 net/mptcp/pm_netlink.c:1170
genl_family_rcv_msg_doit+0x228/0x320 net/netlink/genetlink.c:731
genl_family_rcv_msg net/netlink/genetlink.c:775 [inline]
genl_rcv_msg+0x328/0x580 net/netlink/genetlink.c:792
netlink_rcv_skb+0x153/0x420 net/netlink/af_netlink.c:2504
genl_rcv+0x24/0x40 net/netlink/genetlink.c:803
netlink_unicast_kernel net/netlink/af_netlink.c:1314 [inline]
netlink_unicast+0x533/0x7d0 net/netlink/af_netlink.c:1340
netlink_sendmsg+0x86d/0xdb0 net/netlink/af_netlink.c:1929
sock_sendmsg_nosec net/socket.c:704 [inline]
sock_sendmsg+0xcf/0x120 net/socket.c:724
sock_no_sendpage+0x101/0x150 net/core/sock.c:2980
kernel_sendpage.part.0+0x1a0/0x340 net/socket.c:3504
kernel_sendpage net/socket.c:3501 [inline]
sock_sendpage+0xe5/0x140 net/socket.c:1003
pipe_to_sendpage+0x2ad/0x380 fs/splice.c:364
splice_from_pipe_feed fs/splice.c:418 [inline]
__splice_from_pipe+0x43e/0x8a0 fs/splice.c:562
splice_from_pipe fs/splice.c:597 [inline]
generic_splice_sendpage+0xd4/0x140 fs/splice.c:746
do_splice_from fs/splice.c:767 [inline]
direct_splice_actor+0x110/0x180 fs/splice.c:936
splice_direct_to_actor+0x34b/0x8c0 fs/splice.c:891
do_splice_direct+0x1b3/0x280 fs/splice.c:979
do_sendfile+0xae9/0x1240 fs/read_write.c:1249
__do_sys_sendfile64 fs/read_write.c:1314 [inline]
__se_sys_sendfile64 fs/read_write.c:1300 [inline]
__x64_sys_sendfile64+0x1cc/0x210 fs/read_write.c:1300
do_syscall_x64 arch/x86/entry/common.c:50 [inline]
do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80
entry_SYSCALL_64_after_hwframe+0x44/0xae
RIP: 0033:0x7f215cb69969
Code: 28 00 00 00 75 05 48 83 c4 28 c3 e8 e1 14 00 00 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 c0 ff ff ff f7 d8 64 89 01 48
RSP: 002b:00007ffc96bb3868 EFLAGS: 00000246 ORIG_RAX: 0000000000000028
RAX: ffffffffffffffda RBX: 00007f215cbad072 RCX: 00007f215cb69969
RDX: 0000000000000000 RSI: 0000000000000004 RDI: 0000000000000005
RBP: 0000000000000000 R08: 00007ffc96bb3a08 R09: 00007ffc96bb3a08
R10: 0000000100000002 R11: 0000000000000246 R12: 00007ffc96bb387c
R13: 431bde82d7b634db R14: 0000000000000000 R15: 0000000000000000
the problem originates from uncorrect lock annotation in the mptcp
code and is only visible since commit 2dcb96bacce3 ("net: core: Correct
the sock::sk_lock.owned lockdep annotations"), but is present since
the port-based endpoint support initial implementation.
This patch addresses the issue introducing a nested variant of
lock_sock_fast() and using it in the relevant code path.
Fixes: 1729cf186d8a ("mptcp: create the listening socket for new port")
Fixes: 2dcb96bacce3 ("net: core: Correct the sock::sk_lock.owned lockdep annotations")
Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Reported-and-tested-by: syzbot+1dd53f7a89b299d59eaf@syzkaller.appspotmail.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
2021-09-29 12:59:17 +03:00
EXPORT_SYMBOL ( __lock_sock_fast ) ;
2010-05-26 23:20:18 +04:00
2019-04-17 23:51:48 +03:00
int sock_gettstamp ( struct socket * sock , void __user * userstamp ,
bool timeval , bool time32 )
2007-02-09 17:24:36 +03:00
{
2019-04-17 23:51:48 +03:00
struct sock * sk = sock - > sk ;
struct timespec64 ts ;
2018-08-06 06:57:02 +03:00
sock_enable_timestamp ( sk , SOCK_TIMESTAMP ) ;
2019-04-17 23:51:48 +03:00
ts = ktime_to_timespec64 ( sock_read_timestamp ( sk ) ) ;
if ( ts . tv_sec = = - 1 )
2005-04-17 02:20:36 +04:00
return - ENOENT ;
2019-04-17 23:51:48 +03:00
if ( ts . tv_sec = = 0 ) {
2018-12-28 05:55:09 +03:00
ktime_t kt = ktime_get_real ( ) ;
2019-10-25 12:18:36 +03:00
sock_write_timestamp ( sk , kt ) ;
2019-04-17 23:51:48 +03:00
ts = ktime_to_timespec64 ( kt ) ;
2007-04-20 03:16:32 +04:00
}
2005-04-17 02:20:36 +04:00
2019-04-17 23:51:48 +03:00
if ( timeval )
ts . tv_nsec / = 1000 ;
2018-08-06 06:57:02 +03:00
2019-04-17 23:51:48 +03:00
# ifdef CONFIG_COMPAT_32BIT_TIME
if ( time32 )
return put_old_timespec32 ( & ts , userstamp ) ;
# endif
# ifdef CONFIG_SPARC64
/* beware of padding in sparc64 timeval */
if ( timeval & & ! in_compat_syscall ( ) ) {
struct __kernel_old_timeval __user tv = {
2019-04-23 10:25:24 +03:00
. tv_sec = ts . tv_sec ,
. tv_usec = ts . tv_nsec ,
2019-04-17 23:51:48 +03:00
} ;
2019-04-23 10:25:24 +03:00
if ( copy_to_user ( userstamp , & tv , sizeof ( tv ) ) )
2019-04-17 23:51:48 +03:00
return - EFAULT ;
return 0 ;
2007-03-19 03:33:16 +03:00
}
2019-04-17 23:51:48 +03:00
# endif
return put_timespec64 ( & ts , userstamp ) ;
2007-03-19 03:33:16 +03:00
}
2019-04-17 23:51:48 +03:00
EXPORT_SYMBOL ( sock_gettstamp ) ;
2007-03-19 03:33:16 +03:00
2019-10-03 23:56:37 +03:00
void sock_enable_timestamp ( struct sock * sk , enum sock_flags flag )
2007-02-09 17:24:36 +03:00
{
2009-02-12 08:03:38 +03:00
if ( ! sock_flag ( sk , flag ) ) {
2011-11-28 16:04:18 +04:00
unsigned long previous_flags = sk - > sk_flags ;
2009-02-12 08:03:38 +03:00
sock_set_flag ( sk , flag ) ;
/*
* we just set one of the two flags which require net
* time stamping , but time stamping might have been on
* already because of the other one
*/
2015-10-26 15:51:37 +03:00
if ( sock_needs_netstamp ( sk ) & &
! ( previous_flags & SK_FLAGS_TIMESTAMP ) )
2009-02-12 08:03:38 +03:00
net_enable_timestamp ( ) ;
2005-04-17 02:20:36 +04:00
}
}
2013-07-19 21:40:09 +04:00
int sock_recv_errqueue ( struct sock * sk , struct msghdr * msg , int len ,
int level , int type )
{
struct sock_exterr_skb * serr ;
2014-09-01 05:30:27 +04:00
struct sk_buff * skb ;
2013-07-19 21:40:09 +04:00
int copied , err ;
err = - EAGAIN ;
2014-09-01 05:30:27 +04:00
skb = sock_dequeue_err_skb ( sk ) ;
2013-07-19 21:40:09 +04:00
if ( skb = = NULL )
goto out ;
copied = skb - > len ;
if ( copied > len ) {
msg - > msg_flags | = MSG_TRUNC ;
copied = len ;
}
2014-11-06 00:46:40 +03:00
err = skb_copy_datagram_msg ( skb , 0 , msg , copied ) ;
2013-07-19 21:40:09 +04:00
if ( err )
goto out_free_skb ;
sock_recv_timestamp ( msg , sk , skb ) ;
serr = SKB_EXT_ERR ( skb ) ;
put_cmsg ( msg , level , type , sizeof ( serr - > ee ) , & serr - > ee ) ;
msg - > msg_flags | = MSG_ERRQUEUE ;
err = copied ;
out_free_skb :
kfree_skb ( skb ) ;
out :
return err ;
}
EXPORT_SYMBOL ( sock_recv_errqueue ) ;
2005-04-17 02:20:36 +04:00
/*
* Get a socket option on an socket .
*
* FIX : POSIX 1003.1 g is very ambiguous here . It states that
* asynchronous errors should be reported by getsockopt . We assume
* this means if you specify SO_ERROR ( otherwise whats the point of it ) .
*/
int sock_common_getsockopt ( struct socket * sock , int level , int optname ,
char __user * optval , int __user * optlen )
{
struct sock * sk = sock - > sk ;
return sk - > sk_prot - > getsockopt ( sk , level , optname , optval , optlen ) ;
}
EXPORT_SYMBOL ( sock_common_getsockopt ) ;
2015-03-02 10:37:48 +03:00
int sock_common_recvmsg ( struct socket * sock , struct msghdr * msg , size_t size ,
int flags )
2005-04-17 02:20:36 +04:00
{
struct sock * sk = sock - > sk ;
int addr_len = 0 ;
int err ;
2015-03-02 10:37:48 +03:00
err = sk - > sk_prot - > recvmsg ( sk , msg , size , flags & MSG_DONTWAIT ,
2005-04-17 02:20:36 +04:00
flags & ~ MSG_DONTWAIT , & addr_len ) ;
if ( err > = 0 )
msg - > msg_namelen = addr_len ;
return err ;
}
EXPORT_SYMBOL ( sock_common_recvmsg ) ;
/*
* Set socket options on an inet socket .
*/
int sock_common_setsockopt ( struct socket * sock , int level , int optname ,
2020-07-23 09:09:07 +03:00
sockptr_t optval , unsigned int optlen )
2005-04-17 02:20:36 +04:00
{
struct sock * sk = sock - > sk ;
return sk - > sk_prot - > setsockopt ( sk , level , optname , optval , optlen ) ;
}
EXPORT_SYMBOL ( sock_common_setsockopt ) ;
void sk_common_release ( struct sock * sk )
{
if ( sk - > sk_prot - > destroy )
sk - > sk_prot - > destroy ( sk ) ;
/*
2020-08-27 14:27:49 +03:00
* Observation : when sk_common_release is called , processes have
2005-04-17 02:20:36 +04:00
* no access to socket . But net still has .
* Step one , detach it from networking :
*
* A . Remove from hash tables .
*/
sk - > sk_prot - > unhash ( sk ) ;
/*
* In this point socket cannot receive new packets , but it is possible
* that some packets are in flight because some CPU runs receiver and
* did hash table lookup before we unhashed socket . They will achieve
* receive queue and will be purged by socket destructor .
*
* Also we still have packets pending on receive queue and probably ,
* our own packets waiting in device queues . sock_destroy will drain
* receive queue , but transmitted packets will delay socket destruction
* until the last reference will be released .
*/
sock_orphan ( sk ) ;
xfrm_sk_free_policy ( sk ) ;
2005-08-10 06:45:38 +04:00
sk_refcnt_debug_release ( sk ) ;
net: use a per task frag allocator
We currently use a per socket order-0 page cache for tcp_sendmsg()
operations.
This page is used to build fragments for skbs.
Its done to increase probability of coalescing small write() into
single segments in skbs still in write queue (not yet sent)
But it wastes a lot of memory for applications handling many mostly
idle sockets, since each socket holds one page in sk->sk_sndmsg_page
Its also quite inefficient to build TSO 64KB packets, because we need
about 16 pages per skb on arches where PAGE_SIZE = 4096, so we hit
page allocator more than wanted.
This patch adds a per task frag allocator and uses bigger pages,
if available. An automatic fallback is done in case of memory pressure.
(up to 32768 bytes per frag, thats order-3 pages on x86)
This increases TCP stream performance by 20% on loopback device,
but also benefits on other network devices, since 8x less frags are
mapped on transmit and unmapped on tx completion. Alexander Duyck
mentioned a probable performance win on systems with IOMMU enabled.
Its possible some SG enabled hardware cant cope with bigger fragments,
but their ndo_start_xmit() should already handle this, splitting a
fragment in sub fragments, since some arches have PAGE_SIZE=65536
Successfully tested on various ethernet devices.
(ixgbe, igb, bnx2x, tg3, mellanox mlx4)
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Ben Hutchings <bhutchings@solarflare.com>
Cc: Vijay Subramanian <subramanian.vijay@gmail.com>
Cc: Alexander Duyck <alexander.h.duyck@intel.com>
Tested-by: Vijay Subramanian <subramanian.vijay@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2012-09-24 03:04:42 +04:00
2005-04-17 02:20:36 +04:00
sock_put ( sk ) ;
}
EXPORT_SYMBOL ( sk_common_release ) ;
2017-03-20 22:22:03 +03:00
void sk_get_meminfo ( const struct sock * sk , u32 * mem )
{
memset ( mem , 0 , sizeof ( * mem ) * SK_MEMINFO_VARS ) ;
mem [ SK_MEMINFO_RMEM_ALLOC ] = sk_rmem_alloc_get ( sk ) ;
2019-10-11 06:17:44 +03:00
mem [ SK_MEMINFO_RCVBUF ] = READ_ONCE ( sk - > sk_rcvbuf ) ;
2017-03-20 22:22:03 +03:00
mem [ SK_MEMINFO_WMEM_ALLOC ] = sk_wmem_alloc_get ( sk ) ;
2019-10-11 06:17:45 +03:00
mem [ SK_MEMINFO_SNDBUF ] = READ_ONCE ( sk - > sk_sndbuf ) ;
2017-03-20 22:22:03 +03:00
mem [ SK_MEMINFO_FWD_ALLOC ] = sk - > sk_forward_alloc ;
2019-10-11 06:17:46 +03:00
mem [ SK_MEMINFO_WMEM_QUEUED ] = READ_ONCE ( sk - > sk_wmem_queued ) ;
2017-03-20 22:22:03 +03:00
mem [ SK_MEMINFO_OPTMEM ] = atomic_read ( & sk - > sk_omem_alloc ) ;
2019-10-10 01:41:03 +03:00
mem [ SK_MEMINFO_BACKLOG ] = READ_ONCE ( sk - > sk_backlog . len ) ;
2017-03-20 22:22:03 +03:00
mem [ SK_MEMINFO_DROPS ] = atomic_read ( & sk - > sk_drops ) ;
}
2008-03-29 02:38:17 +03:00
# ifdef CONFIG_PROC_FS
static DECLARE_BITMAP ( proto_inuse_idx , PROTO_INUSE_NR ) ;
2008-04-01 06:42:16 +04:00
int sock_prot_inuse_get ( struct net * net , struct proto * prot )
{
int cpu , idx = prot - > inuse_idx ;
int res = 0 ;
for_each_possible_cpu ( cpu )
2017-12-14 16:51:57 +03:00
res + = per_cpu_ptr ( net - > core . prot_inuse , cpu ) - > val [ idx ] ;
2008-04-01 06:42:16 +04:00
return res > = 0 ? res : 0 ;
}
EXPORT_SYMBOL_GPL ( sock_prot_inuse_get ) ;
2017-12-14 16:51:58 +03:00
int sock_inuse_get ( struct net * net )
{
int cpu , res = 0 ;
for_each_possible_cpu ( cpu )
2021-11-15 20:11:49 +03:00
res + = per_cpu_ptr ( net - > core . prot_inuse , cpu ) - > all ;
2017-12-14 16:51:58 +03:00
return res ;
}
EXPORT_SYMBOL_GPL ( sock_inuse_get ) ;
2010-01-17 06:35:32 +03:00
static int __net_init sock_inuse_init_net ( struct net * net )
2008-04-01 06:42:16 +04:00
{
2017-12-14 16:51:57 +03:00
net - > core . prot_inuse = alloc_percpu ( struct prot_inuse ) ;
2017-12-14 16:51:58 +03:00
if ( net - > core . prot_inuse = = NULL )
return - ENOMEM ;
return 0 ;
2008-04-01 06:42:16 +04:00
}
2010-01-17 06:35:32 +03:00
static void __net_exit sock_inuse_exit_net ( struct net * net )
2008-04-01 06:42:16 +04:00
{
2017-12-14 16:51:57 +03:00
free_percpu ( net - > core . prot_inuse ) ;
2008-04-01 06:42:16 +04:00
}
static struct pernet_operations net_inuse_ops = {
. init = sock_inuse_init_net ,
. exit = sock_inuse_exit_net ,
} ;
static __init int net_inuse_init ( void )
{
if ( register_pernet_subsys ( & net_inuse_ops ) )
panic ( " Cannot initialize net inuse counters " ) ;
return 0 ;
}
core_initcall ( net_inuse_init ) ;
2008-03-29 02:38:17 +03:00
2019-08-23 04:14:11 +03:00
static int assign_proto_idx ( struct proto * prot )
2008-03-29 02:38:17 +03:00
{
prot - > inuse_idx = find_first_zero_bit ( proto_inuse_idx , PROTO_INUSE_NR ) ;
if ( unlikely ( prot - > inuse_idx = = PROTO_INUSE_NR - 1 ) ) {
2012-05-16 23:58:40 +04:00
pr_err ( " PROTO_INUSE_NR exhausted \n " ) ;
2019-08-23 04:14:11 +03:00
return - ENOSPC ;
2008-03-29 02:38:17 +03:00
}
set_bit ( prot - > inuse_idx , proto_inuse_idx ) ;
2019-08-23 04:14:11 +03:00
return 0 ;
2008-03-29 02:38:17 +03:00
}
static void release_proto_idx ( struct proto * prot )
{
if ( prot - > inuse_idx ! = PROTO_INUSE_NR - 1 )
clear_bit ( prot - > inuse_idx , proto_inuse_idx ) ;
}
# else
2019-08-23 04:14:11 +03:00
static inline int assign_proto_idx ( struct proto * prot )
2008-03-29 02:38:17 +03:00
{
2019-08-23 04:14:11 +03:00
return 0 ;
2008-03-29 02:38:17 +03:00
}
static inline void release_proto_idx ( struct proto * prot )
{
}
2017-12-14 16:51:58 +03:00
2008-03-29 02:38:17 +03:00
# endif
2020-08-10 15:16:58 +03:00
static void tw_prot_cleanup ( struct timewait_sock_ops * twsk_prot )
{
if ( ! twsk_prot )
return ;
kfree ( twsk_prot - > twsk_slab_name ) ;
twsk_prot - > twsk_slab_name = NULL ;
kmem_cache_destroy ( twsk_prot - > twsk_slab ) ;
twsk_prot - > twsk_slab = NULL ;
}
2021-03-11 05:57:36 +03:00
static int tw_prot_init ( const struct proto * prot )
{
struct timewait_sock_ops * twsk_prot = prot - > twsk_prot ;
if ( ! twsk_prot )
return 0 ;
twsk_prot - > twsk_slab_name = kasprintf ( GFP_KERNEL , " tw_sock_%s " ,
prot - > name ) ;
if ( ! twsk_prot - > twsk_slab_name )
return - ENOMEM ;
twsk_prot - > twsk_slab =
kmem_cache_create ( twsk_prot - > twsk_slab_name ,
twsk_prot - > twsk_obj_size , 0 ,
SLAB_ACCOUNT | prot - > slab_flags ,
NULL ) ;
if ( ! twsk_prot - > twsk_slab ) {
pr_crit ( " %s: Can't create timewait sock SLAB cache! \n " ,
prot - > name ) ;
return - ENOMEM ;
}
return 0 ;
}
2015-03-13 02:44:07 +03:00
static void req_prot_cleanup ( struct request_sock_ops * rsk_prot )
{
if ( ! rsk_prot )
return ;
kfree ( rsk_prot - > slab_name ) ;
rsk_prot - > slab_name = NULL ;
2015-09-13 15:15:18 +03:00
kmem_cache_destroy ( rsk_prot - > slab ) ;
rsk_prot - > slab = NULL ;
2015-03-13 02:44:07 +03:00
}
static int req_prot_init ( const struct proto * prot )
{
struct request_sock_ops * rsk_prot = prot - > rsk_prot ;
if ( ! rsk_prot )
return 0 ;
rsk_prot - > slab_name = kasprintf ( GFP_KERNEL , " request_sock_%s " ,
prot - > name ) ;
if ( ! rsk_prot - > slab_name )
return - ENOMEM ;
rsk_prot - > slab = kmem_cache_create ( rsk_prot - > slab_name ,
rsk_prot - > obj_size , 0 ,
2018-06-28 01:16:42 +03:00
SLAB_ACCOUNT | prot - > slab_flags ,
NULL ) ;
2015-03-13 02:44:07 +03:00
if ( ! rsk_prot - > slab ) {
pr_crit ( " %s: Can't create request sock SLAB cache! \n " ,
prot - > name ) ;
return - ENOMEM ;
}
return 0 ;
}
2007-11-07 13:23:38 +03:00
int proto_register ( struct proto * prot , int alloc_slab )
{
2019-08-23 04:14:11 +03:00
int ret = - ENOBUFS ;
2005-04-17 02:20:36 +04:00
if ( alloc_slab ) {
2017-06-11 05:50:42 +03:00
prot - > slab = kmem_cache_create_usercopy ( prot - > name ,
prot - > obj_size , 0 ,
2018-06-28 01:16:42 +03:00
SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
prot - > slab_flags ,
net: Restrict unwhitelisted proto caches to size 0
Now that protocols have been annotated (the copy of icsk_ca_ops->name
is of an ops field from outside the slab cache):
$ git grep 'copy_.*_user.*sk.*->'
caif/caif_socket.c: copy_from_user(&cf_sk->conn_req.param.data, ov, ol)) {
ipv4/raw.c: if (copy_from_user(&raw_sk(sk)->filter, optval, optlen))
ipv4/raw.c: copy_to_user(optval, &raw_sk(sk)->filter, len))
ipv4/tcp.c: if (copy_to_user(optval, icsk->icsk_ca_ops->name, len))
ipv4/tcp.c: if (copy_to_user(optval, icsk->icsk_ulp_ops->name, len))
ipv6/raw.c: if (copy_from_user(&raw6_sk(sk)->filter, optval, optlen))
ipv6/raw.c: if (copy_to_user(optval, &raw6_sk(sk)->filter, len))
sctp/socket.c: if (copy_from_user(&sctp_sk(sk)->subscribe, optval, optlen))
sctp/socket.c: if (copy_to_user(optval, &sctp_sk(sk)->subscribe, len))
sctp/socket.c: if (copy_to_user(optval, &sctp_sk(sk)->initmsg, len))
we can switch the default proto usercopy region to size 0. Any protocols
needing to add whitelisted regions must annotate the fields with the
useroffset and usersize fields of struct proto.
This patch is modified from Brad Spengler/PaX Team's PAX_USERCOPY
whitelisting code in the last public patch of grsecurity/PaX based on my
understanding of the code. Changes or omissions from the original code are
mine and don't reflect the original grsecurity/PaX code.
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Paolo Abeni <pabeni@redhat.com>
Cc: David Howells <dhowells@redhat.com>
Cc: netdev@vger.kernel.org
Signed-off-by: Kees Cook <keescook@chromium.org>
2017-08-25 02:59:38 +03:00
prot - > useroffset , prot - > usersize ,
udp: RCU handling for Unicast packets.
Goals are :
1) Optimizing handling of incoming Unicast UDP frames, so that no memory
writes should happen in the fast path.
Note: Multicasts and broadcasts still will need to take a lock,
because doing a full lockless lookup in this case is difficult.
2) No expensive operations in the socket bind/unhash phases :
- No expensive synchronize_rcu() calls.
- No added rcu_head in socket structure, increasing memory needs,
but more important, forcing us to use call_rcu() calls,
that have the bad property of making sockets structure cold.
(rcu grace period between socket freeing and its potential reuse
make this socket being cold in CPU cache).
David did a previous patch using call_rcu() and noticed a 20%
impact on TCP connection rates.
Quoting Cristopher Lameter :
"Right. That results in cacheline cooldown. You'd want to recycle
the object as they are cache hot on a per cpu basis. That is screwed
up by the delayed regular rcu processing. We have seen multiple
regressions due to cacheline cooldown.
The only choice in cacheline hot sensitive areas is to deal with the
complexity that comes with SLAB_DESTROY_BY_RCU or give up on RCU."
- Because udp sockets are allocated from dedicated kmem_cache,
use of SLAB_DESTROY_BY_RCU can help here.
Theory of operation :
---------------------
As the lookup is lockfree (using rcu_read_lock()/rcu_read_unlock()),
special attention must be taken by readers and writers.
Use of SLAB_DESTROY_BY_RCU is tricky too, because a socket can be freed,
reused, inserted in a different chain or in worst case in the same chain
while readers could do lookups in the same time.
In order to avoid loops, a reader must check each socket found in a chain
really belongs to the chain the reader was traversing. If it finds a
mismatch, lookup must start again at the begining. This *restart* loop
is the reason we had to use rdlock for the multicast case, because
we dont want to send same message several times to the same socket.
We use RCU only for fast path.
Thus, /proc/net/udp still takes spinlocks.
Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-10-29 12:11:14 +03:00
NULL ) ;
2005-04-17 02:20:36 +04:00
if ( prot - > slab = = NULL ) {
2012-05-16 23:58:40 +04:00
pr_crit ( " %s: Can't create sock SLAB cache! \n " ,
prot - > name ) ;
2008-03-29 02:39:10 +03:00
goto out ;
2005-04-17 02:20:36 +04:00
}
[NET] Generalise TCP's struct open_request minisock infrastructure
Kept this first changeset minimal, without changing existing names to
ease peer review.
Basicaly tcp_openreq_alloc now receives the or_calltable, that in turn
has two new members:
->slab, that replaces tcp_openreq_cachep
->obj_size, to inform the size of the openreq descendant for
a specific protocol
The protocol specific fields in struct open_request were moved to a
class hierarchy, with the things that are common to all connection
oriented PF_INET protocols in struct inet_request_sock, the TCP ones
in tcp_request_sock, that is an inet_request_sock, that is an
open_request.
I.e. this uses the same approach used for the struct sock class
hierarchy, with sk_prot indicating if the protocol wants to use the
open_request infrastructure by filling in sk_prot->rsk_prot with an
or_calltable.
Results? Performance is improved and TCP v4 now uses only 64 bytes per
open request minisock, down from 96 without this patch :-)
Next changeset will rename some of the structs, fields and functions
mentioned above, struct or_calltable is way unclear, better name it
struct request_sock_ops, s/struct open_request/struct request_sock/g,
etc.
Signed-off-by: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
2005-06-19 09:46:52 +04:00
2015-03-13 02:44:07 +03:00
if ( req_prot_init ( prot ) )
goto out_free_request_sock_slab ;
2005-08-10 07:09:30 +04:00
2021-03-11 05:57:36 +03:00
if ( tw_prot_init ( prot ) )
goto out_free_timewait_sock_slab ;
2005-04-17 02:20:36 +04:00
}
2011-12-16 04:51:59 +04:00
mutex_lock ( & proto_list_mutex ) ;
2019-08-23 04:14:11 +03:00
ret = assign_proto_idx ( prot ) ;
if ( ret ) {
mutex_unlock ( & proto_list_mutex ) ;
2020-08-10 15:16:58 +03:00
goto out_free_timewait_sock_slab ;
2019-08-23 04:14:11 +03:00
}
2005-04-17 02:20:36 +04:00
list_add ( & prot - > node , & proto_list ) ;
2011-12-16 04:51:59 +04:00
mutex_unlock ( & proto_list_mutex ) ;
2019-08-23 04:14:11 +03:00
return ret ;
2007-11-07 13:23:38 +03:00
2020-08-10 15:16:58 +03:00
out_free_timewait_sock_slab :
2021-04-22 16:41:51 +03:00
if ( alloc_slab )
2020-08-10 15:16:58 +03:00
tw_prot_cleanup ( prot - > twsk_prot ) ;
2005-08-10 07:09:30 +04:00
out_free_request_sock_slab :
2019-08-23 04:14:11 +03:00
if ( alloc_slab ) {
req_prot_cleanup ( prot - > rsk_prot ) ;
2015-03-13 02:44:07 +03:00
2019-08-23 04:14:11 +03:00
kmem_cache_destroy ( prot - > slab ) ;
prot - > slab = NULL ;
}
2007-11-07 13:23:38 +03:00
out :
2019-08-23 04:14:11 +03:00
return ret ;
2005-04-17 02:20:36 +04:00
}
EXPORT_SYMBOL ( proto_register ) ;
void proto_unregister ( struct proto * prot )
{
2011-12-16 04:51:59 +04:00
mutex_lock ( & proto_list_mutex ) ;
2008-03-29 02:38:17 +03:00
release_proto_idx ( prot ) ;
2005-09-07 06:47:50 +04:00
list_del ( & prot - > node ) ;
2011-12-16 04:51:59 +04:00
mutex_unlock ( & proto_list_mutex ) ;
2005-04-17 02:20:36 +04:00
2015-09-13 15:15:18 +03:00
kmem_cache_destroy ( prot - > slab ) ;
prot - > slab = NULL ;
2005-04-17 02:20:36 +04:00
2015-03-13 02:44:07 +03:00
req_prot_cleanup ( prot - > rsk_prot ) ;
2020-08-10 15:16:58 +03:00
tw_prot_cleanup ( prot - > twsk_prot ) ;
2005-04-17 02:20:36 +04:00
}
EXPORT_SYMBOL ( proto_unregister ) ;
sock_diag: request _diag module only when the family or proto has been registered
Now when using 'ss' in iproute, kernel would try to load all _diag
modules, which also causes corresponding family and proto modules
to be loaded as well due to module dependencies.
Like after running 'ss', sctp, dccp, af_packet (if it works as a module)
would be loaded.
For example:
$ lsmod|grep sctp
$ ss
$ lsmod|grep sctp
sctp_diag 16384 0
sctp 323584 5 sctp_diag
inet_diag 24576 4 raw_diag,tcp_diag,sctp_diag,udp_diag
libcrc32c 16384 3 nf_conntrack,nf_nat,sctp
As these family and proto modules are loaded unintentionally, it
could cause some problems, like:
- Some debug tools use 'ss' to collect the socket info, which loads all
those diag and family and protocol modules. It's noisy for identifying
issues.
- Users usually expect to drop sctp init packet silently when they
have no sense of sctp protocol instead of sending abort back.
- It wastes resources (especially with multiple netns), and SCTP module
can't be unloaded once it's loaded.
...
In short, it's really inappropriate to have these family and proto
modules loaded unexpectedly when just doing debugging with inet_diag.
This patch is to introduce sock_load_diag_module() where it loads
the _diag module only when it's corresponding family or proto has
been already registered.
Note that we can't just load _diag module without the family or
proto loaded, as some symbols used in _diag module are from the
family or proto module.
v1->v2:
- move inet proto check to inet_diag to avoid a compiling err.
v2->v3:
- define sock_load_diag_module in sock.c and export one symbol
only.
- improve the changelog.
Reported-by: Sabrina Dubroca <sd@queasysnail.net>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Acked-by: Phil Sutter <phil@nwl.cc>
Acked-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-03-10 13:57:50 +03:00
int sock_load_diag_module ( int family , int protocol )
{
if ( ! protocol ) {
if ( ! sock_is_registered ( family ) )
return - ENOENT ;
return request_module ( " net-pf-%d-proto-%d-type-%d " , PF_NETLINK ,
NETLINK_SOCK_DIAG , family ) ;
}
# ifdef CONFIG_INET
if ( family = = AF_INET & &
2018-11-05 09:37:15 +03:00
protocol ! = IPPROTO_RAW & &
2020-07-09 16:12:39 +03:00
protocol < MAX_INET_PROTOS & &
sock_diag: request _diag module only when the family or proto has been registered
Now when using 'ss' in iproute, kernel would try to load all _diag
modules, which also causes corresponding family and proto modules
to be loaded as well due to module dependencies.
Like after running 'ss', sctp, dccp, af_packet (if it works as a module)
would be loaded.
For example:
$ lsmod|grep sctp
$ ss
$ lsmod|grep sctp
sctp_diag 16384 0
sctp 323584 5 sctp_diag
inet_diag 24576 4 raw_diag,tcp_diag,sctp_diag,udp_diag
libcrc32c 16384 3 nf_conntrack,nf_nat,sctp
As these family and proto modules are loaded unintentionally, it
could cause some problems, like:
- Some debug tools use 'ss' to collect the socket info, which loads all
those diag and family and protocol modules. It's noisy for identifying
issues.
- Users usually expect to drop sctp init packet silently when they
have no sense of sctp protocol instead of sending abort back.
- It wastes resources (especially with multiple netns), and SCTP module
can't be unloaded once it's loaded.
...
In short, it's really inappropriate to have these family and proto
modules loaded unexpectedly when just doing debugging with inet_diag.
This patch is to introduce sock_load_diag_module() where it loads
the _diag module only when it's corresponding family or proto has
been already registered.
Note that we can't just load _diag module without the family or
proto loaded, as some symbols used in _diag module are from the
family or proto module.
v1->v2:
- move inet proto check to inet_diag to avoid a compiling err.
v2->v3:
- define sock_load_diag_module in sock.c and export one symbol
only.
- improve the changelog.
Reported-by: Sabrina Dubroca <sd@queasysnail.net>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Acked-by: Phil Sutter <phil@nwl.cc>
Acked-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-03-10 13:57:50 +03:00
! rcu_access_pointer ( inet_protos [ protocol ] ) )
return - ENOENT ;
# endif
return request_module ( " net-pf-%d-proto-%d-type-%d-%d " , PF_NETLINK ,
NETLINK_SOCK_DIAG , family , protocol ) ;
}
EXPORT_SYMBOL ( sock_load_diag_module ) ;
2005-04-17 02:20:36 +04:00
# ifdef CONFIG_PROC_FS
static void * proto_seq_start ( struct seq_file * seq , loff_t * pos )
2011-12-16 04:51:59 +04:00
__acquires ( proto_list_mutex )
2005-04-17 02:20:36 +04:00
{
2011-12-16 04:51:59 +04:00
mutex_lock ( & proto_list_mutex ) ;
2007-07-10 00:15:14 +04:00
return seq_list_start_head ( & proto_list , * pos ) ;
2005-04-17 02:20:36 +04:00
}
static void * proto_seq_next ( struct seq_file * seq , void * v , loff_t * pos )
{
2007-07-10 00:15:14 +04:00
return seq_list_next ( v , & proto_list , pos ) ;
2005-04-17 02:20:36 +04:00
}
static void proto_seq_stop ( struct seq_file * seq , void * v )
2011-12-16 04:51:59 +04:00
__releases ( proto_list_mutex )
2005-04-17 02:20:36 +04:00
{
2011-12-16 04:51:59 +04:00
mutex_unlock ( & proto_list_mutex ) ;
2005-04-17 02:20:36 +04:00
}
static char proto_method_implemented ( const void * method )
{
return method = = NULL ? ' n ' : ' y ' ;
}
2011-12-12 01:47:02 +04:00
static long sock_prot_memory_allocated ( struct proto * proto )
{
2012-04-25 17:47:29 +04:00
return proto - > memory_allocated ! = NULL ? proto_memory_allocated ( proto ) : - 1L ;
2011-12-12 01:47:02 +04:00
}
2019-10-04 00:44:40 +03:00
static const char * sock_prot_memory_pressure ( struct proto * proto )
2011-12-12 01:47:02 +04:00
{
return proto - > memory_pressure ! = NULL ?
proto_memory_pressure ( proto ) ? " yes " : " no " : " NI " ;
}
2005-04-17 02:20:36 +04:00
static void proto_seq_printf ( struct seq_file * seq , struct proto * proto )
{
2011-12-12 01:47:02 +04:00
2010-11-10 02:24:26 +03:00
seq_printf ( seq , " %-9s %4u %6d %6ld %-3s %6u %-3s %-10s "
2005-04-17 02:20:36 +04:00
" %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c \n " ,
proto - > name ,
proto - > obj_size ,
2008-11-20 02:14:01 +03:00
sock_prot_inuse_get ( seq_file_net ( seq ) , proto ) ,
2011-12-12 01:47:02 +04:00
sock_prot_memory_allocated ( proto ) ,
sock_prot_memory_pressure ( proto ) ,
2005-04-17 02:20:36 +04:00
proto - > max_header ,
proto - > slab = = NULL ? " no " : " yes " ,
module_name ( proto - > owner ) ,
proto_method_implemented ( proto - > close ) ,
proto_method_implemented ( proto - > connect ) ,
proto_method_implemented ( proto - > disconnect ) ,
proto_method_implemented ( proto - > accept ) ,
proto_method_implemented ( proto - > ioctl ) ,
proto_method_implemented ( proto - > init ) ,
proto_method_implemented ( proto - > destroy ) ,
proto_method_implemented ( proto - > shutdown ) ,
proto_method_implemented ( proto - > setsockopt ) ,
proto_method_implemented ( proto - > getsockopt ) ,
proto_method_implemented ( proto - > sendmsg ) ,
proto_method_implemented ( proto - > recvmsg ) ,
proto_method_implemented ( proto - > sendpage ) ,
proto_method_implemented ( proto - > bind ) ,
proto_method_implemented ( proto - > backlog_rcv ) ,
proto_method_implemented ( proto - > hash ) ,
proto_method_implemented ( proto - > unhash ) ,
proto_method_implemented ( proto - > get_port ) ,
proto_method_implemented ( proto - > enter_memory_pressure ) ) ;
}
static int proto_seq_show ( struct seq_file * seq , void * v )
{
2007-07-10 00:15:14 +04:00
if ( v = = & proto_list )
2005-04-17 02:20:36 +04:00
seq_printf ( seq , " %-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s " ,
" protocol " ,
" size " ,
" sockets " ,
" memory " ,
" press " ,
" maxhdr " ,
" slab " ,
" module " ,
" cl co di ac io in de sh ss gs se re sp bi br ha uh gp em \n " ) ;
else
2007-07-10 00:15:14 +04:00
proto_seq_printf ( seq , list_entry ( v , struct proto , node ) ) ;
2005-04-17 02:20:36 +04:00
return 0 ;
}
2007-03-13 00:34:29 +03:00
static const struct seq_operations proto_seq_ops = {
2005-04-17 02:20:36 +04:00
. start = proto_seq_start ,
. next = proto_seq_next ,
. stop = proto_seq_stop ,
. show = proto_seq_show ,
} ;
2008-11-20 02:14:01 +03:00
static __net_init int proto_init_net ( struct net * net )
{
2018-04-10 20:42:55 +03:00
if ( ! proc_create_net ( " protocols " , 0444 , net - > proc_net , & proto_seq_ops ,
sizeof ( struct seq_net_private ) ) )
2008-11-20 02:14:01 +03:00
return - ENOMEM ;
return 0 ;
}
static __net_exit void proto_exit_net ( struct net * net )
{
2013-02-18 05:34:56 +04:00
remove_proc_entry ( " protocols " , net - > proc_net ) ;
2008-11-20 02:14:01 +03:00
}
static __net_initdata struct pernet_operations proto_net_ops = {
. init = proto_init_net ,
. exit = proto_exit_net ,
2005-04-17 02:20:36 +04:00
} ;
static int __init proto_init ( void )
{
2008-11-20 02:14:01 +03:00
return register_pernet_subsys ( & proto_net_ops ) ;
2005-04-17 02:20:36 +04:00
}
subsys_initcall ( proto_init ) ;
# endif /* PROC_FS */
2017-03-24 20:08:24 +03:00
# ifdef CONFIG_NET_RX_BUSY_POLL
bool sk_busy_loop_end ( void * p , unsigned long start_time )
{
struct sock * sk = p ;
2019-10-24 08:44:51 +03:00
return ! skb_queue_empty_lockless ( & sk - > sk_receive_queue ) | |
2017-03-24 20:08:24 +03:00
sk_busy_loop_timeout ( sk , start_time ) ;
}
EXPORT_SYMBOL ( sk_busy_loop_end ) ;
# endif /* CONFIG_NET_RX_BUSY_POLL */
2020-05-29 15:09:42 +03:00
int sock_bind_add ( struct sock * sk , struct sockaddr * addr , int addr_len )
{
if ( ! sk - > sk_prot - > bind_add )
return - EOPNOTSUPP ;
return sk - > sk_prot - > bind_add ( sk , addr , addr_len ) ;
}
EXPORT_SYMBOL ( sock_bind_add ) ;