2005-04-17 02:20:36 +04:00
/*
* INET An implementation of the TCP / IP protocol suite for the LINUX
* operating system . INET is implemented using the BSD Socket
* interface as the means of communication with the user level .
*
* Generic socket support routines . Memory allocators , socket lock / release
* handler for protocols to use and generic option handler .
*
*
2005-05-06 03:16:16 +04:00
* Authors : Ross Biro
2005-04-17 02:20:36 +04:00
* Fred N . van Kempen , < waltje @ uWalt . NL . Mugnet . ORG >
* Florian La Roche , < flla @ stud . uni - sb . de >
* Alan Cox , < A . Cox @ swansea . ac . uk >
*
* Fixes :
* Alan Cox : Numerous verify_area ( ) problems
* Alan Cox : Connecting on a connecting socket
* now returns an error for tcp .
* Alan Cox : sock - > protocol is set correctly .
* and is not sometimes left as 0.
* Alan Cox : connect handles icmp errors on a
* connect properly . Unfortunately there
* is a restart syscall nasty there . I
* can ' t match BSD without hacking the C
* library . Ideas urgently sought !
* Alan Cox : Disallow bind ( ) to addresses that are
* not ours - especially broadcast ones ! !
* Alan Cox : Socket 1024 _IS_ ok for users . ( fencepost )
* Alan Cox : sock_wfree / sock_rfree don ' t destroy sockets ,
* instead they leave that for the DESTROY timer .
* Alan Cox : Clean up error flag in accept
* Alan Cox : TCP ack handling is buggy , the DESTROY timer
* was buggy . Put a remove_sock ( ) in the handler
* for memory when we hit 0. Also altered the timer
2007-02-09 17:24:36 +03:00
* code . The ACK stuff can wait and needs major
2005-04-17 02:20:36 +04:00
* TCP layer surgery .
* Alan Cox : Fixed TCP ack bug , removed remove sock
* and fixed timer / inet_bh race .
* Alan Cox : Added zapped flag for TCP
* Alan Cox : Move kfree_skb into skbuff . c and tidied up surplus code
* Alan Cox : for new sk_buff allocations wmalloc / rmalloc now call alloc_skb
* Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources
* Alan Cox : Supports socket option broadcast now as does udp . Packet and raw need fixing .
* Alan Cox : Added RCVBUF , SNDBUF size setting . It suddenly occurred to me how easy it was so . . .
* Rick Sladkey : Relaxed UDP rules for matching packets .
* C . E . Hawkins : IFF_PROMISC / SIOCGHWADDR support
* Pauline Middelink : identd support
* Alan Cox : Fixed connect ( ) taking signals I think .
* Alan Cox : SO_LINGER supported
* Alan Cox : Error reporting fixes
* Anonymous : inet_create tidied up ( sk - > reuse setting )
* Alan Cox : inet sockets don ' t set sk - > type !
* Alan Cox : Split socket option code
* Alan Cox : Callbacks
* Alan Cox : Nagle flag for Charles & Johannes stuff
* Alex : Removed restriction on inet fioctl
* Alan Cox : Splitting INET from NET core
* Alan Cox : Fixed bogus SO_TYPE handling in getsockopt ( )
* Adam Caldwell : Missing return in SO_DONTROUTE / SO_DEBUG code
* Alan Cox : Split IP from generic code
* Alan Cox : New kfree_skbmem ( )
* Alan Cox : Make SO_DEBUG superuser only .
* Alan Cox : Allow anyone to clear SO_DEBUG
* ( compatibility fix )
* Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput .
* Alan Cox : Allocator for a socket is settable .
* Alan Cox : SO_ERROR includes soft errors .
* Alan Cox : Allow NULL arguments on some SO_ opts
* Alan Cox : Generic socket allocation to make hooks
* easier ( suggested by Craig Metz ) .
* Michael Pall : SO_ERROR returns positive errno again
* Steve Whitehouse : Added default destructor to free
* protocol private data .
* Steve Whitehouse : Added various other default routines
* common to several socket families .
* Chris Evans : Call suser ( ) check last on F_SETOWN
* Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER .
* Andi Kleen : Add sock_kmalloc ( ) / sock_kfree_s ( )
* Andi Kleen : Fix write_space callback
* Chris Evans : Security fixes - signedness again
* Arnaldo C . Melo : cleanups , use skb_queue_purge
*
* To Fix :
*
*
* This program is free software ; you can redistribute it and / or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation ; either version
* 2 of the License , or ( at your option ) any later version .
*/
2012-05-16 23:58:40 +04:00
# define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
2006-01-11 23:17:47 +03:00
# include <linux/capability.h>
2005-04-17 02:20:36 +04:00
# include <linux/errno.h>
2013-07-19 21:40:09 +04:00
# include <linux/errqueue.h>
2005-04-17 02:20:36 +04:00
# include <linux/types.h>
# include <linux/socket.h>
# include <linux/in.h>
# include <linux/kernel.h>
# include <linux/module.h>
# include <linux/proc_fs.h>
# include <linux/seq_file.h>
# include <linux/sched.h>
# include <linux/timer.h>
# include <linux/string.h>
# include <linux/sockios.h>
# include <linux/net.h>
# include <linux/mm.h>
# include <linux/slab.h>
# include <linux/interrupt.h>
# include <linux/poll.h>
# include <linux/tcp.h>
# include <linux/init.h>
2006-10-20 00:08:53 +04:00
# include <linux/highmem.h>
2010-06-13 07:28:59 +04:00
# include <linux/user_namespace.h>
2012-02-24 11:31:31 +04:00
# include <linux/static_key.h>
2012-01-10 01:44:23 +04:00
# include <linux/memcontrol.h>
2012-05-03 10:25:55 +04:00
# include <linux/prefetch.h>
2005-04-17 02:20:36 +04:00
# include <asm/uaccess.h>
# include <linux/netdevice.h>
# include <net/protocol.h>
# include <linux/skbuff.h>
2007-09-12 14:01:34 +04:00
# include <net/net_namespace.h>
[NET] Generalise TCP's struct open_request minisock infrastructure
Kept this first changeset minimal, without changing existing names to
ease peer review.
Basicaly tcp_openreq_alloc now receives the or_calltable, that in turn
has two new members:
->slab, that replaces tcp_openreq_cachep
->obj_size, to inform the size of the openreq descendant for
a specific protocol
The protocol specific fields in struct open_request were moved to a
class hierarchy, with the things that are common to all connection
oriented PF_INET protocols in struct inet_request_sock, the TCP ones
in tcp_request_sock, that is an inet_request_sock, that is an
open_request.
I.e. this uses the same approach used for the struct sock class
hierarchy, with sk_prot indicating if the protocol wants to use the
open_request infrastructure by filling in sk_prot->rsk_prot with an
or_calltable.
Results? Performance is improved and TCP v4 now uses only 64 bytes per
open request minisock, down from 96 without this patch :-)
Next changeset will rename some of the structs, fields and functions
mentioned above, struct or_calltable is way unclear, better name it
struct request_sock_ops, s/struct open_request/struct request_sock/g,
etc.
Signed-off-by: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
2005-06-19 09:46:52 +04:00
# include <net/request_sock.h>
2005-04-17 02:20:36 +04:00
# include <net/sock.h>
2009-02-12 08:03:38 +03:00
# include <linux/net_tstamp.h>
2005-04-17 02:20:36 +04:00
# include <net/xfrm.h>
# include <linux/ipsec.h>
cls_cgroup: Store classid in struct sock
Up until now cls_cgroup has relied on fetching the classid out of
the current executing thread. This runs into trouble when a packet
processing is delayed in which case it may execute out of another
thread's context.
Furthermore, even when a packet is not delayed we may fail to
classify it if soft IRQs have been disabled, because this scenario
is indistinguishable from one where a packet unrelated to the
current thread is processed by a real soft IRQ.
In fact, the current semantics is inherently broken, as a single
skb may be constructed out of the writes of two different tasks.
A different manifestation of this problem is when the TCP stack
transmits in response of an incoming ACK. This is currently
unclassified.
As we already have a concept of packet ownership for accounting
purposes in the skb->sk pointer, this is a natural place to store
the classid in a persistent manner.
This patch adds the cls_cgroup classid in struct sock, filling up
an existing hole on 64-bit :)
The value is set at socket creation time. So all sockets created
via socket(2) automatically gains the ID of the thread creating it.
Whenever another process touches the socket by either reading or
writing to it, we will change the socket classid to that of the
process if it has a valid (non-zero) classid.
For sockets created on inbound connections through accept(2), we
inherit the classid of the original listening socket through
sk_clone, possibly preceding the actual accept(2) call.
In order to minimise risks, I have not made this the authoritative
classid. For now it is only used as a backup when we execute
with soft IRQs disabled. Once we're completely happy with its
semantics we can use it as the sole classid.
Footnote: I have rearranged the error path on cls_group module
creation. If we didn't do this, then there is a window where
someone could create a tc rule using cls_group before the cgroup
subsystem has been registered.
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
2010-05-24 11:12:34 +04:00
# include <net/cls_cgroup.h>
2011-11-22 09:10:51 +04:00
# include <net/netprio_cgroup.h>
2005-04-17 02:20:36 +04:00
# include <linux/filter.h>
2011-06-17 16:00:03 +04:00
# include <trace/events/sock.h>
2005-04-17 02:20:36 +04:00
# ifdef CONFIG_INET
# include <net/tcp.h>
# endif
2013-07-10 18:13:17 +04:00
# include <net/busy_poll.h>
2013-06-10 12:39:50 +04:00
2011-12-16 04:51:59 +04:00
static DEFINE_MUTEX ( proto_list_mutex ) ;
2011-12-12 01:47:04 +04:00
static LIST_HEAD ( proto_list ) ;
2014-04-24 01:26:56 +04:00
/**
* sk_ns_capable - General socket capability test
* @ sk : Socket to use a capability on or through
* @ user_ns : The user namespace of the capability to use
* @ cap : The capability to use
*
* Test to see if the opener of the socket had when the socket was
* created and the current process has the capability @ cap in the user
* namespace @ user_ns .
*/
bool sk_ns_capable ( const struct sock * sk ,
struct user_namespace * user_ns , int cap )
{
return file_ns_capable ( sk - > sk_socket - > file , user_ns , cap ) & &
ns_capable ( user_ns , cap ) ;
}
EXPORT_SYMBOL ( sk_ns_capable ) ;
/**
* sk_capable - Socket global capability test
* @ sk : Socket to use a capability on or through
* @ cap : The global capbility to use
*
* Test to see if the opener of the socket had when the socket was
* created and the current process has the capability @ cap in all user
* namespaces .
*/
bool sk_capable ( const struct sock * sk , int cap )
{
return sk_ns_capable ( sk , & init_user_ns , cap ) ;
}
EXPORT_SYMBOL ( sk_capable ) ;
/**
* sk_net_capable - Network namespace socket capability test
* @ sk : Socket to use a capability on or through
* @ cap : The capability to use
*
* Test to see if the opener of the socket had when the socke was created
* and the current process has the capability @ cap over the network namespace
* the socket is a member of .
*/
bool sk_net_capable ( const struct sock * sk , int cap )
{
return sk_ns_capable ( sk , sock_net ( sk ) - > user_ns , cap ) ;
}
EXPORT_SYMBOL ( sk_net_capable ) ;
2012-08-01 03:43:02 +04:00
# ifdef CONFIG_MEMCG_KMEM
2012-04-10 02:36:33 +04:00
int mem_cgroup_sockets_init ( struct mem_cgroup * memcg , struct cgroup_subsys * ss )
2011-12-12 01:47:04 +04:00
{
struct proto * proto ;
int ret = 0 ;
2011-12-16 04:51:59 +04:00
mutex_lock ( & proto_list_mutex ) ;
2011-12-12 01:47:04 +04:00
list_for_each_entry ( proto , & proto_list , node ) {
if ( proto - > init_cgroup ) {
2012-04-10 02:36:33 +04:00
ret = proto - > init_cgroup ( memcg , ss ) ;
2011-12-12 01:47:04 +04:00
if ( ret )
goto out ;
}
}
2011-12-16 04:51:59 +04:00
mutex_unlock ( & proto_list_mutex ) ;
2011-12-12 01:47:04 +04:00
return ret ;
out :
list_for_each_entry_continue_reverse ( proto , & proto_list , node )
if ( proto - > destroy_cgroup )
2012-04-10 02:36:33 +04:00
proto - > destroy_cgroup ( memcg ) ;
2011-12-16 04:51:59 +04:00
mutex_unlock ( & proto_list_mutex ) ;
2011-12-12 01:47:04 +04:00
return ret ;
}
2012-04-10 02:36:33 +04:00
void mem_cgroup_sockets_destroy ( struct mem_cgroup * memcg )
2011-12-12 01:47:04 +04:00
{
struct proto * proto ;
2011-12-16 04:51:59 +04:00
mutex_lock ( & proto_list_mutex ) ;
2011-12-12 01:47:04 +04:00
list_for_each_entry_reverse ( proto , & proto_list , node )
if ( proto - > destroy_cgroup )
2012-04-10 02:36:33 +04:00
proto - > destroy_cgroup ( memcg ) ;
2011-12-16 04:51:59 +04:00
mutex_unlock ( & proto_list_mutex ) ;
2011-12-12 01:47:04 +04:00
}
# endif
2006-07-03 11:25:12 +04:00
/*
* Each address family might have different locking rules , so we have
* one slock key per address family :
*/
2006-07-03 11:25:35 +04:00
static struct lock_class_key af_family_keys [ AF_MAX ] ;
static struct lock_class_key af_family_slock_keys [ AF_MAX ] ;
2013-02-22 11:59:10 +04:00
# if defined(CONFIG_MEMCG_KMEM)
2012-02-24 11:31:31 +04:00
struct static_key memcg_socket_limit_enabled ;
2011-12-12 01:47:03 +04:00
EXPORT_SYMBOL ( memcg_socket_limit_enabled ) ;
2013-02-22 11:59:10 +04:00
# endif
2011-12-12 01:47:03 +04:00
2006-07-03 11:25:35 +04:00
/*
* Make lock validator output more readable . ( we pre - construct these
* strings build - time , so that runtime initialization of socket
* locks is fast ) :
*/
2009-08-05 21:42:58 +04:00
static const char * const af_family_key_strings [ AF_MAX + 1 ] = {
2006-07-03 11:25:35 +04:00
" sk_lock-AF_UNSPEC " , " sk_lock-AF_UNIX " , " sk_lock-AF_INET " ,
" sk_lock-AF_AX25 " , " sk_lock-AF_IPX " , " sk_lock-AF_APPLETALK " ,
" sk_lock-AF_NETROM " , " sk_lock-AF_BRIDGE " , " sk_lock-AF_ATMPVC " ,
" sk_lock-AF_X25 " , " sk_lock-AF_INET6 " , " sk_lock-AF_ROSE " ,
" sk_lock-AF_DECnet " , " sk_lock-AF_NETBEUI " , " sk_lock-AF_SECURITY " ,
" sk_lock-AF_KEY " , " sk_lock-AF_NETLINK " , " sk_lock-AF_PACKET " ,
" sk_lock-AF_ASH " , " sk_lock-AF_ECONET " , " sk_lock-AF_ATMSVC " ,
2009-02-27 10:43:19 +03:00
" sk_lock-AF_RDS " , " sk_lock-AF_SNA " , " sk_lock-AF_IRDA " ,
2006-07-03 11:25:35 +04:00
" sk_lock-AF_PPPOX " , " sk_lock-AF_WANPIPE " , " sk_lock-AF_LLC " ,
2007-12-17 02:59:24 +03:00
" sk_lock-27 " , " sk_lock-28 " , " sk_lock-AF_CAN " ,
2007-04-27 02:48:28 +04:00
" sk_lock-AF_TIPC " , " sk_lock-AF_BLUETOOTH " , " sk_lock-IUCV " ,
2008-09-23 06:51:15 +04:00
" sk_lock-AF_RXRPC " , " sk_lock-AF_ISDN " , " sk_lock-AF_PHONET " ,
2010-12-08 09:35:34 +03:00
" sk_lock-AF_IEEE802154 " , " sk_lock-AF_CAIF " , " sk_lock-AF_ALG " ,
2013-05-28 09:02:44 +04:00
" sk_lock-AF_NFC " , " sk_lock-AF_VSOCK " , " sk_lock-AF_MAX "
2006-07-03 11:25:35 +04:00
} ;
2009-08-05 21:42:58 +04:00
static const char * const af_family_slock_key_strings [ AF_MAX + 1 ] = {
2006-07-03 11:25:35 +04:00
" slock-AF_UNSPEC " , " slock-AF_UNIX " , " slock-AF_INET " ,
" slock-AF_AX25 " , " slock-AF_IPX " , " slock-AF_APPLETALK " ,
" slock-AF_NETROM " , " slock-AF_BRIDGE " , " slock-AF_ATMPVC " ,
" slock-AF_X25 " , " slock-AF_INET6 " , " slock-AF_ROSE " ,
" slock-AF_DECnet " , " slock-AF_NETBEUI " , " slock-AF_SECURITY " ,
" slock-AF_KEY " , " slock-AF_NETLINK " , " slock-AF_PACKET " ,
" slock-AF_ASH " , " slock-AF_ECONET " , " slock-AF_ATMSVC " ,
2009-02-27 10:43:19 +03:00
" slock-AF_RDS " , " slock-AF_SNA " , " slock-AF_IRDA " ,
2006-07-03 11:25:35 +04:00
" slock-AF_PPPOX " , " slock-AF_WANPIPE " , " slock-AF_LLC " ,
2007-12-17 02:59:24 +03:00
" slock-27 " , " slock-28 " , " slock-AF_CAN " ,
2007-04-27 02:48:28 +04:00
" slock-AF_TIPC " , " slock-AF_BLUETOOTH " , " slock-AF_IUCV " ,
2008-09-23 06:51:15 +04:00
" slock-AF_RXRPC " , " slock-AF_ISDN " , " slock-AF_PHONET " ,
2010-12-08 09:35:34 +03:00
" slock-AF_IEEE802154 " , " slock-AF_CAIF " , " slock-AF_ALG " ,
2013-05-28 09:02:44 +04:00
" slock-AF_NFC " , " slock-AF_VSOCK " , " slock-AF_MAX "
2006-07-03 11:25:35 +04:00
} ;
2009-08-05 21:42:58 +04:00
static const char * const af_family_clock_key_strings [ AF_MAX + 1 ] = {
2007-07-19 12:49:00 +04:00
" clock-AF_UNSPEC " , " clock-AF_UNIX " , " clock-AF_INET " ,
" clock-AF_AX25 " , " clock-AF_IPX " , " clock-AF_APPLETALK " ,
" clock-AF_NETROM " , " clock-AF_BRIDGE " , " clock-AF_ATMPVC " ,
" clock-AF_X25 " , " clock-AF_INET6 " , " clock-AF_ROSE " ,
" clock-AF_DECnet " , " clock-AF_NETBEUI " , " clock-AF_SECURITY " ,
" clock-AF_KEY " , " clock-AF_NETLINK " , " clock-AF_PACKET " ,
" clock-AF_ASH " , " clock-AF_ECONET " , " clock-AF_ATMSVC " ,
2009-02-27 10:43:19 +03:00
" clock-AF_RDS " , " clock-AF_SNA " , " clock-AF_IRDA " ,
2007-07-19 12:49:00 +04:00
" clock-AF_PPPOX " , " clock-AF_WANPIPE " , " clock-AF_LLC " ,
2008-07-24 01:06:04 +04:00
" clock-27 " , " clock-28 " , " clock-AF_CAN " ,
2007-07-22 06:30:16 +04:00
" clock-AF_TIPC " , " clock-AF_BLUETOOTH " , " clock-AF_IUCV " ,
2008-09-23 06:51:15 +04:00
" clock-AF_RXRPC " , " clock-AF_ISDN " , " clock-AF_PHONET " ,
2010-12-08 09:35:34 +03:00
" clock-AF_IEEE802154 " , " clock-AF_CAIF " , " clock-AF_ALG " ,
2013-05-28 09:02:44 +04:00
" clock-AF_NFC " , " clock-AF_VSOCK " , " clock-AF_MAX "
2007-07-19 12:49:00 +04:00
} ;
2006-07-03 11:25:12 +04:00
/*
* sk_callback_lock locking rules are per - address - family ,
* so split the lock classes by using a per - AF key :
*/
static struct lock_class_key af_callback_keys [ AF_MAX ] ;
2005-04-17 02:20:36 +04:00
/* Take into consideration the size of the struct sk_buff overhead in the
* determination of these values , since that is non - constant across
* platforms . This makes socket queueing behavior and performance
* not depend upon such differences .
*/
# define _SK_MEM_PACKETS 256
2011-10-13 11:28:54 +04:00
# define _SK_MEM_OVERHEAD SKB_TRUESIZE(256)
2005-04-17 02:20:36 +04:00
# define SK_WMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
# define SK_RMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
/* Run time adjustable parameters. */
2006-09-23 01:15:41 +04:00
__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX ;
2012-04-30 10:13:50 +04:00
EXPORT_SYMBOL ( sysctl_wmem_max ) ;
2006-09-23 01:15:41 +04:00
__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX ;
2012-04-30 10:13:50 +04:00
EXPORT_SYMBOL ( sysctl_rmem_max ) ;
2006-09-23 01:15:41 +04:00
__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX ;
__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX ;
2005-04-17 02:20:36 +04:00
2011-03-31 05:57:33 +04:00
/* Maximal space eaten by iovec or ancillary data plus some space */
2006-09-23 01:15:41 +04:00
int sysctl_optmem_max __read_mostly = sizeof ( unsigned long ) * ( 2 * UIO_MAXIOV + 512 ) ;
2009-05-27 15:30:05 +04:00
EXPORT_SYMBOL ( sysctl_optmem_max ) ;
2005-04-17 02:20:36 +04:00
2012-08-01 03:44:19 +04:00
struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE ;
EXPORT_SYMBOL_GPL ( memalloc_socks ) ;
2012-08-01 03:44:16 +04:00
/**
* sk_set_memalloc - sets % SOCK_MEMALLOC
* @ sk : socket to set it on
*
* Set % SOCK_MEMALLOC on a socket for access to emergency reserves .
* It ' s the responsibility of the admin to adjust min_free_kbytes
* to meet the requirements
*/
void sk_set_memalloc ( struct sock * sk )
{
sock_set_flag ( sk , SOCK_MEMALLOC ) ;
sk - > sk_allocation | = __GFP_MEMALLOC ;
2012-08-01 03:44:19 +04:00
static_key_slow_inc ( & memalloc_socks ) ;
2012-08-01 03:44:16 +04:00
}
EXPORT_SYMBOL_GPL ( sk_set_memalloc ) ;
void sk_clear_memalloc ( struct sock * sk )
{
sock_reset_flag ( sk , SOCK_MEMALLOC ) ;
sk - > sk_allocation & = ~ __GFP_MEMALLOC ;
2012-08-01 03:44:19 +04:00
static_key_slow_dec ( & memalloc_socks ) ;
netvm: prevent a stream-specific deadlock
This patch series is based on top of "Swap-over-NBD without deadlocking
v15" as it depends on the same reservation of PF_MEMALLOC reserves logic.
When a user or administrator requires swap for their application, they
create a swap partition and file, format it with mkswap and activate it
with swapon. In diskless systems this is not an option so if swap if
required then swapping over the network is considered. The two likely
scenarios are when blade servers are used as part of a cluster where the
form factor or maintenance costs do not allow the use of disks and thin
clients.
The Linux Terminal Server Project recommends the use of the Network Block
Device (NBD) for swap but this is not always an option. There is no
guarantee that the network attached storage (NAS) device is running Linux
or supports NBD. However, it is likely that it supports NFS so there are
users that want support for swapping over NFS despite any performance
concern. Some distributions currently carry patches that support swapping
over NFS but it would be preferable to support it in the mainline kernel.
Patch 1 avoids a stream-specific deadlock that potentially affects TCP.
Patch 2 is a small modification to SELinux to avoid using PFMEMALLOC
reserves.
Patch 3 adds three helpers for filesystems to handle swap cache pages.
For example, page_file_mapping() returns page->mapping for
file-backed pages and the address_space of the underlying
swap file for swap cache pages.
Patch 4 adds two address_space_operations to allow a filesystem
to pin all metadata relevant to a swapfile in memory. Upon
successful activation, the swapfile is marked SWP_FILE and
the address space operation ->direct_IO is used for writing
and ->readpage for reading in swap pages.
Patch 5 notes that patch 3 is bolting
filesystem-specific-swapfile-support onto the side and that
the default handlers have different information to what
is available to the filesystem. This patch refactors the
code so that there are generic handlers for each of the new
address_space operations.
Patch 6 adds an API to allow a vector of kernel addresses to be
translated to struct pages and pinned for IO.
Patch 7 adds support for using highmem pages for swap by kmapping
the pages before calling the direct_IO handler.
Patch 8 updates NFS to use the helpers from patch 3 where necessary.
Patch 9 avoids setting PF_private on PG_swapcache pages within NFS.
Patch 10 implements the new swapfile-related address_space operations
for NFS and teaches the direct IO handler how to manage
kernel addresses.
Patch 11 prevents page allocator recursions in NFS by using GFP_NOIO
where appropriate.
Patch 12 fixes a NULL pointer dereference that occurs when using
swap-over-NFS.
With the patches applied, it is possible to mount a swapfile that is on an
NFS filesystem. Swap performance is not great with a swap stress test
taking roughly twice as long to complete than if the swap device was
backed by NBD.
This patch: netvm: prevent a stream-specific deadlock
It could happen that all !SOCK_MEMALLOC sockets have buffered so much data
that we're over the global rmem limit. This will prevent SOCK_MEMALLOC
buffers from receiving data, which will prevent userspace from running,
which is needed to reduce the buffered data.
Fix this by exempting the SOCK_MEMALLOC sockets from the rmem limit. Once
this change it applied, it is important that sockets that set
SOCK_MEMALLOC do not clear the flag until the socket is being torn down.
If this happens, a warning is generated and the tokens reclaimed to avoid
accounting errors until the bug is fixed.
[davem@davemloft.net: Warning about clearing SOCK_MEMALLOC]
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Mel Gorman <mgorman@suse.de>
Acked-by: David S. Miller <davem@davemloft.net>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Trond Myklebust <Trond.Myklebust@netapp.com>
Cc: Neil Brown <neilb@suse.de>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Mike Christie <michaelc@cs.wisc.edu>
Cc: Eric B Munson <emunson@mgebm.net>
Cc: Sebastian Andrzej Siewior <sebastian@breakpoint.cc>
Cc: Mel Gorman <mgorman@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-08-01 03:44:41 +04:00
/*
* SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
* progress of swapping . However , if SOCK_MEMALLOC is cleared while
* it has rmem allocations there is a risk that the user of the
* socket cannot make forward progress due to exceeding the rmem
* limits . By rights , sk_clear_memalloc ( ) should only be called
* on sockets being torn down but warn and reset the accounting if
* that assumption breaks .
*/
if ( WARN_ON ( sk - > sk_forward_alloc ) )
sk_mem_reclaim ( sk ) ;
2012-08-01 03:44:16 +04:00
}
EXPORT_SYMBOL_GPL ( sk_clear_memalloc ) ;
2012-08-01 03:44:26 +04:00
int __sk_backlog_rcv ( struct sock * sk , struct sk_buff * skb )
{
int ret ;
unsigned long pflags = current - > flags ;
/* these should have been dropped before queueing */
BUG_ON ( ! sock_flag ( sk , SOCK_MEMALLOC ) ) ;
current - > flags | = PF_MEMALLOC ;
ret = sk - > sk_backlog_rcv ( sk , skb ) ;
tsk_restore_flags ( current , pflags , PF_MEMALLOC ) ;
return ret ;
}
EXPORT_SYMBOL ( __sk_backlog_rcv ) ;
2005-04-17 02:20:36 +04:00
static int sock_set_timeout ( long * timeo_p , char __user * optval , int optlen )
{
struct timeval tv ;
if ( optlen < sizeof ( tv ) )
return - EINVAL ;
if ( copy_from_user ( & tv , optval , sizeof ( tv ) ) )
return - EFAULT ;
2007-05-25 03:58:54 +04:00
if ( tv . tv_usec < 0 | | tv . tv_usec > = USEC_PER_SEC )
return - EDOM ;
2005-04-17 02:20:36 +04:00
2007-05-25 03:58:54 +04:00
if ( tv . tv_sec < 0 ) {
2007-07-10 00:16:00 +04:00
static int warned __read_mostly ;
2007-05-25 03:58:54 +04:00
* timeo_p = 0 ;
2008-05-03 03:20:10 +04:00
if ( warned < 10 & & net_ratelimit ( ) ) {
2007-05-25 03:58:54 +04:00
warned + + ;
2012-05-16 23:58:40 +04:00
pr_info ( " %s: `%s' (pid %d) tries to set negative timeout \n " ,
__func__ , current - > comm , task_pid_nr ( current ) ) ;
2008-05-03 03:20:10 +04:00
}
2007-05-25 03:58:54 +04:00
return 0 ;
}
2005-04-17 02:20:36 +04:00
* timeo_p = MAX_SCHEDULE_TIMEOUT ;
if ( tv . tv_sec = = 0 & & tv . tv_usec = = 0 )
return 0 ;
if ( tv . tv_sec < ( MAX_SCHEDULE_TIMEOUT / HZ - 1 ) )
* timeo_p = tv . tv_sec * HZ + ( tv . tv_usec + ( 1000000 / HZ - 1 ) ) / ( 1000000 / HZ ) ;
return 0 ;
}
static void sock_warn_obsolete_bsdism ( const char * name )
{
static int warned ;
static char warncomm [ TASK_COMM_LEN ] ;
2007-02-09 17:24:36 +03:00
if ( strcmp ( warncomm , current - > comm ) & & warned < 5 ) {
strcpy ( warncomm , current - > comm ) ;
2012-05-16 23:58:40 +04:00
pr_warn ( " process `%s' is using obsolete %s SO_BSDCOMPAT \n " ,
warncomm , name ) ;
2005-04-17 02:20:36 +04:00
warned + + ;
}
}
2011-11-28 16:04:18 +04:00
# define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))
static void sock_disable_timestamp ( struct sock * sk , unsigned long flags )
2007-02-09 17:24:36 +03:00
{
2011-11-28 16:04:18 +04:00
if ( sk - > sk_flags & flags ) {
sk - > sk_flags & = ~ flags ;
if ( ! ( sk - > sk_flags & SK_FLAGS_TIMESTAMP ) )
2009-02-12 08:03:38 +03:00
net_disable_timestamp ( ) ;
2005-04-17 02:20:36 +04:00
}
}
2006-03-28 13:08:21 +04:00
int sock_queue_rcv_skb ( struct sock * sk , struct sk_buff * skb )
{
2009-10-15 07:40:11 +04:00
int err ;
2006-03-28 13:08:21 +04:00
int skb_len ;
net: Generalize socket rx gap / receive queue overflow cmsg
Create a new socket level option to report number of queue overflows
Recently I augmented the AF_PACKET protocol to report the number of frames lost
on the socket receive queue between any two enqueued frames. This value was
exported via a SOL_PACKET level cmsg. AFter I completed that work it was
requested that this feature be generalized so that any datagram oriented socket
could make use of this option. As such I've created this patch, It creates a
new SOL_SOCKET level option called SO_RXQ_OVFL, which when enabled exports a
SOL_SOCKET level cmsg that reports the nubmer of times the sk_receive_queue
overflowed between any two given frames. It also augments the AF_PACKET
protocol to take advantage of this new feature (as it previously did not touch
sk->sk_drops, which this patch uses to record the overflow count). Tested
successfully by me.
Notes:
1) Unlike my previous patch, this patch simply records the sk_drops value, which
is not a number of drops between packets, but rather a total number of drops.
Deltas must be computed in user space.
2) While this patch currently works with datagram oriented protocols, it will
also be accepted by non-datagram oriented protocols. I'm not sure if thats
agreeable to everyone, but my argument in favor of doing so is that, for those
protocols which aren't applicable to this option, sk_drops will always be zero,
and reporting no drops on a receive queue that isn't used for those
non-participating protocols seems reasonable to me. This also saves us having
to code in a per-protocol opt in mechanism.
3) This applies cleanly to net-next assuming that commit
977750076d98c7ff6cbda51858bb5a5894a9d9ab (my af packet cmsg patch) is reverted
Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2009-10-13 00:26:31 +04:00
unsigned long flags ;
struct sk_buff_head * list = & sk - > sk_receive_queue ;
2006-03-28 13:08:21 +04:00
2011-12-21 11:11:44 +04:00
if ( atomic_read ( & sk - > sk_rmem_alloc ) > = sk - > sk_rcvbuf ) {
2009-10-15 07:40:11 +04:00
atomic_inc ( & sk - > sk_drops ) ;
2011-06-17 16:00:03 +04:00
trace_sock_rcvqueue_full ( sk , skb ) ;
2009-10-15 07:40:11 +04:00
return - ENOMEM ;
2006-03-28 13:08:21 +04:00
}
2006-09-01 02:28:39 +04:00
err = sk_filter ( sk , skb ) ;
2006-03-28 13:08:21 +04:00
if ( err )
2009-10-15 07:40:11 +04:00
return err ;
2006-03-28 13:08:21 +04:00
netvm: prevent a stream-specific deadlock
This patch series is based on top of "Swap-over-NBD without deadlocking
v15" as it depends on the same reservation of PF_MEMALLOC reserves logic.
When a user or administrator requires swap for their application, they
create a swap partition and file, format it with mkswap and activate it
with swapon. In diskless systems this is not an option so if swap if
required then swapping over the network is considered. The two likely
scenarios are when blade servers are used as part of a cluster where the
form factor or maintenance costs do not allow the use of disks and thin
clients.
The Linux Terminal Server Project recommends the use of the Network Block
Device (NBD) for swap but this is not always an option. There is no
guarantee that the network attached storage (NAS) device is running Linux
or supports NBD. However, it is likely that it supports NFS so there are
users that want support for swapping over NFS despite any performance
concern. Some distributions currently carry patches that support swapping
over NFS but it would be preferable to support it in the mainline kernel.
Patch 1 avoids a stream-specific deadlock that potentially affects TCP.
Patch 2 is a small modification to SELinux to avoid using PFMEMALLOC
reserves.
Patch 3 adds three helpers for filesystems to handle swap cache pages.
For example, page_file_mapping() returns page->mapping for
file-backed pages and the address_space of the underlying
swap file for swap cache pages.
Patch 4 adds two address_space_operations to allow a filesystem
to pin all metadata relevant to a swapfile in memory. Upon
successful activation, the swapfile is marked SWP_FILE and
the address space operation ->direct_IO is used for writing
and ->readpage for reading in swap pages.
Patch 5 notes that patch 3 is bolting
filesystem-specific-swapfile-support onto the side and that
the default handlers have different information to what
is available to the filesystem. This patch refactors the
code so that there are generic handlers for each of the new
address_space operations.
Patch 6 adds an API to allow a vector of kernel addresses to be
translated to struct pages and pinned for IO.
Patch 7 adds support for using highmem pages for swap by kmapping
the pages before calling the direct_IO handler.
Patch 8 updates NFS to use the helpers from patch 3 where necessary.
Patch 9 avoids setting PF_private on PG_swapcache pages within NFS.
Patch 10 implements the new swapfile-related address_space operations
for NFS and teaches the direct IO handler how to manage
kernel addresses.
Patch 11 prevents page allocator recursions in NFS by using GFP_NOIO
where appropriate.
Patch 12 fixes a NULL pointer dereference that occurs when using
swap-over-NFS.
With the patches applied, it is possible to mount a swapfile that is on an
NFS filesystem. Swap performance is not great with a swap stress test
taking roughly twice as long to complete than if the swap device was
backed by NBD.
This patch: netvm: prevent a stream-specific deadlock
It could happen that all !SOCK_MEMALLOC sockets have buffered so much data
that we're over the global rmem limit. This will prevent SOCK_MEMALLOC
buffers from receiving data, which will prevent userspace from running,
which is needed to reduce the buffered data.
Fix this by exempting the SOCK_MEMALLOC sockets from the rmem limit. Once
this change it applied, it is important that sockets that set
SOCK_MEMALLOC do not clear the flag until the socket is being torn down.
If this happens, a warning is generated and the tokens reclaimed to avoid
accounting errors until the bug is fixed.
[davem@davemloft.net: Warning about clearing SOCK_MEMALLOC]
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Mel Gorman <mgorman@suse.de>
Acked-by: David S. Miller <davem@davemloft.net>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Trond Myklebust <Trond.Myklebust@netapp.com>
Cc: Neil Brown <neilb@suse.de>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Mike Christie <michaelc@cs.wisc.edu>
Cc: Eric B Munson <emunson@mgebm.net>
Cc: Sebastian Andrzej Siewior <sebastian@breakpoint.cc>
Cc: Mel Gorman <mgorman@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-08-01 03:44:41 +04:00
if ( ! sk_rmem_schedule ( sk , skb , skb - > truesize ) ) {
2009-10-15 07:40:11 +04:00
atomic_inc ( & sk - > sk_drops ) ;
return - ENOBUFS ;
2007-12-31 11:11:19 +03:00
}
2006-03-28 13:08:21 +04:00
skb - > dev = NULL ;
skb_set_owner_r ( skb , sk ) ;
2008-12-18 09:11:38 +03:00
2006-03-28 13:08:21 +04:00
/* Cache the SKB length before we tack it onto the receive
* queue . Once it is added it no longer belongs to us and
* may be freed by other threads of control pulling packets
* from the queue .
*/
skb_len = skb - > len ;
2010-05-12 03:19:48 +04:00
/* we escape from rcu protected region, make sure we dont leak
* a norefcounted dst
*/
skb_dst_force ( skb ) ;
net: Generalize socket rx gap / receive queue overflow cmsg
Create a new socket level option to report number of queue overflows
Recently I augmented the AF_PACKET protocol to report the number of frames lost
on the socket receive queue between any two enqueued frames. This value was
exported via a SOL_PACKET level cmsg. AFter I completed that work it was
requested that this feature be generalized so that any datagram oriented socket
could make use of this option. As such I've created this patch, It creates a
new SOL_SOCKET level option called SO_RXQ_OVFL, which when enabled exports a
SOL_SOCKET level cmsg that reports the nubmer of times the sk_receive_queue
overflowed between any two given frames. It also augments the AF_PACKET
protocol to take advantage of this new feature (as it previously did not touch
sk->sk_drops, which this patch uses to record the overflow count). Tested
successfully by me.
Notes:
1) Unlike my previous patch, this patch simply records the sk_drops value, which
is not a number of drops between packets, but rather a total number of drops.
Deltas must be computed in user space.
2) While this patch currently works with datagram oriented protocols, it will
also be accepted by non-datagram oriented protocols. I'm not sure if thats
agreeable to everyone, but my argument in favor of doing so is that, for those
protocols which aren't applicable to this option, sk_drops will always be zero,
and reporting no drops on a receive queue that isn't used for those
non-participating protocols seems reasonable to me. This also saves us having
to code in a per-protocol opt in mechanism.
3) This applies cleanly to net-next assuming that commit
977750076d98c7ff6cbda51858bb5a5894a9d9ab (my af packet cmsg patch) is reverted
Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2009-10-13 00:26:31 +04:00
spin_lock_irqsave ( & list - > lock , flags ) ;
skb - > dropcount = atomic_read ( & sk - > sk_drops ) ;
__skb_queue_tail ( list , skb ) ;
spin_unlock_irqrestore ( & list - > lock , flags ) ;
2006-03-28 13:08:21 +04:00
if ( ! sock_flag ( sk , SOCK_DEAD ) )
2014-04-12 00:15:36 +04:00
sk - > sk_data_ready ( sk ) ;
2009-10-15 07:40:11 +04:00
return 0 ;
2006-03-28 13:08:21 +04:00
}
EXPORT_SYMBOL ( sock_queue_rcv_skb ) ;
2006-11-16 19:06:06 +03:00
int sk_receive_skb ( struct sock * sk , struct sk_buff * skb , const int nested )
2006-03-28 13:08:21 +04:00
{
int rc = NET_RX_SUCCESS ;
2006-09-01 02:28:39 +04:00
if ( sk_filter ( sk , skb ) )
2006-03-28 13:08:21 +04:00
goto discard_and_relse ;
skb - > dev = NULL ;
2014-07-22 22:16:51 +04:00
if ( sk_rcvqueues_full ( sk , sk - > sk_rcvbuf ) ) {
2010-04-28 02:13:20 +04:00
atomic_inc ( & sk - > sk_drops ) ;
goto discard_and_relse ;
}
2006-11-16 19:06:06 +03:00
if ( nested )
bh_lock_sock_nested ( sk ) ;
else
bh_lock_sock ( sk ) ;
2006-07-03 11:25:35 +04:00
if ( ! sock_owned_by_user ( sk ) ) {
/*
* trylock + unlock semantics :
*/
mutex_acquire ( & sk - > sk_lock . dep_map , 0 , 1 , _RET_IP_ ) ;
2008-10-08 01:18:42 +04:00
rc = sk_backlog_rcv ( sk , skb ) ;
2006-07-03 11:25:35 +04:00
mutex_release ( & sk - > sk_lock . dep_map , 1 , _RET_IP_ ) ;
2012-04-23 03:34:26 +04:00
} else if ( sk_add_backlog ( sk , skb , sk - > sk_rcvbuf ) ) {
2010-03-04 21:01:40 +03:00
bh_unlock_sock ( sk ) ;
atomic_inc ( & sk - > sk_drops ) ;
goto discard_and_relse ;
}
2006-03-28 13:08:21 +04:00
bh_unlock_sock ( sk ) ;
out :
sock_put ( sk ) ;
return rc ;
discard_and_relse :
kfree_skb ( skb ) ;
goto out ;
}
EXPORT_SYMBOL ( sk_receive_skb ) ;
struct dst_entry * __sk_dst_check ( struct sock * sk , u32 cookie )
{
2010-04-09 03:03:29 +04:00
struct dst_entry * dst = __sk_dst_get ( sk ) ;
2006-03-28 13:08:21 +04:00
if ( dst & & dst - > obsolete & & dst - > ops - > check ( dst , cookie ) = = NULL ) {
2009-10-20 03:46:20 +04:00
sk_tx_queue_clear ( sk ) ;
2011-08-01 20:19:00 +04:00
RCU_INIT_POINTER ( sk - > sk_dst_cache , NULL ) ;
2006-03-28 13:08:21 +04:00
dst_release ( dst ) ;
return NULL ;
}
return dst ;
}
EXPORT_SYMBOL ( __sk_dst_check ) ;
struct dst_entry * sk_dst_check ( struct sock * sk , u32 cookie )
{
struct dst_entry * dst = sk_dst_get ( sk ) ;
if ( dst & & dst - > obsolete & & dst - > ops - > check ( dst , cookie ) = = NULL ) {
sk_dst_reset ( sk ) ;
dst_release ( dst ) ;
return NULL ;
}
return dst ;
}
EXPORT_SYMBOL ( sk_dst_check ) ;
2012-11-26 09:21:08 +04:00
static int sock_setbindtodevice ( struct sock * sk , char __user * optval ,
int optlen )
2007-09-15 03:41:03 +04:00
{
int ret = - ENOPROTOOPT ;
# ifdef CONFIG_NETDEVICES
2008-03-25 20:26:21 +03:00
struct net * net = sock_net ( sk ) ;
2007-09-15 03:41:03 +04:00
char devname [ IFNAMSIZ ] ;
int index ;
/* Sorry... */
ret = - EPERM ;
2012-11-16 07:03:04 +04:00
if ( ! ns_capable ( net - > user_ns , CAP_NET_RAW ) )
2007-09-15 03:41:03 +04:00
goto out ;
ret = - EINVAL ;
if ( optlen < 0 )
goto out ;
/* Bind this socket to a particular device like "eth0",
* as specified in the passed interface name . If the
* name is " " or the option length is zero the socket
* is not bound .
*/
if ( optlen > IFNAMSIZ - 1 )
optlen = IFNAMSIZ - 1 ;
memset ( devname , 0 , sizeof ( devname ) ) ;
ret = - EFAULT ;
if ( copy_from_user ( devname , optval , optlen ) )
goto out ;
2009-11-06 09:37:11 +03:00
index = 0 ;
if ( devname [ 0 ] ! = ' \0 ' ) {
2009-11-06 08:03:39 +03:00
struct net_device * dev ;
2007-09-15 03:41:03 +04:00
2009-11-06 08:03:39 +03:00
rcu_read_lock ( ) ;
dev = dev_get_by_name_rcu ( net , devname ) ;
if ( dev )
index = dev - > ifindex ;
rcu_read_unlock ( ) ;
2007-09-15 03:41:03 +04:00
ret = - ENODEV ;
if ( ! dev )
goto out ;
}
lock_sock ( sk ) ;
sk - > sk_bound_dev_if = index ;
sk_dst_reset ( sk ) ;
release_sock ( sk ) ;
ret = 0 ;
out :
# endif
return ret ;
}
2012-11-26 09:21:08 +04:00
static int sock_getbindtodevice ( struct sock * sk , char __user * optval ,
int __user * optlen , int len )
{
int ret = - ENOPROTOOPT ;
# ifdef CONFIG_NETDEVICES
struct net * net = sock_net ( sk ) ;
char devname [ IFNAMSIZ ] ;
if ( sk - > sk_bound_dev_if = = 0 ) {
len = 0 ;
goto zero ;
}
ret = - EINVAL ;
if ( len < IFNAMSIZ )
goto out ;
2013-06-26 19:23:42 +04:00
ret = netdev_get_name ( net , devname , sk - > sk_bound_dev_if ) ;
if ( ret )
2012-11-26 09:21:08 +04:00
goto out ;
len = strlen ( devname ) + 1 ;
ret = - EFAULT ;
if ( copy_to_user ( optval , devname , len ) )
goto out ;
zero :
ret = - EFAULT ;
if ( put_user ( len , optlen ) )
goto out ;
ret = 0 ;
out :
# endif
return ret ;
}
2007-11-15 14:03:19 +03:00
static inline void sock_valbool_flag ( struct sock * sk , int bit , int valbool )
{
if ( valbool )
sock_set_flag ( sk , bit ) ;
else
sock_reset_flag ( sk , bit ) ;
}
2005-04-17 02:20:36 +04:00
/*
* This is meant for all protocols to use and covers goings on
* at the socket level . Everything here is generic .
*/
int sock_setsockopt ( struct socket * sock , int level , int optname ,
2009-10-01 03:12:20 +04:00
char __user * optval , unsigned int optlen )
2005-04-17 02:20:36 +04:00
{
2009-05-27 15:30:05 +04:00
struct sock * sk = sock - > sk ;
2005-04-17 02:20:36 +04:00
int val ;
int valbool ;
struct linger ling ;
int ret = 0 ;
2007-02-09 17:24:36 +03:00
2005-04-17 02:20:36 +04:00
/*
* Options without arguments
*/
2007-09-15 03:41:03 +04:00
if ( optname = = SO_BINDTODEVICE )
2012-11-26 09:21:08 +04:00
return sock_setbindtodevice ( sk , optval , optlen ) ;
2007-09-15 03:41:03 +04:00
2007-04-11 07:10:33 +04:00
if ( optlen < sizeof ( int ) )
return - EINVAL ;
2007-02-09 17:24:36 +03:00
2005-04-17 02:20:36 +04:00
if ( get_user ( val , ( int __user * ) optval ) )
return - EFAULT ;
2007-02-09 17:24:36 +03:00
2009-05-27 15:30:05 +04:00
valbool = val ? 1 : 0 ;
2005-04-17 02:20:36 +04:00
lock_sock ( sk ) ;
2009-05-27 15:30:05 +04:00
switch ( optname ) {
2007-04-11 07:10:33 +04:00
case SO_DEBUG :
2009-05-27 15:30:05 +04:00
if ( val & & ! capable ( CAP_NET_ADMIN ) )
2007-04-11 07:10:33 +04:00
ret = - EACCES ;
2009-05-27 15:30:05 +04:00
else
2007-11-15 14:03:19 +03:00
sock_valbool_flag ( sk , SOCK_DBG , valbool ) ;
2007-04-11 07:10:33 +04:00
break ;
case SO_REUSEADDR :
2012-04-19 07:39:36 +04:00
sk - > sk_reuse = ( valbool ? SK_CAN_REUSE : SK_NO_REUSE ) ;
2007-04-11 07:10:33 +04:00
break ;
2013-01-22 13:49:50 +04:00
case SO_REUSEPORT :
sk - > sk_reuseport = valbool ;
break ;
2007-04-11 07:10:33 +04:00
case SO_TYPE :
2009-08-04 11:28:28 +04:00
case SO_PROTOCOL :
2009-08-04 11:28:29 +04:00
case SO_DOMAIN :
2007-04-11 07:10:33 +04:00
case SO_ERROR :
ret = - ENOPROTOOPT ;
break ;
case SO_DONTROUTE :
2007-11-15 14:03:19 +03:00
sock_valbool_flag ( sk , SOCK_LOCALROUTE , valbool ) ;
2007-04-11 07:10:33 +04:00
break ;
case SO_BROADCAST :
sock_valbool_flag ( sk , SOCK_BROADCAST , valbool ) ;
break ;
case SO_SNDBUF :
/* Don't error on this BSD doesn't and if you think
2012-04-27 00:07:59 +04:00
* about it this is right . Otherwise apps have to
* play ' guess the biggest size ' games . RCVBUF / SNDBUF
* are treated in BSD as hints
*/
val = min_t ( u32 , val , sysctl_wmem_max ) ;
2005-08-10 06:30:51 +04:00
set_sndbuf :
2007-04-11 07:10:33 +04:00
sk - > sk_userlocks | = SOCK_SNDBUF_LOCK ;
2012-04-27 00:07:59 +04:00
sk - > sk_sndbuf = max_t ( u32 , val * 2 , SOCK_MIN_SNDBUF ) ;
/* Wake up sending tasks if we upped the value. */
2007-04-11 07:10:33 +04:00
sk - > sk_write_space ( sk ) ;
break ;
2005-04-17 02:20:36 +04:00
2007-04-11 07:10:33 +04:00
case SO_SNDBUFFORCE :
if ( ! capable ( CAP_NET_ADMIN ) ) {
ret = - EPERM ;
break ;
}
goto set_sndbuf ;
2005-08-10 06:30:51 +04:00
2007-04-11 07:10:33 +04:00
case SO_RCVBUF :
/* Don't error on this BSD doesn't and if you think
2012-04-27 00:07:59 +04:00
* about it this is right . Otherwise apps have to
* play ' guess the biggest size ' games . RCVBUF / SNDBUF
* are treated in BSD as hints
*/
val = min_t ( u32 , val , sysctl_rmem_max ) ;
2005-08-10 06:30:51 +04:00
set_rcvbuf :
2007-04-11 07:10:33 +04:00
sk - > sk_userlocks | = SOCK_RCVBUF_LOCK ;
/*
* We double it on the way in to account for
* " struct sk_buff " etc . overhead . Applications
* assume that the SO_RCVBUF setting they make will
* allow that much actual data to be received on that
* socket .
*
* Applications are unaware that " struct sk_buff " and
* other overheads allocate from the receive buffer
* during socket buffer allocation .
*
* And after considering the possible alternatives ,
* returning the value we actually used in getsockopt
* is the most desirable behavior .
*/
2012-04-27 00:07:59 +04:00
sk - > sk_rcvbuf = max_t ( u32 , val * 2 , SOCK_MIN_RCVBUF ) ;
2007-04-11 07:10:33 +04:00
break ;
case SO_RCVBUFFORCE :
if ( ! capable ( CAP_NET_ADMIN ) ) {
ret = - EPERM ;
2005-04-17 02:20:36 +04:00
break ;
2007-04-11 07:10:33 +04:00
}
goto set_rcvbuf ;
2005-04-17 02:20:36 +04:00
2007-04-11 07:10:33 +04:00
case SO_KEEPALIVE :
2005-04-17 02:20:36 +04:00
# ifdef CONFIG_INET
2012-09-24 11:00:11 +04:00
if ( sk - > sk_protocol = = IPPROTO_TCP & &
sk - > sk_type = = SOCK_STREAM )
2007-04-11 07:10:33 +04:00
tcp_set_keepalive ( sk , valbool ) ;
2005-04-17 02:20:36 +04:00
# endif
2007-04-11 07:10:33 +04:00
sock_valbool_flag ( sk , SOCK_KEEPOPEN , valbool ) ;
break ;
case SO_OOBINLINE :
sock_valbool_flag ( sk , SOCK_URGINLINE , valbool ) ;
break ;
case SO_NO_CHECK :
2014-05-23 19:47:19 +04:00
sk - > sk_no_check_tx = valbool ;
2007-04-11 07:10:33 +04:00
break ;
case SO_PRIORITY :
2012-11-16 07:03:04 +04:00
if ( ( val > = 0 & & val < = 6 ) | |
ns_capable ( sock_net ( sk ) - > user_ns , CAP_NET_ADMIN ) )
2007-04-11 07:10:33 +04:00
sk - > sk_priority = val ;
else
ret = - EPERM ;
break ;
case SO_LINGER :
if ( optlen < sizeof ( ling ) ) {
ret = - EINVAL ; /* 1003.1g */
2005-04-17 02:20:36 +04:00
break ;
2007-04-11 07:10:33 +04:00
}
2009-05-27 15:30:05 +04:00
if ( copy_from_user ( & ling , optval , sizeof ( ling ) ) ) {
2007-04-11 07:10:33 +04:00
ret = - EFAULT ;
2005-04-17 02:20:36 +04:00
break ;
2007-04-11 07:10:33 +04:00
}
if ( ! ling . l_onoff )
sock_reset_flag ( sk , SOCK_LINGER ) ;
else {
2005-04-17 02:20:36 +04:00
# if (BITS_PER_LONG == 32)
2007-04-11 07:10:33 +04:00
if ( ( unsigned int ) ling . l_linger > = MAX_SCHEDULE_TIMEOUT / HZ )
sk - > sk_lingertime = MAX_SCHEDULE_TIMEOUT ;
2005-04-17 02:20:36 +04:00
else
2007-04-11 07:10:33 +04:00
# endif
sk - > sk_lingertime = ( unsigned int ) ling . l_linger * HZ ;
sock_set_flag ( sk , SOCK_LINGER ) ;
}
break ;
case SO_BSDCOMPAT :
sock_warn_obsolete_bsdism ( " setsockopt " ) ;
break ;
case SO_PASSCRED :
if ( valbool )
set_bit ( SOCK_PASSCRED , & sock - > flags ) ;
else
clear_bit ( SOCK_PASSCRED , & sock - > flags ) ;
break ;
case SO_TIMESTAMP :
2007-03-26 09:14:49 +04:00
case SO_TIMESTAMPNS :
2007-04-11 07:10:33 +04:00
if ( valbool ) {
2007-03-26 09:14:49 +04:00
if ( optname = = SO_TIMESTAMP )
sock_reset_flag ( sk , SOCK_RCVTSTAMPNS ) ;
else
sock_set_flag ( sk , SOCK_RCVTSTAMPNS ) ;
2007-04-11 07:10:33 +04:00
sock_set_flag ( sk , SOCK_RCVTSTAMP ) ;
2009-02-12 08:03:38 +03:00
sock_enable_timestamp ( sk , SOCK_TIMESTAMP ) ;
2007-03-26 09:14:49 +04:00
} else {
2007-04-11 07:10:33 +04:00
sock_reset_flag ( sk , SOCK_RCVTSTAMP ) ;
2007-03-26 09:14:49 +04:00
sock_reset_flag ( sk , SOCK_RCVTSTAMPNS ) ;
}
2007-04-11 07:10:33 +04:00
break ;
2009-02-12 08:03:38 +03:00
case SO_TIMESTAMPING :
if ( val & ~ SOF_TIMESTAMPING_MASK ) {
2009-07-20 04:47:04 +04:00
ret = - EINVAL ;
2009-02-12 08:03:38 +03:00
break ;
}
2014-08-05 06:11:47 +04:00
if ( val & SOF_TIMESTAMPING_OPT_ID & &
! ( sk - > sk_tsflags & SOF_TIMESTAMPING_OPT_ID ) )
sk - > sk_tskey = 0 ;
2014-08-05 06:11:46 +04:00
sk - > sk_tsflags = val ;
2009-02-12 08:03:38 +03:00
if ( val & SOF_TIMESTAMPING_RX_SOFTWARE )
sock_enable_timestamp ( sk ,
SOCK_TIMESTAMPING_RX_SOFTWARE ) ;
else
sock_disable_timestamp ( sk ,
2011-11-28 16:04:18 +04:00
( 1UL < < SOCK_TIMESTAMPING_RX_SOFTWARE ) ) ;
2009-02-12 08:03:38 +03:00
break ;
2007-04-11 07:10:33 +04:00
case SO_RCVLOWAT :
if ( val < 0 )
val = INT_MAX ;
sk - > sk_rcvlowat = val ? : 1 ;
break ;
case SO_RCVTIMEO :
ret = sock_set_timeout ( & sk - > sk_rcvtimeo , optval , optlen ) ;
break ;
case SO_SNDTIMEO :
ret = sock_set_timeout ( & sk - > sk_sndtimeo , optval , optlen ) ;
break ;
2005-04-17 02:20:36 +04:00
2007-04-11 07:10:33 +04:00
case SO_ATTACH_FILTER :
ret = - EINVAL ;
if ( optlen = = sizeof ( struct sock_fprog ) ) {
struct sock_fprog fprog ;
2005-04-17 02:20:36 +04:00
2007-04-11 07:10:33 +04:00
ret = - EFAULT ;
if ( copy_from_user ( & fprog , optval , sizeof ( fprog ) ) )
2005-04-17 02:20:36 +04:00
break ;
2007-04-11 07:10:33 +04:00
ret = sk_attach_filter ( & fprog , sk ) ;
}
break ;
case SO_DETACH_FILTER :
2007-10-18 08:21:26 +04:00
ret = sk_detach_filter ( sk ) ;
2007-04-11 07:10:33 +04:00
break ;
2005-04-17 02:20:36 +04:00
2013-01-17 01:55:49 +04:00
case SO_LOCK_FILTER :
if ( sock_flag ( sk , SOCK_FILTER_LOCKED ) & & ! valbool )
ret = - EPERM ;
else
sock_valbool_flag ( sk , SOCK_FILTER_LOCKED , valbool ) ;
break ;
2007-04-11 07:10:33 +04:00
case SO_PASSSEC :
if ( valbool )
set_bit ( SOCK_PASSSEC , & sock - > flags ) ;
else
clear_bit ( SOCK_PASSSEC , & sock - > flags ) ;
break ;
2008-01-31 06:08:16 +03:00
case SO_MARK :
2012-11-16 07:03:04 +04:00
if ( ! ns_capable ( sock_net ( sk ) - > user_ns , CAP_NET_ADMIN ) )
2008-01-31 06:08:16 +03:00
ret = - EPERM ;
2009-05-27 15:30:05 +04:00
else
2008-01-31 06:08:16 +03:00
sk - > sk_mark = val ;
break ;
[AF_UNIX]: Datagram getpeersec
This patch implements an API whereby an application can determine the
label of its peer's Unix datagram sockets via the auxiliary data mechanism of
recvmsg.
Patch purpose:
This patch enables a security-aware application to retrieve the
security context of the peer of a Unix datagram socket. The application
can then use this security context to determine the security context for
processing on behalf of the peer who sent the packet.
Patch design and implementation:
The design and implementation is very similar to the UDP case for INET
sockets. Basically we build upon the existing Unix domain socket API for
retrieving user credentials. Linux offers the API for obtaining user
credentials via ancillary messages (i.e., out of band/control messages
that are bundled together with a normal message). To retrieve the security
context, the application first indicates to the kernel such desire by
setting the SO_PASSSEC option via getsockopt. Then the application
retrieves the security context using the auxiliary data mechanism.
An example server application for Unix datagram socket should look like this:
toggle = 1;
toggle_len = sizeof(toggle);
setsockopt(sockfd, SOL_SOCKET, SO_PASSSEC, &toggle, &toggle_len);
recvmsg(sockfd, &msg_hdr, 0);
if (msg_hdr.msg_controllen > sizeof(struct cmsghdr)) {
cmsg_hdr = CMSG_FIRSTHDR(&msg_hdr);
if (cmsg_hdr->cmsg_len <= CMSG_LEN(sizeof(scontext)) &&
cmsg_hdr->cmsg_level == SOL_SOCKET &&
cmsg_hdr->cmsg_type == SCM_SECURITY) {
memcpy(&scontext, CMSG_DATA(cmsg_hdr), sizeof(scontext));
}
}
sock_setsockopt is enhanced with a new socket option SOCK_PASSSEC to allow
a server socket to receive security context of the peer.
Testing:
We have tested the patch by setting up Unix datagram client and server
applications. We verified that the server can retrieve the security context
using the auxiliary data mechanism of recvmsg.
Signed-off-by: Catherine Zhang <cxzhang@watson.ibm.com>
Acked-by: Acked-by: James Morris <jmorris@namei.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2006-06-29 23:27:47 +04:00
2005-04-17 02:20:36 +04:00
/* We implement the SO_SNDLOWAT etc to
not be settable ( 1003.1 g 5.3 ) */
net: Generalize socket rx gap / receive queue overflow cmsg
Create a new socket level option to report number of queue overflows
Recently I augmented the AF_PACKET protocol to report the number of frames lost
on the socket receive queue between any two enqueued frames. This value was
exported via a SOL_PACKET level cmsg. AFter I completed that work it was
requested that this feature be generalized so that any datagram oriented socket
could make use of this option. As such I've created this patch, It creates a
new SOL_SOCKET level option called SO_RXQ_OVFL, which when enabled exports a
SOL_SOCKET level cmsg that reports the nubmer of times the sk_receive_queue
overflowed between any two given frames. It also augments the AF_PACKET
protocol to take advantage of this new feature (as it previously did not touch
sk->sk_drops, which this patch uses to record the overflow count). Tested
successfully by me.
Notes:
1) Unlike my previous patch, this patch simply records the sk_drops value, which
is not a number of drops between packets, but rather a total number of drops.
Deltas must be computed in user space.
2) While this patch currently works with datagram oriented protocols, it will
also be accepted by non-datagram oriented protocols. I'm not sure if thats
agreeable to everyone, but my argument in favor of doing so is that, for those
protocols which aren't applicable to this option, sk_drops will always be zero,
and reporting no drops on a receive queue that isn't used for those
non-participating protocols seems reasonable to me. This also saves us having
to code in a per-protocol opt in mechanism.
3) This applies cleanly to net-next assuming that commit
977750076d98c7ff6cbda51858bb5a5894a9d9ab (my af packet cmsg patch) is reverted
Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2009-10-13 00:26:31 +04:00
case SO_RXQ_OVFL :
2011-10-07 07:30:20 +04:00
sock_valbool_flag ( sk , SOCK_RXQ_OVFL , valbool ) ;
net: Generalize socket rx gap / receive queue overflow cmsg
Create a new socket level option to report number of queue overflows
Recently I augmented the AF_PACKET protocol to report the number of frames lost
on the socket receive queue between any two enqueued frames. This value was
exported via a SOL_PACKET level cmsg. AFter I completed that work it was
requested that this feature be generalized so that any datagram oriented socket
could make use of this option. As such I've created this patch, It creates a
new SOL_SOCKET level option called SO_RXQ_OVFL, which when enabled exports a
SOL_SOCKET level cmsg that reports the nubmer of times the sk_receive_queue
overflowed between any two given frames. It also augments the AF_PACKET
protocol to take advantage of this new feature (as it previously did not touch
sk->sk_drops, which this patch uses to record the overflow count). Tested
successfully by me.
Notes:
1) Unlike my previous patch, this patch simply records the sk_drops value, which
is not a number of drops between packets, but rather a total number of drops.
Deltas must be computed in user space.
2) While this patch currently works with datagram oriented protocols, it will
also be accepted by non-datagram oriented protocols. I'm not sure if thats
agreeable to everyone, but my argument in favor of doing so is that, for those
protocols which aren't applicable to this option, sk_drops will always be zero,
and reporting no drops on a receive queue that isn't used for those
non-participating protocols seems reasonable to me. This also saves us having
to code in a per-protocol opt in mechanism.
3) This applies cleanly to net-next assuming that commit
977750076d98c7ff6cbda51858bb5a5894a9d9ab (my af packet cmsg patch) is reverted
Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2009-10-13 00:26:31 +04:00
break ;
2011-11-09 13:15:42 +04:00
case SO_WIFI_STATUS :
sock_valbool_flag ( sk , SOCK_WIFI_STATUS , valbool ) ;
break ;
2012-02-21 11:31:34 +04:00
case SO_PEEK_OFF :
if ( sock - > ops - > set_peek_off )
2013-12-08 02:26:27 +04:00
ret = sock - > ops - > set_peek_off ( sk , val ) ;
2012-02-21 11:31:34 +04:00
else
ret = - EOPNOTSUPP ;
break ;
2012-02-11 19:39:30 +04:00
case SO_NOFCS :
sock_valbool_flag ( sk , SOCK_NOFCS , valbool ) ;
break ;
2013-03-28 15:19:25 +04:00
case SO_SELECT_ERR_QUEUE :
sock_valbool_flag ( sk , SOCK_SELECT_ERR_QUEUE , valbool ) ;
break ;
2013-08-01 07:10:25 +04:00
# ifdef CONFIG_NET_RX_BUSY_POLL
2013-07-10 18:13:36 +04:00
case SO_BUSY_POLL :
2013-06-14 17:33:57 +04:00
/* allow unprivileged users to decrease the value */
if ( ( val > sk - > sk_ll_usec ) & & ! capable ( CAP_NET_ADMIN ) )
ret = - EPERM ;
else {
if ( val < 0 )
ret = - EINVAL ;
else
sk - > sk_ll_usec = val ;
}
break ;
# endif
2013-09-24 19:20:52 +04:00
case SO_MAX_PACING_RATE :
sk - > sk_max_pacing_rate = val ;
sk - > sk_pacing_rate = min ( sk - > sk_pacing_rate ,
sk - > sk_max_pacing_rate ) ;
break ;
2007-04-11 07:10:33 +04:00
default :
ret = - ENOPROTOOPT ;
break ;
2007-02-09 17:24:36 +03:00
}
2005-04-17 02:20:36 +04:00
release_sock ( sk ) ;
return ret ;
}
2009-05-27 15:30:05 +04:00
EXPORT_SYMBOL ( sock_setsockopt ) ;
2005-04-17 02:20:36 +04:00
2014-01-03 21:17:14 +04:00
static void cred_to_ucred ( struct pid * pid , const struct cred * cred ,
struct ucred * ucred )
2010-06-13 07:28:59 +04:00
{
ucred - > pid = pid_vnr ( pid ) ;
ucred - > uid = ucred - > gid = - 1 ;
if ( cred ) {
struct user_namespace * current_ns = current_user_ns ( ) ;
2012-05-24 02:39:45 +04:00
ucred - > uid = from_kuid_munged ( current_ns , cred - > euid ) ;
ucred - > gid = from_kgid_munged ( current_ns , cred - > egid ) ;
2010-06-13 07:28:59 +04:00
}
}
2005-04-17 02:20:36 +04:00
int sock_getsockopt ( struct socket * sock , int level , int optname ,
char __user * optval , int __user * optlen )
{
struct sock * sk = sock - > sk ;
2007-02-09 17:24:36 +03:00
2007-04-11 07:10:33 +04:00
union {
2007-02-09 17:24:36 +03:00
int val ;
struct linger ling ;
2005-04-17 02:20:36 +04:00
struct timeval tm ;
} v ;
2007-02-09 17:24:36 +03:00
2010-01-15 12:08:58 +03:00
int lv = sizeof ( int ) ;
2005-04-17 02:20:36 +04:00
int len ;
2007-02-09 17:24:36 +03:00
2007-04-11 07:10:33 +04:00
if ( get_user ( len , optlen ) )
2007-02-09 17:24:36 +03:00
return - EFAULT ;
2007-04-11 07:10:33 +04:00
if ( len < 0 )
2005-04-17 02:20:36 +04:00
return - EINVAL ;
2007-02-09 17:24:36 +03:00
2009-02-24 02:38:41 +03:00
memset ( & v , 0 , sizeof ( v ) ) ;
net: 4 bytes kernel memory disclosure in SO_BSDCOMPAT gsopt try #2
In function sock_getsockopt() located in net/core/sock.c, optval v.val
is not correctly initialized and directly returned in userland in case
we have SO_BSDCOMPAT option set.
This dummy code should trigger the bug:
int main(void)
{
unsigned char buf[4] = { 0, 0, 0, 0 };
int len;
int sock;
sock = socket(33, 2, 2);
getsockopt(sock, 1, SO_BSDCOMPAT, &buf, &len);
printf("%x%x%x%x\n", buf[0], buf[1], buf[2], buf[3]);
close(sock);
}
Here is a patch that fix this bug by initalizing v.val just after its
declaration.
Signed-off-by: Clément Lecigne <clement.lecigne@netasq.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2009-02-13 03:59:09 +03:00
2009-05-27 15:30:05 +04:00
switch ( optname ) {
2007-04-11 07:10:33 +04:00
case SO_DEBUG :
v . val = sock_flag ( sk , SOCK_DBG ) ;
break ;
case SO_DONTROUTE :
v . val = sock_flag ( sk , SOCK_LOCALROUTE ) ;
break ;
case SO_BROADCAST :
2012-05-16 09:57:07 +04:00
v . val = sock_flag ( sk , SOCK_BROADCAST ) ;
2007-04-11 07:10:33 +04:00
break ;
case SO_SNDBUF :
v . val = sk - > sk_sndbuf ;
break ;
case SO_RCVBUF :
v . val = sk - > sk_rcvbuf ;
break ;
case SO_REUSEADDR :
v . val = sk - > sk_reuse ;
break ;
2013-01-22 13:49:50 +04:00
case SO_REUSEPORT :
v . val = sk - > sk_reuseport ;
break ;
2007-04-11 07:10:33 +04:00
case SO_KEEPALIVE :
2012-05-16 09:57:07 +04:00
v . val = sock_flag ( sk , SOCK_KEEPOPEN ) ;
2007-04-11 07:10:33 +04:00
break ;
case SO_TYPE :
v . val = sk - > sk_type ;
break ;
2009-08-04 11:28:28 +04:00
case SO_PROTOCOL :
v . val = sk - > sk_protocol ;
break ;
2009-08-04 11:28:29 +04:00
case SO_DOMAIN :
v . val = sk - > sk_family ;
break ;
2007-04-11 07:10:33 +04:00
case SO_ERROR :
v . val = - sock_error ( sk ) ;
2009-05-27 15:30:05 +04:00
if ( v . val = = 0 )
2007-04-11 07:10:33 +04:00
v . val = xchg ( & sk - > sk_err_soft , 0 ) ;
break ;
case SO_OOBINLINE :
2012-05-16 09:57:07 +04:00
v . val = sock_flag ( sk , SOCK_URGINLINE ) ;
2007-04-11 07:10:33 +04:00
break ;
case SO_NO_CHECK :
2014-05-23 19:47:19 +04:00
v . val = sk - > sk_no_check_tx ;
2007-04-11 07:10:33 +04:00
break ;
case SO_PRIORITY :
v . val = sk - > sk_priority ;
break ;
case SO_LINGER :
lv = sizeof ( v . ling ) ;
2012-05-16 09:57:07 +04:00
v . ling . l_onoff = sock_flag ( sk , SOCK_LINGER ) ;
2007-04-11 07:10:33 +04:00
v . ling . l_linger = sk - > sk_lingertime / HZ ;
break ;
case SO_BSDCOMPAT :
sock_warn_obsolete_bsdism ( " getsockopt " ) ;
break ;
case SO_TIMESTAMP :
2007-03-26 09:14:49 +04:00
v . val = sock_flag ( sk , SOCK_RCVTSTAMP ) & &
! sock_flag ( sk , SOCK_RCVTSTAMPNS ) ;
break ;
case SO_TIMESTAMPNS :
v . val = sock_flag ( sk , SOCK_RCVTSTAMPNS ) ;
2007-04-11 07:10:33 +04:00
break ;
2009-02-12 08:03:38 +03:00
case SO_TIMESTAMPING :
2014-08-05 06:11:46 +04:00
v . val = sk - > sk_tsflags ;
2009-02-12 08:03:38 +03:00
break ;
2007-04-11 07:10:33 +04:00
case SO_RCVTIMEO :
2009-05-27 15:30:05 +04:00
lv = sizeof ( struct timeval ) ;
2007-04-11 07:10:33 +04:00
if ( sk - > sk_rcvtimeo = = MAX_SCHEDULE_TIMEOUT ) {
v . tm . tv_sec = 0 ;
v . tm . tv_usec = 0 ;
} else {
v . tm . tv_sec = sk - > sk_rcvtimeo / HZ ;
v . tm . tv_usec = ( ( sk - > sk_rcvtimeo % HZ ) * 1000000 ) / HZ ;
}
break ;
case SO_SNDTIMEO :
2009-05-27 15:30:05 +04:00
lv = sizeof ( struct timeval ) ;
2007-04-11 07:10:33 +04:00
if ( sk - > sk_sndtimeo = = MAX_SCHEDULE_TIMEOUT ) {
v . tm . tv_sec = 0 ;
v . tm . tv_usec = 0 ;
} else {
v . tm . tv_sec = sk - > sk_sndtimeo / HZ ;
v . tm . tv_usec = ( ( sk - > sk_sndtimeo % HZ ) * 1000000 ) / HZ ;
}
break ;
2005-04-17 02:20:36 +04:00
2007-04-11 07:10:33 +04:00
case SO_RCVLOWAT :
v . val = sk - > sk_rcvlowat ;
break ;
2005-04-17 02:20:36 +04:00
2007-04-11 07:10:33 +04:00
case SO_SNDLOWAT :
2009-05-27 15:30:05 +04:00
v . val = 1 ;
2007-04-11 07:10:33 +04:00
break ;
2005-04-17 02:20:36 +04:00
2007-04-11 07:10:33 +04:00
case SO_PASSCRED :
2012-04-27 00:07:59 +04:00
v . val = ! ! test_bit ( SOCK_PASSCRED , & sock - > flags ) ;
2007-04-11 07:10:33 +04:00
break ;
2005-04-17 02:20:36 +04:00
2007-04-11 07:10:33 +04:00
case SO_PEERCRED :
2010-06-13 07:30:14 +04:00
{
struct ucred peercred ;
if ( len > sizeof ( peercred ) )
len = sizeof ( peercred ) ;
cred_to_ucred ( sk - > sk_peer_pid , sk - > sk_peer_cred , & peercred ) ;
if ( copy_to_user ( optval , & peercred , len ) )
2007-04-11 07:10:33 +04:00
return - EFAULT ;
goto lenout ;
2010-06-13 07:30:14 +04:00
}
2005-04-17 02:20:36 +04:00
2007-04-11 07:10:33 +04:00
case SO_PEERNAME :
{
char address [ 128 ] ;
if ( sock - > ops - > getname ( sock , ( struct sockaddr * ) address , & lv , 2 ) )
return - ENOTCONN ;
if ( lv < len )
return - EINVAL ;
if ( copy_to_user ( optval , address , len ) )
return - EFAULT ;
goto lenout ;
}
2005-04-17 02:20:36 +04:00
2007-04-11 07:10:33 +04:00
/* Dubious BSD thing... Probably nobody even uses it, but
* the UNIX standard wants it for whatever reason . . . - DaveM
*/
case SO_ACCEPTCONN :
v . val = sk - > sk_state = = TCP_LISTEN ;
break ;
2005-04-17 02:20:36 +04:00
2007-04-11 07:10:33 +04:00
case SO_PASSSEC :
2012-04-27 00:07:59 +04:00
v . val = ! ! test_bit ( SOCK_PASSSEC , & sock - > flags ) ;
2007-04-11 07:10:33 +04:00
break ;
[AF_UNIX]: Datagram getpeersec
This patch implements an API whereby an application can determine the
label of its peer's Unix datagram sockets via the auxiliary data mechanism of
recvmsg.
Patch purpose:
This patch enables a security-aware application to retrieve the
security context of the peer of a Unix datagram socket. The application
can then use this security context to determine the security context for
processing on behalf of the peer who sent the packet.
Patch design and implementation:
The design and implementation is very similar to the UDP case for INET
sockets. Basically we build upon the existing Unix domain socket API for
retrieving user credentials. Linux offers the API for obtaining user
credentials via ancillary messages (i.e., out of band/control messages
that are bundled together with a normal message). To retrieve the security
context, the application first indicates to the kernel such desire by
setting the SO_PASSSEC option via getsockopt. Then the application
retrieves the security context using the auxiliary data mechanism.
An example server application for Unix datagram socket should look like this:
toggle = 1;
toggle_len = sizeof(toggle);
setsockopt(sockfd, SOL_SOCKET, SO_PASSSEC, &toggle, &toggle_len);
recvmsg(sockfd, &msg_hdr, 0);
if (msg_hdr.msg_controllen > sizeof(struct cmsghdr)) {
cmsg_hdr = CMSG_FIRSTHDR(&msg_hdr);
if (cmsg_hdr->cmsg_len <= CMSG_LEN(sizeof(scontext)) &&
cmsg_hdr->cmsg_level == SOL_SOCKET &&
cmsg_hdr->cmsg_type == SCM_SECURITY) {
memcpy(&scontext, CMSG_DATA(cmsg_hdr), sizeof(scontext));
}
}
sock_setsockopt is enhanced with a new socket option SOCK_PASSSEC to allow
a server socket to receive security context of the peer.
Testing:
We have tested the patch by setting up Unix datagram client and server
applications. We verified that the server can retrieve the security context
using the auxiliary data mechanism of recvmsg.
Signed-off-by: Catherine Zhang <cxzhang@watson.ibm.com>
Acked-by: Acked-by: James Morris <jmorris@namei.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2006-06-29 23:27:47 +04:00
2007-04-11 07:10:33 +04:00
case SO_PEERSEC :
return security_socket_getpeersec_stream ( sock , optval , optlen , len ) ;
2005-04-17 02:20:36 +04:00
2008-01-31 06:08:16 +03:00
case SO_MARK :
v . val = sk - > sk_mark ;
break ;
net: Generalize socket rx gap / receive queue overflow cmsg
Create a new socket level option to report number of queue overflows
Recently I augmented the AF_PACKET protocol to report the number of frames lost
on the socket receive queue between any two enqueued frames. This value was
exported via a SOL_PACKET level cmsg. AFter I completed that work it was
requested that this feature be generalized so that any datagram oriented socket
could make use of this option. As such I've created this patch, It creates a
new SOL_SOCKET level option called SO_RXQ_OVFL, which when enabled exports a
SOL_SOCKET level cmsg that reports the nubmer of times the sk_receive_queue
overflowed between any two given frames. It also augments the AF_PACKET
protocol to take advantage of this new feature (as it previously did not touch
sk->sk_drops, which this patch uses to record the overflow count). Tested
successfully by me.
Notes:
1) Unlike my previous patch, this patch simply records the sk_drops value, which
is not a number of drops between packets, but rather a total number of drops.
Deltas must be computed in user space.
2) While this patch currently works with datagram oriented protocols, it will
also be accepted by non-datagram oriented protocols. I'm not sure if thats
agreeable to everyone, but my argument in favor of doing so is that, for those
protocols which aren't applicable to this option, sk_drops will always be zero,
and reporting no drops on a receive queue that isn't used for those
non-participating protocols seems reasonable to me. This also saves us having
to code in a per-protocol opt in mechanism.
3) This applies cleanly to net-next assuming that commit
977750076d98c7ff6cbda51858bb5a5894a9d9ab (my af packet cmsg patch) is reverted
Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2009-10-13 00:26:31 +04:00
case SO_RXQ_OVFL :
2012-05-16 09:57:07 +04:00
v . val = sock_flag ( sk , SOCK_RXQ_OVFL ) ;
net: Generalize socket rx gap / receive queue overflow cmsg
Create a new socket level option to report number of queue overflows
Recently I augmented the AF_PACKET protocol to report the number of frames lost
on the socket receive queue between any two enqueued frames. This value was
exported via a SOL_PACKET level cmsg. AFter I completed that work it was
requested that this feature be generalized so that any datagram oriented socket
could make use of this option. As such I've created this patch, It creates a
new SOL_SOCKET level option called SO_RXQ_OVFL, which when enabled exports a
SOL_SOCKET level cmsg that reports the nubmer of times the sk_receive_queue
overflowed between any two given frames. It also augments the AF_PACKET
protocol to take advantage of this new feature (as it previously did not touch
sk->sk_drops, which this patch uses to record the overflow count). Tested
successfully by me.
Notes:
1) Unlike my previous patch, this patch simply records the sk_drops value, which
is not a number of drops between packets, but rather a total number of drops.
Deltas must be computed in user space.
2) While this patch currently works with datagram oriented protocols, it will
also be accepted by non-datagram oriented protocols. I'm not sure if thats
agreeable to everyone, but my argument in favor of doing so is that, for those
protocols which aren't applicable to this option, sk_drops will always be zero,
and reporting no drops on a receive queue that isn't used for those
non-participating protocols seems reasonable to me. This also saves us having
to code in a per-protocol opt in mechanism.
3) This applies cleanly to net-next assuming that commit
977750076d98c7ff6cbda51858bb5a5894a9d9ab (my af packet cmsg patch) is reverted
Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2009-10-13 00:26:31 +04:00
break ;
2011-11-09 13:15:42 +04:00
case SO_WIFI_STATUS :
2012-05-16 09:57:07 +04:00
v . val = sock_flag ( sk , SOCK_WIFI_STATUS ) ;
2011-11-09 13:15:42 +04:00
break ;
2012-02-21 11:31:34 +04:00
case SO_PEEK_OFF :
if ( ! sock - > ops - > set_peek_off )
return - EOPNOTSUPP ;
v . val = sk - > sk_peek_off ;
break ;
2012-02-24 23:48:34 +04:00
case SO_NOFCS :
2012-05-16 09:57:07 +04:00
v . val = sock_flag ( sk , SOCK_NOFCS ) ;
2012-02-24 23:48:34 +04:00
break ;
2012-11-26 09:21:08 +04:00
2012-10-19 03:55:56 +04:00
case SO_BINDTODEVICE :
2012-11-26 09:21:08 +04:00
return sock_getbindtodevice ( sk , optval , optlen , len ) ;
sk-filter: Add ability to get socket filter program (v2)
The SO_ATTACH_FILTER option is set only. I propose to add the get
ability by using SO_ATTACH_FILTER in getsockopt. To be less
irritating to eyes the SO_GET_FILTER alias to it is declared. This
ability is required by checkpoint-restore project to be able to
save full state of a socket.
There are two issues with getting filter back.
First, kernel modifies the sock_filter->code on filter load, thus in
order to return the filter element back to user we have to decode it
into user-visible constants. Fortunately the modification in question
is interconvertible.
Second, the BPF_S_ALU_DIV_K code modifies the command argument k to
speed up the run-time division by doing kernel_k = reciprocal(user_k).
Bad news is that different user_k may result in same kernel_k, so we
can't get the original user_k back. Good news is that we don't have
to do it. What we need to is calculate a user2_k so, that
reciprocal(user2_k) == reciprocal(user_k) == kernel_k
i.e. if it's re-loaded back the compiled again value will be exactly
the same as it was. That said, the user2_k can be calculated like this
user2_k = reciprocal(kernel_k)
with an exception, that if kernel_k == 0, then user2_k == 1.
The optlen argument is treated like this -- when zero, kernel returns
the amount of sock_fprog elements in filter, otherwise it should be
large enough for the sock_fprog array.
changes since v1:
* Declared SO_GET_FILTER in all arch headers
* Added decode of vlan-tag codes
Signed-off-by: Pavel Emelyanov <xemul@parallels.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2012-11-01 06:01:48 +04:00
case SO_GET_FILTER :
len = sk_get_filter ( sk , ( struct sock_filter __user * ) optval , len ) ;
if ( len < 0 )
return len ;
goto lenout ;
2012-11-26 09:21:08 +04:00
2013-01-17 01:55:49 +04:00
case SO_LOCK_FILTER :
v . val = sock_flag ( sk , SOCK_FILTER_LOCKED ) ;
break ;
2014-01-17 20:09:45 +04:00
case SO_BPF_EXTENSIONS :
v . val = bpf_tell_extensions ( ) ;
break ;
2013-03-28 15:19:25 +04:00
case SO_SELECT_ERR_QUEUE :
v . val = sock_flag ( sk , SOCK_SELECT_ERR_QUEUE ) ;
break ;
2013-08-01 07:10:25 +04:00
# ifdef CONFIG_NET_RX_BUSY_POLL
2013-07-10 18:13:36 +04:00
case SO_BUSY_POLL :
2013-06-14 17:33:57 +04:00
v . val = sk - > sk_ll_usec ;
break ;
# endif
2013-09-24 19:20:52 +04:00
case SO_MAX_PACING_RATE :
v . val = sk - > sk_max_pacing_rate ;
break ;
2007-04-11 07:10:33 +04:00
default :
return - ENOPROTOOPT ;
2005-04-17 02:20:36 +04:00
}
2007-04-11 07:10:33 +04:00
2005-04-17 02:20:36 +04:00
if ( len > lv )
len = lv ;
if ( copy_to_user ( optval , & v , len ) )
return - EFAULT ;
lenout :
2007-02-09 17:24:36 +03:00
if ( put_user ( len , optlen ) )
return - EFAULT ;
return 0 ;
2005-04-17 02:20:36 +04:00
}
2006-07-03 11:25:35 +04:00
/*
* Initialize an sk_lock .
*
* ( We also register the sk_lock with the lock validator . )
*/
2007-03-22 22:27:49 +03:00
static inline void sock_lock_init ( struct sock * sk )
2006-07-03 11:25:35 +04:00
{
2006-12-07 07:35:24 +03:00
sock_lock_init_class_and_name ( sk ,
af_family_slock_key_strings [ sk - > sk_family ] ,
af_family_slock_keys + sk - > sk_family ,
af_family_key_strings [ sk - > sk_family ] ,
af_family_keys + sk - > sk_family ) ;
2006-07-03 11:25:35 +04:00
}
2009-07-16 03:13:10 +04:00
/*
* Copy all fields from osk to nsk but nsk - > sk_refcnt must not change yet ,
* even temporarly , because of RCU lookups . sk_node should also be left as is .
net: optimize INET input path further
Followup of commit b178bb3dfc30 (net: reorder struct sock fields)
Optimize INET input path a bit further, by :
1) moving sk_refcnt close to sk_lock.
This reduces number of dirtied cache lines by one on 64bit arches (and
64 bytes cache line size).
2) moving inet_daddr & inet_rcv_saddr at the beginning of sk
(same cache line than hash / family / bound_dev_if / nulls_node)
This reduces number of accessed cache lines in lookups by one, and dont
increase size of inet and timewait socks.
inet and tw sockets now share same place-holder for these fields.
Before patch :
offsetof(struct sock, sk_refcnt) = 0x10
offsetof(struct sock, sk_lock) = 0x40
offsetof(struct sock, sk_receive_queue) = 0x60
offsetof(struct inet_sock, inet_daddr) = 0x270
offsetof(struct inet_sock, inet_rcv_saddr) = 0x274
After patch :
offsetof(struct sock, sk_refcnt) = 0x44
offsetof(struct sock, sk_lock) = 0x48
offsetof(struct sock, sk_receive_queue) = 0x68
offsetof(struct inet_sock, inet_daddr) = 0x0
offsetof(struct inet_sock, inet_rcv_saddr) = 0x4
compute_score() (udp or tcp) now use a single cache line per ignored
item, instead of two.
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2010-11-30 22:04:07 +03:00
* We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
2009-07-16 03:13:10 +04:00
*/
2007-11-01 10:29:45 +03:00
static void sock_copy ( struct sock * nsk , const struct sock * osk )
{
# ifdef CONFIG_SECURITY_NETWORK
void * sptr = nsk - > sk_security ;
# endif
net: optimize INET input path further
Followup of commit b178bb3dfc30 (net: reorder struct sock fields)
Optimize INET input path a bit further, by :
1) moving sk_refcnt close to sk_lock.
This reduces number of dirtied cache lines by one on 64bit arches (and
64 bytes cache line size).
2) moving inet_daddr & inet_rcv_saddr at the beginning of sk
(same cache line than hash / family / bound_dev_if / nulls_node)
This reduces number of accessed cache lines in lookups by one, and dont
increase size of inet and timewait socks.
inet and tw sockets now share same place-holder for these fields.
Before patch :
offsetof(struct sock, sk_refcnt) = 0x10
offsetof(struct sock, sk_lock) = 0x40
offsetof(struct sock, sk_receive_queue) = 0x60
offsetof(struct inet_sock, inet_daddr) = 0x270
offsetof(struct inet_sock, inet_rcv_saddr) = 0x274
After patch :
offsetof(struct sock, sk_refcnt) = 0x44
offsetof(struct sock, sk_lock) = 0x48
offsetof(struct sock, sk_receive_queue) = 0x68
offsetof(struct inet_sock, inet_daddr) = 0x0
offsetof(struct inet_sock, inet_rcv_saddr) = 0x4
compute_score() (udp or tcp) now use a single cache line per ignored
item, instead of two.
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2010-11-30 22:04:07 +03:00
memcpy ( nsk , osk , offsetof ( struct sock , sk_dontcopy_begin ) ) ;
memcpy ( & nsk - > sk_dontcopy_end , & osk - > sk_dontcopy_end ,
osk - > sk_prot - > obj_size - offsetof ( struct sock , sk_dontcopy_end ) ) ;
2007-11-01 10:29:45 +03:00
# ifdef CONFIG_SECURITY_NETWORK
nsk - > sk_security = sptr ;
security_sk_clone ( osk , nsk ) ;
# endif
}
2010-12-17 01:26:56 +03:00
void sk_prot_clear_portaddr_nulls ( struct sock * sk , int size )
{
unsigned long nulls1 , nulls2 ;
nulls1 = offsetof ( struct sock , __sk_common . skc_node . next ) ;
nulls2 = offsetof ( struct sock , __sk_common . skc_portaddr_node . next ) ;
if ( nulls1 > nulls2 )
swap ( nulls1 , nulls2 ) ;
if ( nulls1 ! = 0 )
memset ( ( char * ) sk , 0 , nulls1 ) ;
memset ( ( char * ) sk + nulls1 + sizeof ( void * ) , 0 ,
nulls2 - nulls1 - sizeof ( void * ) ) ;
memset ( ( char * ) sk + nulls2 + sizeof ( void * ) , 0 ,
size - nulls2 - sizeof ( void * ) ) ;
}
EXPORT_SYMBOL ( sk_prot_clear_portaddr_nulls ) ;
2007-11-01 10:36:26 +03:00
static struct sock * sk_prot_alloc ( struct proto * prot , gfp_t priority ,
int family )
2007-11-01 10:33:50 +03:00
{
struct sock * sk ;
struct kmem_cache * slab ;
slab = prot - > slab ;
2009-07-08 23:36:05 +04:00
if ( slab ! = NULL ) {
sk = kmem_cache_alloc ( slab , priority & ~ __GFP_ZERO ) ;
if ( ! sk )
return sk ;
if ( priority & __GFP_ZERO ) {
2010-12-17 01:26:56 +03:00
if ( prot - > clear_sk )
prot - > clear_sk ( sk , prot - > obj_size ) ;
else
sk_prot_clear_nulls ( sk , prot - > obj_size ) ;
2009-07-08 23:36:05 +04:00
}
2010-12-17 01:26:56 +03:00
} else
2007-11-01 10:33:50 +03:00
sk = kmalloc ( prot - > obj_size , priority ) ;
2007-11-01 10:36:26 +03:00
if ( sk ! = NULL ) {
2009-02-26 16:46:57 +03:00
kmemcheck_annotate_bitfield ( sk , flags ) ;
2007-11-01 10:36:26 +03:00
if ( security_sk_alloc ( sk , family , priority ) )
goto out_free ;
if ( ! try_module_get ( prot - > owner ) )
goto out_free_sec ;
2009-10-20 03:46:20 +04:00
sk_tx_queue_clear ( sk ) ;
2007-11-01 10:36:26 +03:00
}
2007-11-01 10:33:50 +03:00
return sk ;
2007-11-01 10:36:26 +03:00
out_free_sec :
security_sk_free ( sk ) ;
out_free :
if ( slab ! = NULL )
kmem_cache_free ( slab , sk ) ;
else
kfree ( sk ) ;
return NULL ;
2007-11-01 10:33:50 +03:00
}
static void sk_prot_free ( struct proto * prot , struct sock * sk )
{
struct kmem_cache * slab ;
2007-11-01 10:36:26 +03:00
struct module * owner ;
2007-11-01 10:33:50 +03:00
2007-11-01 10:36:26 +03:00
owner = prot - > owner ;
2007-11-01 10:33:50 +03:00
slab = prot - > slab ;
2007-11-01 10:36:26 +03:00
security_sk_free ( sk ) ;
2007-11-01 10:33:50 +03:00
if ( slab ! = NULL )
kmem_cache_free ( slab , sk ) ;
else
kfree ( sk ) ;
2007-11-01 10:36:26 +03:00
module_put ( owner ) ;
2007-11-01 10:33:50 +03:00
}
2013-12-29 20:27:11 +04:00
# if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
2013-04-09 00:03:47 +04:00
void sock_update_netprioidx ( struct sock * sk )
2011-11-22 09:10:51 +04:00
{
if ( in_interrupt ( ) )
return ;
2012-02-10 09:43:38 +04:00
2013-04-09 00:03:47 +04:00
sk - > sk_cgrp_prioidx = task_netprioidx ( current ) ;
2011-11-22 09:10:51 +04:00
}
EXPORT_SYMBOL_GPL ( sock_update_netprioidx ) ;
cls_cgroup: Store classid in struct sock
Up until now cls_cgroup has relied on fetching the classid out of
the current executing thread. This runs into trouble when a packet
processing is delayed in which case it may execute out of another
thread's context.
Furthermore, even when a packet is not delayed we may fail to
classify it if soft IRQs have been disabled, because this scenario
is indistinguishable from one where a packet unrelated to the
current thread is processed by a real soft IRQ.
In fact, the current semantics is inherently broken, as a single
skb may be constructed out of the writes of two different tasks.
A different manifestation of this problem is when the TCP stack
transmits in response of an incoming ACK. This is currently
unclassified.
As we already have a concept of packet ownership for accounting
purposes in the skb->sk pointer, this is a natural place to store
the classid in a persistent manner.
This patch adds the cls_cgroup classid in struct sock, filling up
an existing hole on 64-bit :)
The value is set at socket creation time. So all sockets created
via socket(2) automatically gains the ID of the thread creating it.
Whenever another process touches the socket by either reading or
writing to it, we will change the socket classid to that of the
process if it has a valid (non-zero) classid.
For sockets created on inbound connections through accept(2), we
inherit the classid of the original listening socket through
sk_clone, possibly preceding the actual accept(2) call.
In order to minimise risks, I have not made this the authoritative
classid. For now it is only used as a backup when we execute
with soft IRQs disabled. Once we're completely happy with its
semantics we can use it as the sole classid.
Footnote: I have rearranged the error path on cls_group module
creation. If we didn't do this, then there is a window where
someone could create a tc rule using cls_group before the cgroup
subsystem has been registered.
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
2010-05-24 11:12:34 +04:00
# endif
2005-04-17 02:20:36 +04:00
/**
* sk_alloc - All socket objects are allocated here
2007-10-13 08:17:49 +04:00
* @ net : the applicable net namespace
2005-05-01 19:59:25 +04:00
* @ family : protocol family
* @ priority : for allocation ( % GFP_KERNEL , % GFP_ATOMIC , etc )
* @ prot : struct proto associated with this new sock instance
2005-04-17 02:20:36 +04:00
*/
2007-10-09 10:24:22 +04:00
struct sock * sk_alloc ( struct net * net , int family , gfp_t priority ,
2007-11-01 10:39:31 +03:00
struct proto * prot )
2005-04-17 02:20:36 +04:00
{
2007-11-01 10:33:50 +03:00
struct sock * sk ;
2005-04-17 02:20:36 +04:00
2007-11-01 10:38:43 +03:00
sk = sk_prot_alloc ( prot , priority | __GFP_ZERO , family ) ;
2005-04-17 02:20:36 +04:00
if ( sk ) {
2007-11-01 10:38:43 +03:00
sk - > sk_family = family ;
/*
* See comment in struct sock definition to understand
* why we need sk_prot_creator - acme
*/
sk - > sk_prot = sk - > sk_prot_creator = prot ;
sock_lock_init ( sk ) ;
2008-03-25 20:26:21 +03:00
sock_net_set ( sk , get_net ( net ) ) ;
2009-08-31 03:15:36 +04:00
atomic_set ( & sk - > sk_wmem_alloc , 1 ) ;
cls_cgroup: Store classid in struct sock
Up until now cls_cgroup has relied on fetching the classid out of
the current executing thread. This runs into trouble when a packet
processing is delayed in which case it may execute out of another
thread's context.
Furthermore, even when a packet is not delayed we may fail to
classify it if soft IRQs have been disabled, because this scenario
is indistinguishable from one where a packet unrelated to the
current thread is processed by a real soft IRQ.
In fact, the current semantics is inherently broken, as a single
skb may be constructed out of the writes of two different tasks.
A different manifestation of this problem is when the TCP stack
transmits in response of an incoming ACK. This is currently
unclassified.
As we already have a concept of packet ownership for accounting
purposes in the skb->sk pointer, this is a natural place to store
the classid in a persistent manner.
This patch adds the cls_cgroup classid in struct sock, filling up
an existing hole on 64-bit :)
The value is set at socket creation time. So all sockets created
via socket(2) automatically gains the ID of the thread creating it.
Whenever another process touches the socket by either reading or
writing to it, we will change the socket classid to that of the
process if it has a valid (non-zero) classid.
For sockets created on inbound connections through accept(2), we
inherit the classid of the original listening socket through
sk_clone, possibly preceding the actual accept(2) call.
In order to minimise risks, I have not made this the authoritative
classid. For now it is only used as a backup when we execute
with soft IRQs disabled. Once we're completely happy with its
semantics we can use it as the sole classid.
Footnote: I have rearranged the error path on cls_group module
creation. If we didn't do this, then there is a window where
someone could create a tc rule using cls_group before the cgroup
subsystem has been registered.
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
2010-05-24 11:12:34 +04:00
2013-04-09 00:03:35 +04:00
sock_update_classid ( sk ) ;
2013-04-09 00:03:47 +04:00
sock_update_netprioidx ( sk ) ;
2005-04-17 02:20:36 +04:00
}
2005-09-28 02:23:38 +04:00
2007-11-01 10:36:26 +03:00
return sk ;
2005-04-17 02:20:36 +04:00
}
2009-05-27 15:30:05 +04:00
EXPORT_SYMBOL ( sk_alloc ) ;
2005-04-17 02:20:36 +04:00
net: No more expensive sock_hold()/sock_put() on each tx
One of the problem with sock memory accounting is it uses
a pair of sock_hold()/sock_put() for each transmitted packet.
This slows down bidirectional flows because the receive path
also needs to take a refcount on socket and might use a different
cpu than transmit path or transmit completion path. So these
two atomic operations also trigger cache line bounces.
We can see this in tx or tx/rx workloads (media gateways for example),
where sock_wfree() can be in top five functions in profiles.
We use this sock_hold()/sock_put() so that sock freeing
is delayed until all tx packets are completed.
As we also update sk_wmem_alloc, we could offset sk_wmem_alloc
by one unit at init time, until sk_free() is called.
Once sk_free() is called, we atomic_dec_and_test(sk_wmem_alloc)
to decrement initial offset and atomicaly check if any packets
are in flight.
skb_set_owner_w() doesnt call sock_hold() anymore
sock_wfree() doesnt call sock_put() anymore, but check if sk_wmem_alloc
reached 0 to perform the final freeing.
Drawback is that a skb->truesize error could lead to unfreeable sockets, or
even worse, prematurely calling __sk_free() on a live socket.
Nice speedups on SMP. tbench for example, going from 2691 MB/s to 2711 MB/s
on my 8 cpu dev machine, even if tbench was not really hitting sk_refcnt
contention point. 5 % speedup on a UDP transmit workload (depends
on number of flows), lowering TX completion cpu usage.
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2009-06-11 13:55:43 +04:00
static void __sk_free ( struct sock * sk )
2005-04-17 02:20:36 +04:00
{
struct sk_filter * filter ;
if ( sk - > sk_destruct )
sk - > sk_destruct ( sk ) ;
2010-02-23 04:04:49 +03:00
filter = rcu_dereference_check ( sk - > sk_filter ,
atomic_read ( & sk - > sk_wmem_alloc ) = = 0 ) ;
2005-04-17 02:20:36 +04:00
if ( filter ) {
2007-10-18 08:21:51 +04:00
sk_filter_uncharge ( sk , filter ) ;
2011-08-01 20:19:00 +04:00
RCU_INIT_POINTER ( sk - > sk_filter , NULL ) ;
2005-04-17 02:20:36 +04:00
}
2011-11-28 16:04:18 +04:00
sock_disable_timestamp ( sk , SK_FLAGS_TIMESTAMP ) ;
2005-04-17 02:20:36 +04:00
if ( atomic_read ( & sk - > sk_omem_alloc ) )
2012-05-16 23:58:40 +04:00
pr_debug ( " %s: optmem leakage (%d bytes) detected \n " ,
__func__ , atomic_read ( & sk - > sk_omem_alloc ) ) ;
2005-04-17 02:20:36 +04:00
2010-06-13 07:30:14 +04:00
if ( sk - > sk_peer_cred )
put_cred ( sk - > sk_peer_cred ) ;
put_pid ( sk - > sk_peer_pid ) ;
2008-03-25 20:26:21 +03:00
put_net ( sock_net ( sk ) ) ;
2007-11-01 10:33:50 +03:00
sk_prot_free ( sk - > sk_prot_creator , sk ) ;
2005-04-17 02:20:36 +04:00
}
net: No more expensive sock_hold()/sock_put() on each tx
One of the problem with sock memory accounting is it uses
a pair of sock_hold()/sock_put() for each transmitted packet.
This slows down bidirectional flows because the receive path
also needs to take a refcount on socket and might use a different
cpu than transmit path or transmit completion path. So these
two atomic operations also trigger cache line bounces.
We can see this in tx or tx/rx workloads (media gateways for example),
where sock_wfree() can be in top five functions in profiles.
We use this sock_hold()/sock_put() so that sock freeing
is delayed until all tx packets are completed.
As we also update sk_wmem_alloc, we could offset sk_wmem_alloc
by one unit at init time, until sk_free() is called.
Once sk_free() is called, we atomic_dec_and_test(sk_wmem_alloc)
to decrement initial offset and atomicaly check if any packets
are in flight.
skb_set_owner_w() doesnt call sock_hold() anymore
sock_wfree() doesnt call sock_put() anymore, but check if sk_wmem_alloc
reached 0 to perform the final freeing.
Drawback is that a skb->truesize error could lead to unfreeable sockets, or
even worse, prematurely calling __sk_free() on a live socket.
Nice speedups on SMP. tbench for example, going from 2691 MB/s to 2711 MB/s
on my 8 cpu dev machine, even if tbench was not really hitting sk_refcnt
contention point. 5 % speedup on a UDP transmit workload (depends
on number of flows), lowering TX completion cpu usage.
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2009-06-11 13:55:43 +04:00
void sk_free ( struct sock * sk )
{
/*
2011-03-31 05:57:33 +04:00
* We subtract one from sk_wmem_alloc and can know if
net: No more expensive sock_hold()/sock_put() on each tx
One of the problem with sock memory accounting is it uses
a pair of sock_hold()/sock_put() for each transmitted packet.
This slows down bidirectional flows because the receive path
also needs to take a refcount on socket and might use a different
cpu than transmit path or transmit completion path. So these
two atomic operations also trigger cache line bounces.
We can see this in tx or tx/rx workloads (media gateways for example),
where sock_wfree() can be in top five functions in profiles.
We use this sock_hold()/sock_put() so that sock freeing
is delayed until all tx packets are completed.
As we also update sk_wmem_alloc, we could offset sk_wmem_alloc
by one unit at init time, until sk_free() is called.
Once sk_free() is called, we atomic_dec_and_test(sk_wmem_alloc)
to decrement initial offset and atomicaly check if any packets
are in flight.
skb_set_owner_w() doesnt call sock_hold() anymore
sock_wfree() doesnt call sock_put() anymore, but check if sk_wmem_alloc
reached 0 to perform the final freeing.
Drawback is that a skb->truesize error could lead to unfreeable sockets, or
even worse, prematurely calling __sk_free() on a live socket.
Nice speedups on SMP. tbench for example, going from 2691 MB/s to 2711 MB/s
on my 8 cpu dev machine, even if tbench was not really hitting sk_refcnt
contention point. 5 % speedup on a UDP transmit workload (depends
on number of flows), lowering TX completion cpu usage.
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2009-06-11 13:55:43 +04:00
* some packets are still in some tx queue .
* If not null , sock_wfree ( ) will call __sk_free ( sk ) later
*/
if ( atomic_dec_and_test ( & sk - > sk_wmem_alloc ) )
__sk_free ( sk ) ;
}
2009-05-27 15:30:05 +04:00
EXPORT_SYMBOL ( sk_free ) ;
2005-04-17 02:20:36 +04:00
2008-02-29 22:18:32 +03:00
/*
2011-03-31 05:57:33 +04:00
* Last sock_put should drop reference to sk - > sk_net . It has already
* been dropped in sk_change_net . Taking reference to stopping namespace
2008-02-29 22:18:32 +03:00
* is not an option .
2011-03-31 05:57:33 +04:00
* Take reference to a socket to remove it from hash _alive_ and after that
2008-02-29 22:18:32 +03:00
* destroy it in the context of init_net .
*/
void sk_release_kernel ( struct sock * sk )
{
if ( sk = = NULL | | sk - > sk_socket = = NULL )
return ;
sock_hold ( sk ) ;
sock_release ( sk - > sk_socket ) ;
2008-04-16 12:59:46 +04:00
release_net ( sock_net ( sk ) ) ;
2008-03-25 20:26:21 +03:00
sock_net_set ( sk , get_net ( & init_net ) ) ;
2008-02-29 22:18:32 +03:00
sock_put ( sk ) ;
}
2008-02-29 22:33:19 +03:00
EXPORT_SYMBOL ( sk_release_kernel ) ;
2008-02-29 22:18:32 +03:00
2012-01-09 09:33:16 +04:00
static void sk_update_clone ( const struct sock * sk , struct sock * newsk )
{
if ( mem_cgroup_sockets_enabled & & sk - > sk_cgrp )
sock_update_memcg ( newsk ) ;
}
2011-11-09 02:07:07 +04:00
/**
* sk_clone_lock - clone a socket , and lock its clone
* @ sk : the socket to clone
* @ priority : for allocation ( % GFP_KERNEL , % GFP_ATOMIC , etc )
*
* Caller must unlock socket even in error path ( bh_unlock_sock ( newsk ) )
*/
struct sock * sk_clone_lock ( const struct sock * sk , const gfp_t priority )
2005-08-10 07:10:12 +04:00
{
2007-11-01 10:37:32 +03:00
struct sock * newsk ;
2014-07-31 07:34:12 +04:00
bool is_charged = true ;
2005-08-10 07:10:12 +04:00
2007-11-01 10:37:32 +03:00
newsk = sk_prot_alloc ( sk - > sk_prot , priority , sk - > sk_family ) ;
2005-08-10 07:10:12 +04:00
if ( newsk ! = NULL ) {
struct sk_filter * filter ;
2006-08-05 10:08:56 +04:00
sock_copy ( newsk , sk ) ;
2005-08-10 07:10:12 +04:00
/* SANITY */
2008-03-25 20:26:21 +03:00
get_net ( sock_net ( newsk ) ) ;
2005-08-10 07:10:12 +04:00
sk_node_init ( & newsk - > sk_node ) ;
sock_lock_init ( newsk ) ;
bh_lock_sock ( newsk ) ;
2007-03-05 03:05:44 +03:00
newsk - > sk_backlog . head = newsk - > sk_backlog . tail = NULL ;
2010-03-04 21:01:40 +03:00
newsk - > sk_backlog . len = 0 ;
2005-08-10 07:10:12 +04:00
atomic_set ( & newsk - > sk_rmem_alloc , 0 ) ;
net: No more expensive sock_hold()/sock_put() on each tx
One of the problem with sock memory accounting is it uses
a pair of sock_hold()/sock_put() for each transmitted packet.
This slows down bidirectional flows because the receive path
also needs to take a refcount on socket and might use a different
cpu than transmit path or transmit completion path. So these
two atomic operations also trigger cache line bounces.
We can see this in tx or tx/rx workloads (media gateways for example),
where sock_wfree() can be in top five functions in profiles.
We use this sock_hold()/sock_put() so that sock freeing
is delayed until all tx packets are completed.
As we also update sk_wmem_alloc, we could offset sk_wmem_alloc
by one unit at init time, until sk_free() is called.
Once sk_free() is called, we atomic_dec_and_test(sk_wmem_alloc)
to decrement initial offset and atomicaly check if any packets
are in flight.
skb_set_owner_w() doesnt call sock_hold() anymore
sock_wfree() doesnt call sock_put() anymore, but check if sk_wmem_alloc
reached 0 to perform the final freeing.
Drawback is that a skb->truesize error could lead to unfreeable sockets, or
even worse, prematurely calling __sk_free() on a live socket.
Nice speedups on SMP. tbench for example, going from 2691 MB/s to 2711 MB/s
on my 8 cpu dev machine, even if tbench was not really hitting sk_refcnt
contention point. 5 % speedup on a UDP transmit workload (depends
on number of flows), lowering TX completion cpu usage.
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2009-06-11 13:55:43 +04:00
/*
* sk_wmem_alloc set to one ( see sk_free ( ) and sock_wfree ( ) )
*/
atomic_set ( & newsk - > sk_wmem_alloc , 1 ) ;
2005-08-10 07:10:12 +04:00
atomic_set ( & newsk - > sk_omem_alloc , 0 ) ;
skb_queue_head_init ( & newsk - > sk_receive_queue ) ;
skb_queue_head_init ( & newsk - > sk_write_queue ) ;
2006-05-24 04:55:33 +04:00
# ifdef CONFIG_NET_DMA
skb_queue_head_init ( & newsk - > sk_async_wait_queue ) ;
# endif
2005-08-10 07:10:12 +04:00
2010-04-09 03:03:29 +04:00
spin_lock_init ( & newsk - > sk_dst_lock ) ;
2005-08-10 07:10:12 +04:00
rwlock_init ( & newsk - > sk_callback_lock ) ;
2007-07-19 12:49:00 +04:00
lockdep_set_class_and_name ( & newsk - > sk_callback_lock ,
af_callback_keys + newsk - > sk_family ,
af_family_clock_key_strings [ newsk - > sk_family ] ) ;
2005-08-10 07:10:12 +04:00
newsk - > sk_dst_cache = NULL ;
newsk - > sk_wmem_queued = 0 ;
newsk - > sk_forward_alloc = 0 ;
newsk - > sk_send_head = NULL ;
newsk - > sk_userlocks = sk - > sk_userlocks & ~ SOCK_BINDPORT_LOCK ;
sock_reset_flag ( newsk , SOCK_DONE ) ;
skb_queue_head_init ( & newsk - > sk_error_queue ) ;
2010-10-25 07:47:05 +04:00
filter = rcu_dereference_protected ( newsk - > sk_filter , 1 ) ;
2005-08-10 07:10:12 +04:00
if ( filter ! = NULL )
2014-07-31 07:34:12 +04:00
/* though it's an empty new sock, the charging may fail
* if sysctl_optmem_max was changed between creation of
* original socket and cloning
*/
is_charged = sk_filter_charge ( newsk , filter ) ;
2005-08-10 07:10:12 +04:00
2014-07-31 07:34:12 +04:00
if ( unlikely ( ! is_charged | | xfrm_sk_clone_policy ( newsk ) ) ) {
2005-08-10 07:10:12 +04:00
/* It is still raw copy of parent, so invalidate
* destructor and make plain sk_free ( ) */
newsk - > sk_destruct = NULL ;
2011-10-25 06:30:50 +04:00
bh_unlock_sock ( newsk ) ;
2005-08-10 07:10:12 +04:00
sk_free ( newsk ) ;
newsk = NULL ;
goto out ;
}
newsk - > sk_err = 0 ;
newsk - > sk_priority = 0 ;
2009-07-16 03:13:10 +04:00
/*
* Before updating sk_refcnt , we must commit prior changes to memory
* ( Documentation / RCU / rculist_nulls . txt for details )
*/
smp_wmb ( ) ;
2005-08-10 07:10:12 +04:00
atomic_set ( & newsk - > sk_refcnt , 2 ) ;
/*
* Increment the counter in the same struct proto as the master
* sock ( sk_refcnt_debug_inc uses newsk - > sk_prot - > socks , that
* is the same as sk - > sk_prot - > socks , as this field was copied
* with memcpy ) .
*
* This _changes_ the previous behaviour , where
* tcp_create_openreq_child always was incrementing the
* equivalent to tcp_prot - > socks ( inet_sock_nr ) , so this have
* to be taken into account in all callers . - acme
*/
sk_refcnt_debug_inc ( newsk ) ;
2008-06-18 09:41:38 +04:00
sk_set_socket ( newsk , NULL ) ;
2010-04-29 15:01:49 +04:00
newsk - > sk_wq = NULL ;
2005-08-10 07:10:12 +04:00
2012-01-06 00:16:39 +04:00
sk_update_clone ( sk , newsk ) ;
2005-08-10 07:10:12 +04:00
if ( newsk - > sk_prot - > sockets_allocated )
2011-12-12 01:47:02 +04:00
sk_sockets_allocated_inc ( newsk ) ;
2010-01-08 11:00:09 +03:00
2011-11-28 16:04:18 +04:00
if ( newsk - > sk_flags & SK_FLAGS_TIMESTAMP )
2010-01-08 11:00:09 +03:00
net_enable_timestamp ( ) ;
2005-08-10 07:10:12 +04:00
}
out :
return newsk ;
}
2011-11-09 02:07:07 +04:00
EXPORT_SYMBOL_GPL ( sk_clone_lock ) ;
2005-08-10 07:10:12 +04:00
2007-04-21 04:12:43 +04:00
void sk_setup_caps ( struct sock * sk , struct dst_entry * dst )
{
__sk_dst_set ( sk , dst ) ;
sk - > sk_route_caps = dst - > dev - > features ;
if ( sk - > sk_route_caps & NETIF_F_GSO )
2007-06-01 09:15:50 +04:00
sk - > sk_route_caps | = NETIF_F_GSO_SOFTWARE ;
2010-05-16 11:36:33 +04:00
sk - > sk_route_caps & = ~ sk - > sk_route_nocaps ;
2007-04-21 04:12:43 +04:00
if ( sk_can_gso ( sk ) ) {
[NET]: Add per-connection option to set max TSO frame size
Update: My mailer ate one of Jarek's feedback mails... Fixed the
parameter in netif_set_gso_max_size() to be u32, not u16. Fixed the
whitespace issue due to a patch import botch. Changed the types from
u32 to unsigned int to be more consistent with other variables in the
area. Also brought the patch up to the latest net-2.6.26 tree.
Update: Made gso_max_size container 32 bits, not 16. Moved the
location of gso_max_size within netdev to be less hotpath. Made more
consistent names between the sock and netdev layers, and added a
define for the max GSO size.
Update: Respun for net-2.6.26 tree.
Update: changed max_gso_frame_size and sk_gso_max_size from signed to
unsigned - thanks Stephen!
This patch adds the ability for device drivers to control the size of
the TSO frames being sent to them, per TCP connection. By setting the
netdevice's gso_max_size value, the socket layer will set the GSO
frame size based on that value. This will propogate into the TCP
layer, and send TSO's of that size to the hardware.
This can be desirable to help tune the bursty nature of TSO on a
per-adapter basis, where one may have 1 GbE and 10 GbE devices
coexisting in a system, one running multiqueue and the other not, etc.
This can also be desirable for devices that cannot support full 64 KB
TSO's, but still want to benefit from some level of segmentation
offloading.
Signed-off-by: Peter P Waskiewicz Jr <peter.p.waskiewicz.jr@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-03-21 13:43:19 +03:00
if ( dst - > header_len ) {
2007-04-21 04:12:43 +04:00
sk - > sk_route_caps & = ~ NETIF_F_GSO_MASK ;
[NET]: Add per-connection option to set max TSO frame size
Update: My mailer ate one of Jarek's feedback mails... Fixed the
parameter in netif_set_gso_max_size() to be u32, not u16. Fixed the
whitespace issue due to a patch import botch. Changed the types from
u32 to unsigned int to be more consistent with other variables in the
area. Also brought the patch up to the latest net-2.6.26 tree.
Update: Made gso_max_size container 32 bits, not 16. Moved the
location of gso_max_size within netdev to be less hotpath. Made more
consistent names between the sock and netdev layers, and added a
define for the max GSO size.
Update: Respun for net-2.6.26 tree.
Update: changed max_gso_frame_size and sk_gso_max_size from signed to
unsigned - thanks Stephen!
This patch adds the ability for device drivers to control the size of
the TSO frames being sent to them, per TCP connection. By setting the
netdevice's gso_max_size value, the socket layer will set the GSO
frame size based on that value. This will propogate into the TCP
layer, and send TSO's of that size to the hardware.
This can be desirable to help tune the bursty nature of TSO on a
per-adapter basis, where one may have 1 GbE and 10 GbE devices
coexisting in a system, one running multiqueue and the other not, etc.
This can also be desirable for devices that cannot support full 64 KB
TSO's, but still want to benefit from some level of segmentation
offloading.
Signed-off-by: Peter P Waskiewicz Jr <peter.p.waskiewicz.jr@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-03-21 13:43:19 +03:00
} else {
2007-04-21 04:12:43 +04:00
sk - > sk_route_caps | = NETIF_F_SG | NETIF_F_HW_CSUM ;
[NET]: Add per-connection option to set max TSO frame size
Update: My mailer ate one of Jarek's feedback mails... Fixed the
parameter in netif_set_gso_max_size() to be u32, not u16. Fixed the
whitespace issue due to a patch import botch. Changed the types from
u32 to unsigned int to be more consistent with other variables in the
area. Also brought the patch up to the latest net-2.6.26 tree.
Update: Made gso_max_size container 32 bits, not 16. Moved the
location of gso_max_size within netdev to be less hotpath. Made more
consistent names between the sock and netdev layers, and added a
define for the max GSO size.
Update: Respun for net-2.6.26 tree.
Update: changed max_gso_frame_size and sk_gso_max_size from signed to
unsigned - thanks Stephen!
This patch adds the ability for device drivers to control the size of
the TSO frames being sent to them, per TCP connection. By setting the
netdevice's gso_max_size value, the socket layer will set the GSO
frame size based on that value. This will propogate into the TCP
layer, and send TSO's of that size to the hardware.
This can be desirable to help tune the bursty nature of TSO on a
per-adapter basis, where one may have 1 GbE and 10 GbE devices
coexisting in a system, one running multiqueue and the other not, etc.
This can also be desirable for devices that cannot support full 64 KB
TSO's, but still want to benefit from some level of segmentation
offloading.
Signed-off-by: Peter P Waskiewicz Jr <peter.p.waskiewicz.jr@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-03-21 13:43:19 +03:00
sk - > sk_gso_max_size = dst - > dev - > gso_max_size ;
2012-07-30 20:11:42 +04:00
sk - > sk_gso_max_segs = dst - > dev - > gso_max_segs ;
[NET]: Add per-connection option to set max TSO frame size
Update: My mailer ate one of Jarek's feedback mails... Fixed the
parameter in netif_set_gso_max_size() to be u32, not u16. Fixed the
whitespace issue due to a patch import botch. Changed the types from
u32 to unsigned int to be more consistent with other variables in the
area. Also brought the patch up to the latest net-2.6.26 tree.
Update: Made gso_max_size container 32 bits, not 16. Moved the
location of gso_max_size within netdev to be less hotpath. Made more
consistent names between the sock and netdev layers, and added a
define for the max GSO size.
Update: Respun for net-2.6.26 tree.
Update: changed max_gso_frame_size and sk_gso_max_size from signed to
unsigned - thanks Stephen!
This patch adds the ability for device drivers to control the size of
the TSO frames being sent to them, per TCP connection. By setting the
netdevice's gso_max_size value, the socket layer will set the GSO
frame size based on that value. This will propogate into the TCP
layer, and send TSO's of that size to the hardware.
This can be desirable to help tune the bursty nature of TSO on a
per-adapter basis, where one may have 1 GbE and 10 GbE devices
coexisting in a system, one running multiqueue and the other not, etc.
This can also be desirable for devices that cannot support full 64 KB
TSO's, but still want to benefit from some level of segmentation
offloading.
Signed-off-by: Peter P Waskiewicz Jr <peter.p.waskiewicz.jr@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-03-21 13:43:19 +03:00
}
2007-04-21 04:12:43 +04:00
}
}
EXPORT_SYMBOL_GPL ( sk_setup_caps ) ;
2005-04-17 02:20:36 +04:00
/*
* Simple resource managers for sockets .
*/
2007-02-09 17:24:36 +03:00
/*
* Write buffer destructor automatically called from kfree_skb .
2005-04-17 02:20:36 +04:00
*/
void sock_wfree ( struct sk_buff * skb )
{
struct sock * sk = skb - > sk ;
2009-09-24 14:49:24 +04:00
unsigned int len = skb - > truesize ;
2005-04-17 02:20:36 +04:00
2009-09-24 14:49:24 +04:00
if ( ! sock_flag ( sk , SOCK_USE_WRITE_QUEUE ) ) {
/*
* Keep a reference on sk_wmem_alloc , this will be released
* after sk_write_space ( ) call
*/
atomic_sub ( len - 1 , & sk - > sk_wmem_alloc ) ;
2005-04-17 02:20:36 +04:00
sk - > sk_write_space ( sk ) ;
2009-09-24 14:49:24 +04:00
len = 1 ;
}
net: No more expensive sock_hold()/sock_put() on each tx
One of the problem with sock memory accounting is it uses
a pair of sock_hold()/sock_put() for each transmitted packet.
This slows down bidirectional flows because the receive path
also needs to take a refcount on socket and might use a different
cpu than transmit path or transmit completion path. So these
two atomic operations also trigger cache line bounces.
We can see this in tx or tx/rx workloads (media gateways for example),
where sock_wfree() can be in top five functions in profiles.
We use this sock_hold()/sock_put() so that sock freeing
is delayed until all tx packets are completed.
As we also update sk_wmem_alloc, we could offset sk_wmem_alloc
by one unit at init time, until sk_free() is called.
Once sk_free() is called, we atomic_dec_and_test(sk_wmem_alloc)
to decrement initial offset and atomicaly check if any packets
are in flight.
skb_set_owner_w() doesnt call sock_hold() anymore
sock_wfree() doesnt call sock_put() anymore, but check if sk_wmem_alloc
reached 0 to perform the final freeing.
Drawback is that a skb->truesize error could lead to unfreeable sockets, or
even worse, prematurely calling __sk_free() on a live socket.
Nice speedups on SMP. tbench for example, going from 2691 MB/s to 2711 MB/s
on my 8 cpu dev machine, even if tbench was not really hitting sk_refcnt
contention point. 5 % speedup on a UDP transmit workload (depends
on number of flows), lowering TX completion cpu usage.
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2009-06-11 13:55:43 +04:00
/*
2009-09-24 14:49:24 +04:00
* if sk_wmem_alloc reaches 0 , we must finish what sk_free ( )
* could not do because of in - flight packets
net: No more expensive sock_hold()/sock_put() on each tx
One of the problem with sock memory accounting is it uses
a pair of sock_hold()/sock_put() for each transmitted packet.
This slows down bidirectional flows because the receive path
also needs to take a refcount on socket and might use a different
cpu than transmit path or transmit completion path. So these
two atomic operations also trigger cache line bounces.
We can see this in tx or tx/rx workloads (media gateways for example),
where sock_wfree() can be in top five functions in profiles.
We use this sock_hold()/sock_put() so that sock freeing
is delayed until all tx packets are completed.
As we also update sk_wmem_alloc, we could offset sk_wmem_alloc
by one unit at init time, until sk_free() is called.
Once sk_free() is called, we atomic_dec_and_test(sk_wmem_alloc)
to decrement initial offset and atomicaly check if any packets
are in flight.
skb_set_owner_w() doesnt call sock_hold() anymore
sock_wfree() doesnt call sock_put() anymore, but check if sk_wmem_alloc
reached 0 to perform the final freeing.
Drawback is that a skb->truesize error could lead to unfreeable sockets, or
even worse, prematurely calling __sk_free() on a live socket.
Nice speedups on SMP. tbench for example, going from 2691 MB/s to 2711 MB/s
on my 8 cpu dev machine, even if tbench was not really hitting sk_refcnt
contention point. 5 % speedup on a UDP transmit workload (depends
on number of flows), lowering TX completion cpu usage.
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2009-06-11 13:55:43 +04:00
*/
2009-09-24 14:49:24 +04:00
if ( atomic_sub_and_test ( len , & sk - > sk_wmem_alloc ) )
net: No more expensive sock_hold()/sock_put() on each tx
One of the problem with sock memory accounting is it uses
a pair of sock_hold()/sock_put() for each transmitted packet.
This slows down bidirectional flows because the receive path
also needs to take a refcount on socket and might use a different
cpu than transmit path or transmit completion path. So these
two atomic operations also trigger cache line bounces.
We can see this in tx or tx/rx workloads (media gateways for example),
where sock_wfree() can be in top five functions in profiles.
We use this sock_hold()/sock_put() so that sock freeing
is delayed until all tx packets are completed.
As we also update sk_wmem_alloc, we could offset sk_wmem_alloc
by one unit at init time, until sk_free() is called.
Once sk_free() is called, we atomic_dec_and_test(sk_wmem_alloc)
to decrement initial offset and atomicaly check if any packets
are in flight.
skb_set_owner_w() doesnt call sock_hold() anymore
sock_wfree() doesnt call sock_put() anymore, but check if sk_wmem_alloc
reached 0 to perform the final freeing.
Drawback is that a skb->truesize error could lead to unfreeable sockets, or
even worse, prematurely calling __sk_free() on a live socket.
Nice speedups on SMP. tbench for example, going from 2691 MB/s to 2711 MB/s
on my 8 cpu dev machine, even if tbench was not really hitting sk_refcnt
contention point. 5 % speedup on a UDP transmit workload (depends
on number of flows), lowering TX completion cpu usage.
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2009-06-11 13:55:43 +04:00
__sk_free ( sk ) ;
2005-04-17 02:20:36 +04:00
}
2009-05-27 15:30:05 +04:00
EXPORT_SYMBOL ( sock_wfree ) ;
2005-04-17 02:20:36 +04:00
2013-07-31 04:55:08 +04:00
void skb_orphan_partial ( struct sk_buff * skb )
{
/* TCP stack sets skb->ooo_okay based on sk_wmem_alloc,
* so we do not completely orphan skb , but transfert all
* accounted bytes but one , to avoid unexpected reorders .
*/
if ( skb - > destructor = = sock_wfree
# ifdef CONFIG_INET
| | skb - > destructor = = tcp_wfree
# endif
) {
atomic_sub ( skb - > truesize - 1 , & skb - > sk - > sk_wmem_alloc ) ;
skb - > truesize = 1 ;
} else {
skb_orphan ( skb ) ;
}
}
EXPORT_SYMBOL ( skb_orphan_partial ) ;
2007-02-09 17:24:36 +03:00
/*
* Read buffer destructor automatically called from kfree_skb .
2005-04-17 02:20:36 +04:00
*/
void sock_rfree ( struct sk_buff * skb )
{
struct sock * sk = skb - > sk ;
2010-07-11 02:45:17 +04:00
unsigned int len = skb - > truesize ;
2005-04-17 02:20:36 +04:00
2010-07-11 02:45:17 +04:00
atomic_sub ( len , & sk - > sk_rmem_alloc ) ;
sk_mem_uncharge ( sk , len ) ;
2005-04-17 02:20:36 +04:00
}
2009-05-27 15:30:05 +04:00
EXPORT_SYMBOL ( sock_rfree ) ;
2005-04-17 02:20:36 +04:00
2012-06-20 08:22:05 +04:00
void sock_edemux ( struct sk_buff * skb )
{
2012-09-03 03:57:18 +04:00
struct sock * sk = skb - > sk ;
2012-09-10 20:13:07 +04:00
# ifdef CONFIG_INET
2012-09-03 03:57:18 +04:00
if ( sk - > sk_state = = TCP_TIME_WAIT )
inet_twsk_put ( inet_twsk ( sk ) ) ;
else
2012-09-10 20:13:07 +04:00
# endif
2012-09-03 03:57:18 +04:00
sock_put ( sk ) ;
2012-06-20 08:22:05 +04:00
}
EXPORT_SYMBOL ( sock_edemux ) ;
2005-04-17 02:20:36 +04:00
2012-05-24 03:16:53 +04:00
kuid_t sock_i_uid ( struct sock * sk )
2005-04-17 02:20:36 +04:00
{
2012-05-24 03:16:53 +04:00
kuid_t uid ;
2005-04-17 02:20:36 +04:00
2010-09-22 16:43:39 +04:00
read_lock_bh ( & sk - > sk_callback_lock ) ;
2012-05-24 03:16:53 +04:00
uid = sk - > sk_socket ? SOCK_INODE ( sk - > sk_socket ) - > i_uid : GLOBAL_ROOT_UID ;
2010-09-22 16:43:39 +04:00
read_unlock_bh ( & sk - > sk_callback_lock ) ;
2005-04-17 02:20:36 +04:00
return uid ;
}
2009-05-27 15:30:05 +04:00
EXPORT_SYMBOL ( sock_i_uid ) ;
2005-04-17 02:20:36 +04:00
unsigned long sock_i_ino ( struct sock * sk )
{
unsigned long ino ;
2010-09-22 16:43:39 +04:00
read_lock_bh ( & sk - > sk_callback_lock ) ;
2005-04-17 02:20:36 +04:00
ino = sk - > sk_socket ? SOCK_INODE ( sk - > sk_socket ) - > i_ino : 0 ;
2010-09-22 16:43:39 +04:00
read_unlock_bh ( & sk - > sk_callback_lock ) ;
2005-04-17 02:20:36 +04:00
return ino ;
}
2009-05-27 15:30:05 +04:00
EXPORT_SYMBOL ( sock_i_ino ) ;
2005-04-17 02:20:36 +04:00
/*
* Allocate a skb from the socket ' s send buffer .
*/
2005-07-09 01:57:47 +04:00
struct sk_buff * sock_wmalloc ( struct sock * sk , unsigned long size , int force ,
2005-10-07 10:46:04 +04:00
gfp_t priority )
2005-04-17 02:20:36 +04:00
{
if ( force | | atomic_read ( & sk - > sk_wmem_alloc ) < sk - > sk_sndbuf ) {
2009-05-27 15:30:05 +04:00
struct sk_buff * skb = alloc_skb ( size , priority ) ;
2005-04-17 02:20:36 +04:00
if ( skb ) {
skb_set_owner_w ( skb , sk ) ;
return skb ;
}
}
return NULL ;
}
2009-05-27 15:30:05 +04:00
EXPORT_SYMBOL ( sock_wmalloc ) ;
2005-04-17 02:20:36 +04:00
2007-02-09 17:24:36 +03:00
/*
2005-04-17 02:20:36 +04:00
* Allocate a memory block from the socket ' s option memory buffer .
2007-02-09 17:24:36 +03:00
*/
2005-10-07 10:46:04 +04:00
void * sock_kmalloc ( struct sock * sk , int size , gfp_t priority )
2005-04-17 02:20:36 +04:00
{
2012-04-15 09:58:06 +04:00
if ( ( unsigned int ) size < = sysctl_optmem_max & &
2005-04-17 02:20:36 +04:00
atomic_read ( & sk - > sk_omem_alloc ) + size < sysctl_optmem_max ) {
void * mem ;
/* First do the add, to avoid the race if kmalloc
2007-02-09 17:24:36 +03:00
* might sleep .
2005-04-17 02:20:36 +04:00
*/
atomic_add ( size , & sk - > sk_omem_alloc ) ;
mem = kmalloc ( size , priority ) ;
if ( mem )
return mem ;
atomic_sub ( size , & sk - > sk_omem_alloc ) ;
}
return NULL ;
}
2009-05-27 15:30:05 +04:00
EXPORT_SYMBOL ( sock_kmalloc ) ;
2005-04-17 02:20:36 +04:00
/*
* Free an option memory block .
*/
void sock_kfree_s ( struct sock * sk , void * mem , int size )
{
kfree ( mem ) ;
atomic_sub ( size , & sk - > sk_omem_alloc ) ;
}
2009-05-27 15:30:05 +04:00
EXPORT_SYMBOL ( sock_kfree_s ) ;
2005-04-17 02:20:36 +04:00
/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
I think , these locks should be removed for datagram sockets .
*/
2009-05-27 15:30:05 +04:00
static long sock_wait_for_wmem ( struct sock * sk , long timeo )
2005-04-17 02:20:36 +04:00
{
DEFINE_WAIT ( wait ) ;
clear_bit ( SOCK_ASYNC_NOSPACE , & sk - > sk_socket - > flags ) ;
for ( ; ; ) {
if ( ! timeo )
break ;
if ( signal_pending ( current ) )
break ;
set_bit ( SOCK_NOSPACE , & sk - > sk_socket - > flags ) ;
2010-04-20 17:03:51 +04:00
prepare_to_wait ( sk_sleep ( sk ) , & wait , TASK_INTERRUPTIBLE ) ;
2005-04-17 02:20:36 +04:00
if ( atomic_read ( & sk - > sk_wmem_alloc ) < sk - > sk_sndbuf )
break ;
if ( sk - > sk_shutdown & SEND_SHUTDOWN )
break ;
if ( sk - > sk_err )
break ;
timeo = schedule_timeout ( timeo ) ;
}
2010-04-20 17:03:51 +04:00
finish_wait ( sk_sleep ( sk ) , & wait ) ;
2005-04-17 02:20:36 +04:00
return timeo ;
}
/*
* Generic send / receive buffer handlers
*/
2009-02-05 03:55:54 +03:00
struct sk_buff * sock_alloc_send_pskb ( struct sock * sk , unsigned long header_len ,
unsigned long data_len , int noblock ,
2013-08-09 01:38:47 +04:00
int * errcode , int max_page_order )
2005-04-17 02:20:36 +04:00
{
2013-08-09 01:38:47 +04:00
struct sk_buff * skb = NULL ;
unsigned long chunk ;
2005-10-21 11:20:43 +04:00
gfp_t gfp_mask ;
2005-04-17 02:20:36 +04:00
long timeo ;
int err ;
2012-05-31 01:18:10 +04:00
int npages = ( data_len + ( PAGE_SIZE - 1 ) ) > > PAGE_SHIFT ;
2013-08-09 01:38:47 +04:00
struct page * page ;
int i ;
2012-05-31 01:18:10 +04:00
err = - EMSGSIZE ;
if ( npages > MAX_SKB_FRAGS )
goto failure ;
2005-04-17 02:20:36 +04:00
timeo = sock_sndtimeo ( sk , noblock ) ;
2013-08-09 01:38:47 +04:00
while ( ! skb ) {
2005-04-17 02:20:36 +04:00
err = sock_error ( sk ) ;
if ( err ! = 0 )
goto failure ;
err = - EPIPE ;
if ( sk - > sk_shutdown & SEND_SHUTDOWN )
goto failure ;
2013-08-09 01:38:47 +04:00
if ( atomic_read ( & sk - > sk_wmem_alloc ) > = sk - > sk_sndbuf ) {
set_bit ( SOCK_ASYNC_NOSPACE , & sk - > sk_socket - > flags ) ;
set_bit ( SOCK_NOSPACE , & sk - > sk_socket - > flags ) ;
err = - EAGAIN ;
if ( ! timeo )
goto failure ;
if ( signal_pending ( current ) )
goto interrupted ;
timeo = sock_wait_for_wmem ( sk , timeo ) ;
continue ;
}
2005-04-17 02:20:36 +04:00
2013-08-09 01:38:47 +04:00
err = - ENOBUFS ;
gfp_mask = sk - > sk_allocation ;
if ( gfp_mask & __GFP_WAIT )
gfp_mask | = __GFP_REPEAT ;
skb = alloc_skb ( header_len , gfp_mask ) ;
if ( ! skb )
2005-04-17 02:20:36 +04:00
goto failure ;
2013-08-09 01:38:47 +04:00
skb - > truesize + = data_len ;
for ( i = 0 ; npages > 0 ; i + + ) {
int order = max_page_order ;
while ( order ) {
if ( npages > = 1 < < order ) {
page = alloc_pages ( sk - > sk_allocation |
net: use __GFP_NORETRY for high order allocations
sock_alloc_send_pskb() & sk_page_frag_refill()
have a loop trying high order allocations to prepare
skb with low number of fragments as this increases performance.
Problem is that under memory pressure/fragmentation, this can
trigger OOM while the intent was only to try the high order
allocations, then fallback to order-0 allocations.
We had various reports from unexpected regressions.
According to David, setting __GFP_NORETRY should be fine,
as the asynchronous compaction is still enabled, and this
will prevent OOM from kicking as in :
CFSClientEventm invoked oom-killer: gfp_mask=0x42d0, order=3, oom_adj=0,
oom_score_adj=0, oom_score_badness=2 (enabled),memcg_scoring=disabled
CFSClientEventm
Call Trace:
[<ffffffff8043766c>] dump_header+0xe1/0x23e
[<ffffffff80437a02>] oom_kill_process+0x6a/0x323
[<ffffffff80438443>] out_of_memory+0x4b3/0x50d
[<ffffffff8043a4a6>] __alloc_pages_may_oom+0xa2/0xc7
[<ffffffff80236f42>] __alloc_pages_nodemask+0x1002/0x17f0
[<ffffffff8024bd23>] alloc_pages_current+0x103/0x2b0
[<ffffffff8028567f>] sk_page_frag_refill+0x8f/0x160
[<ffffffff80295fa0>] tcp_sendmsg+0x560/0xee0
[<ffffffff802a5037>] inet_sendmsg+0x67/0x100
[<ffffffff80283c9c>] __sock_sendmsg_nosec+0x6c/0x90
[<ffffffff80283e85>] sock_sendmsg+0xc5/0xf0
[<ffffffff802847b6>] __sys_sendmsg+0x136/0x430
[<ffffffff80284ec8>] sys_sendmsg+0x88/0x110
[<ffffffff80711472>] system_call_fastpath+0x16/0x1b
Out of Memory: Kill process 2856 (bash) score 9999 or sacrifice child
Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: David Rientjes <rientjes@google.com>
Acked-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-02-06 22:42:42 +04:00
__GFP_COMP |
__GFP_NOWARN |
__GFP_NORETRY ,
2013-08-09 01:38:47 +04:00
order ) ;
if ( page )
goto fill_page ;
}
order - - ;
}
page = alloc_page ( sk - > sk_allocation ) ;
if ( ! page )
goto failure ;
fill_page :
chunk = min_t ( unsigned long , data_len ,
PAGE_SIZE < < order ) ;
skb_fill_page_desc ( skb , i , page , 0 , chunk ) ;
data_len - = chunk ;
npages - = 1 < < order ;
2005-04-17 02:20:36 +04:00
}
}
skb_set_owner_w ( skb , sk ) ;
return skb ;
interrupted :
err = sock_intr_errno ( timeo ) ;
failure :
2013-08-09 01:38:47 +04:00
kfree_skb ( skb ) ;
2005-04-17 02:20:36 +04:00
* errcode = err ;
return NULL ;
}
2009-02-05 03:55:54 +03:00
EXPORT_SYMBOL ( sock_alloc_send_pskb ) ;
2005-04-17 02:20:36 +04:00
2007-02-09 17:24:36 +03:00
struct sk_buff * sock_alloc_send_skb ( struct sock * sk , unsigned long size ,
2005-04-17 02:20:36 +04:00
int noblock , int * errcode )
{
2013-08-09 01:38:47 +04:00
return sock_alloc_send_pskb ( sk , size , 0 , noblock , errcode , 0 ) ;
2005-04-17 02:20:36 +04:00
}
2009-05-27 15:30:05 +04:00
EXPORT_SYMBOL ( sock_alloc_send_skb ) ;
2005-04-17 02:20:36 +04:00
net: use a per task frag allocator
We currently use a per socket order-0 page cache for tcp_sendmsg()
operations.
This page is used to build fragments for skbs.
Its done to increase probability of coalescing small write() into
single segments in skbs still in write queue (not yet sent)
But it wastes a lot of memory for applications handling many mostly
idle sockets, since each socket holds one page in sk->sk_sndmsg_page
Its also quite inefficient to build TSO 64KB packets, because we need
about 16 pages per skb on arches where PAGE_SIZE = 4096, so we hit
page allocator more than wanted.
This patch adds a per task frag allocator and uses bigger pages,
if available. An automatic fallback is done in case of memory pressure.
(up to 32768 bytes per frag, thats order-3 pages on x86)
This increases TCP stream performance by 20% on loopback device,
but also benefits on other network devices, since 8x less frags are
mapped on transmit and unmapped on tx completion. Alexander Duyck
mentioned a probable performance win on systems with IOMMU enabled.
Its possible some SG enabled hardware cant cope with bigger fragments,
but their ndo_start_xmit() should already handle this, splitting a
fragment in sub fragments, since some arches have PAGE_SIZE=65536
Successfully tested on various ethernet devices.
(ixgbe, igb, bnx2x, tg3, mellanox mlx4)
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Ben Hutchings <bhutchings@solarflare.com>
Cc: Vijay Subramanian <subramanian.vijay@gmail.com>
Cc: Alexander Duyck <alexander.h.duyck@intel.com>
Tested-by: Vijay Subramanian <subramanian.vijay@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2012-09-24 03:04:42 +04:00
/* On 32bit arches, an skb frag is limited to 2^15 */
# define SKB_FRAG_PAGE_ORDER get_order(32768)
2013-10-18 03:27:07 +04:00
/**
* skb_page_frag_refill - check that a page_frag contains enough room
* @ sz : minimum size of the fragment we want to get
* @ pfrag : pointer to page_frag
* @ prio : priority for memory allocation
*
* Note : While this allocator tries to use high order pages , there is
* no guarantee that allocations succeed . Therefore , @ sz MUST be
* less or equal than PAGE_SIZE .
*/
bool skb_page_frag_refill ( unsigned int sz , struct page_frag * pfrag , gfp_t prio )
net: use a per task frag allocator
We currently use a per socket order-0 page cache for tcp_sendmsg()
operations.
This page is used to build fragments for skbs.
Its done to increase probability of coalescing small write() into
single segments in skbs still in write queue (not yet sent)
But it wastes a lot of memory for applications handling many mostly
idle sockets, since each socket holds one page in sk->sk_sndmsg_page
Its also quite inefficient to build TSO 64KB packets, because we need
about 16 pages per skb on arches where PAGE_SIZE = 4096, so we hit
page allocator more than wanted.
This patch adds a per task frag allocator and uses bigger pages,
if available. An automatic fallback is done in case of memory pressure.
(up to 32768 bytes per frag, thats order-3 pages on x86)
This increases TCP stream performance by 20% on loopback device,
but also benefits on other network devices, since 8x less frags are
mapped on transmit and unmapped on tx completion. Alexander Duyck
mentioned a probable performance win on systems with IOMMU enabled.
Its possible some SG enabled hardware cant cope with bigger fragments,
but their ndo_start_xmit() should already handle this, splitting a
fragment in sub fragments, since some arches have PAGE_SIZE=65536
Successfully tested on various ethernet devices.
(ixgbe, igb, bnx2x, tg3, mellanox mlx4)
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Ben Hutchings <bhutchings@solarflare.com>
Cc: Vijay Subramanian <subramanian.vijay@gmail.com>
Cc: Alexander Duyck <alexander.h.duyck@intel.com>
Tested-by: Vijay Subramanian <subramanian.vijay@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2012-09-24 03:04:42 +04:00
{
int order ;
if ( pfrag - > page ) {
if ( atomic_read ( & pfrag - > page - > _count ) = = 1 ) {
pfrag - > offset = 0 ;
return true ;
}
2013-10-18 03:27:07 +04:00
if ( pfrag - > offset + sz < = pfrag - > size )
net: use a per task frag allocator
We currently use a per socket order-0 page cache for tcp_sendmsg()
operations.
This page is used to build fragments for skbs.
Its done to increase probability of coalescing small write() into
single segments in skbs still in write queue (not yet sent)
But it wastes a lot of memory for applications handling many mostly
idle sockets, since each socket holds one page in sk->sk_sndmsg_page
Its also quite inefficient to build TSO 64KB packets, because we need
about 16 pages per skb on arches where PAGE_SIZE = 4096, so we hit
page allocator more than wanted.
This patch adds a per task frag allocator and uses bigger pages,
if available. An automatic fallback is done in case of memory pressure.
(up to 32768 bytes per frag, thats order-3 pages on x86)
This increases TCP stream performance by 20% on loopback device,
but also benefits on other network devices, since 8x less frags are
mapped on transmit and unmapped on tx completion. Alexander Duyck
mentioned a probable performance win on systems with IOMMU enabled.
Its possible some SG enabled hardware cant cope with bigger fragments,
but their ndo_start_xmit() should already handle this, splitting a
fragment in sub fragments, since some arches have PAGE_SIZE=65536
Successfully tested on various ethernet devices.
(ixgbe, igb, bnx2x, tg3, mellanox mlx4)
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Ben Hutchings <bhutchings@solarflare.com>
Cc: Vijay Subramanian <subramanian.vijay@gmail.com>
Cc: Alexander Duyck <alexander.h.duyck@intel.com>
Tested-by: Vijay Subramanian <subramanian.vijay@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2012-09-24 03:04:42 +04:00
return true ;
put_page ( pfrag - > page ) ;
}
2014-01-17 10:23:25 +04:00
order = SKB_FRAG_PAGE_ORDER ;
net: use a per task frag allocator
We currently use a per socket order-0 page cache for tcp_sendmsg()
operations.
This page is used to build fragments for skbs.
Its done to increase probability of coalescing small write() into
single segments in skbs still in write queue (not yet sent)
But it wastes a lot of memory for applications handling many mostly
idle sockets, since each socket holds one page in sk->sk_sndmsg_page
Its also quite inefficient to build TSO 64KB packets, because we need
about 16 pages per skb on arches where PAGE_SIZE = 4096, so we hit
page allocator more than wanted.
This patch adds a per task frag allocator and uses bigger pages,
if available. An automatic fallback is done in case of memory pressure.
(up to 32768 bytes per frag, thats order-3 pages on x86)
This increases TCP stream performance by 20% on loopback device,
but also benefits on other network devices, since 8x less frags are
mapped on transmit and unmapped on tx completion. Alexander Duyck
mentioned a probable performance win on systems with IOMMU enabled.
Its possible some SG enabled hardware cant cope with bigger fragments,
but their ndo_start_xmit() should already handle this, splitting a
fragment in sub fragments, since some arches have PAGE_SIZE=65536
Successfully tested on various ethernet devices.
(ixgbe, igb, bnx2x, tg3, mellanox mlx4)
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Ben Hutchings <bhutchings@solarflare.com>
Cc: Vijay Subramanian <subramanian.vijay@gmail.com>
Cc: Alexander Duyck <alexander.h.duyck@intel.com>
Tested-by: Vijay Subramanian <subramanian.vijay@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2012-09-24 03:04:42 +04:00
do {
2013-10-18 03:27:07 +04:00
gfp_t gfp = prio ;
net: use a per task frag allocator
We currently use a per socket order-0 page cache for tcp_sendmsg()
operations.
This page is used to build fragments for skbs.
Its done to increase probability of coalescing small write() into
single segments in skbs still in write queue (not yet sent)
But it wastes a lot of memory for applications handling many mostly
idle sockets, since each socket holds one page in sk->sk_sndmsg_page
Its also quite inefficient to build TSO 64KB packets, because we need
about 16 pages per skb on arches where PAGE_SIZE = 4096, so we hit
page allocator more than wanted.
This patch adds a per task frag allocator and uses bigger pages,
if available. An automatic fallback is done in case of memory pressure.
(up to 32768 bytes per frag, thats order-3 pages on x86)
This increases TCP stream performance by 20% on loopback device,
but also benefits on other network devices, since 8x less frags are
mapped on transmit and unmapped on tx completion. Alexander Duyck
mentioned a probable performance win on systems with IOMMU enabled.
Its possible some SG enabled hardware cant cope with bigger fragments,
but their ndo_start_xmit() should already handle this, splitting a
fragment in sub fragments, since some arches have PAGE_SIZE=65536
Successfully tested on various ethernet devices.
(ixgbe, igb, bnx2x, tg3, mellanox mlx4)
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Ben Hutchings <bhutchings@solarflare.com>
Cc: Vijay Subramanian <subramanian.vijay@gmail.com>
Cc: Alexander Duyck <alexander.h.duyck@intel.com>
Tested-by: Vijay Subramanian <subramanian.vijay@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2012-09-24 03:04:42 +04:00
if ( order )
net: use __GFP_NORETRY for high order allocations
sock_alloc_send_pskb() & sk_page_frag_refill()
have a loop trying high order allocations to prepare
skb with low number of fragments as this increases performance.
Problem is that under memory pressure/fragmentation, this can
trigger OOM while the intent was only to try the high order
allocations, then fallback to order-0 allocations.
We had various reports from unexpected regressions.
According to David, setting __GFP_NORETRY should be fine,
as the asynchronous compaction is still enabled, and this
will prevent OOM from kicking as in :
CFSClientEventm invoked oom-killer: gfp_mask=0x42d0, order=3, oom_adj=0,
oom_score_adj=0, oom_score_badness=2 (enabled),memcg_scoring=disabled
CFSClientEventm
Call Trace:
[<ffffffff8043766c>] dump_header+0xe1/0x23e
[<ffffffff80437a02>] oom_kill_process+0x6a/0x323
[<ffffffff80438443>] out_of_memory+0x4b3/0x50d
[<ffffffff8043a4a6>] __alloc_pages_may_oom+0xa2/0xc7
[<ffffffff80236f42>] __alloc_pages_nodemask+0x1002/0x17f0
[<ffffffff8024bd23>] alloc_pages_current+0x103/0x2b0
[<ffffffff8028567f>] sk_page_frag_refill+0x8f/0x160
[<ffffffff80295fa0>] tcp_sendmsg+0x560/0xee0
[<ffffffff802a5037>] inet_sendmsg+0x67/0x100
[<ffffffff80283c9c>] __sock_sendmsg_nosec+0x6c/0x90
[<ffffffff80283e85>] sock_sendmsg+0xc5/0xf0
[<ffffffff802847b6>] __sys_sendmsg+0x136/0x430
[<ffffffff80284ec8>] sys_sendmsg+0x88/0x110
[<ffffffff80711472>] system_call_fastpath+0x16/0x1b
Out of Memory: Kill process 2856 (bash) score 9999 or sacrifice child
Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: David Rientjes <rientjes@google.com>
Acked-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-02-06 22:42:42 +04:00
gfp | = __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY ;
net: use a per task frag allocator
We currently use a per socket order-0 page cache for tcp_sendmsg()
operations.
This page is used to build fragments for skbs.
Its done to increase probability of coalescing small write() into
single segments in skbs still in write queue (not yet sent)
But it wastes a lot of memory for applications handling many mostly
idle sockets, since each socket holds one page in sk->sk_sndmsg_page
Its also quite inefficient to build TSO 64KB packets, because we need
about 16 pages per skb on arches where PAGE_SIZE = 4096, so we hit
page allocator more than wanted.
This patch adds a per task frag allocator and uses bigger pages,
if available. An automatic fallback is done in case of memory pressure.
(up to 32768 bytes per frag, thats order-3 pages on x86)
This increases TCP stream performance by 20% on loopback device,
but also benefits on other network devices, since 8x less frags are
mapped on transmit and unmapped on tx completion. Alexander Duyck
mentioned a probable performance win on systems with IOMMU enabled.
Its possible some SG enabled hardware cant cope with bigger fragments,
but their ndo_start_xmit() should already handle this, splitting a
fragment in sub fragments, since some arches have PAGE_SIZE=65536
Successfully tested on various ethernet devices.
(ixgbe, igb, bnx2x, tg3, mellanox mlx4)
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Ben Hutchings <bhutchings@solarflare.com>
Cc: Vijay Subramanian <subramanian.vijay@gmail.com>
Cc: Alexander Duyck <alexander.h.duyck@intel.com>
Tested-by: Vijay Subramanian <subramanian.vijay@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2012-09-24 03:04:42 +04:00
pfrag - > page = alloc_pages ( gfp , order ) ;
if ( likely ( pfrag - > page ) ) {
pfrag - > offset = 0 ;
pfrag - > size = PAGE_SIZE < < order ;
return true ;
}
} while ( - - order > = 0 ) ;
2013-10-18 03:27:07 +04:00
return false ;
}
EXPORT_SYMBOL ( skb_page_frag_refill ) ;
bool sk_page_frag_refill ( struct sock * sk , struct page_frag * pfrag )
{
if ( likely ( skb_page_frag_refill ( 32U , pfrag , sk - > sk_allocation ) ) )
return true ;
net: use a per task frag allocator
We currently use a per socket order-0 page cache for tcp_sendmsg()
operations.
This page is used to build fragments for skbs.
Its done to increase probability of coalescing small write() into
single segments in skbs still in write queue (not yet sent)
But it wastes a lot of memory for applications handling many mostly
idle sockets, since each socket holds one page in sk->sk_sndmsg_page
Its also quite inefficient to build TSO 64KB packets, because we need
about 16 pages per skb on arches where PAGE_SIZE = 4096, so we hit
page allocator more than wanted.
This patch adds a per task frag allocator and uses bigger pages,
if available. An automatic fallback is done in case of memory pressure.
(up to 32768 bytes per frag, thats order-3 pages on x86)
This increases TCP stream performance by 20% on loopback device,
but also benefits on other network devices, since 8x less frags are
mapped on transmit and unmapped on tx completion. Alexander Duyck
mentioned a probable performance win on systems with IOMMU enabled.
Its possible some SG enabled hardware cant cope with bigger fragments,
but their ndo_start_xmit() should already handle this, splitting a
fragment in sub fragments, since some arches have PAGE_SIZE=65536
Successfully tested on various ethernet devices.
(ixgbe, igb, bnx2x, tg3, mellanox mlx4)
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Ben Hutchings <bhutchings@solarflare.com>
Cc: Vijay Subramanian <subramanian.vijay@gmail.com>
Cc: Alexander Duyck <alexander.h.duyck@intel.com>
Tested-by: Vijay Subramanian <subramanian.vijay@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2012-09-24 03:04:42 +04:00
sk_enter_memory_pressure ( sk ) ;
sk_stream_moderate_sndbuf ( sk ) ;
return false ;
}
EXPORT_SYMBOL ( sk_page_frag_refill ) ;
2005-04-17 02:20:36 +04:00
static void __lock_sock ( struct sock * sk )
2010-09-08 07:48:48 +04:00
__releases ( & sk - > sk_lock . slock )
__acquires ( & sk - > sk_lock . slock )
2005-04-17 02:20:36 +04:00
{
DEFINE_WAIT ( wait ) ;
2007-04-11 07:10:33 +04:00
for ( ; ; ) {
2005-04-17 02:20:36 +04:00
prepare_to_wait_exclusive ( & sk - > sk_lock . wq , & wait ,
TASK_UNINTERRUPTIBLE ) ;
spin_unlock_bh ( & sk - > sk_lock . slock ) ;
schedule ( ) ;
spin_lock_bh ( & sk - > sk_lock . slock ) ;
2007-04-11 07:10:33 +04:00
if ( ! sock_owned_by_user ( sk ) )
2005-04-17 02:20:36 +04:00
break ;
}
finish_wait ( & sk - > sk_lock . wq , & wait ) ;
}
static void __release_sock ( struct sock * sk )
2010-09-08 07:48:48 +04:00
__releases ( & sk - > sk_lock . slock )
__acquires ( & sk - > sk_lock . slock )
2005-04-17 02:20:36 +04:00
{
struct sk_buff * skb = sk - > sk_backlog . head ;
do {
sk - > sk_backlog . head = sk - > sk_backlog . tail = NULL ;
bh_unlock_sock ( sk ) ;
do {
struct sk_buff * next = skb - > next ;
2012-04-30 20:07:09 +04:00
prefetch ( next ) ;
2010-05-12 03:19:48 +04:00
WARN_ON_ONCE ( skb_dst_is_noref ( skb ) ) ;
2005-04-17 02:20:36 +04:00
skb - > next = NULL ;
2008-10-08 01:18:42 +04:00
sk_backlog_rcv ( sk , skb ) ;
2005-04-17 02:20:36 +04:00
/*
* We are in process context here with softirqs
* disabled , use cond_resched_softirq ( ) to preempt .
* This is safe to do because we ' ve taken the backlog
* queue private :
*/
cond_resched_softirq ( ) ;
skb = next ;
} while ( skb ! = NULL ) ;
bh_lock_sock ( sk ) ;
2007-04-11 07:10:33 +04:00
} while ( ( skb = sk - > sk_backlog . head ) ! = NULL ) ;
2010-03-04 21:01:40 +03:00
/*
* Doing the zeroing here guarantee we can not loop forever
* while a wild producer attempts to flood us .
*/
sk - > sk_backlog . len = 0 ;
2005-04-17 02:20:36 +04:00
}
/**
* sk_wait_data - wait for data to arrive at sk_receive_queue
2005-05-01 19:59:25 +04:00
* @ sk : sock to wait on
* @ timeo : for how long
2005-04-17 02:20:36 +04:00
*
* Now socket state including sk - > sk_err is changed only under lock ,
* hence we may omit checks after joining wait queue .
* We check receive queue before schedule ( ) only as optimization ;
* it is very likely that release_sock ( ) added new data .
*/
int sk_wait_data ( struct sock * sk , long * timeo )
{
int rc ;
DEFINE_WAIT ( wait ) ;
2010-04-20 17:03:51 +04:00
prepare_to_wait ( sk_sleep ( sk ) , & wait , TASK_INTERRUPTIBLE ) ;
2005-04-17 02:20:36 +04:00
set_bit ( SOCK_ASYNC_WAITDATA , & sk - > sk_socket - > flags ) ;
rc = sk_wait_event ( sk , timeo , ! skb_queue_empty ( & sk - > sk_receive_queue ) ) ;
clear_bit ( SOCK_ASYNC_WAITDATA , & sk - > sk_socket - > flags ) ;
2010-04-20 17:03:51 +04:00
finish_wait ( sk_sleep ( sk ) , & wait ) ;
2005-04-17 02:20:36 +04:00
return rc ;
}
EXPORT_SYMBOL ( sk_wait_data ) ;
2007-12-31 11:11:19 +03:00
/**
* __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
* @ sk : socket
* @ size : memory size to allocate
* @ kind : allocation type
*
* If kind is SK_MEM_SEND , it means wmem allocation . Otherwise it means
* rmem allocation . This function assumes that protocols which have
* memory_pressure use sk_wmem_queued as write buffer accounting .
*/
int __sk_mem_schedule ( struct sock * sk , int size , int kind )
{
struct proto * prot = sk - > sk_prot ;
int amt = sk_mem_pages ( size ) ;
2010-11-10 02:24:26 +03:00
long allocated ;
2011-12-12 01:47:03 +04:00
int parent_status = UNDER_LIMIT ;
2007-12-31 11:11:19 +03:00
sk - > sk_forward_alloc + = amt * SK_MEM_QUANTUM ;
2011-12-12 01:47:02 +04:00
2011-12-12 01:47:03 +04:00
allocated = sk_memory_allocated_add ( sk , amt , & parent_status ) ;
2007-12-31 11:11:19 +03:00
/* Under limit. */
2011-12-12 01:47:03 +04:00
if ( parent_status = = UNDER_LIMIT & &
allocated < = sk_prot_mem_limits ( sk , 0 ) ) {
2011-12-12 01:47:02 +04:00
sk_leave_memory_pressure ( sk ) ;
2007-12-31 11:11:19 +03:00
return 1 ;
}
2011-12-12 01:47:03 +04:00
/* Under pressure. (we or our parents) */
if ( ( parent_status > SOFT_LIMIT ) | |
allocated > sk_prot_mem_limits ( sk , 1 ) )
2011-12-12 01:47:02 +04:00
sk_enter_memory_pressure ( sk ) ;
2007-12-31 11:11:19 +03:00
2011-12-12 01:47:03 +04:00
/* Over hard limit (we or our parents) */
if ( ( parent_status = = OVER_LIMIT ) | |
( allocated > sk_prot_mem_limits ( sk , 2 ) ) )
2007-12-31 11:11:19 +03:00
goto suppress_allocation ;
/* guarantee minimum buffer size under pressure */
if ( kind = = SK_MEM_RECV ) {
if ( atomic_read ( & sk - > sk_rmem_alloc ) < prot - > sysctl_rmem [ 0 ] )
return 1 ;
2011-12-12 01:47:02 +04:00
2007-12-31 11:11:19 +03:00
} else { /* SK_MEM_SEND */
if ( sk - > sk_type = = SOCK_STREAM ) {
if ( sk - > sk_wmem_queued < prot - > sysctl_wmem [ 0 ] )
return 1 ;
} else if ( atomic_read ( & sk - > sk_wmem_alloc ) <
prot - > sysctl_wmem [ 0 ] )
return 1 ;
}
2011-12-12 01:47:02 +04:00
if ( sk_has_memory_pressure ( sk ) ) {
2008-11-26 08:16:35 +03:00
int alloc ;
2011-12-12 01:47:02 +04:00
if ( ! sk_under_memory_pressure ( sk ) )
2008-11-26 08:16:35 +03:00
return 1 ;
2011-12-12 01:47:02 +04:00
alloc = sk_sockets_allocated_read_positive ( sk ) ;
if ( sk_prot_mem_limits ( sk , 2 ) > alloc *
2007-12-31 11:11:19 +03:00
sk_mem_pages ( sk - > sk_wmem_queued +
atomic_read ( & sk - > sk_rmem_alloc ) +
sk - > sk_forward_alloc ) )
return 1 ;
}
suppress_allocation :
if ( kind = = SK_MEM_SEND & & sk - > sk_type = = SOCK_STREAM ) {
sk_stream_moderate_sndbuf ( sk ) ;
/* Fail only if socket is _under_ its sndbuf.
* In this case we cannot block , so that we have to fail .
*/
if ( sk - > sk_wmem_queued + size > = sk - > sk_sndbuf )
return 1 ;
}
2011-06-17 16:00:03 +04:00
trace_sock_exceed_buf_limit ( sk , prot , allocated ) ;
2007-12-31 11:11:19 +03:00
/* Alas. Undo changes. */
sk - > sk_forward_alloc - = amt * SK_MEM_QUANTUM ;
2011-12-12 01:47:02 +04:00
net: introduce res_counter_charge_nofail() for socket allocations
There is a case in __sk_mem_schedule(), where an allocation
is beyond the maximum, but yet we are allowed to proceed.
It happens under the following condition:
sk->sk_wmem_queued + size >= sk->sk_sndbuf
The network code won't revert the allocation in this case,
meaning that at some point later it'll try to do it. Since
this is never communicated to the underlying res_counter
code, there is an inbalance in res_counter uncharge operation.
I see two ways of fixing this:
1) storing the information about those allocations somewhere
in memcg, and then deducting from that first, before
we start draining the res_counter,
2) providing a slightly different allocation function for
the res_counter, that matches the original behavior of
the network code more closely.
I decided to go for #2 here, believing it to be more elegant,
since #1 would require us to do basically that, but in a more
obscure way.
Signed-off-by: Glauber Costa <glommer@parallels.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
CC: Tejun Heo <tj@kernel.org>
CC: Li Zefan <lizf@cn.fujitsu.com>
CC: Laurent Chavey <chavey@google.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2012-01-20 08:57:16 +04:00
sk_memory_allocated_sub ( sk , amt ) ;
2011-12-12 01:47:02 +04:00
2007-12-31 11:11:19 +03:00
return 0 ;
}
EXPORT_SYMBOL ( __sk_mem_schedule ) ;
/**
* __sk_reclaim - reclaim memory_allocated
* @ sk : socket
*/
void __sk_mem_reclaim ( struct sock * sk )
{
2011-12-12 01:47:02 +04:00
sk_memory_allocated_sub ( sk ,
net: introduce res_counter_charge_nofail() for socket allocations
There is a case in __sk_mem_schedule(), where an allocation
is beyond the maximum, but yet we are allowed to proceed.
It happens under the following condition:
sk->sk_wmem_queued + size >= sk->sk_sndbuf
The network code won't revert the allocation in this case,
meaning that at some point later it'll try to do it. Since
this is never communicated to the underlying res_counter
code, there is an inbalance in res_counter uncharge operation.
I see two ways of fixing this:
1) storing the information about those allocations somewhere
in memcg, and then deducting from that first, before
we start draining the res_counter,
2) providing a slightly different allocation function for
the res_counter, that matches the original behavior of
the network code more closely.
I decided to go for #2 here, believing it to be more elegant,
since #1 would require us to do basically that, but in a more
obscure way.
Signed-off-by: Glauber Costa <glommer@parallels.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
CC: Tejun Heo <tj@kernel.org>
CC: Li Zefan <lizf@cn.fujitsu.com>
CC: Laurent Chavey <chavey@google.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2012-01-20 08:57:16 +04:00
sk - > sk_forward_alloc > > SK_MEM_QUANTUM_SHIFT ) ;
2007-12-31 11:11:19 +03:00
sk - > sk_forward_alloc & = SK_MEM_QUANTUM - 1 ;
2011-12-12 01:47:02 +04:00
if ( sk_under_memory_pressure ( sk ) & &
( sk_memory_allocated ( sk ) < sk_prot_mem_limits ( sk , 0 ) ) )
sk_leave_memory_pressure ( sk ) ;
2007-12-31 11:11:19 +03:00
}
EXPORT_SYMBOL ( __sk_mem_reclaim ) ;
2005-04-17 02:20:36 +04:00
/*
* Set of default routines for initialising struct proto_ops when
* the protocol does not support a particular function . In certain
* cases where it makes no sense for a protocol to have a " do nothing "
* function , some default processing is provided .
*/
int sock_no_bind ( struct socket * sock , struct sockaddr * saddr , int len )
{
return - EOPNOTSUPP ;
}
2009-05-27 15:30:05 +04:00
EXPORT_SYMBOL ( sock_no_bind ) ;
2005-04-17 02:20:36 +04:00
2007-02-09 17:24:36 +03:00
int sock_no_connect ( struct socket * sock , struct sockaddr * saddr ,
2005-04-17 02:20:36 +04:00
int len , int flags )
{
return - EOPNOTSUPP ;
}
2009-05-27 15:30:05 +04:00
EXPORT_SYMBOL ( sock_no_connect ) ;
2005-04-17 02:20:36 +04:00
int sock_no_socketpair ( struct socket * sock1 , struct socket * sock2 )
{
return - EOPNOTSUPP ;
}
2009-05-27 15:30:05 +04:00
EXPORT_SYMBOL ( sock_no_socketpair ) ;
2005-04-17 02:20:36 +04:00
int sock_no_accept ( struct socket * sock , struct socket * newsock , int flags )
{
return - EOPNOTSUPP ;
}
2009-05-27 15:30:05 +04:00
EXPORT_SYMBOL ( sock_no_accept ) ;
2005-04-17 02:20:36 +04:00
2007-02-09 17:24:36 +03:00
int sock_no_getname ( struct socket * sock , struct sockaddr * saddr ,
2005-04-17 02:20:36 +04:00
int * len , int peer )
{
return - EOPNOTSUPP ;
}
2009-05-27 15:30:05 +04:00
EXPORT_SYMBOL ( sock_no_getname ) ;
2005-04-17 02:20:36 +04:00
2009-05-27 15:30:05 +04:00
unsigned int sock_no_poll ( struct file * file , struct socket * sock , poll_table * pt )
2005-04-17 02:20:36 +04:00
{
return 0 ;
}
2009-05-27 15:30:05 +04:00
EXPORT_SYMBOL ( sock_no_poll ) ;
2005-04-17 02:20:36 +04:00
int sock_no_ioctl ( struct socket * sock , unsigned int cmd , unsigned long arg )
{
return - EOPNOTSUPP ;
}
2009-05-27 15:30:05 +04:00
EXPORT_SYMBOL ( sock_no_ioctl ) ;
2005-04-17 02:20:36 +04:00
int sock_no_listen ( struct socket * sock , int backlog )
{
return - EOPNOTSUPP ;
}
2009-05-27 15:30:05 +04:00
EXPORT_SYMBOL ( sock_no_listen ) ;
2005-04-17 02:20:36 +04:00
int sock_no_shutdown ( struct socket * sock , int how )
{
return - EOPNOTSUPP ;
}
2009-05-27 15:30:05 +04:00
EXPORT_SYMBOL ( sock_no_shutdown ) ;
2005-04-17 02:20:36 +04:00
int sock_no_setsockopt ( struct socket * sock , int level , int optname ,
2009-10-01 03:12:20 +04:00
char __user * optval , unsigned int optlen )
2005-04-17 02:20:36 +04:00
{
return - EOPNOTSUPP ;
}
2009-05-27 15:30:05 +04:00
EXPORT_SYMBOL ( sock_no_setsockopt ) ;
2005-04-17 02:20:36 +04:00
int sock_no_getsockopt ( struct socket * sock , int level , int optname ,
char __user * optval , int __user * optlen )
{
return - EOPNOTSUPP ;
}
2009-05-27 15:30:05 +04:00
EXPORT_SYMBOL ( sock_no_getsockopt ) ;
2005-04-17 02:20:36 +04:00
int sock_no_sendmsg ( struct kiocb * iocb , struct socket * sock , struct msghdr * m ,
size_t len )
{
return - EOPNOTSUPP ;
}
2009-05-27 15:30:05 +04:00
EXPORT_SYMBOL ( sock_no_sendmsg ) ;
2005-04-17 02:20:36 +04:00
int sock_no_recvmsg ( struct kiocb * iocb , struct socket * sock , struct msghdr * m ,
size_t len , int flags )
{
return - EOPNOTSUPP ;
}
2009-05-27 15:30:05 +04:00
EXPORT_SYMBOL ( sock_no_recvmsg ) ;
2005-04-17 02:20:36 +04:00
int sock_no_mmap ( struct file * file , struct socket * sock , struct vm_area_struct * vma )
{
/* Mirror missing mmap method error code */
return - ENODEV ;
}
2009-05-27 15:30:05 +04:00
EXPORT_SYMBOL ( sock_no_mmap ) ;
2005-04-17 02:20:36 +04:00
ssize_t sock_no_sendpage ( struct socket * sock , struct page * page , int offset , size_t size , int flags )
{
ssize_t res ;
struct msghdr msg = { . msg_flags = flags } ;
struct kvec iov ;
char * kaddr = kmap ( page ) ;
iov . iov_base = kaddr + offset ;
iov . iov_len = size ;
res = kernel_sendmsg ( sock , & msg , & iov , 1 , size ) ;
kunmap ( page ) ;
return res ;
}
2009-05-27 15:30:05 +04:00
EXPORT_SYMBOL ( sock_no_sendpage ) ;
2005-04-17 02:20:36 +04:00
/*
* Default Socket Callbacks
*/
static void sock_def_wakeup ( struct sock * sk )
{
2010-04-29 15:01:49 +04:00
struct socket_wq * wq ;
rcu_read_lock ( ) ;
wq = rcu_dereference ( sk - > sk_wq ) ;
if ( wq_has_sleeper ( wq ) )
wake_up_interruptible_all ( & wq - > wait ) ;
rcu_read_unlock ( ) ;
2005-04-17 02:20:36 +04:00
}
static void sock_def_error_report ( struct sock * sk )
{
2010-04-29 15:01:49 +04:00
struct socket_wq * wq ;
rcu_read_lock ( ) ;
wq = rcu_dereference ( sk - > sk_wq ) ;
if ( wq_has_sleeper ( wq ) )
wake_up_interruptible_poll ( & wq - > wait , POLLERR ) ;
2007-11-26 15:10:50 +03:00
sk_wake_async ( sk , SOCK_WAKE_IO , POLL_ERR ) ;
2010-04-29 15:01:49 +04:00
rcu_read_unlock ( ) ;
2005-04-17 02:20:36 +04:00
}
2014-04-12 00:15:36 +04:00
static void sock_def_readable ( struct sock * sk )
2005-04-17 02:20:36 +04:00
{
2010-04-29 15:01:49 +04:00
struct socket_wq * wq ;
rcu_read_lock ( ) ;
wq = rcu_dereference ( sk - > sk_wq ) ;
if ( wq_has_sleeper ( wq ) )
2011-01-06 21:54:29 +03:00
wake_up_interruptible_sync_poll ( & wq - > wait , POLLIN | POLLPRI |
2009-04-01 02:24:21 +04:00
POLLRDNORM | POLLRDBAND ) ;
2007-11-26 15:10:50 +03:00
sk_wake_async ( sk , SOCK_WAKE_WAITD , POLL_IN ) ;
2010-04-29 15:01:49 +04:00
rcu_read_unlock ( ) ;
2005-04-17 02:20:36 +04:00
}
static void sock_def_write_space ( struct sock * sk )
{
2010-04-29 15:01:49 +04:00
struct socket_wq * wq ;
rcu_read_lock ( ) ;
2005-04-17 02:20:36 +04:00
/* Do not wake up a writer until he can make "significant"
* progress . - - DaveM
*/
2007-04-11 07:10:33 +04:00
if ( ( atomic_read ( & sk - > sk_wmem_alloc ) < < 1 ) < = sk - > sk_sndbuf ) {
2010-04-29 15:01:49 +04:00
wq = rcu_dereference ( sk - > sk_wq ) ;
if ( wq_has_sleeper ( wq ) )
wake_up_interruptible_sync_poll ( & wq - > wait , POLLOUT |
2009-04-01 02:24:21 +04:00
POLLWRNORM | POLLWRBAND ) ;
2005-04-17 02:20:36 +04:00
/* Should agree with poll, otherwise some programs break */
if ( sock_writeable ( sk ) )
2007-11-26 15:10:50 +03:00
sk_wake_async ( sk , SOCK_WAKE_SPACE , POLL_OUT ) ;
2005-04-17 02:20:36 +04:00
}
2010-04-29 15:01:49 +04:00
rcu_read_unlock ( ) ;
2005-04-17 02:20:36 +04:00
}
static void sock_def_destruct ( struct sock * sk )
{
2005-11-08 20:41:34 +03:00
kfree ( sk - > sk_protinfo ) ;
2005-04-17 02:20:36 +04:00
}
void sk_send_sigurg ( struct sock * sk )
{
if ( sk - > sk_socket & & sk - > sk_socket - > file )
if ( send_sigurg ( & sk - > sk_socket - > file - > f_owner ) )
2007-11-26 15:10:50 +03:00
sk_wake_async ( sk , SOCK_WAKE_URG , POLL_PRI ) ;
2005-04-17 02:20:36 +04:00
}
2009-05-27 15:30:05 +04:00
EXPORT_SYMBOL ( sk_send_sigurg ) ;
2005-04-17 02:20:36 +04:00
void sk_reset_timer ( struct sock * sk , struct timer_list * timer ,
unsigned long expires )
{
if ( ! mod_timer ( timer , expires ) )
sock_hold ( sk ) ;
}
EXPORT_SYMBOL ( sk_reset_timer ) ;
void sk_stop_timer ( struct sock * sk , struct timer_list * timer )
{
2013-02-04 00:32:57 +04:00
if ( del_timer ( timer ) )
2005-04-17 02:20:36 +04:00
__sock_put ( sk ) ;
}
EXPORT_SYMBOL ( sk_stop_timer ) ;
void sock_init_data ( struct socket * sock , struct sock * sk )
{
skb_queue_head_init ( & sk - > sk_receive_queue ) ;
skb_queue_head_init ( & sk - > sk_write_queue ) ;
skb_queue_head_init ( & sk - > sk_error_queue ) ;
2006-05-24 04:55:33 +04:00
# ifdef CONFIG_NET_DMA
skb_queue_head_init ( & sk - > sk_async_wait_queue ) ;
# endif
2005-04-17 02:20:36 +04:00
sk - > sk_send_head = NULL ;
init_timer ( & sk - > sk_timer ) ;
2007-02-09 17:24:36 +03:00
2005-04-17 02:20:36 +04:00
sk - > sk_allocation = GFP_KERNEL ;
sk - > sk_rcvbuf = sysctl_rmem_default ;
sk - > sk_sndbuf = sysctl_wmem_default ;
sk - > sk_state = TCP_CLOSE ;
2008-06-18 09:41:38 +04:00
sk_set_socket ( sk , sock ) ;
2005-04-17 02:20:36 +04:00
sock_set_flag ( sk , SOCK_ZAPPED ) ;
2007-04-11 07:10:33 +04:00
if ( sock ) {
2005-04-17 02:20:36 +04:00
sk - > sk_type = sock - > type ;
2010-04-29 15:01:49 +04:00
sk - > sk_wq = sock - > wq ;
2005-04-17 02:20:36 +04:00
sock - > sk = sk ;
} else
2010-04-29 15:01:49 +04:00
sk - > sk_wq = NULL ;
2005-04-17 02:20:36 +04:00
2010-04-09 03:03:29 +04:00
spin_lock_init ( & sk - > sk_dst_lock ) ;
2005-04-17 02:20:36 +04:00
rwlock_init ( & sk - > sk_callback_lock ) ;
2007-07-19 12:49:00 +04:00
lockdep_set_class_and_name ( & sk - > sk_callback_lock ,
af_callback_keys + sk - > sk_family ,
af_family_clock_key_strings [ sk - > sk_family ] ) ;
2005-04-17 02:20:36 +04:00
sk - > sk_state_change = sock_def_wakeup ;
sk - > sk_data_ready = sock_def_readable ;
sk - > sk_write_space = sock_def_write_space ;
sk - > sk_error_report = sock_def_error_report ;
sk - > sk_destruct = sock_def_destruct ;
net: use a per task frag allocator
We currently use a per socket order-0 page cache for tcp_sendmsg()
operations.
This page is used to build fragments for skbs.
Its done to increase probability of coalescing small write() into
single segments in skbs still in write queue (not yet sent)
But it wastes a lot of memory for applications handling many mostly
idle sockets, since each socket holds one page in sk->sk_sndmsg_page
Its also quite inefficient to build TSO 64KB packets, because we need
about 16 pages per skb on arches where PAGE_SIZE = 4096, so we hit
page allocator more than wanted.
This patch adds a per task frag allocator and uses bigger pages,
if available. An automatic fallback is done in case of memory pressure.
(up to 32768 bytes per frag, thats order-3 pages on x86)
This increases TCP stream performance by 20% on loopback device,
but also benefits on other network devices, since 8x less frags are
mapped on transmit and unmapped on tx completion. Alexander Duyck
mentioned a probable performance win on systems with IOMMU enabled.
Its possible some SG enabled hardware cant cope with bigger fragments,
but their ndo_start_xmit() should already handle this, splitting a
fragment in sub fragments, since some arches have PAGE_SIZE=65536
Successfully tested on various ethernet devices.
(ixgbe, igb, bnx2x, tg3, mellanox mlx4)
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Ben Hutchings <bhutchings@solarflare.com>
Cc: Vijay Subramanian <subramanian.vijay@gmail.com>
Cc: Alexander Duyck <alexander.h.duyck@intel.com>
Tested-by: Vijay Subramanian <subramanian.vijay@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2012-09-24 03:04:42 +04:00
sk - > sk_frag . page = NULL ;
sk - > sk_frag . offset = 0 ;
2012-02-21 11:31:34 +04:00
sk - > sk_peek_off = - 1 ;
2005-04-17 02:20:36 +04:00
2010-06-13 07:30:14 +04:00
sk - > sk_peer_pid = NULL ;
sk - > sk_peer_cred = NULL ;
2005-04-17 02:20:36 +04:00
sk - > sk_write_pending = 0 ;
sk - > sk_rcvlowat = 1 ;
sk - > sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT ;
sk - > sk_sndtimeo = MAX_SCHEDULE_TIMEOUT ;
2008-04-14 08:39:26 +04:00
sk - > sk_stamp = ktime_set ( - 1L , 0 ) ;
2005-04-17 02:20:36 +04:00
2013-08-01 07:10:25 +04:00
# ifdef CONFIG_NET_RX_BUSY_POLL
2013-06-10 12:39:50 +04:00
sk - > sk_napi_id = 0 ;
2013-07-10 18:13:36 +04:00
sk - > sk_ll_usec = sysctl_net_busy_read ;
2013-06-10 12:39:50 +04:00
# endif
2013-09-24 19:20:52 +04:00
sk - > sk_max_pacing_rate = ~ 0U ;
2013-10-09 02:16:00 +04:00
sk - > sk_pacing_rate = ~ 0U ;
2009-07-16 03:13:10 +04:00
/*
* Before updating sk_refcnt , we must commit prior changes to memory
* ( Documentation / RCU / rculist_nulls . txt for details )
*/
smp_wmb ( ) ;
2005-04-17 02:20:36 +04:00
atomic_set ( & sk - > sk_refcnt , 1 ) ;
2007-11-14 07:30:01 +03:00
atomic_set ( & sk - > sk_drops , 0 ) ;
2005-04-17 02:20:36 +04:00
}
2009-05-27 15:30:05 +04:00
EXPORT_SYMBOL ( sock_init_data ) ;
2005-04-17 02:20:36 +04:00
2008-02-14 02:03:16 +03:00
void lock_sock_nested ( struct sock * sk , int subclass )
2005-04-17 02:20:36 +04:00
{
might_sleep ( ) ;
2006-07-03 11:25:35 +04:00
spin_lock_bh ( & sk - > sk_lock . slock ) ;
2007-09-12 12:44:19 +04:00
if ( sk - > sk_lock . owned )
2005-04-17 02:20:36 +04:00
__lock_sock ( sk ) ;
2007-09-12 12:44:19 +04:00
sk - > sk_lock . owned = 1 ;
2006-07-03 11:25:35 +04:00
spin_unlock ( & sk - > sk_lock . slock ) ;
/*
* The sk_lock has mutex_lock ( ) semantics here :
*/
2006-11-09 09:44:35 +03:00
mutex_acquire ( & sk - > sk_lock . dep_map , subclass , 0 , _RET_IP_ ) ;
2006-07-03 11:25:35 +04:00
local_bh_enable ( ) ;
2005-04-17 02:20:36 +04:00
}
2006-11-09 09:44:35 +03:00
EXPORT_SYMBOL ( lock_sock_nested ) ;
2005-04-17 02:20:36 +04:00
2008-02-14 02:03:16 +03:00
void release_sock ( struct sock * sk )
2005-04-17 02:20:36 +04:00
{
2006-07-03 11:25:35 +04:00
/*
* The sk_lock has mutex_unlock ( ) semantics :
*/
mutex_release ( & sk - > sk_lock . dep_map , 1 , _RET_IP_ ) ;
spin_lock_bh ( & sk - > sk_lock . slock ) ;
2005-04-17 02:20:36 +04:00
if ( sk - > sk_backlog . tail )
__release_sock ( sk ) ;
tcp: TCP Small Queues
This introduce TSQ (TCP Small Queues)
TSQ goal is to reduce number of TCP packets in xmit queues (qdisc &
device queues), to reduce RTT and cwnd bias, part of the bufferbloat
problem.
sk->sk_wmem_alloc not allowed to grow above a given limit,
allowing no more than ~128KB [1] per tcp socket in qdisc/dev layers at a
given time.
TSO packets are sized/capped to half the limit, so that we have two
TSO packets in flight, allowing better bandwidth use.
As a side effect, setting the limit to 40000 automatically reduces the
standard gso max limit (65536) to 40000/2 : It can help to reduce
latencies of high prio packets, having smaller TSO packets.
This means we divert sock_wfree() to a tcp_wfree() handler, to
queue/send following frames when skb_orphan() [2] is called for the
already queued skbs.
Results on my dev machines (tg3/ixgbe nics) are really impressive,
using standard pfifo_fast, and with or without TSO/GSO.
Without reduction of nominal bandwidth, we have reduction of buffering
per bulk sender :
< 1ms on Gbit (instead of 50ms with TSO)
< 8ms on 100Mbit (instead of 132 ms)
I no longer have 4 MBytes backlogged in qdisc by a single netperf
session, and both side socket autotuning no longer use 4 Mbytes.
As skb destructor cannot restart xmit itself ( as qdisc lock might be
taken at this point ), we delegate the work to a tasklet. We use one
tasklest per cpu for performance reasons.
If tasklet finds a socket owned by the user, it sets TSQ_OWNED flag.
This flag is tested in a new protocol method called from release_sock(),
to eventually send new segments.
[1] New /proc/sys/net/ipv4/tcp_limit_output_bytes tunable
[2] skb_orphan() is usually called at TX completion time,
but some drivers call it in their start_xmit() handler.
These drivers should at least use BQL, or else a single TCP
session can still fill the whole NIC TX ring, since TSQ will
have no effect.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Dave Taht <dave.taht@bufferbloat.net>
Cc: Tom Herbert <therbert@google.com>
Cc: Matt Mathis <mattmathis@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Cc: Nandita Dukkipati <nanditad@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2012-07-11 09:50:31 +04:00
tcp: tcp_release_cb() should release socket ownership
Lars Persson reported following deadlock :
-000 |M:0x0:0x802B6AF8(asm) <-- arch_spin_lock
-001 |tcp_v4_rcv(skb = 0x8BD527A0) <-- sk = 0x8BE6B2A0
-002 |ip_local_deliver_finish(skb = 0x8BD527A0)
-003 |__netif_receive_skb_core(skb = 0x8BD527A0, ?)
-004 |netif_receive_skb(skb = 0x8BD527A0)
-005 |elk_poll(napi = 0x8C770500, budget = 64)
-006 |net_rx_action(?)
-007 |__do_softirq()
-008 |do_softirq()
-009 |local_bh_enable()
-010 |tcp_rcv_established(sk = 0x8BE6B2A0, skb = 0x87D3A9E0, th = 0x814EBE14, ?)
-011 |tcp_v4_do_rcv(sk = 0x8BE6B2A0, skb = 0x87D3A9E0)
-012 |tcp_delack_timer_handler(sk = 0x8BE6B2A0)
-013 |tcp_release_cb(sk = 0x8BE6B2A0)
-014 |release_sock(sk = 0x8BE6B2A0)
-015 |tcp_sendmsg(?, sk = 0x8BE6B2A0, ?, ?)
-016 |sock_sendmsg(sock = 0x8518C4C0, msg = 0x87D8DAA8, size = 4096)
-017 |kernel_sendmsg(?, ?, ?, ?, size = 4096)
-018 |smb_send_kvec()
-019 |smb_send_rqst(server = 0x87C4D400, rqst = 0x87D8DBA0)
-020 |cifs_call_async()
-021 |cifs_async_writev(wdata = 0x87FD6580)
-022 |cifs_writepages(mapping = 0x852096E4, wbc = 0x87D8DC88)
-023 |__writeback_single_inode(inode = 0x852095D0, wbc = 0x87D8DC88)
-024 |writeback_sb_inodes(sb = 0x87D6D800, wb = 0x87E4A9C0, work = 0x87D8DD88)
-025 |__writeback_inodes_wb(wb = 0x87E4A9C0, work = 0x87D8DD88)
-026 |wb_writeback(wb = 0x87E4A9C0, work = 0x87D8DD88)
-027 |wb_do_writeback(wb = 0x87E4A9C0, force_wait = 0)
-028 |bdi_writeback_workfn(work = 0x87E4A9CC)
-029 |process_one_work(worker = 0x8B045880, work = 0x87E4A9CC)
-030 |worker_thread(__worker = 0x8B045880)
-031 |kthread(_create = 0x87CADD90)
-032 |ret_from_kernel_thread(asm)
Bug occurs because __tcp_checksum_complete_user() enables BH, assuming
it is running from softirq context.
Lars trace involved a NIC without RX checksum support but other points
are problematic as well, like the prequeue stuff.
Problem is triggered by a timer, that found socket being owned by user.
tcp_release_cb() should call tcp_write_timer_handler() or
tcp_delack_timer_handler() in the appropriate context :
BH disabled and socket lock held, but 'owned' field cleared,
as if they were running from timer handlers.
Fixes: 6f458dfb4092 ("tcp: improve latencies of timer triggered events")
Reported-by: Lars Persson <lars.persson@axis.com>
Tested-by: Lars Persson <lars.persson@axis.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-03-10 20:50:11 +04:00
/* Warning : release_cb() might need to release sk ownership,
* ie call sock_release_ownership ( sk ) before us .
*/
tcp: TCP Small Queues
This introduce TSQ (TCP Small Queues)
TSQ goal is to reduce number of TCP packets in xmit queues (qdisc &
device queues), to reduce RTT and cwnd bias, part of the bufferbloat
problem.
sk->sk_wmem_alloc not allowed to grow above a given limit,
allowing no more than ~128KB [1] per tcp socket in qdisc/dev layers at a
given time.
TSO packets are sized/capped to half the limit, so that we have two
TSO packets in flight, allowing better bandwidth use.
As a side effect, setting the limit to 40000 automatically reduces the
standard gso max limit (65536) to 40000/2 : It can help to reduce
latencies of high prio packets, having smaller TSO packets.
This means we divert sock_wfree() to a tcp_wfree() handler, to
queue/send following frames when skb_orphan() [2] is called for the
already queued skbs.
Results on my dev machines (tg3/ixgbe nics) are really impressive,
using standard pfifo_fast, and with or without TSO/GSO.
Without reduction of nominal bandwidth, we have reduction of buffering
per bulk sender :
< 1ms on Gbit (instead of 50ms with TSO)
< 8ms on 100Mbit (instead of 132 ms)
I no longer have 4 MBytes backlogged in qdisc by a single netperf
session, and both side socket autotuning no longer use 4 Mbytes.
As skb destructor cannot restart xmit itself ( as qdisc lock might be
taken at this point ), we delegate the work to a tasklet. We use one
tasklest per cpu for performance reasons.
If tasklet finds a socket owned by the user, it sets TSQ_OWNED flag.
This flag is tested in a new protocol method called from release_sock(),
to eventually send new segments.
[1] New /proc/sys/net/ipv4/tcp_limit_output_bytes tunable
[2] skb_orphan() is usually called at TX completion time,
but some drivers call it in their start_xmit() handler.
These drivers should at least use BQL, or else a single TCP
session can still fill the whole NIC TX ring, since TSQ will
have no effect.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Dave Taht <dave.taht@bufferbloat.net>
Cc: Tom Herbert <therbert@google.com>
Cc: Matt Mathis <mattmathis@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Cc: Nandita Dukkipati <nanditad@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2012-07-11 09:50:31 +04:00
if ( sk - > sk_prot - > release_cb )
sk - > sk_prot - > release_cb ( sk ) ;
tcp: tcp_release_cb() should release socket ownership
Lars Persson reported following deadlock :
-000 |M:0x0:0x802B6AF8(asm) <-- arch_spin_lock
-001 |tcp_v4_rcv(skb = 0x8BD527A0) <-- sk = 0x8BE6B2A0
-002 |ip_local_deliver_finish(skb = 0x8BD527A0)
-003 |__netif_receive_skb_core(skb = 0x8BD527A0, ?)
-004 |netif_receive_skb(skb = 0x8BD527A0)
-005 |elk_poll(napi = 0x8C770500, budget = 64)
-006 |net_rx_action(?)
-007 |__do_softirq()
-008 |do_softirq()
-009 |local_bh_enable()
-010 |tcp_rcv_established(sk = 0x8BE6B2A0, skb = 0x87D3A9E0, th = 0x814EBE14, ?)
-011 |tcp_v4_do_rcv(sk = 0x8BE6B2A0, skb = 0x87D3A9E0)
-012 |tcp_delack_timer_handler(sk = 0x8BE6B2A0)
-013 |tcp_release_cb(sk = 0x8BE6B2A0)
-014 |release_sock(sk = 0x8BE6B2A0)
-015 |tcp_sendmsg(?, sk = 0x8BE6B2A0, ?, ?)
-016 |sock_sendmsg(sock = 0x8518C4C0, msg = 0x87D8DAA8, size = 4096)
-017 |kernel_sendmsg(?, ?, ?, ?, size = 4096)
-018 |smb_send_kvec()
-019 |smb_send_rqst(server = 0x87C4D400, rqst = 0x87D8DBA0)
-020 |cifs_call_async()
-021 |cifs_async_writev(wdata = 0x87FD6580)
-022 |cifs_writepages(mapping = 0x852096E4, wbc = 0x87D8DC88)
-023 |__writeback_single_inode(inode = 0x852095D0, wbc = 0x87D8DC88)
-024 |writeback_sb_inodes(sb = 0x87D6D800, wb = 0x87E4A9C0, work = 0x87D8DD88)
-025 |__writeback_inodes_wb(wb = 0x87E4A9C0, work = 0x87D8DD88)
-026 |wb_writeback(wb = 0x87E4A9C0, work = 0x87D8DD88)
-027 |wb_do_writeback(wb = 0x87E4A9C0, force_wait = 0)
-028 |bdi_writeback_workfn(work = 0x87E4A9CC)
-029 |process_one_work(worker = 0x8B045880, work = 0x87E4A9CC)
-030 |worker_thread(__worker = 0x8B045880)
-031 |kthread(_create = 0x87CADD90)
-032 |ret_from_kernel_thread(asm)
Bug occurs because __tcp_checksum_complete_user() enables BH, assuming
it is running from softirq context.
Lars trace involved a NIC without RX checksum support but other points
are problematic as well, like the prequeue stuff.
Problem is triggered by a timer, that found socket being owned by user.
tcp_release_cb() should call tcp_write_timer_handler() or
tcp_delack_timer_handler() in the appropriate context :
BH disabled and socket lock held, but 'owned' field cleared,
as if they were running from timer handlers.
Fixes: 6f458dfb4092 ("tcp: improve latencies of timer triggered events")
Reported-by: Lars Persson <lars.persson@axis.com>
Tested-by: Lars Persson <lars.persson@axis.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-03-10 20:50:11 +04:00
sock_release_ownership ( sk ) ;
2006-07-03 11:25:35 +04:00
if ( waitqueue_active ( & sk - > sk_lock . wq ) )
wake_up ( & sk - > sk_lock . wq ) ;
spin_unlock_bh ( & sk - > sk_lock . slock ) ;
2005-04-17 02:20:36 +04:00
}
EXPORT_SYMBOL ( release_sock ) ;
2010-05-26 23:20:18 +04:00
/**
* lock_sock_fast - fast version of lock_sock
* @ sk : socket
*
* This version should be used for very small section , where process wont block
* return false if fast path is taken
* sk_lock . slock locked , owned = 0 , BH disabled
* return true if slow path is taken
* sk_lock . slock unlocked , owned = 1 , BH enabled
*/
bool lock_sock_fast ( struct sock * sk )
{
might_sleep ( ) ;
spin_lock_bh ( & sk - > sk_lock . slock ) ;
if ( ! sk - > sk_lock . owned )
/*
* Note : We must disable BH
*/
return false ;
__lock_sock ( sk ) ;
sk - > sk_lock . owned = 1 ;
spin_unlock ( & sk - > sk_lock . slock ) ;
/*
* The sk_lock has mutex_lock ( ) semantics here :
*/
mutex_acquire ( & sk - > sk_lock . dep_map , 0 , 0 , _RET_IP_ ) ;
local_bh_enable ( ) ;
return true ;
}
EXPORT_SYMBOL ( lock_sock_fast ) ;
2005-04-17 02:20:36 +04:00
int sock_get_timestamp ( struct sock * sk , struct timeval __user * userstamp )
2007-02-09 17:24:36 +03:00
{
2007-04-20 03:16:32 +04:00
struct timeval tv ;
2005-04-17 02:20:36 +04:00
if ( ! sock_flag ( sk , SOCK_TIMESTAMP ) )
2009-02-12 08:03:38 +03:00
sock_enable_timestamp ( sk , SOCK_TIMESTAMP ) ;
2007-04-20 03:16:32 +04:00
tv = ktime_to_timeval ( sk - > sk_stamp ) ;
if ( tv . tv_sec = = - 1 )
2005-04-17 02:20:36 +04:00
return - ENOENT ;
2007-04-20 03:16:32 +04:00
if ( tv . tv_sec = = 0 ) {
sk - > sk_stamp = ktime_get_real ( ) ;
tv = ktime_to_timeval ( sk - > sk_stamp ) ;
}
return copy_to_user ( userstamp , & tv , sizeof ( tv ) ) ? - EFAULT : 0 ;
2007-02-09 17:24:36 +03:00
}
2005-04-17 02:20:36 +04:00
EXPORT_SYMBOL ( sock_get_timestamp ) ;
2007-03-19 03:33:16 +03:00
int sock_get_timestampns ( struct sock * sk , struct timespec __user * userstamp )
{
struct timespec ts ;
if ( ! sock_flag ( sk , SOCK_TIMESTAMP ) )
2009-02-12 08:03:38 +03:00
sock_enable_timestamp ( sk , SOCK_TIMESTAMP ) ;
2007-03-19 03:33:16 +03:00
ts = ktime_to_timespec ( sk - > sk_stamp ) ;
if ( ts . tv_sec = = - 1 )
return - ENOENT ;
if ( ts . tv_sec = = 0 ) {
sk - > sk_stamp = ktime_get_real ( ) ;
ts = ktime_to_timespec ( sk - > sk_stamp ) ;
}
return copy_to_user ( userstamp , & ts , sizeof ( ts ) ) ? - EFAULT : 0 ;
}
EXPORT_SYMBOL ( sock_get_timestampns ) ;
2009-02-12 08:03:38 +03:00
void sock_enable_timestamp ( struct sock * sk , int flag )
2007-02-09 17:24:36 +03:00
{
2009-02-12 08:03:38 +03:00
if ( ! sock_flag ( sk , flag ) ) {
2011-11-28 16:04:18 +04:00
unsigned long previous_flags = sk - > sk_flags ;
2009-02-12 08:03:38 +03:00
sock_set_flag ( sk , flag ) ;
/*
* we just set one of the two flags which require net
* time stamping , but time stamping might have been on
* already because of the other one
*/
2011-11-28 16:04:18 +04:00
if ( ! ( previous_flags & SK_FLAGS_TIMESTAMP ) )
2009-02-12 08:03:38 +03:00
net_enable_timestamp ( ) ;
2005-04-17 02:20:36 +04:00
}
}
2013-07-19 21:40:09 +04:00
int sock_recv_errqueue ( struct sock * sk , struct msghdr * msg , int len ,
int level , int type )
{
struct sock_exterr_skb * serr ;
struct sk_buff * skb , * skb2 ;
int copied , err ;
err = - EAGAIN ;
skb = skb_dequeue ( & sk - > sk_error_queue ) ;
if ( skb = = NULL )
goto out ;
copied = skb - > len ;
if ( copied > len ) {
msg - > msg_flags | = MSG_TRUNC ;
copied = len ;
}
err = skb_copy_datagram_iovec ( skb , 0 , msg - > msg_iov , copied ) ;
if ( err )
goto out_free_skb ;
sock_recv_timestamp ( msg , sk , skb ) ;
serr = SKB_EXT_ERR ( skb ) ;
put_cmsg ( msg , level , type , sizeof ( serr - > ee ) , & serr - > ee ) ;
msg - > msg_flags | = MSG_ERRQUEUE ;
err = copied ;
/* Reset and regenerate socket error */
spin_lock_bh ( & sk - > sk_error_queue . lock ) ;
sk - > sk_err = 0 ;
if ( ( skb2 = skb_peek ( & sk - > sk_error_queue ) ) ! = NULL ) {
sk - > sk_err = SKB_EXT_ERR ( skb2 ) - > ee . ee_errno ;
spin_unlock_bh ( & sk - > sk_error_queue . lock ) ;
sk - > sk_error_report ( sk ) ;
} else
spin_unlock_bh ( & sk - > sk_error_queue . lock ) ;
out_free_skb :
kfree_skb ( skb ) ;
out :
return err ;
}
EXPORT_SYMBOL ( sock_recv_errqueue ) ;
2005-04-17 02:20:36 +04:00
/*
* Get a socket option on an socket .
*
* FIX : POSIX 1003.1 g is very ambiguous here . It states that
* asynchronous errors should be reported by getsockopt . We assume
* this means if you specify SO_ERROR ( otherwise whats the point of it ) .
*/
int sock_common_getsockopt ( struct socket * sock , int level , int optname ,
char __user * optval , int __user * optlen )
{
struct sock * sk = sock - > sk ;
return sk - > sk_prot - > getsockopt ( sk , level , optname , optval , optlen ) ;
}
EXPORT_SYMBOL ( sock_common_getsockopt ) ;
2006-03-21 09:45:21 +03:00
# ifdef CONFIG_COMPAT
2006-03-21 09:48:35 +03:00
int compat_sock_common_getsockopt ( struct socket * sock , int level , int optname ,
char __user * optval , int __user * optlen )
2006-03-21 09:45:21 +03:00
{
struct sock * sk = sock - > sk ;
2007-03-07 00:44:06 +03:00
if ( sk - > sk_prot - > compat_getsockopt ! = NULL )
2006-03-21 09:48:35 +03:00
return sk - > sk_prot - > compat_getsockopt ( sk , level , optname ,
optval , optlen ) ;
2006-03-21 09:45:21 +03:00
return sk - > sk_prot - > getsockopt ( sk , level , optname , optval , optlen ) ;
}
EXPORT_SYMBOL ( compat_sock_common_getsockopt ) ;
# endif
2005-04-17 02:20:36 +04:00
int sock_common_recvmsg ( struct kiocb * iocb , struct socket * sock ,
struct msghdr * msg , size_t size , int flags )
{
struct sock * sk = sock - > sk ;
int addr_len = 0 ;
int err ;
err = sk - > sk_prot - > recvmsg ( iocb , sk , msg , size , flags & MSG_DONTWAIT ,
flags & ~ MSG_DONTWAIT , & addr_len ) ;
if ( err > = 0 )
msg - > msg_namelen = addr_len ;
return err ;
}
EXPORT_SYMBOL ( sock_common_recvmsg ) ;
/*
* Set socket options on an inet socket .
*/
int sock_common_setsockopt ( struct socket * sock , int level , int optname ,
2009-10-01 03:12:20 +04:00
char __user * optval , unsigned int optlen )
2005-04-17 02:20:36 +04:00
{
struct sock * sk = sock - > sk ;
return sk - > sk_prot - > setsockopt ( sk , level , optname , optval , optlen ) ;
}
EXPORT_SYMBOL ( sock_common_setsockopt ) ;
2006-03-21 09:45:21 +03:00
# ifdef CONFIG_COMPAT
2006-03-21 09:48:35 +03:00
int compat_sock_common_setsockopt ( struct socket * sock , int level , int optname ,
2009-10-01 03:12:20 +04:00
char __user * optval , unsigned int optlen )
2006-03-21 09:45:21 +03:00
{
struct sock * sk = sock - > sk ;
2006-03-21 09:48:35 +03:00
if ( sk - > sk_prot - > compat_setsockopt ! = NULL )
return sk - > sk_prot - > compat_setsockopt ( sk , level , optname ,
optval , optlen ) ;
2006-03-21 09:45:21 +03:00
return sk - > sk_prot - > setsockopt ( sk , level , optname , optval , optlen ) ;
}
EXPORT_SYMBOL ( compat_sock_common_setsockopt ) ;
# endif
2005-04-17 02:20:36 +04:00
void sk_common_release ( struct sock * sk )
{
if ( sk - > sk_prot - > destroy )
sk - > sk_prot - > destroy ( sk ) ;
/*
* Observation : when sock_common_release is called , processes have
* no access to socket . But net still has .
* Step one , detach it from networking :
*
* A . Remove from hash tables .
*/
sk - > sk_prot - > unhash ( sk ) ;
/*
* In this point socket cannot receive new packets , but it is possible
* that some packets are in flight because some CPU runs receiver and
* did hash table lookup before we unhashed socket . They will achieve
* receive queue and will be purged by socket destructor .
*
* Also we still have packets pending on receive queue and probably ,
* our own packets waiting in device queues . sock_destroy will drain
* receive queue , but transmitted packets will delay socket destruction
* until the last reference will be released .
*/
sock_orphan ( sk ) ;
xfrm_sk_free_policy ( sk ) ;
2005-08-10 06:45:38 +04:00
sk_refcnt_debug_release ( sk ) ;
net: use a per task frag allocator
We currently use a per socket order-0 page cache for tcp_sendmsg()
operations.
This page is used to build fragments for skbs.
Its done to increase probability of coalescing small write() into
single segments in skbs still in write queue (not yet sent)
But it wastes a lot of memory for applications handling many mostly
idle sockets, since each socket holds one page in sk->sk_sndmsg_page
Its also quite inefficient to build TSO 64KB packets, because we need
about 16 pages per skb on arches where PAGE_SIZE = 4096, so we hit
page allocator more than wanted.
This patch adds a per task frag allocator and uses bigger pages,
if available. An automatic fallback is done in case of memory pressure.
(up to 32768 bytes per frag, thats order-3 pages on x86)
This increases TCP stream performance by 20% on loopback device,
but also benefits on other network devices, since 8x less frags are
mapped on transmit and unmapped on tx completion. Alexander Duyck
mentioned a probable performance win on systems with IOMMU enabled.
Its possible some SG enabled hardware cant cope with bigger fragments,
but their ndo_start_xmit() should already handle this, splitting a
fragment in sub fragments, since some arches have PAGE_SIZE=65536
Successfully tested on various ethernet devices.
(ixgbe, igb, bnx2x, tg3, mellanox mlx4)
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Ben Hutchings <bhutchings@solarflare.com>
Cc: Vijay Subramanian <subramanian.vijay@gmail.com>
Cc: Alexander Duyck <alexander.h.duyck@intel.com>
Tested-by: Vijay Subramanian <subramanian.vijay@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2012-09-24 03:04:42 +04:00
if ( sk - > sk_frag . page ) {
put_page ( sk - > sk_frag . page ) ;
sk - > sk_frag . page = NULL ;
}
2005-04-17 02:20:36 +04:00
sock_put ( sk ) ;
}
EXPORT_SYMBOL ( sk_common_release ) ;
2008-03-29 02:38:17 +03:00
# ifdef CONFIG_PROC_FS
# define PROTO_INUSE_NR 64 /* should be enough for the first time */
2008-03-29 02:38:43 +03:00
struct prot_inuse {
int val [ PROTO_INUSE_NR ] ;
} ;
2008-03-29 02:38:17 +03:00
static DECLARE_BITMAP ( proto_inuse_idx , PROTO_INUSE_NR ) ;
2008-04-01 06:42:16 +04:00
# ifdef CONFIG_NET_NS
void sock_prot_inuse_add ( struct net * net , struct proto * prot , int val )
{
2010-07-19 14:48:49 +04:00
__this_cpu_add ( net - > core . inuse - > val [ prot - > inuse_idx ] , val ) ;
2008-04-01 06:42:16 +04:00
}
EXPORT_SYMBOL_GPL ( sock_prot_inuse_add ) ;
int sock_prot_inuse_get ( struct net * net , struct proto * prot )
{
int cpu , idx = prot - > inuse_idx ;
int res = 0 ;
for_each_possible_cpu ( cpu )
res + = per_cpu_ptr ( net - > core . inuse , cpu ) - > val [ idx ] ;
return res > = 0 ? res : 0 ;
}
EXPORT_SYMBOL_GPL ( sock_prot_inuse_get ) ;
2010-01-17 06:35:32 +03:00
static int __net_init sock_inuse_init_net ( struct net * net )
2008-04-01 06:42:16 +04:00
{
net - > core . inuse = alloc_percpu ( struct prot_inuse ) ;
return net - > core . inuse ? 0 : - ENOMEM ;
}
2010-01-17 06:35:32 +03:00
static void __net_exit sock_inuse_exit_net ( struct net * net )
2008-04-01 06:42:16 +04:00
{
free_percpu ( net - > core . inuse ) ;
}
static struct pernet_operations net_inuse_ops = {
. init = sock_inuse_init_net ,
. exit = sock_inuse_exit_net ,
} ;
static __init int net_inuse_init ( void )
{
if ( register_pernet_subsys ( & net_inuse_ops ) )
panic ( " Cannot initialize net inuse counters " ) ;
return 0 ;
}
core_initcall ( net_inuse_init ) ;
# else
2008-03-29 02:38:43 +03:00
static DEFINE_PER_CPU ( struct prot_inuse , prot_inuse ) ;
2008-04-01 06:41:46 +04:00
void sock_prot_inuse_add ( struct net * net , struct proto * prot , int val )
2008-03-29 02:38:43 +03:00
{
2010-07-19 14:48:49 +04:00
__this_cpu_add ( prot_inuse . val [ prot - > inuse_idx ] , val ) ;
2008-03-29 02:38:43 +03:00
}
EXPORT_SYMBOL_GPL ( sock_prot_inuse_add ) ;
2008-04-01 06:41:46 +04:00
int sock_prot_inuse_get ( struct net * net , struct proto * prot )
2008-03-29 02:38:43 +03:00
{
int cpu , idx = prot - > inuse_idx ;
int res = 0 ;
for_each_possible_cpu ( cpu )
res + = per_cpu ( prot_inuse , cpu ) . val [ idx ] ;
return res > = 0 ? res : 0 ;
}
EXPORT_SYMBOL_GPL ( sock_prot_inuse_get ) ;
2008-04-01 06:42:16 +04:00
# endif
2008-03-29 02:38:17 +03:00
static void assign_proto_idx ( struct proto * prot )
{
prot - > inuse_idx = find_first_zero_bit ( proto_inuse_idx , PROTO_INUSE_NR ) ;
if ( unlikely ( prot - > inuse_idx = = PROTO_INUSE_NR - 1 ) ) {
2012-05-16 23:58:40 +04:00
pr_err ( " PROTO_INUSE_NR exhausted \n " ) ;
2008-03-29 02:38:17 +03:00
return ;
}
set_bit ( prot - > inuse_idx , proto_inuse_idx ) ;
}
static void release_proto_idx ( struct proto * prot )
{
if ( prot - > inuse_idx ! = PROTO_INUSE_NR - 1 )
clear_bit ( prot - > inuse_idx , proto_inuse_idx ) ;
}
# else
static inline void assign_proto_idx ( struct proto * prot )
{
}
static inline void release_proto_idx ( struct proto * prot )
{
}
# endif
2007-11-07 13:23:38 +03:00
int proto_register ( struct proto * prot , int alloc_slab )
{
2005-04-17 02:20:36 +04:00
if ( alloc_slab ) {
prot - > slab = kmem_cache_create ( prot - > name , prot - > obj_size , 0 ,
udp: RCU handling for Unicast packets.
Goals are :
1) Optimizing handling of incoming Unicast UDP frames, so that no memory
writes should happen in the fast path.
Note: Multicasts and broadcasts still will need to take a lock,
because doing a full lockless lookup in this case is difficult.
2) No expensive operations in the socket bind/unhash phases :
- No expensive synchronize_rcu() calls.
- No added rcu_head in socket structure, increasing memory needs,
but more important, forcing us to use call_rcu() calls,
that have the bad property of making sockets structure cold.
(rcu grace period between socket freeing and its potential reuse
make this socket being cold in CPU cache).
David did a previous patch using call_rcu() and noticed a 20%
impact on TCP connection rates.
Quoting Cristopher Lameter :
"Right. That results in cacheline cooldown. You'd want to recycle
the object as they are cache hot on a per cpu basis. That is screwed
up by the delayed regular rcu processing. We have seen multiple
regressions due to cacheline cooldown.
The only choice in cacheline hot sensitive areas is to deal with the
complexity that comes with SLAB_DESTROY_BY_RCU or give up on RCU."
- Because udp sockets are allocated from dedicated kmem_cache,
use of SLAB_DESTROY_BY_RCU can help here.
Theory of operation :
---------------------
As the lookup is lockfree (using rcu_read_lock()/rcu_read_unlock()),
special attention must be taken by readers and writers.
Use of SLAB_DESTROY_BY_RCU is tricky too, because a socket can be freed,
reused, inserted in a different chain or in worst case in the same chain
while readers could do lookups in the same time.
In order to avoid loops, a reader must check each socket found in a chain
really belongs to the chain the reader was traversing. If it finds a
mismatch, lookup must start again at the begining. This *restart* loop
is the reason we had to use rdlock for the multicast case, because
we dont want to send same message several times to the same socket.
We use RCU only for fast path.
Thus, /proc/net/udp still takes spinlocks.
Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-10-29 12:11:14 +03:00
SLAB_HWCACHE_ALIGN | prot - > slab_flags ,
NULL ) ;
2005-04-17 02:20:36 +04:00
if ( prot - > slab = = NULL ) {
2012-05-16 23:58:40 +04:00
pr_crit ( " %s: Can't create sock SLAB cache! \n " ,
prot - > name ) ;
2008-03-29 02:39:10 +03:00
goto out ;
2005-04-17 02:20:36 +04:00
}
[NET] Generalise TCP's struct open_request minisock infrastructure
Kept this first changeset minimal, without changing existing names to
ease peer review.
Basicaly tcp_openreq_alloc now receives the or_calltable, that in turn
has two new members:
->slab, that replaces tcp_openreq_cachep
->obj_size, to inform the size of the openreq descendant for
a specific protocol
The protocol specific fields in struct open_request were moved to a
class hierarchy, with the things that are common to all connection
oriented PF_INET protocols in struct inet_request_sock, the TCP ones
in tcp_request_sock, that is an inet_request_sock, that is an
open_request.
I.e. this uses the same approach used for the struct sock class
hierarchy, with sk_prot indicating if the protocol wants to use the
open_request infrastructure by filling in sk_prot->rsk_prot with an
or_calltable.
Results? Performance is improved and TCP v4 now uses only 64 bytes per
open request minisock, down from 96 without this patch :-)
Next changeset will rename some of the structs, fields and functions
mentioned above, struct or_calltable is way unclear, better name it
struct request_sock_ops, s/struct open_request/struct request_sock/g,
etc.
Signed-off-by: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
2005-06-19 09:46:52 +04:00
if ( prot - > rsk_prot ! = NULL ) {
2010-02-17 12:34:12 +03:00
prot - > rsk_prot - > slab_name = kasprintf ( GFP_KERNEL , " request_sock_%s " , prot - > name ) ;
2008-11-22 03:45:22 +03:00
if ( prot - > rsk_prot - > slab_name = = NULL )
[NET] Generalise TCP's struct open_request minisock infrastructure
Kept this first changeset minimal, without changing existing names to
ease peer review.
Basicaly tcp_openreq_alloc now receives the or_calltable, that in turn
has two new members:
->slab, that replaces tcp_openreq_cachep
->obj_size, to inform the size of the openreq descendant for
a specific protocol
The protocol specific fields in struct open_request were moved to a
class hierarchy, with the things that are common to all connection
oriented PF_INET protocols in struct inet_request_sock, the TCP ones
in tcp_request_sock, that is an inet_request_sock, that is an
open_request.
I.e. this uses the same approach used for the struct sock class
hierarchy, with sk_prot indicating if the protocol wants to use the
open_request infrastructure by filling in sk_prot->rsk_prot with an
or_calltable.
Results? Performance is improved and TCP v4 now uses only 64 bytes per
open request minisock, down from 96 without this patch :-)
Next changeset will rename some of the structs, fields and functions
mentioned above, struct or_calltable is way unclear, better name it
struct request_sock_ops, s/struct open_request/struct request_sock/g,
etc.
Signed-off-by: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
2005-06-19 09:46:52 +04:00
goto out_free_sock_slab ;
2008-11-22 03:45:22 +03:00
prot - > rsk_prot - > slab = kmem_cache_create ( prot - > rsk_prot - > slab_name ,
[NET] Generalise TCP's struct open_request minisock infrastructure
Kept this first changeset minimal, without changing existing names to
ease peer review.
Basicaly tcp_openreq_alloc now receives the or_calltable, that in turn
has two new members:
->slab, that replaces tcp_openreq_cachep
->obj_size, to inform the size of the openreq descendant for
a specific protocol
The protocol specific fields in struct open_request were moved to a
class hierarchy, with the things that are common to all connection
oriented PF_INET protocols in struct inet_request_sock, the TCP ones
in tcp_request_sock, that is an inet_request_sock, that is an
open_request.
I.e. this uses the same approach used for the struct sock class
hierarchy, with sk_prot indicating if the protocol wants to use the
open_request infrastructure by filling in sk_prot->rsk_prot with an
or_calltable.
Results? Performance is improved and TCP v4 now uses only 64 bytes per
open request minisock, down from 96 without this patch :-)
Next changeset will rename some of the structs, fields and functions
mentioned above, struct or_calltable is way unclear, better name it
struct request_sock_ops, s/struct open_request/struct request_sock/g,
etc.
Signed-off-by: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
2005-06-19 09:46:52 +04:00
prot - > rsk_prot - > obj_size , 0 ,
2007-07-20 05:11:58 +04:00
SLAB_HWCACHE_ALIGN , NULL ) ;
[NET] Generalise TCP's struct open_request minisock infrastructure
Kept this first changeset minimal, without changing existing names to
ease peer review.
Basicaly tcp_openreq_alloc now receives the or_calltable, that in turn
has two new members:
->slab, that replaces tcp_openreq_cachep
->obj_size, to inform the size of the openreq descendant for
a specific protocol
The protocol specific fields in struct open_request were moved to a
class hierarchy, with the things that are common to all connection
oriented PF_INET protocols in struct inet_request_sock, the TCP ones
in tcp_request_sock, that is an inet_request_sock, that is an
open_request.
I.e. this uses the same approach used for the struct sock class
hierarchy, with sk_prot indicating if the protocol wants to use the
open_request infrastructure by filling in sk_prot->rsk_prot with an
or_calltable.
Results? Performance is improved and TCP v4 now uses only 64 bytes per
open request minisock, down from 96 without this patch :-)
Next changeset will rename some of the structs, fields and functions
mentioned above, struct or_calltable is way unclear, better name it
struct request_sock_ops, s/struct open_request/struct request_sock/g,
etc.
Signed-off-by: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
2005-06-19 09:46:52 +04:00
if ( prot - > rsk_prot - > slab = = NULL ) {
2012-05-16 23:58:40 +04:00
pr_crit ( " %s: Can't create request sock SLAB cache! \n " ,
prot - > name ) ;
[NET] Generalise TCP's struct open_request minisock infrastructure
Kept this first changeset minimal, without changing existing names to
ease peer review.
Basicaly tcp_openreq_alloc now receives the or_calltable, that in turn
has two new members:
->slab, that replaces tcp_openreq_cachep
->obj_size, to inform the size of the openreq descendant for
a specific protocol
The protocol specific fields in struct open_request were moved to a
class hierarchy, with the things that are common to all connection
oriented PF_INET protocols in struct inet_request_sock, the TCP ones
in tcp_request_sock, that is an inet_request_sock, that is an
open_request.
I.e. this uses the same approach used for the struct sock class
hierarchy, with sk_prot indicating if the protocol wants to use the
open_request infrastructure by filling in sk_prot->rsk_prot with an
or_calltable.
Results? Performance is improved and TCP v4 now uses only 64 bytes per
open request minisock, down from 96 without this patch :-)
Next changeset will rename some of the structs, fields and functions
mentioned above, struct or_calltable is way unclear, better name it
struct request_sock_ops, s/struct open_request/struct request_sock/g,
etc.
Signed-off-by: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
2005-06-19 09:46:52 +04:00
goto out_free_request_sock_slab_name ;
}
}
2005-08-10 07:09:30 +04:00
2005-12-14 10:25:19 +03:00
if ( prot - > twsk_prot ! = NULL ) {
2010-02-17 12:34:12 +03:00
prot - > twsk_prot - > twsk_slab_name = kasprintf ( GFP_KERNEL , " tw_sock_%s " , prot - > name ) ;
2005-08-10 07:09:30 +04:00
2008-11-22 03:45:22 +03:00
if ( prot - > twsk_prot - > twsk_slab_name = = NULL )
2005-08-10 07:09:30 +04:00
goto out_free_request_sock_slab ;
2005-12-14 10:25:19 +03:00
prot - > twsk_prot - > twsk_slab =
2008-11-22 03:45:22 +03:00
kmem_cache_create ( prot - > twsk_prot - > twsk_slab_name ,
2005-12-14 10:25:19 +03:00
prot - > twsk_prot - > twsk_obj_size ,
2008-11-17 06:40:17 +03:00
0 ,
SLAB_HWCACHE_ALIGN |
prot - > slab_flags ,
2007-07-20 05:11:58 +04:00
NULL ) ;
2005-12-14 10:25:19 +03:00
if ( prot - > twsk_prot - > twsk_slab = = NULL )
2005-08-10 07:09:30 +04:00
goto out_free_timewait_sock_slab_name ;
}
2005-04-17 02:20:36 +04:00
}
2011-12-16 04:51:59 +04:00
mutex_lock ( & proto_list_mutex ) ;
2005-04-17 02:20:36 +04:00
list_add ( & prot - > node , & proto_list ) ;
2008-03-29 02:38:17 +03:00
assign_proto_idx ( prot ) ;
2011-12-16 04:51:59 +04:00
mutex_unlock ( & proto_list_mutex ) ;
2007-11-07 13:23:38 +03:00
return 0 ;
2005-08-10 07:09:30 +04:00
out_free_timewait_sock_slab_name :
2008-11-22 03:45:22 +03:00
kfree ( prot - > twsk_prot - > twsk_slab_name ) ;
2005-08-10 07:09:30 +04:00
out_free_request_sock_slab :
if ( prot - > rsk_prot & & prot - > rsk_prot - > slab ) {
kmem_cache_destroy ( prot - > rsk_prot - > slab ) ;
prot - > rsk_prot - > slab = NULL ;
}
[NET] Generalise TCP's struct open_request minisock infrastructure
Kept this first changeset minimal, without changing existing names to
ease peer review.
Basicaly tcp_openreq_alloc now receives the or_calltable, that in turn
has two new members:
->slab, that replaces tcp_openreq_cachep
->obj_size, to inform the size of the openreq descendant for
a specific protocol
The protocol specific fields in struct open_request were moved to a
class hierarchy, with the things that are common to all connection
oriented PF_INET protocols in struct inet_request_sock, the TCP ones
in tcp_request_sock, that is an inet_request_sock, that is an
open_request.
I.e. this uses the same approach used for the struct sock class
hierarchy, with sk_prot indicating if the protocol wants to use the
open_request infrastructure by filling in sk_prot->rsk_prot with an
or_calltable.
Results? Performance is improved and TCP v4 now uses only 64 bytes per
open request minisock, down from 96 without this patch :-)
Next changeset will rename some of the structs, fields and functions
mentioned above, struct or_calltable is way unclear, better name it
struct request_sock_ops, s/struct open_request/struct request_sock/g,
etc.
Signed-off-by: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
2005-06-19 09:46:52 +04:00
out_free_request_sock_slab_name :
2010-03-06 04:04:45 +03:00
if ( prot - > rsk_prot )
kfree ( prot - > rsk_prot - > slab_name ) ;
[NET] Generalise TCP's struct open_request minisock infrastructure
Kept this first changeset minimal, without changing existing names to
ease peer review.
Basicaly tcp_openreq_alloc now receives the or_calltable, that in turn
has two new members:
->slab, that replaces tcp_openreq_cachep
->obj_size, to inform the size of the openreq descendant for
a specific protocol
The protocol specific fields in struct open_request were moved to a
class hierarchy, with the things that are common to all connection
oriented PF_INET protocols in struct inet_request_sock, the TCP ones
in tcp_request_sock, that is an inet_request_sock, that is an
open_request.
I.e. this uses the same approach used for the struct sock class
hierarchy, with sk_prot indicating if the protocol wants to use the
open_request infrastructure by filling in sk_prot->rsk_prot with an
or_calltable.
Results? Performance is improved and TCP v4 now uses only 64 bytes per
open request minisock, down from 96 without this patch :-)
Next changeset will rename some of the structs, fields and functions
mentioned above, struct or_calltable is way unclear, better name it
struct request_sock_ops, s/struct open_request/struct request_sock/g,
etc.
Signed-off-by: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
2005-06-19 09:46:52 +04:00
out_free_sock_slab :
kmem_cache_destroy ( prot - > slab ) ;
prot - > slab = NULL ;
2007-11-07 13:23:38 +03:00
out :
return - ENOBUFS ;
2005-04-17 02:20:36 +04:00
}
EXPORT_SYMBOL ( proto_register ) ;
void proto_unregister ( struct proto * prot )
{
2011-12-16 04:51:59 +04:00
mutex_lock ( & proto_list_mutex ) ;
2008-03-29 02:38:17 +03:00
release_proto_idx ( prot ) ;
2005-09-07 06:47:50 +04:00
list_del ( & prot - > node ) ;
2011-12-16 04:51:59 +04:00
mutex_unlock ( & proto_list_mutex ) ;
2005-04-17 02:20:36 +04:00
if ( prot - > slab ! = NULL ) {
kmem_cache_destroy ( prot - > slab ) ;
prot - > slab = NULL ;
}
[NET] Generalise TCP's struct open_request minisock infrastructure
Kept this first changeset minimal, without changing existing names to
ease peer review.
Basicaly tcp_openreq_alloc now receives the or_calltable, that in turn
has two new members:
->slab, that replaces tcp_openreq_cachep
->obj_size, to inform the size of the openreq descendant for
a specific protocol
The protocol specific fields in struct open_request were moved to a
class hierarchy, with the things that are common to all connection
oriented PF_INET protocols in struct inet_request_sock, the TCP ones
in tcp_request_sock, that is an inet_request_sock, that is an
open_request.
I.e. this uses the same approach used for the struct sock class
hierarchy, with sk_prot indicating if the protocol wants to use the
open_request infrastructure by filling in sk_prot->rsk_prot with an
or_calltable.
Results? Performance is improved and TCP v4 now uses only 64 bytes per
open request minisock, down from 96 without this patch :-)
Next changeset will rename some of the structs, fields and functions
mentioned above, struct or_calltable is way unclear, better name it
struct request_sock_ops, s/struct open_request/struct request_sock/g,
etc.
Signed-off-by: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
2005-06-19 09:46:52 +04:00
if ( prot - > rsk_prot ! = NULL & & prot - > rsk_prot - > slab ! = NULL ) {
kmem_cache_destroy ( prot - > rsk_prot - > slab ) ;
2008-11-22 03:45:22 +03:00
kfree ( prot - > rsk_prot - > slab_name ) ;
[NET] Generalise TCP's struct open_request minisock infrastructure
Kept this first changeset minimal, without changing existing names to
ease peer review.
Basicaly tcp_openreq_alloc now receives the or_calltable, that in turn
has two new members:
->slab, that replaces tcp_openreq_cachep
->obj_size, to inform the size of the openreq descendant for
a specific protocol
The protocol specific fields in struct open_request were moved to a
class hierarchy, with the things that are common to all connection
oriented PF_INET protocols in struct inet_request_sock, the TCP ones
in tcp_request_sock, that is an inet_request_sock, that is an
open_request.
I.e. this uses the same approach used for the struct sock class
hierarchy, with sk_prot indicating if the protocol wants to use the
open_request infrastructure by filling in sk_prot->rsk_prot with an
or_calltable.
Results? Performance is improved and TCP v4 now uses only 64 bytes per
open request minisock, down from 96 without this patch :-)
Next changeset will rename some of the structs, fields and functions
mentioned above, struct or_calltable is way unclear, better name it
struct request_sock_ops, s/struct open_request/struct request_sock/g,
etc.
Signed-off-by: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
2005-06-19 09:46:52 +04:00
prot - > rsk_prot - > slab = NULL ;
}
2005-12-14 10:25:19 +03:00
if ( prot - > twsk_prot ! = NULL & & prot - > twsk_prot - > twsk_slab ! = NULL ) {
kmem_cache_destroy ( prot - > twsk_prot - > twsk_slab ) ;
2008-11-22 03:45:22 +03:00
kfree ( prot - > twsk_prot - > twsk_slab_name ) ;
2005-12-14 10:25:19 +03:00
prot - > twsk_prot - > twsk_slab = NULL ;
2005-08-10 07:09:30 +04:00
}
2005-04-17 02:20:36 +04:00
}
EXPORT_SYMBOL ( proto_unregister ) ;
# ifdef CONFIG_PROC_FS
static void * proto_seq_start ( struct seq_file * seq , loff_t * pos )
2011-12-16 04:51:59 +04:00
__acquires ( proto_list_mutex )
2005-04-17 02:20:36 +04:00
{
2011-12-16 04:51:59 +04:00
mutex_lock ( & proto_list_mutex ) ;
2007-07-10 00:15:14 +04:00
return seq_list_start_head ( & proto_list , * pos ) ;
2005-04-17 02:20:36 +04:00
}
static void * proto_seq_next ( struct seq_file * seq , void * v , loff_t * pos )
{
2007-07-10 00:15:14 +04:00
return seq_list_next ( v , & proto_list , pos ) ;
2005-04-17 02:20:36 +04:00
}
static void proto_seq_stop ( struct seq_file * seq , void * v )
2011-12-16 04:51:59 +04:00
__releases ( proto_list_mutex )
2005-04-17 02:20:36 +04:00
{
2011-12-16 04:51:59 +04:00
mutex_unlock ( & proto_list_mutex ) ;
2005-04-17 02:20:36 +04:00
}
static char proto_method_implemented ( const void * method )
{
return method = = NULL ? ' n ' : ' y ' ;
}
2011-12-12 01:47:02 +04:00
static long sock_prot_memory_allocated ( struct proto * proto )
{
2012-04-25 17:47:29 +04:00
return proto - > memory_allocated ! = NULL ? proto_memory_allocated ( proto ) : - 1L ;
2011-12-12 01:47:02 +04:00
}
static char * sock_prot_memory_pressure ( struct proto * proto )
{
return proto - > memory_pressure ! = NULL ?
proto_memory_pressure ( proto ) ? " yes " : " no " : " NI " ;
}
2005-04-17 02:20:36 +04:00
static void proto_seq_printf ( struct seq_file * seq , struct proto * proto )
{
2011-12-12 01:47:02 +04:00
2010-11-10 02:24:26 +03:00
seq_printf ( seq , " %-9s %4u %6d %6ld %-3s %6u %-3s %-10s "
2005-04-17 02:20:36 +04:00
" %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c \n " ,
proto - > name ,
proto - > obj_size ,
2008-11-20 02:14:01 +03:00
sock_prot_inuse_get ( seq_file_net ( seq ) , proto ) ,
2011-12-12 01:47:02 +04:00
sock_prot_memory_allocated ( proto ) ,
sock_prot_memory_pressure ( proto ) ,
2005-04-17 02:20:36 +04:00
proto - > max_header ,
proto - > slab = = NULL ? " no " : " yes " ,
module_name ( proto - > owner ) ,
proto_method_implemented ( proto - > close ) ,
proto_method_implemented ( proto - > connect ) ,
proto_method_implemented ( proto - > disconnect ) ,
proto_method_implemented ( proto - > accept ) ,
proto_method_implemented ( proto - > ioctl ) ,
proto_method_implemented ( proto - > init ) ,
proto_method_implemented ( proto - > destroy ) ,
proto_method_implemented ( proto - > shutdown ) ,
proto_method_implemented ( proto - > setsockopt ) ,
proto_method_implemented ( proto - > getsockopt ) ,
proto_method_implemented ( proto - > sendmsg ) ,
proto_method_implemented ( proto - > recvmsg ) ,
proto_method_implemented ( proto - > sendpage ) ,
proto_method_implemented ( proto - > bind ) ,
proto_method_implemented ( proto - > backlog_rcv ) ,
proto_method_implemented ( proto - > hash ) ,
proto_method_implemented ( proto - > unhash ) ,
proto_method_implemented ( proto - > get_port ) ,
proto_method_implemented ( proto - > enter_memory_pressure ) ) ;
}
static int proto_seq_show ( struct seq_file * seq , void * v )
{
2007-07-10 00:15:14 +04:00
if ( v = = & proto_list )
2005-04-17 02:20:36 +04:00
seq_printf ( seq , " %-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s " ,
" protocol " ,
" size " ,
" sockets " ,
" memory " ,
" press " ,
" maxhdr " ,
" slab " ,
" module " ,
" cl co di ac io in de sh ss gs se re sp bi br ha uh gp em \n " ) ;
else
2007-07-10 00:15:14 +04:00
proto_seq_printf ( seq , list_entry ( v , struct proto , node ) ) ;
2005-04-17 02:20:36 +04:00
return 0 ;
}
2007-03-13 00:34:29 +03:00
static const struct seq_operations proto_seq_ops = {
2005-04-17 02:20:36 +04:00
. start = proto_seq_start ,
. next = proto_seq_next ,
. stop = proto_seq_stop ,
. show = proto_seq_show ,
} ;
static int proto_seq_open ( struct inode * inode , struct file * file )
{
2008-11-20 02:14:01 +03:00
return seq_open_net ( inode , file , & proto_seq_ops ,
sizeof ( struct seq_net_private ) ) ;
2005-04-17 02:20:36 +04:00
}
2007-02-12 11:55:35 +03:00
static const struct file_operations proto_seq_fops = {
2005-04-17 02:20:36 +04:00
. owner = THIS_MODULE ,
. open = proto_seq_open ,
. read = seq_read ,
. llseek = seq_lseek ,
2008-11-20 02:14:01 +03:00
. release = seq_release_net ,
} ;
static __net_init int proto_init_net ( struct net * net )
{
2013-02-18 05:34:54 +04:00
if ( ! proc_create ( " protocols " , S_IRUGO , net - > proc_net , & proto_seq_fops ) )
2008-11-20 02:14:01 +03:00
return - ENOMEM ;
return 0 ;
}
static __net_exit void proto_exit_net ( struct net * net )
{
2013-02-18 05:34:56 +04:00
remove_proc_entry ( " protocols " , net - > proc_net ) ;
2008-11-20 02:14:01 +03:00
}
static __net_initdata struct pernet_operations proto_net_ops = {
. init = proto_init_net ,
. exit = proto_exit_net ,
2005-04-17 02:20:36 +04:00
} ;
static int __init proto_init ( void )
{
2008-11-20 02:14:01 +03:00
return register_pernet_subsys ( & proto_net_ops ) ;
2005-04-17 02:20:36 +04:00
}
subsys_initcall ( proto_init ) ;
# endif /* PROC_FS */