2005-04-17 02:20:36 +04:00
/*
* UDP over IPv6
2007-02-09 17:24:49 +03:00
* Linux INET6 implementation
2005-04-17 02:20:36 +04:00
*
* Authors :
2007-02-09 17:24:49 +03:00
* Pedro Roque < roque @ di . fc . ul . pt >
2005-04-17 02:20:36 +04:00
*
* Based on linux / ipv4 / udp . c
*
* Fixes :
* Hideaki YOSHIFUJI : sin6_scope_id support
* YOSHIFUJI Hideaki @ USAGI and : Support IPV6_V6ONLY socket option , which
* Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
* a single port at the same time .
* Kazunori MIYAZAWA @ USAGI : change process style to use ip6_append_data
* YOSHIFUJI Hideaki @ USAGI : convert / proc / net / udp6 to seq_file .
*
* This program is free software ; you can redistribute it and / or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation ; either version
* 2 of the License , or ( at your option ) any later version .
*/
# include <linux/errno.h>
# include <linux/types.h>
# include <linux/socket.h>
# include <linux/sockios.h>
# include <linux/net.h>
# include <linux/in6.h>
# include <linux/netdevice.h>
# include <linux/if_arp.h>
# include <linux/ipv6.h>
# include <linux/icmpv6.h>
# include <linux/init.h>
2007-12-11 22:30:32 +03:00
# include <linux/module.h>
2005-12-14 10:16:37 +03:00
# include <linux/skbuff.h>
2005-04-17 02:20:36 +04:00
# include <asm/uaccess.h>
# include <net/ndisc.h>
# include <net/protocol.h>
# include <net/transp_v6.h>
# include <net/ip6_route.h>
# include <net/raw.h>
2005-08-10 07:08:28 +04:00
# include <net/tcp_states.h>
2005-04-17 02:20:36 +04:00
# include <net/ip6_checksum.h>
# include <net/xfrm.h>
# include <linux/proc_fs.h>
# include <linux/seq_file.h>
2006-11-27 22:10:57 +03:00
# include "udp_impl.h"
2005-04-17 02:20:36 +04:00
2008-03-23 02:51:21 +03:00
int udp_v6_get_port ( struct sock * sk , unsigned short snum )
2005-04-17 02:20:36 +04:00
{
2008-03-23 02:51:21 +03:00
return udp_lib_get_port ( sk , snum , ipv6_rcv_saddr_equal ) ;
2005-04-17 02:20:36 +04:00
}
2008-10-29 11:41:45 +03:00
static inline int compute_score ( struct sock * sk , struct net * net ,
unsigned short hnum ,
struct in6_addr * saddr , __be16 sport ,
struct in6_addr * daddr , __be16 dport ,
int dif )
{
int score = - 1 ;
if ( net_eq ( sock_net ( sk ) , net ) & & sk - > sk_hash = = hnum & &
sk - > sk_family = = PF_INET6 ) {
struct ipv6_pinfo * np = inet6_sk ( sk ) ;
struct inet_sock * inet = inet_sk ( sk ) ;
score = 0 ;
if ( inet - > dport ) {
if ( inet - > dport ! = sport )
return - 1 ;
score + + ;
}
if ( ! ipv6_addr_any ( & np - > rcv_saddr ) ) {
if ( ! ipv6_addr_equal ( & np - > rcv_saddr , daddr ) )
return - 1 ;
score + + ;
}
if ( ! ipv6_addr_any ( & np - > daddr ) ) {
if ( ! ipv6_addr_equal ( & np - > daddr , saddr ) )
return - 1 ;
score + + ;
}
if ( sk - > sk_bound_dev_if ) {
if ( sk - > sk_bound_dev_if ! = dif )
return - 1 ;
score + + ;
}
}
return score ;
}
2008-01-31 16:07:57 +03:00
static struct sock * __udp6_lib_lookup ( struct net * net ,
struct in6_addr * saddr , __be16 sport ,
2006-11-27 22:10:57 +03:00
struct in6_addr * daddr , __be16 dport ,
2008-10-29 11:41:45 +03:00
int dif , struct udp_table * udptable )
2005-04-17 02:20:36 +04:00
{
udp: RCU handling for Unicast packets.
Goals are :
1) Optimizing handling of incoming Unicast UDP frames, so that no memory
writes should happen in the fast path.
Note: Multicasts and broadcasts still will need to take a lock,
because doing a full lockless lookup in this case is difficult.
2) No expensive operations in the socket bind/unhash phases :
- No expensive synchronize_rcu() calls.
- No added rcu_head in socket structure, increasing memory needs,
but more important, forcing us to use call_rcu() calls,
that have the bad property of making sockets structure cold.
(rcu grace period between socket freeing and its potential reuse
make this socket being cold in CPU cache).
David did a previous patch using call_rcu() and noticed a 20%
impact on TCP connection rates.
Quoting Cristopher Lameter :
"Right. That results in cacheline cooldown. You'd want to recycle
the object as they are cache hot on a per cpu basis. That is screwed
up by the delayed regular rcu processing. We have seen multiple
regressions due to cacheline cooldown.
The only choice in cacheline hot sensitive areas is to deal with the
complexity that comes with SLAB_DESTROY_BY_RCU or give up on RCU."
- Because udp sockets are allocated from dedicated kmem_cache,
use of SLAB_DESTROY_BY_RCU can help here.
Theory of operation :
---------------------
As the lookup is lockfree (using rcu_read_lock()/rcu_read_unlock()),
special attention must be taken by readers and writers.
Use of SLAB_DESTROY_BY_RCU is tricky too, because a socket can be freed,
reused, inserted in a different chain or in worst case in the same chain
while readers could do lookups in the same time.
In order to avoid loops, a reader must check each socket found in a chain
really belongs to the chain the reader was traversing. If it finds a
mismatch, lookup must start again at the begining. This *restart* loop
is the reason we had to use rdlock for the multicast case, because
we dont want to send same message several times to the same socket.
We use RCU only for fast path.
Thus, /proc/net/udp still takes spinlocks.
Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-10-29 12:11:14 +03:00
struct sock * sk , * result ;
2008-11-17 06:39:21 +03:00
struct hlist_nulls_node * node ;
2005-04-17 02:20:36 +04:00
unsigned short hnum = ntohs ( dport ) ;
2008-10-29 11:41:45 +03:00
unsigned int hash = udp_hashfn ( net , hnum ) ;
struct udp_hslot * hslot = & udptable - > hash [ hash ] ;
udp: RCU handling for Unicast packets.
Goals are :
1) Optimizing handling of incoming Unicast UDP frames, so that no memory
writes should happen in the fast path.
Note: Multicasts and broadcasts still will need to take a lock,
because doing a full lockless lookup in this case is difficult.
2) No expensive operations in the socket bind/unhash phases :
- No expensive synchronize_rcu() calls.
- No added rcu_head in socket structure, increasing memory needs,
but more important, forcing us to use call_rcu() calls,
that have the bad property of making sockets structure cold.
(rcu grace period between socket freeing and its potential reuse
make this socket being cold in CPU cache).
David did a previous patch using call_rcu() and noticed a 20%
impact on TCP connection rates.
Quoting Cristopher Lameter :
"Right. That results in cacheline cooldown. You'd want to recycle
the object as they are cache hot on a per cpu basis. That is screwed
up by the delayed regular rcu processing. We have seen multiple
regressions due to cacheline cooldown.
The only choice in cacheline hot sensitive areas is to deal with the
complexity that comes with SLAB_DESTROY_BY_RCU or give up on RCU."
- Because udp sockets are allocated from dedicated kmem_cache,
use of SLAB_DESTROY_BY_RCU can help here.
Theory of operation :
---------------------
As the lookup is lockfree (using rcu_read_lock()/rcu_read_unlock()),
special attention must be taken by readers and writers.
Use of SLAB_DESTROY_BY_RCU is tricky too, because a socket can be freed,
reused, inserted in a different chain or in worst case in the same chain
while readers could do lookups in the same time.
In order to avoid loops, a reader must check each socket found in a chain
really belongs to the chain the reader was traversing. If it finds a
mismatch, lookup must start again at the begining. This *restart* loop
is the reason we had to use rdlock for the multicast case, because
we dont want to send same message several times to the same socket.
We use RCU only for fast path.
Thus, /proc/net/udp still takes spinlocks.
Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-10-29 12:11:14 +03:00
int score , badness ;
2008-10-29 11:41:45 +03:00
udp: RCU handling for Unicast packets.
Goals are :
1) Optimizing handling of incoming Unicast UDP frames, so that no memory
writes should happen in the fast path.
Note: Multicasts and broadcasts still will need to take a lock,
because doing a full lockless lookup in this case is difficult.
2) No expensive operations in the socket bind/unhash phases :
- No expensive synchronize_rcu() calls.
- No added rcu_head in socket structure, increasing memory needs,
but more important, forcing us to use call_rcu() calls,
that have the bad property of making sockets structure cold.
(rcu grace period between socket freeing and its potential reuse
make this socket being cold in CPU cache).
David did a previous patch using call_rcu() and noticed a 20%
impact on TCP connection rates.
Quoting Cristopher Lameter :
"Right. That results in cacheline cooldown. You'd want to recycle
the object as they are cache hot on a per cpu basis. That is screwed
up by the delayed regular rcu processing. We have seen multiple
regressions due to cacheline cooldown.
The only choice in cacheline hot sensitive areas is to deal with the
complexity that comes with SLAB_DESTROY_BY_RCU or give up on RCU."
- Because udp sockets are allocated from dedicated kmem_cache,
use of SLAB_DESTROY_BY_RCU can help here.
Theory of operation :
---------------------
As the lookup is lockfree (using rcu_read_lock()/rcu_read_unlock()),
special attention must be taken by readers and writers.
Use of SLAB_DESTROY_BY_RCU is tricky too, because a socket can be freed,
reused, inserted in a different chain or in worst case in the same chain
while readers could do lookups in the same time.
In order to avoid loops, a reader must check each socket found in a chain
really belongs to the chain the reader was traversing. If it finds a
mismatch, lookup must start again at the begining. This *restart* loop
is the reason we had to use rdlock for the multicast case, because
we dont want to send same message several times to the same socket.
We use RCU only for fast path.
Thus, /proc/net/udp still takes spinlocks.
Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-10-29 12:11:14 +03:00
rcu_read_lock ( ) ;
begin :
result = NULL ;
badness = - 1 ;
2008-11-17 06:39:21 +03:00
sk_nulls_for_each_rcu ( sk , node , & hslot - > head ) {
2008-10-29 11:41:45 +03:00
score = compute_score ( sk , net , hnum , saddr , sport , daddr , dport , dif ) ;
if ( score > badness ) {
result = sk ;
badness = score ;
2005-04-17 02:20:36 +04:00
}
}
2008-11-17 06:39:21 +03:00
/*
* if the nulls value we got at the end of this lookup is
* not the expected one , we must restart lookup .
* We probably met an item that was moved to another chain .
*/
if ( get_nulls_value ( node ) ! = hash )
goto begin ;
udp: RCU handling for Unicast packets.
Goals are :
1) Optimizing handling of incoming Unicast UDP frames, so that no memory
writes should happen in the fast path.
Note: Multicasts and broadcasts still will need to take a lock,
because doing a full lockless lookup in this case is difficult.
2) No expensive operations in the socket bind/unhash phases :
- No expensive synchronize_rcu() calls.
- No added rcu_head in socket structure, increasing memory needs,
but more important, forcing us to use call_rcu() calls,
that have the bad property of making sockets structure cold.
(rcu grace period between socket freeing and its potential reuse
make this socket being cold in CPU cache).
David did a previous patch using call_rcu() and noticed a 20%
impact on TCP connection rates.
Quoting Cristopher Lameter :
"Right. That results in cacheline cooldown. You'd want to recycle
the object as they are cache hot on a per cpu basis. That is screwed
up by the delayed regular rcu processing. We have seen multiple
regressions due to cacheline cooldown.
The only choice in cacheline hot sensitive areas is to deal with the
complexity that comes with SLAB_DESTROY_BY_RCU or give up on RCU."
- Because udp sockets are allocated from dedicated kmem_cache,
use of SLAB_DESTROY_BY_RCU can help here.
Theory of operation :
---------------------
As the lookup is lockfree (using rcu_read_lock()/rcu_read_unlock()),
special attention must be taken by readers and writers.
Use of SLAB_DESTROY_BY_RCU is tricky too, because a socket can be freed,
reused, inserted in a different chain or in worst case in the same chain
while readers could do lookups in the same time.
In order to avoid loops, a reader must check each socket found in a chain
really belongs to the chain the reader was traversing. If it finds a
mismatch, lookup must start again at the begining. This *restart* loop
is the reason we had to use rdlock for the multicast case, because
we dont want to send same message several times to the same socket.
We use RCU only for fast path.
Thus, /proc/net/udp still takes spinlocks.
Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-10-29 12:11:14 +03:00
if ( result ) {
if ( unlikely ( ! atomic_inc_not_zero ( & result - > sk_refcnt ) ) )
result = NULL ;
else if ( unlikely ( compute_score ( result , net , hnum , saddr , sport ,
daddr , dport , dif ) < badness ) ) {
sock_put ( result ) ;
goto begin ;
}
}
rcu_read_unlock ( ) ;
2005-04-17 02:20:36 +04:00
return result ;
}
2008-10-07 23:38:32 +04:00
static struct sock * __udp6_lib_lookup_skb ( struct sk_buff * skb ,
__be16 sport , __be16 dport ,
2008-10-29 11:41:45 +03:00
struct udp_table * udptable )
2008-10-07 23:38:32 +04:00
{
2008-10-07 23:41:01 +04:00
struct sock * sk ;
2008-10-07 23:38:32 +04:00
struct ipv6hdr * iph = ipv6_hdr ( skb ) ;
2008-10-07 23:41:01 +04:00
if ( unlikely ( sk = skb_steal_sock ( skb ) ) )
return sk ;
else
return __udp6_lib_lookup ( dev_net ( skb - > dst - > dev ) , & iph - > saddr , sport ,
& iph - > daddr , dport , inet6_iif ( skb ) ,
udptable ) ;
2008-10-07 23:38:32 +04:00
}
2005-04-17 02:20:36 +04:00
/*
* This should be easy , if there is something there we
* return it , otherwise we block .
*/
2006-11-27 22:10:57 +03:00
int udpv6_recvmsg ( struct kiocb * iocb , struct sock * sk ,
2005-04-17 02:20:36 +04:00
struct msghdr * msg , size_t len ,
int noblock , int flags , int * addr_len )
{
struct ipv6_pinfo * np = inet6_sk ( sk ) ;
struct inet_sock * inet = inet_sk ( sk ) ;
2007-02-09 17:24:49 +03:00
struct sk_buff * skb ;
2007-03-26 07:10:56 +04:00
unsigned int ulen , copied ;
2007-12-05 12:53:40 +03:00
int peeked ;
2007-03-26 07:10:56 +04:00
int err ;
int is_udplite = IS_UDPLITE ( sk ) ;
2008-11-02 19:11:01 +03:00
int is_udp4 ;
2005-04-17 02:20:36 +04:00
2007-02-09 17:24:49 +03:00
if ( addr_len )
* addr_len = sizeof ( struct sockaddr_in6 ) ;
2005-04-17 02:20:36 +04:00
if ( flags & MSG_ERRQUEUE )
return ipv6_recv_error ( sk , msg , len ) ;
try_again :
2007-12-05 12:53:40 +03:00
skb = __skb_recv_datagram ( sk , flags | ( noblock ? MSG_DONTWAIT : 0 ) ,
& peeked , & err ) ;
2005-04-17 02:20:36 +04:00
if ( ! skb )
goto out ;
2007-03-26 07:10:56 +04:00
ulen = skb - > len - sizeof ( struct udphdr ) ;
copied = len ;
if ( copied > ulen )
copied = ulen ;
else if ( copied < ulen )
2007-02-09 17:24:49 +03:00
msg - > msg_flags | = MSG_TRUNC ;
2005-04-17 02:20:36 +04:00
2008-11-02 19:11:01 +03:00
is_udp4 = ( skb - > protocol = = htons ( ETH_P_IP ) ) ;
2006-11-27 22:10:57 +03:00
/*
2007-03-26 07:10:56 +04:00
* If checksum is needed at all , try to do it while copying the
* data . If the data is truncated , or if we only want a partial
* coverage checksum ( UDP - Lite ) , do it before the copy .
2006-11-27 22:10:57 +03:00
*/
2007-03-26 07:10:56 +04:00
if ( copied < ulen | | UDP_SKB_CB ( skb ) - > partial_cov ) {
if ( udp_lib_checksum_complete ( skb ) )
2005-04-17 02:20:36 +04:00
goto csum_copy_err ;
2006-11-27 22:10:57 +03:00
}
2007-04-09 22:59:39 +04:00
if ( skb_csum_unnecessary ( skb ) )
2006-11-27 22:10:57 +03:00
err = skb_copy_datagram_iovec ( skb , sizeof ( struct udphdr ) ,
msg - > msg_iov , copied ) ;
else {
2005-04-17 02:20:36 +04:00
err = skb_copy_and_csum_datagram_iovec ( skb , sizeof ( struct udphdr ) , msg - > msg_iov ) ;
if ( err = = - EINVAL )
goto csum_copy_err ;
}
if ( err )
goto out_free ;
2008-11-02 19:11:01 +03:00
if ( ! peeked ) {
if ( is_udp4 )
UDP_INC_STATS_USER ( sock_net ( sk ) ,
UDP_MIB_INDATAGRAMS , is_udplite ) ;
else
UDP6_INC_STATS_USER ( sock_net ( sk ) ,
UDP_MIB_INDATAGRAMS , is_udplite ) ;
}
2007-12-03 14:33:28 +03:00
2005-04-17 02:20:36 +04:00
sock_recv_timestamp ( msg , sk , skb ) ;
/* Copy the address. */
if ( msg - > msg_name ) {
struct sockaddr_in6 * sin6 ;
2007-02-09 17:24:49 +03:00
2005-04-17 02:20:36 +04:00
sin6 = ( struct sockaddr_in6 * ) msg - > msg_name ;
sin6 - > sin6_family = AF_INET6 ;
2007-03-13 20:28:48 +03:00
sin6 - > sin6_port = udp_hdr ( skb ) - > source ;
2005-04-17 02:20:36 +04:00
sin6 - > sin6_flowinfo = 0 ;
sin6 - > sin6_scope_id = 0 ;
2008-11-02 19:11:01 +03:00
if ( is_udp4 )
2005-04-17 02:20:36 +04:00
ipv6_addr_set ( & sin6 - > sin6_addr , 0 , 0 ,
2007-04-21 09:47:35 +04:00
htonl ( 0xffff ) , ip_hdr ( skb ) - > saddr ) ;
2005-04-17 02:20:36 +04:00
else {
2007-04-26 04:54:47 +04:00
ipv6_addr_copy ( & sin6 - > sin6_addr ,
& ipv6_hdr ( skb ) - > saddr ) ;
2005-04-17 02:20:36 +04:00
if ( ipv6_addr_type ( & sin6 - > sin6_addr ) & IPV6_ADDR_LINKLOCAL )
sin6 - > sin6_scope_id = IP6CB ( skb ) - > iif ;
}
}
2008-11-02 19:11:01 +03:00
if ( is_udp4 ) {
2005-04-17 02:20:36 +04:00
if ( inet - > cmsg_flags )
ip_cmsg_recv ( msg , skb ) ;
} else {
if ( np - > rxopt . all )
datagram_recv_ctl ( sk , msg , skb ) ;
2007-02-09 17:24:49 +03:00
}
2005-04-17 02:20:36 +04:00
err = copied ;
if ( flags & MSG_TRUNC )
2007-03-26 07:10:56 +04:00
err = ulen ;
2005-04-17 02:20:36 +04:00
out_free :
2007-12-31 11:29:24 +03:00
lock_sock ( sk ) ;
2005-04-17 02:20:36 +04:00
skb_free_datagram ( sk , skb ) ;
2007-12-31 11:29:24 +03:00
release_sock ( sk ) ;
2005-04-17 02:20:36 +04:00
out :
return err ;
csum_copy_err :
2007-12-31 11:29:24 +03:00
lock_sock ( sk ) ;
2008-11-02 19:14:27 +03:00
if ( ! skb_kill_datagram ( sk , skb , flags ) ) {
if ( is_udp4 )
UDP_INC_STATS_USER ( sock_net ( sk ) ,
UDP_MIB_INERRORS , is_udplite ) ;
else
UDP6_INC_STATS_USER ( sock_net ( sk ) ,
UDP_MIB_INERRORS , is_udplite ) ;
}
2007-12-31 11:29:24 +03:00
release_sock ( sk ) ;
2005-04-17 02:20:36 +04:00
2007-11-06 08:29:17 +03:00
if ( flags & MSG_DONTWAIT )
2005-04-17 02:20:36 +04:00
return - EAGAIN ;
goto try_again ;
}
2006-11-27 22:10:57 +03:00
void __udp6_lib_err ( struct sk_buff * skb , struct inet6_skb_parm * opt ,
int type , int code , int offset , __be32 info ,
2008-10-29 11:41:45 +03:00
struct udp_table * udptable )
2005-04-17 02:20:36 +04:00
{
struct ipv6_pinfo * np ;
struct ipv6hdr * hdr = ( struct ipv6hdr * ) skb - > data ;
struct in6_addr * saddr = & hdr - > saddr ;
struct in6_addr * daddr = & hdr - > daddr ;
struct udphdr * uh = ( struct udphdr * ) ( skb - > data + offset ) ;
struct sock * sk ;
int err ;
2008-03-25 15:47:49 +03:00
sk = __udp6_lib_lookup ( dev_net ( skb - > dev ) , daddr , uh - > dest ,
2006-11-27 22:10:57 +03:00
saddr , uh - > source , inet6_iif ( skb ) , udptable ) ;
2005-04-17 02:20:36 +04:00
if ( sk = = NULL )
return ;
np = inet6_sk ( sk ) ;
if ( ! icmpv6_err_convert ( type , code , & err ) & & ! np - > recverr )
goto out ;
if ( sk - > sk_state ! = TCP_ESTABLISHED & & ! np - > recverr )
goto out ;
if ( np - > recverr )
ipv6_icmp_error ( sk , skb , err , uh - > dest , ntohl ( info ) , ( u8 * ) ( uh + 1 ) ) ;
sk - > sk_err = err ;
sk - > sk_error_report ( sk ) ;
out :
sock_put ( sk ) ;
}
2006-11-27 22:10:57 +03:00
static __inline__ void udpv6_err ( struct sk_buff * skb ,
struct inet6_skb_parm * opt , int type ,
2006-11-21 05:06:37 +03:00
int code , int offset , __be32 info )
2006-11-27 22:10:57 +03:00
{
2008-10-29 11:41:45 +03:00
__udp6_lib_err ( skb , opt , type , code , offset , info , & udp_table ) ;
2006-11-27 22:10:57 +03:00
}
int udpv6_queue_rcv_skb ( struct sock * sk , struct sk_buff * skb )
2005-04-17 02:20:36 +04:00
{
2006-11-27 22:10:57 +03:00
struct udp_sock * up = udp_sk ( sk ) ;
2006-08-15 11:00:09 +04:00
int rc ;
2007-12-03 14:34:16 +03:00
int is_udplite = IS_UDPLITE ( sk ) ;
2006-08-15 11:00:09 +04:00
2006-11-27 22:10:57 +03:00
if ( ! xfrm6_policy_check ( sk , XFRM_POLICY_IN , skb ) )
goto drop ;
2005-04-17 02:20:36 +04:00
2006-11-27 22:10:57 +03:00
/*
* UDP - Lite specific tests , ignored on UDP sockets ( see net / ipv4 / udp . c ) .
*/
2007-12-03 14:34:16 +03:00
if ( ( is_udplite & UDPLITE_RECV_CC ) & & UDP_SKB_CB ( skb ) - > partial_cov ) {
2006-11-27 22:10:57 +03:00
if ( up - > pcrlen = = 0 ) { /* full coverage was set */
LIMIT_NETDEBUG ( KERN_WARNING " UDPLITE6: partial coverage "
" %d while full coverage %d requested \n " ,
UDP_SKB_CB ( skb ) - > cscov , skb - > len ) ;
goto drop ;
}
if ( UDP_SKB_CB ( skb ) - > cscov < up - > pcrlen ) {
LIMIT_NETDEBUG ( KERN_WARNING " UDPLITE6: coverage %d "
" too small, need min %d \n " ,
UDP_SKB_CB ( skb ) - > cscov , up - > pcrlen ) ;
goto drop ;
}
2005-04-17 02:20:36 +04:00
}
2007-03-07 07:29:58 +03:00
if ( sk - > sk_filter ) {
if ( udp_lib_checksum_complete ( skb ) )
goto drop ;
}
2006-11-27 22:10:57 +03:00
2006-08-15 11:00:09 +04:00
if ( ( rc = sock_queue_rcv_skb ( sk , skb ) ) < 0 ) {
/* Note that an ENOMEM error is charged twice */
2008-06-18 08:04:56 +04:00
if ( rc = = - ENOMEM ) {
2008-07-06 08:19:40 +04:00
UDP6_INC_STATS_BH ( sock_net ( sk ) ,
UDP_MIB_RCVBUFERRORS , is_udplite ) ;
2008-06-18 08:04:56 +04:00
atomic_inc ( & sk - > sk_drops ) ;
}
2006-11-27 22:10:57 +03:00
goto drop ;
2005-04-17 02:20:36 +04:00
}
2007-12-03 14:33:28 +03:00
2005-04-17 02:20:36 +04:00
return 0 ;
2006-11-27 22:10:57 +03:00
drop :
2008-07-06 08:19:40 +04:00
UDP6_INC_STATS_BH ( sock_net ( sk ) , UDP_MIB_INERRORS , is_udplite ) ;
2006-11-27 22:10:57 +03:00
kfree_skb ( skb ) ;
return - 1 ;
2005-04-17 02:20:36 +04:00
}
2008-11-02 07:22:23 +03:00
static struct sock * udp_v6_mcast_next ( struct net * net , struct sock * sk ,
2006-11-15 07:56:00 +03:00
__be16 loc_port , struct in6_addr * loc_addr ,
__be16 rmt_port , struct in6_addr * rmt_addr ,
2005-04-17 02:20:36 +04:00
int dif )
{
2008-11-17 06:39:21 +03:00
struct hlist_nulls_node * node ;
2005-04-17 02:20:36 +04:00
struct sock * s = sk ;
unsigned short num = ntohs ( loc_port ) ;
2008-11-17 06:39:21 +03:00
sk_nulls_for_each_from ( s , node ) {
2005-04-17 02:20:36 +04:00
struct inet_sock * inet = inet_sk ( s ) ;
2008-11-02 07:22:23 +03:00
if ( ! net_eq ( sock_net ( s ) , net ) )
2008-03-07 22:16:55 +03:00
continue ;
2007-02-10 02:44:52 +03:00
if ( s - > sk_hash = = num & & s - > sk_family = = PF_INET6 ) {
2005-04-17 02:20:36 +04:00
struct ipv6_pinfo * np = inet6_sk ( s ) ;
if ( inet - > dport ) {
if ( inet - > dport ! = rmt_port )
continue ;
}
if ( ! ipv6_addr_any ( & np - > daddr ) & &
! ipv6_addr_equal ( & np - > daddr , rmt_addr ) )
continue ;
if ( s - > sk_bound_dev_if & & s - > sk_bound_dev_if ! = dif )
continue ;
if ( ! ipv6_addr_any ( & np - > rcv_saddr ) ) {
2005-09-15 08:10:20 +04:00
if ( ! ipv6_addr_equal ( & np - > rcv_saddr , loc_addr ) )
continue ;
2005-04-17 02:20:36 +04:00
}
2007-03-09 07:42:35 +03:00
if ( ! inet6_mc_check ( s , loc_addr , rmt_addr ) )
2005-04-17 02:20:36 +04:00
continue ;
return s ;
}
}
return NULL ;
}
/*
* Note : called only from the BH handler context ,
* so we don ' t need to lock the hashes .
*/
2008-06-17 04:12:11 +04:00
static int __udp6_lib_mcast_deliver ( struct net * net , struct sk_buff * skb ,
struct in6_addr * saddr , struct in6_addr * daddr ,
2008-10-29 11:41:45 +03:00
struct udp_table * udptable )
2005-04-17 02:20:36 +04:00
{
struct sock * sk , * sk2 ;
2007-03-13 20:28:48 +03:00
const struct udphdr * uh = udp_hdr ( skb ) ;
2008-10-29 11:41:45 +03:00
struct udp_hslot * hslot = & udptable - > hash [ udp_hashfn ( net , ntohs ( uh - > dest ) ) ] ;
2005-04-17 02:20:36 +04:00
int dif ;
2008-10-29 11:41:45 +03:00
spin_lock ( & hslot - > lock ) ;
2008-11-17 06:39:21 +03:00
sk = sk_nulls_head ( & hslot - > head ) ;
2006-11-22 04:41:56 +03:00
dif = inet6_iif ( skb ) ;
2008-11-02 07:22:23 +03:00
sk = udp_v6_mcast_next ( net , sk , uh - > dest , daddr , uh - > source , saddr , dif ) ;
2005-04-17 02:20:36 +04:00
if ( ! sk ) {
kfree_skb ( skb ) ;
goto out ;
}
sk2 = sk ;
2008-11-17 06:39:21 +03:00
while ( ( sk2 = udp_v6_mcast_next ( net , sk_nulls_next ( sk2 ) , uh - > dest , daddr ,
2005-04-17 02:20:36 +04:00
uh - > source , saddr , dif ) ) ) {
struct sk_buff * buff = skb_clone ( skb , GFP_ATOMIC ) ;
2007-12-31 11:29:24 +03:00
if ( buff ) {
2008-08-09 11:35:05 +04:00
bh_lock_sock ( sk2 ) ;
2007-12-31 11:29:24 +03:00
if ( ! sock_owned_by_user ( sk2 ) )
udpv6_queue_rcv_skb ( sk2 , buff ) ;
else
sk_add_backlog ( sk2 , buff ) ;
bh_unlock_sock ( sk2 ) ;
}
2005-04-17 02:20:36 +04:00
}
2008-08-09 11:35:05 +04:00
bh_lock_sock ( sk ) ;
2007-12-31 11:29:24 +03:00
if ( ! sock_owned_by_user ( sk ) )
udpv6_queue_rcv_skb ( sk , skb ) ;
else
sk_add_backlog ( sk , skb ) ;
bh_unlock_sock ( sk ) ;
2005-04-17 02:20:36 +04:00
out :
2008-10-29 11:41:45 +03:00
spin_unlock ( & hslot - > lock ) ;
2006-11-27 22:10:57 +03:00
return 0 ;
2005-04-17 02:20:36 +04:00
}
2007-03-26 07:10:56 +04:00
static inline int udp6_csum_init ( struct sk_buff * skb , struct udphdr * uh ,
int proto )
2006-11-27 22:10:57 +03:00
{
2007-03-26 07:10:56 +04:00
int err ;
UDP_SKB_CB ( skb ) - > partial_cov = 0 ;
UDP_SKB_CB ( skb ) - > cscov = skb - > len ;
2008-03-07 03:22:02 +03:00
if ( proto = = IPPROTO_UDPLITE ) {
2007-03-26 07:10:56 +04:00
err = udplite_checksum_init ( skb , uh ) ;
if ( err )
return err ;
}
2006-11-27 22:10:57 +03:00
if ( uh - > check = = 0 ) {
/* RFC 2460 section 8.1 says that we SHOULD log
this error . Well , it is reasonable .
*/
LIMIT_NETDEBUG ( KERN_INFO " IPv6: udp checksum is 0 \n " ) ;
return 1 ;
}
if ( skb - > ip_summed = = CHECKSUM_COMPLETE & &
2007-04-26 04:54:47 +04:00
! csum_ipv6_magic ( & ipv6_hdr ( skb ) - > saddr , & ipv6_hdr ( skb ) - > daddr ,
2007-03-26 07:10:56 +04:00
skb - > len , proto , skb - > csum ) )
2006-11-27 22:10:57 +03:00
skb - > ip_summed = CHECKSUM_UNNECESSARY ;
2007-04-09 22:59:39 +04:00
if ( ! skb_csum_unnecessary ( skb ) )
2007-04-26 04:54:47 +04:00
skb - > csum = ~ csum_unfold ( csum_ipv6_magic ( & ipv6_hdr ( skb ) - > saddr ,
& ipv6_hdr ( skb ) - > daddr ,
2007-03-26 07:10:56 +04:00
skb - > len , proto , 0 ) ) ;
2006-11-27 22:10:57 +03:00
2007-03-26 07:10:56 +04:00
return 0 ;
2006-11-27 22:10:57 +03:00
}
2008-10-29 11:41:45 +03:00
int __udp6_lib_rcv ( struct sk_buff * skb , struct udp_table * udptable ,
2007-03-26 07:10:56 +04:00
int proto )
2005-04-17 02:20:36 +04:00
{
struct sock * sk ;
2007-02-09 17:24:49 +03:00
struct udphdr * uh ;
2005-04-17 02:20:36 +04:00
struct net_device * dev = skb - > dev ;
struct in6_addr * saddr , * daddr ;
u32 ulen = 0 ;
2008-07-06 08:19:40 +04:00
struct net * net = dev_net ( skb - > dev ) ;
2005-04-17 02:20:36 +04:00
if ( ! pskb_may_pull ( skb , sizeof ( struct udphdr ) ) )
goto short_packet ;
2007-04-26 04:54:47 +04:00
saddr = & ipv6_hdr ( skb ) - > saddr ;
daddr = & ipv6_hdr ( skb ) - > daddr ;
2007-03-13 20:28:48 +03:00
uh = udp_hdr ( skb ) ;
2005-04-17 02:20:36 +04:00
ulen = ntohs ( uh - > len ) ;
2006-11-27 22:10:57 +03:00
if ( ulen > skb - > len )
goto short_packet ;
2005-04-17 02:20:36 +04:00
2007-03-26 07:10:56 +04:00
if ( proto = = IPPROTO_UDP ) {
/* UDP validates ulen. */
2005-04-17 02:20:36 +04:00
2006-11-27 22:10:57 +03:00
/* Check for jumbo payload */
if ( ulen = = 0 )
ulen = skb - > len ;
2005-04-17 02:20:36 +04:00
2006-11-27 22:10:57 +03:00
if ( ulen < sizeof ( * uh ) )
goto short_packet ;
2005-04-17 02:20:36 +04:00
2006-11-27 22:10:57 +03:00
if ( ulen < skb - > len ) {
if ( pskb_trim_rcsum ( skb , ulen ) )
goto short_packet ;
2007-04-26 04:54:47 +04:00
saddr = & ipv6_hdr ( skb ) - > saddr ;
daddr = & ipv6_hdr ( skb ) - > daddr ;
2007-03-13 20:28:48 +03:00
uh = udp_hdr ( skb ) ;
2006-11-27 22:10:57 +03:00
}
}
2005-04-17 02:20:36 +04:00
2007-03-26 07:10:56 +04:00
if ( udp6_csum_init ( skb , uh , proto ) )
goto discard ;
2007-02-09 17:24:49 +03:00
/*
* Multicast receive code
2005-04-17 02:20:36 +04:00
*/
2006-11-27 22:10:57 +03:00
if ( ipv6_addr_is_multicast ( daddr ) )
2008-06-17 04:12:11 +04:00
return __udp6_lib_mcast_deliver ( net , skb ,
saddr , daddr , udptable ) ;
2005-04-17 02:20:36 +04:00
/* Unicast */
2007-02-09 17:24:49 +03:00
/*
2005-04-17 02:20:36 +04:00
* check socket cache . . . must talk to Alan about his plans
* for sock caches . . . i ' ll skip this for now .
*/
2008-10-07 23:38:32 +04:00
sk = __udp6_lib_lookup_skb ( skb , uh - > source , uh - > dest , udptable ) ;
2005-04-17 02:20:36 +04:00
if ( sk = = NULL ) {
if ( ! xfrm6_policy_check ( NULL , XFRM_POLICY_IN , skb ) )
goto discard ;
2006-11-27 22:10:57 +03:00
if ( udp_lib_checksum_complete ( skb ) )
2005-04-17 02:20:36 +04:00
goto discard ;
2008-07-06 08:19:40 +04:00
UDP6_INC_STATS_BH ( net , UDP_MIB_NOPORTS ,
proto = = IPPROTO_UDPLITE ) ;
2005-04-17 02:20:36 +04:00
icmpv6_send ( skb , ICMPV6_DEST_UNREACH , ICMPV6_PORT_UNREACH , 0 , dev ) ;
kfree_skb ( skb ) ;
2007-03-09 07:42:35 +03:00
return 0 ;
2005-04-17 02:20:36 +04:00
}
2007-02-09 17:24:49 +03:00
2005-04-17 02:20:36 +04:00
/* deliver */
2007-02-09 17:24:49 +03:00
2008-08-09 11:35:05 +04:00
bh_lock_sock ( sk ) ;
2007-12-31 11:29:24 +03:00
if ( ! sock_owned_by_user ( sk ) )
udpv6_queue_rcv_skb ( sk , skb ) ;
else
sk_add_backlog ( sk , skb ) ;
bh_unlock_sock ( sk ) ;
2005-04-17 02:20:36 +04:00
sock_put ( sk ) ;
2007-03-09 07:42:35 +03:00
return 0 ;
2005-04-17 02:20:36 +04:00
2007-02-09 17:24:49 +03:00
short_packet :
2006-11-27 22:10:57 +03:00
LIMIT_NETDEBUG ( KERN_DEBUG " UDP%sv6: short packet: %d/%u \n " ,
2008-03-07 03:22:02 +03:00
proto = = IPPROTO_UDPLITE ? " -Lite " : " " ,
2007-03-26 07:10:56 +04:00
ulen , skb - > len ) ;
2005-04-17 02:20:36 +04:00
discard :
2008-07-06 08:19:40 +04:00
UDP6_INC_STATS_BH ( net , UDP_MIB_INERRORS , proto = = IPPROTO_UDPLITE ) ;
2005-04-17 02:20:36 +04:00
kfree_skb ( skb ) ;
2007-03-09 07:42:35 +03:00
return 0 ;
2005-04-17 02:20:36 +04:00
}
2006-11-27 22:10:57 +03:00
2007-10-15 23:50:28 +04:00
static __inline__ int udpv6_rcv ( struct sk_buff * skb )
2006-11-27 22:10:57 +03:00
{
2008-10-29 11:41:45 +03:00
return __udp6_lib_rcv ( skb , & udp_table , IPPROTO_UDP ) ;
2006-11-27 22:10:57 +03:00
}
2005-04-17 02:20:36 +04:00
/*
* Throw away all pending data and cancel the corking . Socket is locked .
*/
static void udp_v6_flush_pending_frames ( struct sock * sk )
{
struct udp_sock * up = udp_sk ( sk ) ;
2008-06-04 15:49:07 +04:00
if ( up - > pending = = AF_INET )
udp_flush_pending_frames ( sk ) ;
else if ( up - > pending ) {
2005-04-17 02:20:36 +04:00
up - > len = 0 ;
up - > pending = 0 ;
ip6_flush_pending_frames ( sk ) ;
2007-02-09 17:24:49 +03:00
}
2005-04-17 02:20:36 +04:00
}
/*
* Sending
*/
2006-11-27 20:29:59 +03:00
static int udp_v6_push_pending_frames ( struct sock * sk )
2005-04-17 02:20:36 +04:00
{
struct sk_buff * skb ;
struct udphdr * uh ;
2006-11-27 20:29:59 +03:00
struct udp_sock * up = udp_sk ( sk ) ;
2005-04-17 02:20:36 +04:00
struct inet_sock * inet = inet_sk ( sk ) ;
struct flowi * fl = & inet - > cork . fl ;
int err = 0 ;
2007-12-03 14:34:16 +03:00
int is_udplite = IS_UDPLITE ( sk ) ;
2006-11-15 08:35:48 +03:00
__wsum csum = 0 ;
2005-04-17 02:20:36 +04:00
/* Grab the skbuff where UDP header space exists. */
if ( ( skb = skb_peek ( & sk - > sk_write_queue ) ) = = NULL )
goto out ;
/*
* Create a UDP header
*/
2007-03-13 20:28:48 +03:00
uh = udp_hdr ( skb ) ;
2005-04-17 02:20:36 +04:00
uh - > source = fl - > fl_ip_sport ;
uh - > dest = fl - > fl_ip_dport ;
uh - > len = htons ( up - > len ) ;
uh - > check = 0 ;
2007-12-03 14:34:16 +03:00
if ( is_udplite )
2006-11-27 22:10:57 +03:00
csum = udplite_csum_outgoing ( sk , skb ) ;
else
csum = udp_csum_outgoing ( sk , skb ) ;
2005-04-17 02:20:36 +04:00
2006-11-27 22:10:57 +03:00
/* add protocol-dependent pseudo-header */
uh - > check = csum_ipv6_magic ( & fl - > fl6_src , & fl - > fl6_dst ,
up - > len , fl - > proto , csum ) ;
2005-04-17 02:20:36 +04:00
if ( uh - > check = = 0 )
2006-11-16 13:36:50 +03:00
uh - > check = CSUM_MANGLED_0 ;
2005-04-17 02:20:36 +04:00
err = ip6_push_pending_frames ( sk ) ;
out :
up - > len = 0 ;
up - > pending = 0 ;
2007-09-15 04:15:01 +04:00
if ( ! err )
2008-07-06 08:19:20 +04:00
UDP6_INC_STATS_USER ( sock_net ( sk ) ,
UDP_MIB_OUTDATAGRAMS , is_udplite ) ;
2005-04-17 02:20:36 +04:00
return err ;
}
2006-11-27 22:10:57 +03:00
int udpv6_sendmsg ( struct kiocb * iocb , struct sock * sk ,
2005-04-17 02:20:36 +04:00
struct msghdr * msg , size_t len )
{
struct ipv6_txoptions opt_space ;
struct udp_sock * up = udp_sk ( sk ) ;
struct inet_sock * inet = inet_sk ( sk ) ;
struct ipv6_pinfo * np = inet6_sk ( sk ) ;
struct sockaddr_in6 * sin6 = ( struct sockaddr_in6 * ) msg - > msg_name ;
struct in6_addr * daddr , * final_p = NULL , final ;
struct ipv6_txoptions * opt = NULL ;
struct ip6_flowlabel * flowlabel = NULL ;
2006-10-04 01:34:00 +04:00
struct flowi fl ;
2005-04-17 02:20:36 +04:00
struct dst_entry * dst ;
int addr_len = msg - > msg_namelen ;
int ulen = len ;
int hlimit = - 1 ;
2005-09-08 05:19:03 +04:00
int tclass = - 1 ;
2005-04-17 02:20:36 +04:00
int corkreq = up - > corkflag | | msg - > msg_flags & MSG_MORE ;
int err ;
2005-09-18 11:30:08 +04:00
int connected = 0 ;
2007-12-03 14:34:16 +03:00
int is_udplite = IS_UDPLITE ( sk ) ;
2006-11-27 22:10:57 +03:00
int ( * getfrag ) ( void * , char * , int , int , int , struct sk_buff * ) ;
2005-04-17 02:20:36 +04:00
/* destination address check */
if ( sin6 ) {
if ( addr_len < offsetof ( struct sockaddr , sa_data ) )
return - EINVAL ;
switch ( sin6 - > sin6_family ) {
case AF_INET6 :
if ( addr_len < SIN6_LEN_RFC2133 )
return - EINVAL ;
daddr = & sin6 - > sin6_addr ;
break ;
case AF_INET :
goto do_udp_sendmsg ;
case AF_UNSPEC :
msg - > msg_name = sin6 = NULL ;
msg - > msg_namelen = addr_len = 0 ;
daddr = NULL ;
break ;
default :
return - EINVAL ;
}
} else if ( ! up - > pending ) {
if ( sk - > sk_state ! = TCP_ESTABLISHED )
return - EDESTADDRREQ ;
daddr = & np - > daddr ;
2007-02-09 17:24:49 +03:00
} else
2005-04-17 02:20:36 +04:00
daddr = NULL ;
if ( daddr ) {
2007-08-25 10:16:08 +04:00
if ( ipv6_addr_v4mapped ( daddr ) ) {
2005-04-17 02:20:36 +04:00
struct sockaddr_in sin ;
sin . sin_family = AF_INET ;
sin . sin_port = sin6 ? sin6 - > sin6_port : inet - > dport ;
sin . sin_addr . s_addr = daddr - > s6_addr32 [ 3 ] ;
msg - > msg_name = & sin ;
msg - > msg_namelen = sizeof ( sin ) ;
do_udp_sendmsg :
if ( __ipv6_only_sock ( sk ) )
return - ENETUNREACH ;
return udp_sendmsg ( iocb , sk , msg , len ) ;
}
}
if ( up - > pending = = AF_INET )
return udp_sendmsg ( iocb , sk , msg , len ) ;
/* Rough check on arithmetic overflow,
[IPv6]: Fix incorrect length check in rawv6_sendmsg()
In article <20070329.142644.70222545.davem@davemloft.net> (at Thu, 29 Mar 2007 14:26:44 -0700 (PDT)), David Miller <davem@davemloft.net> says:
> From: Sridhar Samudrala <sri@us.ibm.com>
> Date: Thu, 29 Mar 2007 14:17:28 -0700
>
> > The check for length in rawv6_sendmsg() is incorrect.
> > As len is an unsigned int, (len < 0) will never be TRUE.
> > I think checking for IPV6_MAXPLEN(65535) is better.
> >
> > Is it possible to send ipv6 jumbo packets using raw
> > sockets? If so, we can remove this check.
>
> I don't see why such a limitation against jumbo would exist,
> does anyone else?
>
> Thanks for catching this Sridhar. A good compiler should simply
> fail to compile "if (x < 0)" when 'x' is an unsigned type, don't
> you think :-)
Dave, we use "int" for returning value,
so we should fix this anyway, IMHO;
we should not allow len > INT_MAX.
Signed-off-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Acked-by: Sridhar Samudrala <sri@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2007-03-31 01:45:35 +04:00
better check is made in ip6_append_data ( ) .
2005-04-17 02:20:36 +04:00
*/
if ( len > INT_MAX - sizeof ( struct udphdr ) )
return - EMSGSIZE ;
2007-02-09 17:24:49 +03:00
2005-04-17 02:20:36 +04:00
if ( up - > pending ) {
/*
* There are pending frames .
* The socket lock must be held while it ' s corked .
*/
lock_sock ( sk ) ;
if ( likely ( up - > pending ) ) {
if ( unlikely ( up - > pending ! = AF_INET6 ) ) {
release_sock ( sk ) ;
return - EAFNOSUPPORT ;
}
dst = NULL ;
goto do_append_data ;
}
release_sock ( sk ) ;
}
ulen + = sizeof ( struct udphdr ) ;
2006-10-04 01:34:00 +04:00
memset ( & fl , 0 , sizeof ( fl ) ) ;
2005-04-17 02:20:36 +04:00
if ( sin6 ) {
if ( sin6 - > sin6_port = = 0 )
return - EINVAL ;
2006-10-04 01:34:00 +04:00
fl . fl_ip_dport = sin6 - > sin6_port ;
2005-04-17 02:20:36 +04:00
daddr = & sin6 - > sin6_addr ;
if ( np - > sndflow ) {
2006-10-04 01:34:00 +04:00
fl . fl6_flowlabel = sin6 - > sin6_flowinfo & IPV6_FLOWINFO_MASK ;
if ( fl . fl6_flowlabel & IPV6_FLOWLABEL_MASK ) {
flowlabel = fl6_sock_lookup ( sk , fl . fl6_flowlabel ) ;
2005-04-17 02:20:36 +04:00
if ( flowlabel = = NULL )
return - EINVAL ;
daddr = & flowlabel - > dst ;
}
}
/*
* Otherwise it will be difficult to maintain
* sk - > sk_dst_cache .
*/
if ( sk - > sk_state = = TCP_ESTABLISHED & &
ipv6_addr_equal ( daddr , & np - > daddr ) )
daddr = & np - > daddr ;
if ( addr_len > = sizeof ( struct sockaddr_in6 ) & &
sin6 - > sin6_scope_id & &
ipv6_addr_type ( daddr ) & IPV6_ADDR_LINKLOCAL )
2006-10-04 01:34:00 +04:00
fl . oif = sin6 - > sin6_scope_id ;
2005-04-17 02:20:36 +04:00
} else {
if ( sk - > sk_state ! = TCP_ESTABLISHED )
return - EDESTADDRREQ ;
2006-10-04 01:34:00 +04:00
fl . fl_ip_dport = inet - > dport ;
2005-04-17 02:20:36 +04:00
daddr = & np - > daddr ;
2006-10-04 01:34:00 +04:00
fl . fl6_flowlabel = np - > flow_label ;
2005-09-18 11:30:08 +04:00
connected = 1 ;
2005-04-17 02:20:36 +04:00
}
2006-10-04 01:34:00 +04:00
if ( ! fl . oif )
fl . oif = sk - > sk_bound_dev_if ;
2005-04-17 02:20:36 +04:00
2008-12-16 13:08:29 +03:00
if ( ! fl . oif )
fl . oif = np - > sticky_pktinfo . ipi6_ifindex ;
2005-04-17 02:20:36 +04:00
if ( msg - > msg_controllen ) {
opt = & opt_space ;
memset ( opt , 0 , sizeof ( struct ipv6_txoptions ) ) ;
opt - > tot_len = sizeof ( * opt ) ;
2008-06-04 08:02:49 +04:00
err = datagram_send_ctl ( sock_net ( sk ) , msg , & fl , opt , & hlimit , & tclass ) ;
2005-04-17 02:20:36 +04:00
if ( err < 0 ) {
fl6_sock_release ( flowlabel ) ;
return err ;
}
2006-10-04 01:34:00 +04:00
if ( ( fl . fl6_flowlabel & IPV6_FLOWLABEL_MASK ) & & ! flowlabel ) {
flowlabel = fl6_sock_lookup ( sk , fl . fl6_flowlabel ) ;
2005-04-17 02:20:36 +04:00
if ( flowlabel = = NULL )
return - EINVAL ;
}
if ( ! ( opt - > opt_nflen | opt - > opt_flen ) )
opt = NULL ;
2005-09-18 11:30:08 +04:00
connected = 0 ;
2005-04-17 02:20:36 +04:00
}
if ( opt = = NULL )
opt = np - > opt ;
2005-11-20 06:23:18 +03:00
if ( flowlabel )
opt = fl6_merge_options ( & opt_space , flowlabel , opt ) ;
opt = ipv6_fixup_options ( & opt_space , opt ) ;
2005-04-17 02:20:36 +04:00
2006-11-27 22:10:57 +03:00
fl . proto = sk - > sk_protocol ;
2008-04-11 08:38:24 +04:00
if ( ! ipv6_addr_any ( daddr ) )
ipv6_addr_copy ( & fl . fl6_dst , daddr ) ;
else
fl . fl6_dst . s6_addr [ 15 ] = 0x1 ; /* :: means loopback (BSD'ism) */
2006-10-04 01:34:00 +04:00
if ( ipv6_addr_any ( & fl . fl6_src ) & & ! ipv6_addr_any ( & np - > saddr ) )
ipv6_addr_copy ( & fl . fl6_src , & np - > saddr ) ;
fl . fl_ip_sport = inet - > sport ;
2007-02-09 17:24:49 +03:00
2005-04-17 02:20:36 +04:00
/* merge ip6_build_xmit from ip6_output */
if ( opt & & opt - > srcrt ) {
struct rt0_hdr * rt0 = ( struct rt0_hdr * ) opt - > srcrt ;
2006-10-04 01:34:00 +04:00
ipv6_addr_copy ( & final , & fl . fl6_dst ) ;
ipv6_addr_copy ( & fl . fl6_dst , rt0 - > addr ) ;
2005-04-17 02:20:36 +04:00
final_p = & final ;
2005-09-18 11:30:08 +04:00
connected = 0 ;
2005-04-17 02:20:36 +04:00
}
2006-10-04 01:34:00 +04:00
if ( ! fl . oif & & ipv6_addr_is_multicast ( & fl . fl6_dst ) ) {
fl . oif = np - > mcast_oif ;
2005-09-18 11:30:08 +04:00
connected = 0 ;
}
2005-04-17 02:20:36 +04:00
2006-10-04 01:34:00 +04:00
security_sk_classify_flow ( sk , & fl ) ;
2006-08-05 10:12:42 +04:00
2006-10-04 01:34:00 +04:00
err = ip6_sk_dst_lookup ( sk , & dst , & fl ) ;
2005-04-17 02:20:36 +04:00
if ( err )
goto out ;
if ( final_p )
2006-10-04 01:34:00 +04:00
ipv6_addr_copy ( & fl . fl6_dst , final_p ) ;
2005-04-17 02:20:36 +04:00
2008-11-26 04:35:18 +03:00
err = __xfrm_lookup ( sock_net ( sk ) , & dst , & fl , sk , XFRM_LOOKUP_WAIT ) ;
if ( err < 0 ) {
2007-05-25 05:17:54 +04:00
if ( err = = - EREMOTE )
err = ip6_dst_blackhole ( sk , & dst , & fl ) ;
if ( err < 0 )
goto out ;
}
2005-04-17 02:20:36 +04:00
if ( hlimit < 0 ) {
2006-10-04 01:34:00 +04:00
if ( ipv6_addr_is_multicast ( & fl . fl6_dst ) )
2005-04-17 02:20:36 +04:00
hlimit = np - > mcast_hops ;
else
hlimit = np - > hop_limit ;
if ( hlimit < 0 )
2008-03-10 13:00:30 +03:00
hlimit = ip6_dst_hoplimit ( dst ) ;
2005-04-17 02:20:36 +04:00
}
2005-09-08 05:19:03 +04:00
if ( tclass < 0 ) {
tclass = np - > tclass ;
if ( tclass < 0 )
tclass = 0 ;
}
2005-04-17 02:20:36 +04:00
if ( msg - > msg_flags & MSG_CONFIRM )
goto do_confirm ;
back_from_confirm :
lock_sock ( sk ) ;
if ( unlikely ( up - > pending ) ) {
/* The socket is already corked while preparing it. */
/* ... which is an evident application bug. --ANK */
release_sock ( sk ) ;
2005-08-10 07:50:53 +04:00
LIMIT_NETDEBUG ( KERN_DEBUG " udp cork app bug 2 \n " ) ;
2005-04-17 02:20:36 +04:00
err = - EINVAL ;
goto out ;
}
up - > pending = AF_INET6 ;
do_append_data :
up - > len + = ulen ;
2006-11-27 22:10:57 +03:00
getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag ;
err = ip6_append_data ( sk , getfrag , msg - > msg_iov , ulen ,
2006-10-04 01:34:00 +04:00
sizeof ( struct udphdr ) , hlimit , tclass , opt , & fl ,
2005-09-08 05:19:03 +04:00
( struct rt6_info * ) dst ,
corkreq ? msg - > msg_flags | MSG_MORE : msg - > msg_flags ) ;
2005-04-17 02:20:36 +04:00
if ( err )
udp_v6_flush_pending_frames ( sk ) ;
else if ( ! corkreq )
2006-11-27 20:29:59 +03:00
err = udp_v6_push_pending_frames ( sk ) ;
2006-10-04 01:35:49 +04:00
else if ( unlikely ( skb_queue_empty ( & sk - > sk_write_queue ) ) )
up - > pending = 0 ;
2005-04-17 02:20:36 +04:00
2005-10-04 01:21:58 +04:00
if ( dst ) {
if ( connected ) {
ip6_dst_store ( sk , dst ,
2006-10-04 01:34:00 +04:00
ipv6_addr_equal ( & fl . fl6_dst , & np - > daddr ) ?
2006-08-30 04:15:09 +04:00
& np - > daddr : NULL ,
# ifdef CONFIG_IPV6_SUBTREES
2006-10-04 01:34:00 +04:00
ipv6_addr_equal ( & fl . fl6_src , & np - > saddr ) ?
2006-08-30 04:15:09 +04:00
& np - > saddr :
# endif
NULL ) ;
2005-10-04 01:21:58 +04:00
} else {
dst_release ( dst ) ;
}
2008-06-03 20:30:25 +04:00
dst = NULL ;
2005-10-04 01:21:58 +04:00
}
2005-04-17 02:20:36 +04:00
if ( err > 0 )
err = np - > recverr ? net_xmit_errno ( err ) : 0 ;
release_sock ( sk ) ;
out :
2008-06-03 20:30:25 +04:00
dst_release ( dst ) ;
2005-04-17 02:20:36 +04:00
fl6_sock_release ( flowlabel ) ;
2007-09-15 04:15:01 +04:00
if ( ! err )
2005-04-17 02:20:36 +04:00
return len ;
2006-08-15 11:00:09 +04:00
/*
* ENOBUFS = no kernel mem , SOCK_NOSPACE = no sndbuf space . Reporting
* ENOBUFS might not be good ( it ' s not tunable per se ) , but otherwise
* we don ' t have a good statistic ( IpOutDiscards but it can be too many
* things ) . We could add another new stat but at least for now that
* seems like overkill .
*/
if ( err = = - ENOBUFS | | test_bit ( SOCK_NOSPACE , & sk - > sk_socket - > flags ) ) {
2008-07-06 08:19:20 +04:00
UDP6_INC_STATS_USER ( sock_net ( sk ) ,
UDP_MIB_SNDBUFERRORS , is_udplite ) ;
2006-08-15 11:00:09 +04:00
}
2005-04-17 02:20:36 +04:00
return err ;
do_confirm :
dst_confirm ( dst ) ;
if ( ! ( msg - > msg_flags & MSG_PROBE ) | | len )
goto back_from_confirm ;
err = 0 ;
goto out ;
}
2008-06-15 04:04:49 +04:00
void udpv6_destroy_sock ( struct sock * sk )
2005-04-17 02:20:36 +04:00
{
lock_sock ( sk ) ;
udp_v6_flush_pending_frames ( sk ) ;
release_sock ( sk ) ;
inet6_destroy_sock ( sk ) ;
}
/*
* Socket option code for UDP
*/
2006-11-27 22:10:57 +03:00
int udpv6_setsockopt ( struct sock * sk , int level , int optname ,
char __user * optval , int optlen )
2006-03-21 09:45:21 +03:00
{
2008-03-07 03:22:02 +03:00
if ( level = = SOL_UDP | | level = = SOL_UDPLITE )
2006-11-27 20:29:59 +03:00
return udp_lib_setsockopt ( sk , level , optname , optval , optlen ,
udp_v6_push_pending_frames ) ;
2006-11-27 22:10:57 +03:00
return ipv6_setsockopt ( sk , level , optname , optval , optlen ) ;
2006-03-21 09:45:21 +03:00
}
# ifdef CONFIG_COMPAT
2006-11-27 22:10:57 +03:00
int compat_udpv6_setsockopt ( struct sock * sk , int level , int optname ,
char __user * optval , int optlen )
2006-03-21 09:45:21 +03:00
{
2008-03-07 03:22:02 +03:00
if ( level = = SOL_UDP | | level = = SOL_UDPLITE )
2006-11-27 20:29:59 +03:00
return udp_lib_setsockopt ( sk , level , optname , optval , optlen ,
udp_v6_push_pending_frames ) ;
2006-11-27 22:10:57 +03:00
return compat_ipv6_setsockopt ( sk , level , optname , optval , optlen ) ;
2006-03-21 09:45:21 +03:00
}
# endif
2006-11-27 22:10:57 +03:00
int udpv6_getsockopt ( struct sock * sk , int level , int optname ,
char __user * optval , int __user * optlen )
2006-03-21 09:45:21 +03:00
{
2008-03-07 03:22:02 +03:00
if ( level = = SOL_UDP | | level = = SOL_UDPLITE )
2006-11-27 20:29:59 +03:00
return udp_lib_getsockopt ( sk , level , optname , optval , optlen ) ;
2006-11-27 22:10:57 +03:00
return ipv6_getsockopt ( sk , level , optname , optval , optlen ) ;
2006-03-21 09:45:21 +03:00
}
# ifdef CONFIG_COMPAT
2006-11-27 22:10:57 +03:00
int compat_udpv6_getsockopt ( struct sock * sk , int level , int optname ,
char __user * optval , int __user * optlen )
2006-03-21 09:45:21 +03:00
{
2008-03-07 03:22:02 +03:00
if ( level = = SOL_UDP | | level = = SOL_UDPLITE )
2006-11-27 20:29:59 +03:00
return udp_lib_getsockopt ( sk , level , optname , optval , optlen ) ;
2006-11-27 22:10:57 +03:00
return compat_ipv6_getsockopt ( sk , level , optname , optval , optlen ) ;
2006-03-21 09:45:21 +03:00
}
# endif
2005-04-17 02:20:36 +04:00
static struct inet6_protocol udpv6_protocol = {
. handler = udpv6_rcv ,
. err_handler = udpv6_err ,
. flags = INET6_PROTO_NOPOLICY | INET6_PROTO_FINAL ,
} ;
/* ------------------------------------------------------------------------ */
# ifdef CONFIG_PROC_FS
static void udp6_sock_seq_show ( struct seq_file * seq , struct sock * sp , int bucket )
{
struct inet_sock * inet = inet_sk ( sp ) ;
struct ipv6_pinfo * np = inet6_sk ( sp ) ;
struct in6_addr * dest , * src ;
__u16 destp , srcp ;
dest = & np - > daddr ;
src = & np - > rcv_saddr ;
destp = ntohs ( inet - > dport ) ;
srcp = ntohs ( inet - > sport ) ;
seq_printf ( seq ,
" %4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X "
2008-06-18 08:04:56 +04:00
" %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p %d \n " ,
2005-04-17 02:20:36 +04:00
bucket ,
src - > s6_addr32 [ 0 ] , src - > s6_addr32 [ 1 ] ,
src - > s6_addr32 [ 2 ] , src - > s6_addr32 [ 3 ] , srcp ,
dest - > s6_addr32 [ 0 ] , dest - > s6_addr32 [ 1 ] ,
dest - > s6_addr32 [ 2 ] , dest - > s6_addr32 [ 3 ] , destp ,
2007-02-09 17:24:49 +03:00
sp - > sk_state ,
2005-04-17 02:20:36 +04:00
atomic_read ( & sp - > sk_wmem_alloc ) ,
atomic_read ( & sp - > sk_rmem_alloc ) ,
0 , 0L , 0 ,
sock_i_uid ( sp ) , 0 ,
sock_i_ino ( sp ) ,
2008-06-18 08:04:56 +04:00
atomic_read ( & sp - > sk_refcnt ) , sp ,
atomic_read ( & sp - > sk_drops ) ) ;
2005-04-17 02:20:36 +04:00
}
2006-11-27 22:10:57 +03:00
int udp6_seq_show ( struct seq_file * seq , void * v )
2005-04-17 02:20:36 +04:00
{
if ( v = = SEQ_START_TOKEN )
seq_printf ( seq ,
" sl "
" local_address "
" remote_address "
" st tx_queue rx_queue tr tm->when retrnsmt "
2008-06-18 08:04:56 +04:00
" uid timeout inode ref pointer drops \n " ) ;
2005-04-17 02:20:36 +04:00
else
udp6_sock_seq_show ( seq , v , ( ( struct udp_iter_state * ) seq - > private ) - > bucket ) ;
return 0 ;
}
static struct udp_seq_afinfo udp6_seq_afinfo = {
. name = " udp6 " ,
. family = AF_INET6 ,
2008-10-29 11:41:45 +03:00
. udp_table = & udp_table ,
2008-03-29 04:25:53 +03:00
. seq_fops = {
. owner = THIS_MODULE ,
} ,
2008-03-29 04:24:26 +03:00
. seq_ops = {
. show = udp6_seq_show ,
} ,
2005-04-17 02:20:36 +04:00
} ;
2008-03-21 14:14:17 +03:00
int udp6_proc_init ( struct net * net )
2005-04-17 02:20:36 +04:00
{
2008-03-21 14:14:17 +03:00
return udp_proc_register ( net , & udp6_seq_afinfo ) ;
2005-04-17 02:20:36 +04:00
}
2008-03-21 14:14:17 +03:00
void udp6_proc_exit ( struct net * net ) {
udp_proc_unregister ( net , & udp6_seq_afinfo ) ;
2005-04-17 02:20:36 +04:00
}
# endif /* CONFIG_PROC_FS */
/* ------------------------------------------------------------------------ */
struct proto udpv6_prot = {
2006-03-21 09:48:35 +03:00
. name = " UDPv6 " ,
. owner = THIS_MODULE ,
2006-11-27 22:10:57 +03:00
. close = udp_lib_close ,
2006-03-21 09:48:35 +03:00
. connect = ip6_datagram_connect ,
. disconnect = udp_disconnect ,
. ioctl = udp_ioctl ,
. destroy = udpv6_destroy_sock ,
. setsockopt = udpv6_setsockopt ,
. getsockopt = udpv6_getsockopt ,
. sendmsg = udpv6_sendmsg ,
. recvmsg = udpv6_recvmsg ,
. backlog_rcv = udpv6_queue_rcv_skb ,
2006-11-27 22:10:57 +03:00
. hash = udp_lib_hash ,
. unhash = udp_lib_unhash ,
2006-03-21 09:48:35 +03:00
. get_port = udp_v6_get_port ,
2007-12-31 11:29:24 +03:00
. memory_allocated = & udp_memory_allocated ,
. sysctl_mem = sysctl_udp_mem ,
. sysctl_wmem = & sysctl_udp_wmem_min ,
. sysctl_rmem = & sysctl_udp_rmem_min ,
2006-03-21 09:48:35 +03:00
. obj_size = sizeof ( struct udp6_sock ) ,
udp: RCU handling for Unicast packets.
Goals are :
1) Optimizing handling of incoming Unicast UDP frames, so that no memory
writes should happen in the fast path.
Note: Multicasts and broadcasts still will need to take a lock,
because doing a full lockless lookup in this case is difficult.
2) No expensive operations in the socket bind/unhash phases :
- No expensive synchronize_rcu() calls.
- No added rcu_head in socket structure, increasing memory needs,
but more important, forcing us to use call_rcu() calls,
that have the bad property of making sockets structure cold.
(rcu grace period between socket freeing and its potential reuse
make this socket being cold in CPU cache).
David did a previous patch using call_rcu() and noticed a 20%
impact on TCP connection rates.
Quoting Cristopher Lameter :
"Right. That results in cacheline cooldown. You'd want to recycle
the object as they are cache hot on a per cpu basis. That is screwed
up by the delayed regular rcu processing. We have seen multiple
regressions due to cacheline cooldown.
The only choice in cacheline hot sensitive areas is to deal with the
complexity that comes with SLAB_DESTROY_BY_RCU or give up on RCU."
- Because udp sockets are allocated from dedicated kmem_cache,
use of SLAB_DESTROY_BY_RCU can help here.
Theory of operation :
---------------------
As the lookup is lockfree (using rcu_read_lock()/rcu_read_unlock()),
special attention must be taken by readers and writers.
Use of SLAB_DESTROY_BY_RCU is tricky too, because a socket can be freed,
reused, inserted in a different chain or in worst case in the same chain
while readers could do lookups in the same time.
In order to avoid loops, a reader must check each socket found in a chain
really belongs to the chain the reader was traversing. If it finds a
mismatch, lookup must start again at the begining. This *restart* loop
is the reason we had to use rdlock for the multicast case, because
we dont want to send same message several times to the same socket.
We use RCU only for fast path.
Thus, /proc/net/udp still takes spinlocks.
Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-10-29 12:11:14 +03:00
. slab_flags = SLAB_DESTROY_BY_RCU ,
2008-10-29 11:41:45 +03:00
. h . udp_table = & udp_table ,
2006-03-21 09:45:21 +03:00
# ifdef CONFIG_COMPAT
2006-03-21 09:48:35 +03:00
. compat_setsockopt = compat_udpv6_setsockopt ,
. compat_getsockopt = compat_udpv6_getsockopt ,
2006-03-21 09:45:21 +03:00
# endif
2005-04-17 02:20:36 +04:00
} ;
static struct inet_protosw udpv6_protosw = {
. type = SOCK_DGRAM ,
. protocol = IPPROTO_UDP ,
. prot = & udpv6_prot ,
. ops = & inet6_dgram_ops ,
. capability = - 1 ,
. no_check = UDP_CSUM_DEFAULT ,
. flags = INET_PROTOSW_PERMANENT ,
} ;
2007-12-11 13:25:35 +03:00
int __init udpv6_init ( void )
2005-04-17 02:20:36 +04:00
{
2007-12-11 13:25:35 +03:00
int ret ;
ret = inet6_add_protocol ( & udpv6_protocol , IPPROTO_UDP ) ;
if ( ret )
goto out ;
ret = inet6_register_protosw ( & udpv6_protosw ) ;
if ( ret )
goto out_udpv6_protocol ;
out :
return ret ;
out_udpv6_protocol :
inet6_del_protocol ( & udpv6_protocol , IPPROTO_UDP ) ;
goto out ;
}
2007-12-13 16:34:58 +03:00
void udpv6_exit ( void )
2007-12-11 13:25:35 +03:00
{
inet6_unregister_protosw ( & udpv6_protosw ) ;
inet6_del_protocol ( & udpv6_protocol , IPPROTO_UDP ) ;
2005-04-17 02:20:36 +04:00
}