2005-04-17 02:20:36 +04:00
/*
* UDP over IPv6
2007-02-09 17:24:49 +03:00
* Linux INET6 implementation
2005-04-17 02:20:36 +04:00
*
* Authors :
2007-02-09 17:24:49 +03:00
* Pedro Roque < roque @ di . fc . ul . pt >
2005-04-17 02:20:36 +04:00
*
* Based on linux / ipv4 / udp . c
*
* Fixes :
* Hideaki YOSHIFUJI : sin6_scope_id support
* YOSHIFUJI Hideaki @ USAGI and : Support IPV6_V6ONLY socket option , which
* Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
* a single port at the same time .
* Kazunori MIYAZAWA @ USAGI : change process style to use ip6_append_data
* YOSHIFUJI Hideaki @ USAGI : convert / proc / net / udp6 to seq_file .
*
* This program is free software ; you can redistribute it and / or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation ; either version
* 2 of the License , or ( at your option ) any later version .
*/
# include <linux/errno.h>
# include <linux/types.h>
# include <linux/socket.h>
# include <linux/sockios.h>
# include <linux/net.h>
# include <linux/in6.h>
# include <linux/netdevice.h>
# include <linux/if_arp.h>
# include <linux/ipv6.h>
# include <linux/icmpv6.h>
# include <linux/init.h>
2007-12-11 22:30:32 +03:00
# include <linux/module.h>
2005-12-14 10:16:37 +03:00
# include <linux/skbuff.h>
2005-04-17 02:20:36 +04:00
# include <asm/uaccess.h>
# include <net/ndisc.h>
# include <net/protocol.h>
# include <net/transp_v6.h>
# include <net/ip6_route.h>
# include <net/raw.h>
2005-08-10 07:08:28 +04:00
# include <net/tcp_states.h>
2005-04-17 02:20:36 +04:00
# include <net/ip6_checksum.h>
# include <net/xfrm.h>
# include <linux/proc_fs.h>
# include <linux/seq_file.h>
2006-11-27 22:10:57 +03:00
# include "udp_impl.h"
2005-04-17 02:20:36 +04:00
2009-03-24 19:24:51 +03:00
int ipv6_rcv_saddr_equal ( const struct sock * sk , const struct sock * sk2 )
{
const struct in6_addr * sk_rcv_saddr6 = & inet6_sk ( sk ) - > rcv_saddr ;
const struct in6_addr * sk2_rcv_saddr6 = inet6_rcv_saddr ( sk2 ) ;
2009-10-15 10:30:45 +04:00
__be32 sk1_rcv_saddr = inet_sk ( sk ) - > inet_rcv_saddr ;
2009-04-09 21:37:33 +04:00
__be32 sk2_rcv_saddr = inet_rcv_saddr ( sk2 ) ;
2009-03-24 19:24:51 +03:00
int sk_ipv6only = ipv6_only_sock ( sk ) ;
int sk2_ipv6only = inet_v6_ipv6only ( sk2 ) ;
int addr_type = ipv6_addr_type ( sk_rcv_saddr6 ) ;
int addr_type2 = sk2_rcv_saddr6 ? ipv6_addr_type ( sk2_rcv_saddr6 ) : IPV6_ADDR_MAPPED ;
/* if both are mapped, treat as IPv4 */
if ( addr_type = = IPV6_ADDR_MAPPED & & addr_type2 = = IPV6_ADDR_MAPPED )
2009-04-09 21:37:33 +04:00
return ( ! sk2_ipv6only & &
2009-10-15 10:30:45 +04:00
( ! sk1_rcv_saddr | | ! sk2_rcv_saddr | |
sk1_rcv_saddr = = sk2_rcv_saddr ) ) ;
2009-03-24 19:24:51 +03:00
if ( addr_type2 = = IPV6_ADDR_ANY & &
! ( sk2_ipv6only & & addr_type = = IPV6_ADDR_MAPPED ) )
return 1 ;
if ( addr_type = = IPV6_ADDR_ANY & &
! ( sk_ipv6only & & addr_type2 = = IPV6_ADDR_MAPPED ) )
return 1 ;
if ( sk2_rcv_saddr6 & &
ipv6_addr_equal ( sk_rcv_saddr6 , sk2_rcv_saddr6 ) )
return 1 ;
return 0 ;
}
2009-11-08 13:17:30 +03:00
static unsigned int udp6_portaddr_hash ( struct net * net ,
const struct in6_addr * addr6 ,
unsigned int port )
{
unsigned int hash , mix = net_hash_mix ( net ) ;
if ( ipv6_addr_any ( addr6 ) )
hash = jhash_1word ( 0 , mix ) ;
2009-11-09 15:05:53 +03:00
else if ( ipv6_addr_v4mapped ( addr6 ) )
2009-11-08 13:17:30 +03:00
hash = jhash_1word ( addr6 - > s6_addr32 [ 3 ] , mix ) ;
else
hash = jhash2 ( addr6 - > s6_addr32 , 4 , mix ) ;
return hash ^ port ;
}
2008-03-23 02:51:21 +03:00
int udp_v6_get_port ( struct sock * sk , unsigned short snum )
2005-04-17 02:20:36 +04:00
{
2009-11-09 08:26:33 +03:00
unsigned int hash2_nulladdr =
udp6_portaddr_hash ( sock_net ( sk ) , & in6addr_any , snum ) ;
unsigned int hash2_partial =
udp6_portaddr_hash ( sock_net ( sk ) , & inet6_sk ( sk ) - > rcv_saddr , 0 ) ;
2009-11-08 13:17:30 +03:00
/* precompute partial secondary hash */
2009-11-09 08:26:33 +03:00
udp_sk ( sk ) - > udp_portaddr_hash = hash2_partial ;
return udp_lib_get_port ( sk , snum , ipv6_rcv_saddr_equal , hash2_nulladdr ) ;
2005-04-17 02:20:36 +04:00
}
2008-10-29 11:41:45 +03:00
static inline int compute_score ( struct sock * sk , struct net * net ,
unsigned short hnum ,
struct in6_addr * saddr , __be16 sport ,
struct in6_addr * daddr , __be16 dport ,
int dif )
{
int score = - 1 ;
2009-11-08 13:17:30 +03:00
if ( net_eq ( sock_net ( sk ) , net ) & & udp_sk ( sk ) - > udp_port_hash = = hnum & &
2008-10-29 11:41:45 +03:00
sk - > sk_family = = PF_INET6 ) {
struct ipv6_pinfo * np = inet6_sk ( sk ) ;
struct inet_sock * inet = inet_sk ( sk ) ;
score = 0 ;
2009-10-15 10:30:45 +04:00
if ( inet - > inet_dport ) {
if ( inet - > inet_dport ! = sport )
2008-10-29 11:41:45 +03:00
return - 1 ;
score + + ;
}
if ( ! ipv6_addr_any ( & np - > rcv_saddr ) ) {
if ( ! ipv6_addr_equal ( & np - > rcv_saddr , daddr ) )
return - 1 ;
score + + ;
}
if ( ! ipv6_addr_any ( & np - > daddr ) ) {
if ( ! ipv6_addr_equal ( & np - > daddr , saddr ) )
return - 1 ;
score + + ;
}
if ( sk - > sk_bound_dev_if ) {
if ( sk - > sk_bound_dev_if ! = dif )
return - 1 ;
score + + ;
}
}
return score ;
}
ipv6: udp: optimize unicast RX path
We first locate the (local port) hash chain head
If few sockets are in this chain, we proceed with previous lookup algo.
If too many sockets are listed, we take a look at the secondary
(port, address) hash chain.
We choose the shortest chain and proceed with a RCU lookup on the elected chain.
But, if we chose (port, address) chain, and fail to find a socket on given address,
we must try another lookup on (port, in6addr_any) chain to find sockets not bound
to a particular IP.
-> No extra cost for typical setups, where the first lookup will probabbly
be performed.
RCU lookups everywhere, we dont acquire spinlock.
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2009-11-08 13:18:30 +03:00
# define SCORE2_MAX (1 + 1 + 1)
static inline int compute_score2 ( struct sock * sk , struct net * net ,
const struct in6_addr * saddr , __be16 sport ,
const struct in6_addr * daddr , unsigned short hnum ,
int dif )
{
int score = - 1 ;
if ( net_eq ( sock_net ( sk ) , net ) & & udp_sk ( sk ) - > udp_port_hash = = hnum & &
sk - > sk_family = = PF_INET6 ) {
struct ipv6_pinfo * np = inet6_sk ( sk ) ;
struct inet_sock * inet = inet_sk ( sk ) ;
if ( ! ipv6_addr_equal ( & np - > rcv_saddr , daddr ) )
return - 1 ;
score = 0 ;
if ( inet - > inet_dport ) {
if ( inet - > inet_dport ! = sport )
return - 1 ;
score + + ;
}
if ( ! ipv6_addr_any ( & np - > daddr ) ) {
if ( ! ipv6_addr_equal ( & np - > daddr , saddr ) )
return - 1 ;
score + + ;
}
if ( sk - > sk_bound_dev_if ) {
if ( sk - > sk_bound_dev_if ! = dif )
return - 1 ;
score + + ;
}
}
return score ;
}
/* called with read_rcu_lock() */
static struct sock * udp6_lib_lookup2 ( struct net * net ,
const struct in6_addr * saddr , __be16 sport ,
const struct in6_addr * daddr , unsigned int hnum , int dif ,
struct udp_hslot * hslot2 , unsigned int slot2 )
{
struct sock * sk , * result ;
struct hlist_nulls_node * node ;
int score , badness ;
begin :
result = NULL ;
badness = - 1 ;
udp_portaddr_for_each_entry_rcu ( sk , node , & hslot2 - > head ) {
score = compute_score2 ( sk , net , saddr , sport ,
daddr , hnum , dif ) ;
if ( score > badness ) {
result = sk ;
badness = score ;
if ( score = = SCORE2_MAX )
goto exact_match ;
}
}
/*
* if the nulls value we got at the end of this lookup is
* not the expected one , we must restart lookup .
* We probably met an item that was moved to another chain .
*/
if ( get_nulls_value ( node ) ! = slot2 )
goto begin ;
if ( result ) {
exact_match :
if ( unlikely ( ! atomic_inc_not_zero ( & result - > sk_refcnt ) ) )
result = NULL ;
else if ( unlikely ( compute_score2 ( result , net , saddr , sport ,
daddr , hnum , dif ) < badness ) ) {
sock_put ( result ) ;
goto begin ;
}
}
return result ;
}
2008-01-31 16:07:57 +03:00
static struct sock * __udp6_lib_lookup ( struct net * net ,
struct in6_addr * saddr , __be16 sport ,
2006-11-27 22:10:57 +03:00
struct in6_addr * daddr , __be16 dport ,
2008-10-29 11:41:45 +03:00
int dif , struct udp_table * udptable )
2005-04-17 02:20:36 +04:00
{
udp: RCU handling for Unicast packets.
Goals are :
1) Optimizing handling of incoming Unicast UDP frames, so that no memory
writes should happen in the fast path.
Note: Multicasts and broadcasts still will need to take a lock,
because doing a full lockless lookup in this case is difficult.
2) No expensive operations in the socket bind/unhash phases :
- No expensive synchronize_rcu() calls.
- No added rcu_head in socket structure, increasing memory needs,
but more important, forcing us to use call_rcu() calls,
that have the bad property of making sockets structure cold.
(rcu grace period between socket freeing and its potential reuse
make this socket being cold in CPU cache).
David did a previous patch using call_rcu() and noticed a 20%
impact on TCP connection rates.
Quoting Cristopher Lameter :
"Right. That results in cacheline cooldown. You'd want to recycle
the object as they are cache hot on a per cpu basis. That is screwed
up by the delayed regular rcu processing. We have seen multiple
regressions due to cacheline cooldown.
The only choice in cacheline hot sensitive areas is to deal with the
complexity that comes with SLAB_DESTROY_BY_RCU or give up on RCU."
- Because udp sockets are allocated from dedicated kmem_cache,
use of SLAB_DESTROY_BY_RCU can help here.
Theory of operation :
---------------------
As the lookup is lockfree (using rcu_read_lock()/rcu_read_unlock()),
special attention must be taken by readers and writers.
Use of SLAB_DESTROY_BY_RCU is tricky too, because a socket can be freed,
reused, inserted in a different chain or in worst case in the same chain
while readers could do lookups in the same time.
In order to avoid loops, a reader must check each socket found in a chain
really belongs to the chain the reader was traversing. If it finds a
mismatch, lookup must start again at the begining. This *restart* loop
is the reason we had to use rdlock for the multicast case, because
we dont want to send same message several times to the same socket.
We use RCU only for fast path.
Thus, /proc/net/udp still takes spinlocks.
Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-10-29 12:11:14 +03:00
struct sock * sk , * result ;
2008-11-17 06:39:21 +03:00
struct hlist_nulls_node * node ;
2005-04-17 02:20:36 +04:00
unsigned short hnum = ntohs ( dport ) ;
ipv6: udp: optimize unicast RX path
We first locate the (local port) hash chain head
If few sockets are in this chain, we proceed with previous lookup algo.
If too many sockets are listed, we take a look at the secondary
(port, address) hash chain.
We choose the shortest chain and proceed with a RCU lookup on the elected chain.
But, if we chose (port, address) chain, and fail to find a socket on given address,
we must try another lookup on (port, in6addr_any) chain to find sockets not bound
to a particular IP.
-> No extra cost for typical setups, where the first lookup will probabbly
be performed.
RCU lookups everywhere, we dont acquire spinlock.
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2009-11-08 13:18:30 +03:00
unsigned int hash2 , slot2 , slot = udp_hashfn ( net , hnum , udptable - > mask ) ;
struct udp_hslot * hslot2 , * hslot = & udptable - > hash [ slot ] ;
udp: RCU handling for Unicast packets.
Goals are :
1) Optimizing handling of incoming Unicast UDP frames, so that no memory
writes should happen in the fast path.
Note: Multicasts and broadcasts still will need to take a lock,
because doing a full lockless lookup in this case is difficult.
2) No expensive operations in the socket bind/unhash phases :
- No expensive synchronize_rcu() calls.
- No added rcu_head in socket structure, increasing memory needs,
but more important, forcing us to use call_rcu() calls,
that have the bad property of making sockets structure cold.
(rcu grace period between socket freeing and its potential reuse
make this socket being cold in CPU cache).
David did a previous patch using call_rcu() and noticed a 20%
impact on TCP connection rates.
Quoting Cristopher Lameter :
"Right. That results in cacheline cooldown. You'd want to recycle
the object as they are cache hot on a per cpu basis. That is screwed
up by the delayed regular rcu processing. We have seen multiple
regressions due to cacheline cooldown.
The only choice in cacheline hot sensitive areas is to deal with the
complexity that comes with SLAB_DESTROY_BY_RCU or give up on RCU."
- Because udp sockets are allocated from dedicated kmem_cache,
use of SLAB_DESTROY_BY_RCU can help here.
Theory of operation :
---------------------
As the lookup is lockfree (using rcu_read_lock()/rcu_read_unlock()),
special attention must be taken by readers and writers.
Use of SLAB_DESTROY_BY_RCU is tricky too, because a socket can be freed,
reused, inserted in a different chain or in worst case in the same chain
while readers could do lookups in the same time.
In order to avoid loops, a reader must check each socket found in a chain
really belongs to the chain the reader was traversing. If it finds a
mismatch, lookup must start again at the begining. This *restart* loop
is the reason we had to use rdlock for the multicast case, because
we dont want to send same message several times to the same socket.
We use RCU only for fast path.
Thus, /proc/net/udp still takes spinlocks.
Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-10-29 12:11:14 +03:00
int score , badness ;
2008-10-29 11:41:45 +03:00
udp: RCU handling for Unicast packets.
Goals are :
1) Optimizing handling of incoming Unicast UDP frames, so that no memory
writes should happen in the fast path.
Note: Multicasts and broadcasts still will need to take a lock,
because doing a full lockless lookup in this case is difficult.
2) No expensive operations in the socket bind/unhash phases :
- No expensive synchronize_rcu() calls.
- No added rcu_head in socket structure, increasing memory needs,
but more important, forcing us to use call_rcu() calls,
that have the bad property of making sockets structure cold.
(rcu grace period between socket freeing and its potential reuse
make this socket being cold in CPU cache).
David did a previous patch using call_rcu() and noticed a 20%
impact on TCP connection rates.
Quoting Cristopher Lameter :
"Right. That results in cacheline cooldown. You'd want to recycle
the object as they are cache hot on a per cpu basis. That is screwed
up by the delayed regular rcu processing. We have seen multiple
regressions due to cacheline cooldown.
The only choice in cacheline hot sensitive areas is to deal with the
complexity that comes with SLAB_DESTROY_BY_RCU or give up on RCU."
- Because udp sockets are allocated from dedicated kmem_cache,
use of SLAB_DESTROY_BY_RCU can help here.
Theory of operation :
---------------------
As the lookup is lockfree (using rcu_read_lock()/rcu_read_unlock()),
special attention must be taken by readers and writers.
Use of SLAB_DESTROY_BY_RCU is tricky too, because a socket can be freed,
reused, inserted in a different chain or in worst case in the same chain
while readers could do lookups in the same time.
In order to avoid loops, a reader must check each socket found in a chain
really belongs to the chain the reader was traversing. If it finds a
mismatch, lookup must start again at the begining. This *restart* loop
is the reason we had to use rdlock for the multicast case, because
we dont want to send same message several times to the same socket.
We use RCU only for fast path.
Thus, /proc/net/udp still takes spinlocks.
Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-10-29 12:11:14 +03:00
rcu_read_lock ( ) ;
ipv6: udp: optimize unicast RX path
We first locate the (local port) hash chain head
If few sockets are in this chain, we proceed with previous lookup algo.
If too many sockets are listed, we take a look at the secondary
(port, address) hash chain.
We choose the shortest chain and proceed with a RCU lookup on the elected chain.
But, if we chose (port, address) chain, and fail to find a socket on given address,
we must try another lookup on (port, in6addr_any) chain to find sockets not bound
to a particular IP.
-> No extra cost for typical setups, where the first lookup will probabbly
be performed.
RCU lookups everywhere, we dont acquire spinlock.
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2009-11-08 13:18:30 +03:00
if ( hslot - > count > 10 ) {
hash2 = udp6_portaddr_hash ( net , daddr , hnum ) ;
slot2 = hash2 & udptable - > mask ;
hslot2 = & udptable - > hash2 [ slot2 ] ;
if ( hslot - > count < hslot2 - > count )
goto begin ;
result = udp6_lib_lookup2 ( net , saddr , sport ,
daddr , hnum , dif ,
hslot2 , slot2 ) ;
if ( ! result ) {
hash2 = udp6_portaddr_hash ( net , & in6addr_any , hnum ) ;
slot2 = hash2 & udptable - > mask ;
hslot2 = & udptable - > hash2 [ slot2 ] ;
if ( hslot - > count < hslot2 - > count )
goto begin ;
result = udp6_lib_lookup2 ( net , & in6addr_any , sport ,
daddr , hnum , dif ,
hslot2 , slot2 ) ;
}
rcu_read_unlock ( ) ;
return result ;
}
udp: RCU handling for Unicast packets.
Goals are :
1) Optimizing handling of incoming Unicast UDP frames, so that no memory
writes should happen in the fast path.
Note: Multicasts and broadcasts still will need to take a lock,
because doing a full lockless lookup in this case is difficult.
2) No expensive operations in the socket bind/unhash phases :
- No expensive synchronize_rcu() calls.
- No added rcu_head in socket structure, increasing memory needs,
but more important, forcing us to use call_rcu() calls,
that have the bad property of making sockets structure cold.
(rcu grace period between socket freeing and its potential reuse
make this socket being cold in CPU cache).
David did a previous patch using call_rcu() and noticed a 20%
impact on TCP connection rates.
Quoting Cristopher Lameter :
"Right. That results in cacheline cooldown. You'd want to recycle
the object as they are cache hot on a per cpu basis. That is screwed
up by the delayed regular rcu processing. We have seen multiple
regressions due to cacheline cooldown.
The only choice in cacheline hot sensitive areas is to deal with the
complexity that comes with SLAB_DESTROY_BY_RCU or give up on RCU."
- Because udp sockets are allocated from dedicated kmem_cache,
use of SLAB_DESTROY_BY_RCU can help here.
Theory of operation :
---------------------
As the lookup is lockfree (using rcu_read_lock()/rcu_read_unlock()),
special attention must be taken by readers and writers.
Use of SLAB_DESTROY_BY_RCU is tricky too, because a socket can be freed,
reused, inserted in a different chain or in worst case in the same chain
while readers could do lookups in the same time.
In order to avoid loops, a reader must check each socket found in a chain
really belongs to the chain the reader was traversing. If it finds a
mismatch, lookup must start again at the begining. This *restart* loop
is the reason we had to use rdlock for the multicast case, because
we dont want to send same message several times to the same socket.
We use RCU only for fast path.
Thus, /proc/net/udp still takes spinlocks.
Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-10-29 12:11:14 +03:00
begin :
result = NULL ;
badness = - 1 ;
2008-11-17 06:39:21 +03:00
sk_nulls_for_each_rcu ( sk , node , & hslot - > head ) {
2008-10-29 11:41:45 +03:00
score = compute_score ( sk , net , hnum , saddr , sport , daddr , dport , dif ) ;
if ( score > badness ) {
result = sk ;
badness = score ;
2005-04-17 02:20:36 +04:00
}
}
2008-11-17 06:39:21 +03:00
/*
* if the nulls value we got at the end of this lookup is
* not the expected one , we must restart lookup .
* We probably met an item that was moved to another chain .
*/
ipv6: udp: optimize unicast RX path
We first locate the (local port) hash chain head
If few sockets are in this chain, we proceed with previous lookup algo.
If too many sockets are listed, we take a look at the secondary
(port, address) hash chain.
We choose the shortest chain and proceed with a RCU lookup on the elected chain.
But, if we chose (port, address) chain, and fail to find a socket on given address,
we must try another lookup on (port, in6addr_any) chain to find sockets not bound
to a particular IP.
-> No extra cost for typical setups, where the first lookup will probabbly
be performed.
RCU lookups everywhere, we dont acquire spinlock.
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2009-11-08 13:18:30 +03:00
if ( get_nulls_value ( node ) ! = slot )
2008-11-17 06:39:21 +03:00
goto begin ;
udp: RCU handling for Unicast packets.
Goals are :
1) Optimizing handling of incoming Unicast UDP frames, so that no memory
writes should happen in the fast path.
Note: Multicasts and broadcasts still will need to take a lock,
because doing a full lockless lookup in this case is difficult.
2) No expensive operations in the socket bind/unhash phases :
- No expensive synchronize_rcu() calls.
- No added rcu_head in socket structure, increasing memory needs,
but more important, forcing us to use call_rcu() calls,
that have the bad property of making sockets structure cold.
(rcu grace period between socket freeing and its potential reuse
make this socket being cold in CPU cache).
David did a previous patch using call_rcu() and noticed a 20%
impact on TCP connection rates.
Quoting Cristopher Lameter :
"Right. That results in cacheline cooldown. You'd want to recycle
the object as they are cache hot on a per cpu basis. That is screwed
up by the delayed regular rcu processing. We have seen multiple
regressions due to cacheline cooldown.
The only choice in cacheline hot sensitive areas is to deal with the
complexity that comes with SLAB_DESTROY_BY_RCU or give up on RCU."
- Because udp sockets are allocated from dedicated kmem_cache,
use of SLAB_DESTROY_BY_RCU can help here.
Theory of operation :
---------------------
As the lookup is lockfree (using rcu_read_lock()/rcu_read_unlock()),
special attention must be taken by readers and writers.
Use of SLAB_DESTROY_BY_RCU is tricky too, because a socket can be freed,
reused, inserted in a different chain or in worst case in the same chain
while readers could do lookups in the same time.
In order to avoid loops, a reader must check each socket found in a chain
really belongs to the chain the reader was traversing. If it finds a
mismatch, lookup must start again at the begining. This *restart* loop
is the reason we had to use rdlock for the multicast case, because
we dont want to send same message several times to the same socket.
We use RCU only for fast path.
Thus, /proc/net/udp still takes spinlocks.
Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-10-29 12:11:14 +03:00
if ( result ) {
if ( unlikely ( ! atomic_inc_not_zero ( & result - > sk_refcnt ) ) )
result = NULL ;
else if ( unlikely ( compute_score ( result , net , hnum , saddr , sport ,
daddr , dport , dif ) < badness ) ) {
sock_put ( result ) ;
goto begin ;
}
}
rcu_read_unlock ( ) ;
2005-04-17 02:20:36 +04:00
return result ;
}
2008-10-07 23:38:32 +04:00
static struct sock * __udp6_lib_lookup_skb ( struct sk_buff * skb ,
__be16 sport , __be16 dport ,
2008-10-29 11:41:45 +03:00
struct udp_table * udptable )
2008-10-07 23:38:32 +04:00
{
2008-10-07 23:41:01 +04:00
struct sock * sk ;
2008-10-07 23:38:32 +04:00
struct ipv6hdr * iph = ipv6_hdr ( skb ) ;
2008-10-07 23:41:01 +04:00
if ( unlikely ( sk = skb_steal_sock ( skb ) ) )
return sk ;
2009-06-02 09:19:30 +04:00
return __udp6_lib_lookup ( dev_net ( skb_dst ( skb ) - > dev ) , & iph - > saddr , sport ,
& iph - > daddr , dport , inet6_iif ( skb ) ,
udptable ) ;
2008-10-07 23:38:32 +04:00
}
2005-04-17 02:20:36 +04:00
/*
* This should be easy , if there is something there we
* return it , otherwise we block .
*/
2006-11-27 22:10:57 +03:00
int udpv6_recvmsg ( struct kiocb * iocb , struct sock * sk ,
2005-04-17 02:20:36 +04:00
struct msghdr * msg , size_t len ,
int noblock , int flags , int * addr_len )
{
struct ipv6_pinfo * np = inet6_sk ( sk ) ;
struct inet_sock * inet = inet_sk ( sk ) ;
2007-02-09 17:24:49 +03:00
struct sk_buff * skb ;
2007-03-26 07:10:56 +04:00
unsigned int ulen , copied ;
2007-12-05 12:53:40 +03:00
int peeked ;
2007-03-26 07:10:56 +04:00
int err ;
int is_udplite = IS_UDPLITE ( sk ) ;
2008-11-02 19:11:01 +03:00
int is_udp4 ;
2005-04-17 02:20:36 +04:00
2007-02-09 17:24:49 +03:00
if ( addr_len )
* addr_len = sizeof ( struct sockaddr_in6 ) ;
2005-04-17 02:20:36 +04:00
if ( flags & MSG_ERRQUEUE )
return ipv6_recv_error ( sk , msg , len ) ;
try_again :
2007-12-05 12:53:40 +03:00
skb = __skb_recv_datagram ( sk , flags | ( noblock ? MSG_DONTWAIT : 0 ) ,
& peeked , & err ) ;
2005-04-17 02:20:36 +04:00
if ( ! skb )
goto out ;
2007-03-26 07:10:56 +04:00
ulen = skb - > len - sizeof ( struct udphdr ) ;
copied = len ;
if ( copied > ulen )
copied = ulen ;
else if ( copied < ulen )
2007-02-09 17:24:49 +03:00
msg - > msg_flags | = MSG_TRUNC ;
2005-04-17 02:20:36 +04:00
2008-11-02 19:11:01 +03:00
is_udp4 = ( skb - > protocol = = htons ( ETH_P_IP ) ) ;
2006-11-27 22:10:57 +03:00
/*
2007-03-26 07:10:56 +04:00
* If checksum is needed at all , try to do it while copying the
* data . If the data is truncated , or if we only want a partial
* coverage checksum ( UDP - Lite ) , do it before the copy .
2006-11-27 22:10:57 +03:00
*/
2007-03-26 07:10:56 +04:00
if ( copied < ulen | | UDP_SKB_CB ( skb ) - > partial_cov ) {
if ( udp_lib_checksum_complete ( skb ) )
2005-04-17 02:20:36 +04:00
goto csum_copy_err ;
2006-11-27 22:10:57 +03:00
}
2007-04-09 22:59:39 +04:00
if ( skb_csum_unnecessary ( skb ) )
2006-11-27 22:10:57 +03:00
err = skb_copy_datagram_iovec ( skb , sizeof ( struct udphdr ) ,
msg - > msg_iov , copied ) ;
else {
2005-04-17 02:20:36 +04:00
err = skb_copy_and_csum_datagram_iovec ( skb , sizeof ( struct udphdr ) , msg - > msg_iov ) ;
if ( err = = - EINVAL )
goto csum_copy_err ;
}
if ( err )
goto out_free ;
2008-11-02 19:11:01 +03:00
if ( ! peeked ) {
if ( is_udp4 )
UDP_INC_STATS_USER ( sock_net ( sk ) ,
UDP_MIB_INDATAGRAMS , is_udplite ) ;
else
UDP6_INC_STATS_USER ( sock_net ( sk ) ,
UDP_MIB_INDATAGRAMS , is_udplite ) ;
}
2007-12-03 14:33:28 +03:00
net: Generalize socket rx gap / receive queue overflow cmsg
Create a new socket level option to report number of queue overflows
Recently I augmented the AF_PACKET protocol to report the number of frames lost
on the socket receive queue between any two enqueued frames. This value was
exported via a SOL_PACKET level cmsg. AFter I completed that work it was
requested that this feature be generalized so that any datagram oriented socket
could make use of this option. As such I've created this patch, It creates a
new SOL_SOCKET level option called SO_RXQ_OVFL, which when enabled exports a
SOL_SOCKET level cmsg that reports the nubmer of times the sk_receive_queue
overflowed between any two given frames. It also augments the AF_PACKET
protocol to take advantage of this new feature (as it previously did not touch
sk->sk_drops, which this patch uses to record the overflow count). Tested
successfully by me.
Notes:
1) Unlike my previous patch, this patch simply records the sk_drops value, which
is not a number of drops between packets, but rather a total number of drops.
Deltas must be computed in user space.
2) While this patch currently works with datagram oriented protocols, it will
also be accepted by non-datagram oriented protocols. I'm not sure if thats
agreeable to everyone, but my argument in favor of doing so is that, for those
protocols which aren't applicable to this option, sk_drops will always be zero,
and reporting no drops on a receive queue that isn't used for those
non-participating protocols seems reasonable to me. This also saves us having
to code in a per-protocol opt in mechanism.
3) This applies cleanly to net-next assuming that commit
977750076d98c7ff6cbda51858bb5a5894a9d9ab (my af packet cmsg patch) is reverted
Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2009-10-13 00:26:31 +04:00
sock_recv_ts_and_drops ( msg , sk , skb ) ;
2005-04-17 02:20:36 +04:00
/* Copy the address. */
if ( msg - > msg_name ) {
struct sockaddr_in6 * sin6 ;
2007-02-09 17:24:49 +03:00
2005-04-17 02:20:36 +04:00
sin6 = ( struct sockaddr_in6 * ) msg - > msg_name ;
sin6 - > sin6_family = AF_INET6 ;
2007-03-13 20:28:48 +03:00
sin6 - > sin6_port = udp_hdr ( skb ) - > source ;
2005-04-17 02:20:36 +04:00
sin6 - > sin6_flowinfo = 0 ;
sin6 - > sin6_scope_id = 0 ;
2008-11-02 19:11:01 +03:00
if ( is_udp4 )
2009-10-08 00:58:25 +04:00
ipv6_addr_set_v4mapped ( ip_hdr ( skb ) - > saddr ,
& sin6 - > sin6_addr ) ;
2005-04-17 02:20:36 +04:00
else {
2007-04-26 04:54:47 +04:00
ipv6_addr_copy ( & sin6 - > sin6_addr ,
& ipv6_hdr ( skb ) - > saddr ) ;
2005-04-17 02:20:36 +04:00
if ( ipv6_addr_type ( & sin6 - > sin6_addr ) & IPV6_ADDR_LINKLOCAL )
sin6 - > sin6_scope_id = IP6CB ( skb ) - > iif ;
}
}
2008-11-02 19:11:01 +03:00
if ( is_udp4 ) {
2005-04-17 02:20:36 +04:00
if ( inet - > cmsg_flags )
ip_cmsg_recv ( msg , skb ) ;
} else {
if ( np - > rxopt . all )
datagram_recv_ctl ( sk , msg , skb ) ;
2007-02-09 17:24:49 +03:00
}
2005-04-17 02:20:36 +04:00
err = copied ;
if ( flags & MSG_TRUNC )
2007-03-26 07:10:56 +04:00
err = ulen ;
2005-04-17 02:20:36 +04:00
out_free :
2009-10-30 08:03:53 +03:00
skb_free_datagram_locked ( sk , skb ) ;
2005-04-17 02:20:36 +04:00
out :
return err ;
csum_copy_err :
2007-12-31 11:29:24 +03:00
lock_sock ( sk ) ;
2008-11-02 19:14:27 +03:00
if ( ! skb_kill_datagram ( sk , skb , flags ) ) {
if ( is_udp4 )
UDP_INC_STATS_USER ( sock_net ( sk ) ,
UDP_MIB_INERRORS , is_udplite ) ;
else
UDP6_INC_STATS_USER ( sock_net ( sk ) ,
UDP_MIB_INERRORS , is_udplite ) ;
}
2007-12-31 11:29:24 +03:00
release_sock ( sk ) ;
2005-04-17 02:20:36 +04:00
2007-11-06 08:29:17 +03:00
if ( flags & MSG_DONTWAIT )
2005-04-17 02:20:36 +04:00
return - EAGAIN ;
goto try_again ;
}
2006-11-27 22:10:57 +03:00
void __udp6_lib_err ( struct sk_buff * skb , struct inet6_skb_parm * opt ,
2009-06-23 15:31:07 +04:00
u8 type , u8 code , int offset , __be32 info ,
2008-10-29 11:41:45 +03:00
struct udp_table * udptable )
2005-04-17 02:20:36 +04:00
{
struct ipv6_pinfo * np ;
struct ipv6hdr * hdr = ( struct ipv6hdr * ) skb - > data ;
struct in6_addr * saddr = & hdr - > saddr ;
struct in6_addr * daddr = & hdr - > daddr ;
struct udphdr * uh = ( struct udphdr * ) ( skb - > data + offset ) ;
struct sock * sk ;
int err ;
2008-03-25 15:47:49 +03:00
sk = __udp6_lib_lookup ( dev_net ( skb - > dev ) , daddr , uh - > dest ,
2006-11-27 22:10:57 +03:00
saddr , uh - > source , inet6_iif ( skb ) , udptable ) ;
2005-04-17 02:20:36 +04:00
if ( sk = = NULL )
return ;
np = inet6_sk ( sk ) ;
if ( ! icmpv6_err_convert ( type , code , & err ) & & ! np - > recverr )
goto out ;
if ( sk - > sk_state ! = TCP_ESTABLISHED & & ! np - > recverr )
goto out ;
if ( np - > recverr )
ipv6_icmp_error ( sk , skb , err , uh - > dest , ntohl ( info ) , ( u8 * ) ( uh + 1 ) ) ;
sk - > sk_err = err ;
sk - > sk_error_report ( sk ) ;
out :
sock_put ( sk ) ;
}
2006-11-27 22:10:57 +03:00
static __inline__ void udpv6_err ( struct sk_buff * skb ,
2009-06-23 15:31:07 +04:00
struct inet6_skb_parm * opt , u8 type ,
u8 code , int offset , __be32 info )
2006-11-27 22:10:57 +03:00
{
2008-10-29 11:41:45 +03:00
__udp6_lib_err ( skb , opt , type , code , offset , info , & udp_table ) ;
2006-11-27 22:10:57 +03:00
}
int udpv6_queue_rcv_skb ( struct sock * sk , struct sk_buff * skb )
2005-04-17 02:20:36 +04:00
{
2006-11-27 22:10:57 +03:00
struct udp_sock * up = udp_sk ( sk ) ;
2006-08-15 11:00:09 +04:00
int rc ;
2007-12-03 14:34:16 +03:00
int is_udplite = IS_UDPLITE ( sk ) ;
2006-08-15 11:00:09 +04:00
2006-11-27 22:10:57 +03:00
if ( ! xfrm6_policy_check ( sk , XFRM_POLICY_IN , skb ) )
goto drop ;
2005-04-17 02:20:36 +04:00
2006-11-27 22:10:57 +03:00
/*
* UDP - Lite specific tests , ignored on UDP sockets ( see net / ipv4 / udp . c ) .
*/
2007-12-03 14:34:16 +03:00
if ( ( is_udplite & UDPLITE_RECV_CC ) & & UDP_SKB_CB ( skb ) - > partial_cov ) {
2006-11-27 22:10:57 +03:00
if ( up - > pcrlen = = 0 ) { /* full coverage was set */
LIMIT_NETDEBUG ( KERN_WARNING " UDPLITE6: partial coverage "
" %d while full coverage %d requested \n " ,
UDP_SKB_CB ( skb ) - > cscov , skb - > len ) ;
goto drop ;
}
if ( UDP_SKB_CB ( skb ) - > cscov < up - > pcrlen ) {
LIMIT_NETDEBUG ( KERN_WARNING " UDPLITE6: coverage %d "
" too small, need min %d \n " ,
UDP_SKB_CB ( skb ) - > cscov , up - > pcrlen ) ;
goto drop ;
}
2005-04-17 02:20:36 +04:00
}
2007-03-07 07:29:58 +03:00
if ( sk - > sk_filter ) {
if ( udp_lib_checksum_complete ( skb ) )
goto drop ;
}
2006-11-27 22:10:57 +03:00
2009-10-15 07:40:11 +04:00
if ( ( rc = sock_queue_rcv_skb ( sk , skb ) ) < 0 ) {
2006-08-15 11:00:09 +04:00
/* Note that an ENOMEM error is charged twice */
2009-10-15 07:40:11 +04:00
if ( rc = = - ENOMEM )
2008-07-06 08:19:40 +04:00
UDP6_INC_STATS_BH ( sock_net ( sk ) ,
UDP_MIB_RCVBUFERRORS , is_udplite ) ;
2009-10-15 04:12:40 +04:00
goto drop_no_sk_drops_inc ;
2005-04-17 02:20:36 +04:00
}
2007-12-03 14:33:28 +03:00
2005-04-17 02:20:36 +04:00
return 0 ;
2006-11-27 22:10:57 +03:00
drop :
2009-10-15 04:12:40 +04:00
atomic_inc ( & sk - > sk_drops ) ;
drop_no_sk_drops_inc :
2008-07-06 08:19:40 +04:00
UDP6_INC_STATS_BH ( sock_net ( sk ) , UDP_MIB_INERRORS , is_udplite ) ;
2006-11-27 22:10:57 +03:00
kfree_skb ( skb ) ;
return - 1 ;
2005-04-17 02:20:36 +04:00
}
2008-11-02 07:22:23 +03:00
static struct sock * udp_v6_mcast_next ( struct net * net , struct sock * sk ,
2006-11-15 07:56:00 +03:00
__be16 loc_port , struct in6_addr * loc_addr ,
__be16 rmt_port , struct in6_addr * rmt_addr ,
2005-04-17 02:20:36 +04:00
int dif )
{
2008-11-17 06:39:21 +03:00
struct hlist_nulls_node * node ;
2005-04-17 02:20:36 +04:00
struct sock * s = sk ;
unsigned short num = ntohs ( loc_port ) ;
2008-11-17 06:39:21 +03:00
sk_nulls_for_each_from ( s , node ) {
2005-04-17 02:20:36 +04:00
struct inet_sock * inet = inet_sk ( s ) ;
2008-11-02 07:22:23 +03:00
if ( ! net_eq ( sock_net ( s ) , net ) )
2008-03-07 22:16:55 +03:00
continue ;
2009-11-08 13:17:30 +03:00
if ( udp_sk ( s ) - > udp_port_hash = = num & &
s - > sk_family = = PF_INET6 ) {
2005-04-17 02:20:36 +04:00
struct ipv6_pinfo * np = inet6_sk ( s ) ;
2009-10-15 10:30:45 +04:00
if ( inet - > inet_dport ) {
if ( inet - > inet_dport ! = rmt_port )
2005-04-17 02:20:36 +04:00
continue ;
}
if ( ! ipv6_addr_any ( & np - > daddr ) & &
! ipv6_addr_equal ( & np - > daddr , rmt_addr ) )
continue ;
if ( s - > sk_bound_dev_if & & s - > sk_bound_dev_if ! = dif )
continue ;
if ( ! ipv6_addr_any ( & np - > rcv_saddr ) ) {
2005-09-15 08:10:20 +04:00
if ( ! ipv6_addr_equal ( & np - > rcv_saddr , loc_addr ) )
continue ;
2005-04-17 02:20:36 +04:00
}
2007-03-09 07:42:35 +03:00
if ( ! inet6_mc_check ( s , loc_addr , rmt_addr ) )
2005-04-17 02:20:36 +04:00
continue ;
return s ;
}
}
return NULL ;
}
2009-11-08 13:18:52 +03:00
static void flush_stack ( struct sock * * stack , unsigned int count ,
struct sk_buff * skb , unsigned int final )
{
unsigned int i ;
struct sock * sk ;
struct sk_buff * skb1 ;
for ( i = 0 ; i < count ; i + + ) {
skb1 = ( i = = final ) ? skb : skb_clone ( skb , GFP_ATOMIC ) ;
2009-11-08 13:20:19 +03:00
sk = stack [ i ] ;
2009-11-08 13:18:52 +03:00
if ( skb1 ) {
bh_lock_sock ( sk ) ;
if ( ! sock_owned_by_user ( sk ) )
udpv6_queue_rcv_skb ( sk , skb1 ) ;
else
sk_add_backlog ( sk , skb1 ) ;
bh_unlock_sock ( sk ) ;
2009-11-08 13:20:19 +03:00
} else {
atomic_inc ( & sk - > sk_drops ) ;
UDP6_INC_STATS_BH ( sock_net ( sk ) ,
UDP_MIB_RCVBUFERRORS , IS_UDPLITE ( sk ) ) ;
UDP6_INC_STATS_BH ( sock_net ( sk ) ,
UDP_MIB_INERRORS , IS_UDPLITE ( sk ) ) ;
2009-11-08 13:18:52 +03:00
}
}
}
2005-04-17 02:20:36 +04:00
/*
* Note : called only from the BH handler context ,
* so we don ' t need to lock the hashes .
*/
2008-06-17 04:12:11 +04:00
static int __udp6_lib_mcast_deliver ( struct net * net , struct sk_buff * skb ,
struct in6_addr * saddr , struct in6_addr * daddr ,
2008-10-29 11:41:45 +03:00
struct udp_table * udptable )
2005-04-17 02:20:36 +04:00
{
2009-11-08 13:18:52 +03:00
struct sock * sk , * stack [ 256 / sizeof ( struct sock * ) ] ;
2007-03-13 20:28:48 +03:00
const struct udphdr * uh = udp_hdr ( skb ) ;
2009-10-07 04:37:59 +04:00
struct udp_hslot * hslot = udp_hashslot ( udptable , net , ntohs ( uh - > dest ) ) ;
2005-04-17 02:20:36 +04:00
int dif ;
2009-11-08 13:18:52 +03:00
unsigned int i , count = 0 ;
2005-04-17 02:20:36 +04:00
2008-10-29 11:41:45 +03:00
spin_lock ( & hslot - > lock ) ;
2008-11-17 06:39:21 +03:00
sk = sk_nulls_head ( & hslot - > head ) ;
2006-11-22 04:41:56 +03:00
dif = inet6_iif ( skb ) ;
2008-11-02 07:22:23 +03:00
sk = udp_v6_mcast_next ( net , sk , uh - > dest , daddr , uh - > source , saddr , dif ) ;
2009-11-08 13:18:52 +03:00
while ( sk ) {
stack [ count + + ] = sk ;
sk = udp_v6_mcast_next ( net , sk_nulls_next ( sk ) , uh - > dest , daddr ,
uh - > source , saddr , dif ) ;
if ( unlikely ( count = = ARRAY_SIZE ( stack ) ) ) {
if ( ! sk )
break ;
flush_stack ( stack , count , skb , ~ 0 ) ;
count = 0 ;
2007-12-31 11:29:24 +03:00
}
2005-04-17 02:20:36 +04:00
}
2009-11-08 13:18:52 +03:00
/*
* before releasing the lock , we must take reference on sockets
*/
for ( i = 0 ; i < count ; i + + )
sock_hold ( stack [ i ] ) ;
2008-10-29 11:41:45 +03:00
spin_unlock ( & hslot - > lock ) ;
2009-11-08 13:18:52 +03:00
if ( count ) {
flush_stack ( stack , count , skb , count - 1 ) ;
for ( i = 0 ; i < count ; i + + )
sock_put ( stack [ i ] ) ;
} else {
kfree_skb ( skb ) ;
}
2006-11-27 22:10:57 +03:00
return 0 ;
2005-04-17 02:20:36 +04:00
}
2007-03-26 07:10:56 +04:00
static inline int udp6_csum_init ( struct sk_buff * skb , struct udphdr * uh ,
int proto )
2006-11-27 22:10:57 +03:00
{
2007-03-26 07:10:56 +04:00
int err ;
UDP_SKB_CB ( skb ) - > partial_cov = 0 ;
UDP_SKB_CB ( skb ) - > cscov = skb - > len ;
2008-03-07 03:22:02 +03:00
if ( proto = = IPPROTO_UDPLITE ) {
2007-03-26 07:10:56 +04:00
err = udplite_checksum_init ( skb , uh ) ;
if ( err )
return err ;
}
2006-11-27 22:10:57 +03:00
if ( uh - > check = = 0 ) {
/* RFC 2460 section 8.1 says that we SHOULD log
this error . Well , it is reasonable .
*/
LIMIT_NETDEBUG ( KERN_INFO " IPv6: udp checksum is 0 \n " ) ;
return 1 ;
}
if ( skb - > ip_summed = = CHECKSUM_COMPLETE & &
2007-04-26 04:54:47 +04:00
! csum_ipv6_magic ( & ipv6_hdr ( skb ) - > saddr , & ipv6_hdr ( skb ) - > daddr ,
2007-03-26 07:10:56 +04:00
skb - > len , proto , skb - > csum ) )
2006-11-27 22:10:57 +03:00
skb - > ip_summed = CHECKSUM_UNNECESSARY ;
2007-04-09 22:59:39 +04:00
if ( ! skb_csum_unnecessary ( skb ) )
2007-04-26 04:54:47 +04:00
skb - > csum = ~ csum_unfold ( csum_ipv6_magic ( & ipv6_hdr ( skb ) - > saddr ,
& ipv6_hdr ( skb ) - > daddr ,
2007-03-26 07:10:56 +04:00
skb - > len , proto , 0 ) ) ;
2006-11-27 22:10:57 +03:00
2007-03-26 07:10:56 +04:00
return 0 ;
2006-11-27 22:10:57 +03:00
}
2008-10-29 11:41:45 +03:00
int __udp6_lib_rcv ( struct sk_buff * skb , struct udp_table * udptable ,
2007-03-26 07:10:56 +04:00
int proto )
2005-04-17 02:20:36 +04:00
{
struct sock * sk ;
2007-02-09 17:24:49 +03:00
struct udphdr * uh ;
2005-04-17 02:20:36 +04:00
struct net_device * dev = skb - > dev ;
struct in6_addr * saddr , * daddr ;
u32 ulen = 0 ;
2008-07-06 08:19:40 +04:00
struct net * net = dev_net ( skb - > dev ) ;
2005-04-17 02:20:36 +04:00
if ( ! pskb_may_pull ( skb , sizeof ( struct udphdr ) ) )
goto short_packet ;
2007-04-26 04:54:47 +04:00
saddr = & ipv6_hdr ( skb ) - > saddr ;
daddr = & ipv6_hdr ( skb ) - > daddr ;
2007-03-13 20:28:48 +03:00
uh = udp_hdr ( skb ) ;
2005-04-17 02:20:36 +04:00
ulen = ntohs ( uh - > len ) ;
2006-11-27 22:10:57 +03:00
if ( ulen > skb - > len )
goto short_packet ;
2005-04-17 02:20:36 +04:00
2007-03-26 07:10:56 +04:00
if ( proto = = IPPROTO_UDP ) {
/* UDP validates ulen. */
2005-04-17 02:20:36 +04:00
2006-11-27 22:10:57 +03:00
/* Check for jumbo payload */
if ( ulen = = 0 )
ulen = skb - > len ;
2005-04-17 02:20:36 +04:00
2006-11-27 22:10:57 +03:00
if ( ulen < sizeof ( * uh ) )
goto short_packet ;
2005-04-17 02:20:36 +04:00
2006-11-27 22:10:57 +03:00
if ( ulen < skb - > len ) {
if ( pskb_trim_rcsum ( skb , ulen ) )
goto short_packet ;
2007-04-26 04:54:47 +04:00
saddr = & ipv6_hdr ( skb ) - > saddr ;
daddr = & ipv6_hdr ( skb ) - > daddr ;
2007-03-13 20:28:48 +03:00
uh = udp_hdr ( skb ) ;
2006-11-27 22:10:57 +03:00
}
}
2005-04-17 02:20:36 +04:00
2007-03-26 07:10:56 +04:00
if ( udp6_csum_init ( skb , uh , proto ) )
goto discard ;
2007-02-09 17:24:49 +03:00
/*
* Multicast receive code
2005-04-17 02:20:36 +04:00
*/
2006-11-27 22:10:57 +03:00
if ( ipv6_addr_is_multicast ( daddr ) )
2008-06-17 04:12:11 +04:00
return __udp6_lib_mcast_deliver ( net , skb ,
saddr , daddr , udptable ) ;
2005-04-17 02:20:36 +04:00
/* Unicast */
2007-02-09 17:24:49 +03:00
/*
2005-04-17 02:20:36 +04:00
* check socket cache . . . must talk to Alan about his plans
* for sock caches . . . i ' ll skip this for now .
*/
2008-10-07 23:38:32 +04:00
sk = __udp6_lib_lookup_skb ( skb , uh - > source , uh - > dest , udptable ) ;
2005-04-17 02:20:36 +04:00
if ( sk = = NULL ) {
if ( ! xfrm6_policy_check ( NULL , XFRM_POLICY_IN , skb ) )
goto discard ;
2006-11-27 22:10:57 +03:00
if ( udp_lib_checksum_complete ( skb ) )
2005-04-17 02:20:36 +04:00
goto discard ;
2008-07-06 08:19:40 +04:00
UDP6_INC_STATS_BH ( net , UDP_MIB_NOPORTS ,
proto = = IPPROTO_UDPLITE ) ;
2005-04-17 02:20:36 +04:00
icmpv6_send ( skb , ICMPV6_DEST_UNREACH , ICMPV6_PORT_UNREACH , 0 , dev ) ;
kfree_skb ( skb ) ;
2007-03-09 07:42:35 +03:00
return 0 ;
2005-04-17 02:20:36 +04:00
}
2007-02-09 17:24:49 +03:00
2005-04-17 02:20:36 +04:00
/* deliver */
2007-02-09 17:24:49 +03:00
2008-08-09 11:35:05 +04:00
bh_lock_sock ( sk ) ;
2007-12-31 11:29:24 +03:00
if ( ! sock_owned_by_user ( sk ) )
udpv6_queue_rcv_skb ( sk , skb ) ;
else
sk_add_backlog ( sk , skb ) ;
bh_unlock_sock ( sk ) ;
2005-04-17 02:20:36 +04:00
sock_put ( sk ) ;
2007-03-09 07:42:35 +03:00
return 0 ;
2005-04-17 02:20:36 +04:00
2007-02-09 17:24:49 +03:00
short_packet :
2006-11-27 22:10:57 +03:00
LIMIT_NETDEBUG ( KERN_DEBUG " UDP%sv6: short packet: %d/%u \n " ,
2008-03-07 03:22:02 +03:00
proto = = IPPROTO_UDPLITE ? " -Lite " : " " ,
2007-03-26 07:10:56 +04:00
ulen , skb - > len ) ;
2005-04-17 02:20:36 +04:00
discard :
2008-07-06 08:19:40 +04:00
UDP6_INC_STATS_BH ( net , UDP_MIB_INERRORS , proto = = IPPROTO_UDPLITE ) ;
2005-04-17 02:20:36 +04:00
kfree_skb ( skb ) ;
2007-03-09 07:42:35 +03:00
return 0 ;
2005-04-17 02:20:36 +04:00
}
2006-11-27 22:10:57 +03:00
2007-10-15 23:50:28 +04:00
static __inline__ int udpv6_rcv ( struct sk_buff * skb )
2006-11-27 22:10:57 +03:00
{
2008-10-29 11:41:45 +03:00
return __udp6_lib_rcv ( skb , & udp_table , IPPROTO_UDP ) ;
2006-11-27 22:10:57 +03:00
}
2005-04-17 02:20:36 +04:00
/*
* Throw away all pending data and cancel the corking . Socket is locked .
*/
static void udp_v6_flush_pending_frames ( struct sock * sk )
{
struct udp_sock * up = udp_sk ( sk ) ;
2008-06-04 15:49:07 +04:00
if ( up - > pending = = AF_INET )
udp_flush_pending_frames ( sk ) ;
else if ( up - > pending ) {
2005-04-17 02:20:36 +04:00
up - > len = 0 ;
up - > pending = 0 ;
ip6_flush_pending_frames ( sk ) ;
2007-02-09 17:24:49 +03:00
}
2005-04-17 02:20:36 +04:00
}
2009-07-09 12:09:54 +04:00
/**
* udp6_hwcsum_outgoing - handle outgoing HW checksumming
* @ sk : socket we are sending on
* @ skb : sk_buff containing the filled - in UDP header
* ( checksum field must be zeroed out )
*/
static void udp6_hwcsum_outgoing ( struct sock * sk , struct sk_buff * skb ,
const struct in6_addr * saddr ,
const struct in6_addr * daddr , int len )
{
unsigned int offset ;
struct udphdr * uh = udp_hdr ( skb ) ;
__wsum csum = 0 ;
if ( skb_queue_len ( & sk - > sk_write_queue ) = = 1 ) {
/* Only one fragment on the socket. */
skb - > csum_start = skb_transport_header ( skb ) - skb - > head ;
skb - > csum_offset = offsetof ( struct udphdr , check ) ;
uh - > check = ~ csum_ipv6_magic ( saddr , daddr , len , IPPROTO_UDP , 0 ) ;
} else {
/*
* HW - checksum won ' t work as there are two or more
* fragments on the socket so that all csums of sk_buffs
* should be together
*/
offset = skb_transport_offset ( skb ) ;
skb - > csum = skb_checksum ( skb , offset , skb - > len - offset , 0 ) ;
skb - > ip_summed = CHECKSUM_NONE ;
skb_queue_walk ( & sk - > sk_write_queue , skb ) {
csum = csum_add ( csum , skb - > csum ) ;
}
uh - > check = csum_ipv6_magic ( saddr , daddr , len , IPPROTO_UDP ,
csum ) ;
if ( uh - > check = = 0 )
uh - > check = CSUM_MANGLED_0 ;
}
}
2005-04-17 02:20:36 +04:00
/*
* Sending
*/
2006-11-27 20:29:59 +03:00
static int udp_v6_push_pending_frames ( struct sock * sk )
2005-04-17 02:20:36 +04:00
{
struct sk_buff * skb ;
struct udphdr * uh ;
2006-11-27 20:29:59 +03:00
struct udp_sock * up = udp_sk ( sk ) ;
2005-04-17 02:20:36 +04:00
struct inet_sock * inet = inet_sk ( sk ) ;
struct flowi * fl = & inet - > cork . fl ;
int err = 0 ;
2007-12-03 14:34:16 +03:00
int is_udplite = IS_UDPLITE ( sk ) ;
2006-11-15 08:35:48 +03:00
__wsum csum = 0 ;
2005-04-17 02:20:36 +04:00
/* Grab the skbuff where UDP header space exists. */
if ( ( skb = skb_peek ( & sk - > sk_write_queue ) ) = = NULL )
goto out ;
/*
* Create a UDP header
*/
2007-03-13 20:28:48 +03:00
uh = udp_hdr ( skb ) ;
2005-04-17 02:20:36 +04:00
uh - > source = fl - > fl_ip_sport ;
uh - > dest = fl - > fl_ip_dport ;
uh - > len = htons ( up - > len ) ;
uh - > check = 0 ;
2007-12-03 14:34:16 +03:00
if ( is_udplite )
2006-11-27 22:10:57 +03:00
csum = udplite_csum_outgoing ( sk , skb ) ;
2009-07-09 12:09:54 +04:00
else if ( skb - > ip_summed = = CHECKSUM_PARTIAL ) { /* UDP hardware csum */
udp6_hwcsum_outgoing ( sk , skb , & fl - > fl6_src , & fl - > fl6_dst ,
up - > len ) ;
goto send ;
} else
2006-11-27 22:10:57 +03:00
csum = udp_csum_outgoing ( sk , skb ) ;
2005-04-17 02:20:36 +04:00
2006-11-27 22:10:57 +03:00
/* add protocol-dependent pseudo-header */
uh - > check = csum_ipv6_magic ( & fl - > fl6_src , & fl - > fl6_dst ,
up - > len , fl - > proto , csum ) ;
2005-04-17 02:20:36 +04:00
if ( uh - > check = = 0 )
2006-11-16 13:36:50 +03:00
uh - > check = CSUM_MANGLED_0 ;
2005-04-17 02:20:36 +04:00
2009-07-09 12:09:54 +04:00
send :
2005-04-17 02:20:36 +04:00
err = ip6_push_pending_frames ( sk ) ;
ip: Report qdisc packet drops
Christoph Lameter pointed out that packet drops at qdisc level where not
accounted in SNMP counters. Only if application sets IP_RECVERR, drops
are reported to user (-ENOBUFS errors) and SNMP counters updated.
IP_RECVERR is used to enable extended reliable error message passing,
but these are not needed to update system wide SNMP stats.
This patch changes things a bit to allow SNMP counters to be updated,
regardless of IP_RECVERR being set or not on the socket.
Example after an UDP tx flood
# netstat -s
...
IP:
1487048 outgoing packets dropped
...
Udp:
...
SndbufErrors: 1487048
send() syscalls, do however still return an OK status, to not
break applications.
Note : send() manual page explicitly says for -ENOBUFS error :
"The output queue for a network interface was full.
This generally indicates that the interface has stopped sending,
but may be caused by transient congestion.
(Normally, this does not occur in Linux. Packets are just silently
dropped when a device queue overflows.) "
This is not true for IP_RECVERR enabled sockets : a send() syscall
that hit a qdisc drop returns an ENOBUFS error.
Many thanks to Christoph, David, and last but not least, Alexey !
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2009-09-03 05:05:33 +04:00
if ( err ) {
if ( err = = - ENOBUFS & & ! inet6_sk ( sk ) - > recverr ) {
UDP6_INC_STATS_USER ( sock_net ( sk ) ,
UDP_MIB_SNDBUFERRORS , is_udplite ) ;
err = 0 ;
}
} else
UDP6_INC_STATS_USER ( sock_net ( sk ) ,
UDP_MIB_OUTDATAGRAMS , is_udplite ) ;
2005-04-17 02:20:36 +04:00
out :
up - > len = 0 ;
up - > pending = 0 ;
return err ;
}
2006-11-27 22:10:57 +03:00
int udpv6_sendmsg ( struct kiocb * iocb , struct sock * sk ,
2005-04-17 02:20:36 +04:00
struct msghdr * msg , size_t len )
{
struct ipv6_txoptions opt_space ;
struct udp_sock * up = udp_sk ( sk ) ;
struct inet_sock * inet = inet_sk ( sk ) ;
struct ipv6_pinfo * np = inet6_sk ( sk ) ;
struct sockaddr_in6 * sin6 = ( struct sockaddr_in6 * ) msg - > msg_name ;
struct in6_addr * daddr , * final_p = NULL , final ;
struct ipv6_txoptions * opt = NULL ;
struct ip6_flowlabel * flowlabel = NULL ;
2006-10-04 01:34:00 +04:00
struct flowi fl ;
2005-04-17 02:20:36 +04:00
struct dst_entry * dst ;
int addr_len = msg - > msg_namelen ;
int ulen = len ;
int hlimit = - 1 ;
2005-09-08 05:19:03 +04:00
int tclass = - 1 ;
2005-04-17 02:20:36 +04:00
int corkreq = up - > corkflag | | msg - > msg_flags & MSG_MORE ;
int err ;
2005-09-18 11:30:08 +04:00
int connected = 0 ;
2007-12-03 14:34:16 +03:00
int is_udplite = IS_UDPLITE ( sk ) ;
2006-11-27 22:10:57 +03:00
int ( * getfrag ) ( void * , char * , int , int , int , struct sk_buff * ) ;
2005-04-17 02:20:36 +04:00
/* destination address check */
if ( sin6 ) {
if ( addr_len < offsetof ( struct sockaddr , sa_data ) )
return - EINVAL ;
switch ( sin6 - > sin6_family ) {
case AF_INET6 :
if ( addr_len < SIN6_LEN_RFC2133 )
return - EINVAL ;
daddr = & sin6 - > sin6_addr ;
break ;
case AF_INET :
goto do_udp_sendmsg ;
case AF_UNSPEC :
msg - > msg_name = sin6 = NULL ;
msg - > msg_namelen = addr_len = 0 ;
daddr = NULL ;
break ;
default :
return - EINVAL ;
}
} else if ( ! up - > pending ) {
if ( sk - > sk_state ! = TCP_ESTABLISHED )
return - EDESTADDRREQ ;
daddr = & np - > daddr ;
2007-02-09 17:24:49 +03:00
} else
2005-04-17 02:20:36 +04:00
daddr = NULL ;
if ( daddr ) {
2007-08-25 10:16:08 +04:00
if ( ipv6_addr_v4mapped ( daddr ) ) {
2005-04-17 02:20:36 +04:00
struct sockaddr_in sin ;
sin . sin_family = AF_INET ;
2009-10-15 10:30:45 +04:00
sin . sin_port = sin6 ? sin6 - > sin6_port : inet - > inet_dport ;
2005-04-17 02:20:36 +04:00
sin . sin_addr . s_addr = daddr - > s6_addr32 [ 3 ] ;
msg - > msg_name = & sin ;
msg - > msg_namelen = sizeof ( sin ) ;
do_udp_sendmsg :
if ( __ipv6_only_sock ( sk ) )
return - ENETUNREACH ;
return udp_sendmsg ( iocb , sk , msg , len ) ;
}
}
if ( up - > pending = = AF_INET )
return udp_sendmsg ( iocb , sk , msg , len ) ;
/* Rough check on arithmetic overflow,
[IPv6]: Fix incorrect length check in rawv6_sendmsg()
In article <20070329.142644.70222545.davem@davemloft.net> (at Thu, 29 Mar 2007 14:26:44 -0700 (PDT)), David Miller <davem@davemloft.net> says:
> From: Sridhar Samudrala <sri@us.ibm.com>
> Date: Thu, 29 Mar 2007 14:17:28 -0700
>
> > The check for length in rawv6_sendmsg() is incorrect.
> > As len is an unsigned int, (len < 0) will never be TRUE.
> > I think checking for IPV6_MAXPLEN(65535) is better.
> >
> > Is it possible to send ipv6 jumbo packets using raw
> > sockets? If so, we can remove this check.
>
> I don't see why such a limitation against jumbo would exist,
> does anyone else?
>
> Thanks for catching this Sridhar. A good compiler should simply
> fail to compile "if (x < 0)" when 'x' is an unsigned type, don't
> you think :-)
Dave, we use "int" for returning value,
so we should fix this anyway, IMHO;
we should not allow len > INT_MAX.
Signed-off-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Acked-by: Sridhar Samudrala <sri@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2007-03-31 01:45:35 +04:00
better check is made in ip6_append_data ( ) .
2005-04-17 02:20:36 +04:00
*/
if ( len > INT_MAX - sizeof ( struct udphdr ) )
return - EMSGSIZE ;
2007-02-09 17:24:49 +03:00
2005-04-17 02:20:36 +04:00
if ( up - > pending ) {
/*
* There are pending frames .
* The socket lock must be held while it ' s corked .
*/
lock_sock ( sk ) ;
if ( likely ( up - > pending ) ) {
if ( unlikely ( up - > pending ! = AF_INET6 ) ) {
release_sock ( sk ) ;
return - EAFNOSUPPORT ;
}
dst = NULL ;
goto do_append_data ;
}
release_sock ( sk ) ;
}
ulen + = sizeof ( struct udphdr ) ;
2006-10-04 01:34:00 +04:00
memset ( & fl , 0 , sizeof ( fl ) ) ;
2005-04-17 02:20:36 +04:00
if ( sin6 ) {
if ( sin6 - > sin6_port = = 0 )
return - EINVAL ;
2006-10-04 01:34:00 +04:00
fl . fl_ip_dport = sin6 - > sin6_port ;
2005-04-17 02:20:36 +04:00
daddr = & sin6 - > sin6_addr ;
if ( np - > sndflow ) {
2006-10-04 01:34:00 +04:00
fl . fl6_flowlabel = sin6 - > sin6_flowinfo & IPV6_FLOWINFO_MASK ;
if ( fl . fl6_flowlabel & IPV6_FLOWLABEL_MASK ) {
flowlabel = fl6_sock_lookup ( sk , fl . fl6_flowlabel ) ;
2005-04-17 02:20:36 +04:00
if ( flowlabel = = NULL )
return - EINVAL ;
daddr = & flowlabel - > dst ;
}
}
/*
* Otherwise it will be difficult to maintain
* sk - > sk_dst_cache .
*/
if ( sk - > sk_state = = TCP_ESTABLISHED & &
ipv6_addr_equal ( daddr , & np - > daddr ) )
daddr = & np - > daddr ;
if ( addr_len > = sizeof ( struct sockaddr_in6 ) & &
sin6 - > sin6_scope_id & &
ipv6_addr_type ( daddr ) & IPV6_ADDR_LINKLOCAL )
2006-10-04 01:34:00 +04:00
fl . oif = sin6 - > sin6_scope_id ;
2005-04-17 02:20:36 +04:00
} else {
if ( sk - > sk_state ! = TCP_ESTABLISHED )
return - EDESTADDRREQ ;
2009-10-15 10:30:45 +04:00
fl . fl_ip_dport = inet - > inet_dport ;
2005-04-17 02:20:36 +04:00
daddr = & np - > daddr ;
2006-10-04 01:34:00 +04:00
fl . fl6_flowlabel = np - > flow_label ;
2005-09-18 11:30:08 +04:00
connected = 1 ;
2005-04-17 02:20:36 +04:00
}
2006-10-04 01:34:00 +04:00
if ( ! fl . oif )
fl . oif = sk - > sk_bound_dev_if ;
2005-04-17 02:20:36 +04:00
2008-12-16 13:08:29 +03:00
if ( ! fl . oif )
fl . oif = np - > sticky_pktinfo . ipi6_ifindex ;
2009-10-05 12:24:16 +04:00
fl . mark = sk - > sk_mark ;
2005-04-17 02:20:36 +04:00
if ( msg - > msg_controllen ) {
opt = & opt_space ;
memset ( opt , 0 , sizeof ( struct ipv6_txoptions ) ) ;
opt - > tot_len = sizeof ( * opt ) ;
2008-06-04 08:02:49 +04:00
err = datagram_send_ctl ( sock_net ( sk ) , msg , & fl , opt , & hlimit , & tclass ) ;
2005-04-17 02:20:36 +04:00
if ( err < 0 ) {
fl6_sock_release ( flowlabel ) ;
return err ;
}
2006-10-04 01:34:00 +04:00
if ( ( fl . fl6_flowlabel & IPV6_FLOWLABEL_MASK ) & & ! flowlabel ) {
flowlabel = fl6_sock_lookup ( sk , fl . fl6_flowlabel ) ;
2005-04-17 02:20:36 +04:00
if ( flowlabel = = NULL )
return - EINVAL ;
}
if ( ! ( opt - > opt_nflen | opt - > opt_flen ) )
opt = NULL ;
2005-09-18 11:30:08 +04:00
connected = 0 ;
2005-04-17 02:20:36 +04:00
}
if ( opt = = NULL )
opt = np - > opt ;
2005-11-20 06:23:18 +03:00
if ( flowlabel )
opt = fl6_merge_options ( & opt_space , flowlabel , opt ) ;
opt = ipv6_fixup_options ( & opt_space , opt ) ;
2005-04-17 02:20:36 +04:00
2006-11-27 22:10:57 +03:00
fl . proto = sk - > sk_protocol ;
2008-04-11 08:38:24 +04:00
if ( ! ipv6_addr_any ( daddr ) )
ipv6_addr_copy ( & fl . fl6_dst , daddr ) ;
else
fl . fl6_dst . s6_addr [ 15 ] = 0x1 ; /* :: means loopback (BSD'ism) */
2006-10-04 01:34:00 +04:00
if ( ipv6_addr_any ( & fl . fl6_src ) & & ! ipv6_addr_any ( & np - > saddr ) )
ipv6_addr_copy ( & fl . fl6_src , & np - > saddr ) ;
2009-10-15 10:30:45 +04:00
fl . fl_ip_sport = inet - > inet_sport ;
2007-02-09 17:24:49 +03:00
2005-04-17 02:20:36 +04:00
/* merge ip6_build_xmit from ip6_output */
if ( opt & & opt - > srcrt ) {
struct rt0_hdr * rt0 = ( struct rt0_hdr * ) opt - > srcrt ;
2006-10-04 01:34:00 +04:00
ipv6_addr_copy ( & final , & fl . fl6_dst ) ;
ipv6_addr_copy ( & fl . fl6_dst , rt0 - > addr ) ;
2005-04-17 02:20:36 +04:00
final_p = & final ;
2005-09-18 11:30:08 +04:00
connected = 0 ;
2005-04-17 02:20:36 +04:00
}
2006-10-04 01:34:00 +04:00
if ( ! fl . oif & & ipv6_addr_is_multicast ( & fl . fl6_dst ) ) {
fl . oif = np - > mcast_oif ;
2005-09-18 11:30:08 +04:00
connected = 0 ;
}
2005-04-17 02:20:36 +04:00
2006-10-04 01:34:00 +04:00
security_sk_classify_flow ( sk , & fl ) ;
2006-08-05 10:12:42 +04:00
2006-10-04 01:34:00 +04:00
err = ip6_sk_dst_lookup ( sk , & dst , & fl ) ;
2005-04-17 02:20:36 +04:00
if ( err )
goto out ;
if ( final_p )
2006-10-04 01:34:00 +04:00
ipv6_addr_copy ( & fl . fl6_dst , final_p ) ;
2005-04-17 02:20:36 +04:00
2008-11-26 04:35:18 +03:00
err = __xfrm_lookup ( sock_net ( sk ) , & dst , & fl , sk , XFRM_LOOKUP_WAIT ) ;
if ( err < 0 ) {
2007-05-25 05:17:54 +04:00
if ( err = = - EREMOTE )
err = ip6_dst_blackhole ( sk , & dst , & fl ) ;
if ( err < 0 )
goto out ;
}
2005-04-17 02:20:36 +04:00
if ( hlimit < 0 ) {
2006-10-04 01:34:00 +04:00
if ( ipv6_addr_is_multicast ( & fl . fl6_dst ) )
2005-04-17 02:20:36 +04:00
hlimit = np - > mcast_hops ;
else
hlimit = np - > hop_limit ;
if ( hlimit < 0 )
2008-03-10 13:00:30 +03:00
hlimit = ip6_dst_hoplimit ( dst ) ;
2005-04-17 02:20:36 +04:00
}
2009-08-09 12:12:48 +04:00
if ( tclass < 0 )
2005-09-08 05:19:03 +04:00
tclass = np - > tclass ;
2005-04-17 02:20:36 +04:00
if ( msg - > msg_flags & MSG_CONFIRM )
goto do_confirm ;
back_from_confirm :
lock_sock ( sk ) ;
if ( unlikely ( up - > pending ) ) {
/* The socket is already corked while preparing it. */
/* ... which is an evident application bug. --ANK */
release_sock ( sk ) ;
2005-08-10 07:50:53 +04:00
LIMIT_NETDEBUG ( KERN_DEBUG " udp cork app bug 2 \n " ) ;
2005-04-17 02:20:36 +04:00
err = - EINVAL ;
goto out ;
}
up - > pending = AF_INET6 ;
do_append_data :
up - > len + = ulen ;
2006-11-27 22:10:57 +03:00
getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag ;
err = ip6_append_data ( sk , getfrag , msg - > msg_iov , ulen ,
2006-10-04 01:34:00 +04:00
sizeof ( struct udphdr ) , hlimit , tclass , opt , & fl ,
2005-09-08 05:19:03 +04:00
( struct rt6_info * ) dst ,
corkreq ? msg - > msg_flags | MSG_MORE : msg - > msg_flags ) ;
2005-04-17 02:20:36 +04:00
if ( err )
udp_v6_flush_pending_frames ( sk ) ;
else if ( ! corkreq )
2006-11-27 20:29:59 +03:00
err = udp_v6_push_pending_frames ( sk ) ;
2006-10-04 01:35:49 +04:00
else if ( unlikely ( skb_queue_empty ( & sk - > sk_write_queue ) ) )
up - > pending = 0 ;
2005-04-17 02:20:36 +04:00
2005-10-04 01:21:58 +04:00
if ( dst ) {
if ( connected ) {
ip6_dst_store ( sk , dst ,
2006-10-04 01:34:00 +04:00
ipv6_addr_equal ( & fl . fl6_dst , & np - > daddr ) ?
2006-08-30 04:15:09 +04:00
& np - > daddr : NULL ,
# ifdef CONFIG_IPV6_SUBTREES
2006-10-04 01:34:00 +04:00
ipv6_addr_equal ( & fl . fl6_src , & np - > saddr ) ?
2006-08-30 04:15:09 +04:00
& np - > saddr :
# endif
NULL ) ;
2005-10-04 01:21:58 +04:00
} else {
dst_release ( dst ) ;
}
2008-06-03 20:30:25 +04:00
dst = NULL ;
2005-10-04 01:21:58 +04:00
}
2005-04-17 02:20:36 +04:00
if ( err > 0 )
err = np - > recverr ? net_xmit_errno ( err ) : 0 ;
release_sock ( sk ) ;
out :
2008-06-03 20:30:25 +04:00
dst_release ( dst ) ;
2005-04-17 02:20:36 +04:00
fl6_sock_release ( flowlabel ) ;
2007-09-15 04:15:01 +04:00
if ( ! err )
2005-04-17 02:20:36 +04:00
return len ;
2006-08-15 11:00:09 +04:00
/*
* ENOBUFS = no kernel mem , SOCK_NOSPACE = no sndbuf space . Reporting
* ENOBUFS might not be good ( it ' s not tunable per se ) , but otherwise
* we don ' t have a good statistic ( IpOutDiscards but it can be too many
* things ) . We could add another new stat but at least for now that
* seems like overkill .
*/
if ( err = = - ENOBUFS | | test_bit ( SOCK_NOSPACE , & sk - > sk_socket - > flags ) ) {
2008-07-06 08:19:20 +04:00
UDP6_INC_STATS_USER ( sock_net ( sk ) ,
UDP_MIB_SNDBUFERRORS , is_udplite ) ;
2006-08-15 11:00:09 +04:00
}
2005-04-17 02:20:36 +04:00
return err ;
do_confirm :
dst_confirm ( dst ) ;
if ( ! ( msg - > msg_flags & MSG_PROBE ) | | len )
goto back_from_confirm ;
err = 0 ;
goto out ;
}
2008-06-15 04:04:49 +04:00
void udpv6_destroy_sock ( struct sock * sk )
2005-04-17 02:20:36 +04:00
{
lock_sock ( sk ) ;
udp_v6_flush_pending_frames ( sk ) ;
release_sock ( sk ) ;
inet6_destroy_sock ( sk ) ;
}
/*
* Socket option code for UDP
*/
2006-11-27 22:10:57 +03:00
int udpv6_setsockopt ( struct sock * sk , int level , int optname ,
2009-10-01 03:12:20 +04:00
char __user * optval , unsigned int optlen )
2006-03-21 09:45:21 +03:00
{
2008-03-07 03:22:02 +03:00
if ( level = = SOL_UDP | | level = = SOL_UDPLITE )
2006-11-27 20:29:59 +03:00
return udp_lib_setsockopt ( sk , level , optname , optval , optlen ,
udp_v6_push_pending_frames ) ;
2006-11-27 22:10:57 +03:00
return ipv6_setsockopt ( sk , level , optname , optval , optlen ) ;
2006-03-21 09:45:21 +03:00
}
# ifdef CONFIG_COMPAT
2006-11-27 22:10:57 +03:00
int compat_udpv6_setsockopt ( struct sock * sk , int level , int optname ,
2009-10-01 03:12:20 +04:00
char __user * optval , unsigned int optlen )
2006-03-21 09:45:21 +03:00
{
2008-03-07 03:22:02 +03:00
if ( level = = SOL_UDP | | level = = SOL_UDPLITE )
2006-11-27 20:29:59 +03:00
return udp_lib_setsockopt ( sk , level , optname , optval , optlen ,
udp_v6_push_pending_frames ) ;
2006-11-27 22:10:57 +03:00
return compat_ipv6_setsockopt ( sk , level , optname , optval , optlen ) ;
2006-03-21 09:45:21 +03:00
}
# endif
2006-11-27 22:10:57 +03:00
int udpv6_getsockopt ( struct sock * sk , int level , int optname ,
char __user * optval , int __user * optlen )
2006-03-21 09:45:21 +03:00
{
2008-03-07 03:22:02 +03:00
if ( level = = SOL_UDP | | level = = SOL_UDPLITE )
2006-11-27 20:29:59 +03:00
return udp_lib_getsockopt ( sk , level , optname , optval , optlen ) ;
2006-11-27 22:10:57 +03:00
return ipv6_getsockopt ( sk , level , optname , optval , optlen ) ;
2006-03-21 09:45:21 +03:00
}
# ifdef CONFIG_COMPAT
2006-11-27 22:10:57 +03:00
int compat_udpv6_getsockopt ( struct sock * sk , int level , int optname ,
char __user * optval , int __user * optlen )
2006-03-21 09:45:21 +03:00
{
2008-03-07 03:22:02 +03:00
if ( level = = SOL_UDP | | level = = SOL_UDPLITE )
2006-11-27 20:29:59 +03:00
return udp_lib_getsockopt ( sk , level , optname , optval , optlen ) ;
2006-11-27 22:10:57 +03:00
return compat_ipv6_getsockopt ( sk , level , optname , optval , optlen ) ;
2006-03-21 09:45:21 +03:00
}
# endif
2009-07-09 12:10:04 +04:00
static int udp6_ufo_send_check ( struct sk_buff * skb )
{
struct ipv6hdr * ipv6h ;
struct udphdr * uh ;
if ( ! pskb_may_pull ( skb , sizeof ( * uh ) ) )
return - EINVAL ;
ipv6h = ipv6_hdr ( skb ) ;
uh = udp_hdr ( skb ) ;
uh - > check = ~ csum_ipv6_magic ( & ipv6h - > saddr , & ipv6h - > daddr , skb - > len ,
IPPROTO_UDP , 0 ) ;
skb - > csum_start = skb_transport_header ( skb ) - skb - > head ;
skb - > csum_offset = offsetof ( struct udphdr , check ) ;
skb - > ip_summed = CHECKSUM_PARTIAL ;
return 0 ;
}
static struct sk_buff * udp6_ufo_fragment ( struct sk_buff * skb , int features )
{
struct sk_buff * segs = ERR_PTR ( - EINVAL ) ;
unsigned int mss ;
unsigned int unfrag_ip6hlen , unfrag_len ;
struct frag_hdr * fptr ;
u8 * mac_start , * prevhdr ;
u8 nexthdr ;
u8 frag_hdr_sz = sizeof ( struct frag_hdr ) ;
int offset ;
__wsum csum ;
mss = skb_shinfo ( skb ) - > gso_size ;
if ( unlikely ( skb - > len < = mss ) )
goto out ;
if ( skb_gso_ok ( skb , features | NETIF_F_GSO_ROBUST ) ) {
/* Packet is from an untrusted source, reset gso_segs. */
int type = skb_shinfo ( skb ) - > gso_type ;
if ( unlikely ( type & ~ ( SKB_GSO_UDP | SKB_GSO_DODGY ) | |
! ( type & ( SKB_GSO_UDP ) ) ) )
goto out ;
skb_shinfo ( skb ) - > gso_segs = DIV_ROUND_UP ( skb - > len , mss ) ;
segs = NULL ;
goto out ;
}
/* Do software UFO. Complete and fill in the UDP checksum as HW cannot
* do checksum of UDP packets sent as multiple IP fragments .
*/
offset = skb - > csum_start - skb_headroom ( skb ) ;
csum = skb_checksum ( skb , offset , skb - > len - offset , 0 ) ;
offset + = skb - > csum_offset ;
* ( __sum16 * ) ( skb - > data + offset ) = csum_fold ( csum ) ;
skb - > ip_summed = CHECKSUM_NONE ;
/* Check if there is enough headroom to insert fragment header. */
if ( ( skb_headroom ( skb ) < frag_hdr_sz ) & &
pskb_expand_head ( skb , frag_hdr_sz , 0 , GFP_ATOMIC ) )
goto out ;
/* Find the unfragmentable header and shift it left by frag_hdr_sz
* bytes to insert fragment header .
*/
unfrag_ip6hlen = ip6_find_1stfragopt ( skb , & prevhdr ) ;
nexthdr = * prevhdr ;
* prevhdr = NEXTHDR_FRAGMENT ;
unfrag_len = skb_network_header ( skb ) - skb_mac_header ( skb ) +
unfrag_ip6hlen ;
mac_start = skb_mac_header ( skb ) ;
memmove ( mac_start - frag_hdr_sz , mac_start , unfrag_len ) ;
skb - > mac_header - = frag_hdr_sz ;
skb - > network_header - = frag_hdr_sz ;
fptr = ( struct frag_hdr * ) ( skb_network_header ( skb ) + unfrag_ip6hlen ) ;
fptr - > nexthdr = nexthdr ;
fptr - > reserved = 0 ;
ipv6_select_ident ( fptr ) ;
/* Fragment the skb. ipv6 header and the remaining fields of the
* fragment header are updated in ipv6_gso_segment ( )
*/
segs = skb_segment ( skb , features ) ;
out :
return segs ;
}
2009-09-14 16:22:28 +04:00
static const struct inet6_protocol udpv6_protocol = {
2005-04-17 02:20:36 +04:00
. handler = udpv6_rcv ,
. err_handler = udpv6_err ,
2009-07-09 12:10:04 +04:00
. gso_send_check = udp6_ufo_send_check ,
. gso_segment = udp6_ufo_fragment ,
2005-04-17 02:20:36 +04:00
. flags = INET6_PROTO_NOPOLICY | INET6_PROTO_FINAL ,
} ;
/* ------------------------------------------------------------------------ */
# ifdef CONFIG_PROC_FS
static void udp6_sock_seq_show ( struct seq_file * seq , struct sock * sp , int bucket )
{
struct inet_sock * inet = inet_sk ( sp ) ;
struct ipv6_pinfo * np = inet6_sk ( sp ) ;
struct in6_addr * dest , * src ;
__u16 destp , srcp ;
dest = & np - > daddr ;
src = & np - > rcv_saddr ;
2009-10-15 10:30:45 +04:00
destp = ntohs ( inet - > inet_dport ) ;
srcp = ntohs ( inet - > inet_sport ) ;
2005-04-17 02:20:36 +04:00
seq_printf ( seq ,
2009-10-07 04:37:59 +04:00
" %5d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X "
2008-06-18 08:04:56 +04:00
" %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p %d \n " ,
2005-04-17 02:20:36 +04:00
bucket ,
src - > s6_addr32 [ 0 ] , src - > s6_addr32 [ 1 ] ,
src - > s6_addr32 [ 2 ] , src - > s6_addr32 [ 3 ] , srcp ,
dest - > s6_addr32 [ 0 ] , dest - > s6_addr32 [ 1 ] ,
dest - > s6_addr32 [ 2 ] , dest - > s6_addr32 [ 3 ] , destp ,
2007-02-09 17:24:49 +03:00
sp - > sk_state ,
2009-06-18 06:05:41 +04:00
sk_wmem_alloc_get ( sp ) ,
sk_rmem_alloc_get ( sp ) ,
2005-04-17 02:20:36 +04:00
0 , 0L , 0 ,
sock_i_uid ( sp ) , 0 ,
sock_i_ino ( sp ) ,
2008-06-18 08:04:56 +04:00
atomic_read ( & sp - > sk_refcnt ) , sp ,
atomic_read ( & sp - > sk_drops ) ) ;
2005-04-17 02:20:36 +04:00
}
2006-11-27 22:10:57 +03:00
int udp6_seq_show ( struct seq_file * seq , void * v )
2005-04-17 02:20:36 +04:00
{
if ( v = = SEQ_START_TOKEN )
seq_printf ( seq ,
" sl "
" local_address "
" remote_address "
" st tx_queue rx_queue tr tm->when retrnsmt "
2008-06-18 08:04:56 +04:00
" uid timeout inode ref pointer drops \n " ) ;
2005-04-17 02:20:36 +04:00
else
udp6_sock_seq_show ( seq , v , ( ( struct udp_iter_state * ) seq - > private ) - > bucket ) ;
return 0 ;
}
static struct udp_seq_afinfo udp6_seq_afinfo = {
. name = " udp6 " ,
. family = AF_INET6 ,
2008-10-29 11:41:45 +03:00
. udp_table = & udp_table ,
2008-03-29 04:25:53 +03:00
. seq_fops = {
. owner = THIS_MODULE ,
} ,
2008-03-29 04:24:26 +03:00
. seq_ops = {
. show = udp6_seq_show ,
} ,
2005-04-17 02:20:36 +04:00
} ;
2008-03-21 14:14:17 +03:00
int udp6_proc_init ( struct net * net )
2005-04-17 02:20:36 +04:00
{
2008-03-21 14:14:17 +03:00
return udp_proc_register ( net , & udp6_seq_afinfo ) ;
2005-04-17 02:20:36 +04:00
}
2008-03-21 14:14:17 +03:00
void udp6_proc_exit ( struct net * net ) {
udp_proc_unregister ( net , & udp6_seq_afinfo ) ;
2005-04-17 02:20:36 +04:00
}
# endif /* CONFIG_PROC_FS */
/* ------------------------------------------------------------------------ */
struct proto udpv6_prot = {
2006-03-21 09:48:35 +03:00
. name = " UDPv6 " ,
. owner = THIS_MODULE ,
2006-11-27 22:10:57 +03:00
. close = udp_lib_close ,
2006-03-21 09:48:35 +03:00
. connect = ip6_datagram_connect ,
. disconnect = udp_disconnect ,
. ioctl = udp_ioctl ,
. destroy = udpv6_destroy_sock ,
. setsockopt = udpv6_setsockopt ,
. getsockopt = udpv6_getsockopt ,
. sendmsg = udpv6_sendmsg ,
. recvmsg = udpv6_recvmsg ,
. backlog_rcv = udpv6_queue_rcv_skb ,
2006-11-27 22:10:57 +03:00
. hash = udp_lib_hash ,
. unhash = udp_lib_unhash ,
2006-03-21 09:48:35 +03:00
. get_port = udp_v6_get_port ,
2007-12-31 11:29:24 +03:00
. memory_allocated = & udp_memory_allocated ,
. sysctl_mem = sysctl_udp_mem ,
. sysctl_wmem = & sysctl_udp_wmem_min ,
. sysctl_rmem = & sysctl_udp_rmem_min ,
2006-03-21 09:48:35 +03:00
. obj_size = sizeof ( struct udp6_sock ) ,
udp: RCU handling for Unicast packets.
Goals are :
1) Optimizing handling of incoming Unicast UDP frames, so that no memory
writes should happen in the fast path.
Note: Multicasts and broadcasts still will need to take a lock,
because doing a full lockless lookup in this case is difficult.
2) No expensive operations in the socket bind/unhash phases :
- No expensive synchronize_rcu() calls.
- No added rcu_head in socket structure, increasing memory needs,
but more important, forcing us to use call_rcu() calls,
that have the bad property of making sockets structure cold.
(rcu grace period between socket freeing and its potential reuse
make this socket being cold in CPU cache).
David did a previous patch using call_rcu() and noticed a 20%
impact on TCP connection rates.
Quoting Cristopher Lameter :
"Right. That results in cacheline cooldown. You'd want to recycle
the object as they are cache hot on a per cpu basis. That is screwed
up by the delayed regular rcu processing. We have seen multiple
regressions due to cacheline cooldown.
The only choice in cacheline hot sensitive areas is to deal with the
complexity that comes with SLAB_DESTROY_BY_RCU or give up on RCU."
- Because udp sockets are allocated from dedicated kmem_cache,
use of SLAB_DESTROY_BY_RCU can help here.
Theory of operation :
---------------------
As the lookup is lockfree (using rcu_read_lock()/rcu_read_unlock()),
special attention must be taken by readers and writers.
Use of SLAB_DESTROY_BY_RCU is tricky too, because a socket can be freed,
reused, inserted in a different chain or in worst case in the same chain
while readers could do lookups in the same time.
In order to avoid loops, a reader must check each socket found in a chain
really belongs to the chain the reader was traversing. If it finds a
mismatch, lookup must start again at the begining. This *restart* loop
is the reason we had to use rdlock for the multicast case, because
we dont want to send same message several times to the same socket.
We use RCU only for fast path.
Thus, /proc/net/udp still takes spinlocks.
Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-10-29 12:11:14 +03:00
. slab_flags = SLAB_DESTROY_BY_RCU ,
2008-10-29 11:41:45 +03:00
. h . udp_table = & udp_table ,
2006-03-21 09:45:21 +03:00
# ifdef CONFIG_COMPAT
2006-03-21 09:48:35 +03:00
. compat_setsockopt = compat_udpv6_setsockopt ,
. compat_getsockopt = compat_udpv6_getsockopt ,
2006-03-21 09:45:21 +03:00
# endif
2005-04-17 02:20:36 +04:00
} ;
static struct inet_protosw udpv6_protosw = {
. type = SOCK_DGRAM ,
. protocol = IPPROTO_UDP ,
. prot = & udpv6_prot ,
. ops = & inet6_dgram_ops ,
. no_check = UDP_CSUM_DEFAULT ,
. flags = INET_PROTOSW_PERMANENT ,
} ;
2007-12-11 13:25:35 +03:00
int __init udpv6_init ( void )
2005-04-17 02:20:36 +04:00
{
2007-12-11 13:25:35 +03:00
int ret ;
ret = inet6_add_protocol ( & udpv6_protocol , IPPROTO_UDP ) ;
if ( ret )
goto out ;
ret = inet6_register_protosw ( & udpv6_protosw ) ;
if ( ret )
goto out_udpv6_protocol ;
out :
return ret ;
out_udpv6_protocol :
inet6_del_protocol ( & udpv6_protocol , IPPROTO_UDP ) ;
goto out ;
}
2007-12-13 16:34:58 +03:00
void udpv6_exit ( void )
2007-12-11 13:25:35 +03:00
{
inet6_unregister_protosw ( & udpv6_protosw ) ;
inet6_del_protocol ( & udpv6_protocol , IPPROTO_UDP ) ;
2005-04-17 02:20:36 +04:00
}