2019-05-19 15:08:20 +03:00
// SPDX-License-Identifier: GPL-2.0-only
2012-08-13 09:53:28 +04:00
# include <linux/module.h>
# include <linux/sock_diag.h>
# include <linux/net.h>
2012-08-13 09:57:44 +04:00
# include <linux/netdevice.h>
2012-08-13 09:53:28 +04:00
# include <linux/packet_diag.h>
packet: use percpu mmap tx frame pending refcount
In PF_PACKET's packet mmap(), we can avoid using one atomic_inc()
and one atomic_dec() call in skb destructor and use a percpu
reference count instead in order to determine if packets are
still pending to be sent out. Micro-benchmark with [1] that has
been slightly modified (that is, protcol = 0 in socket(2) and
bind(2)), example on a rather crappy testing machine; I expect
it to scale and have even better results on bigger machines:
./packet_mm_tx -s7000 -m7200 -z700000 em1, avg over 2500 runs:
With patch: 4,022,015 cyc
Without patch: 4,812,994 cyc
time ./packet_mm_tx -s64 -c10000000 em1 > /dev/null, stable:
With patch:
real 1m32.241s
user 0m0.287s
sys 1m29.316s
Without patch:
real 1m38.386s
user 0m0.265s
sys 1m35.572s
In function tpacket_snd(), it is okay to use packet_read_pending()
since in fast-path we short-circuit the condition already with
ph != NULL, since we have next frames to process. In case we have
MSG_DONTWAIT, we also do not execute this path as need_wait is
false here anyway, and in case of _no_ MSG_DONTWAIT flag, it is
okay to call a packet_read_pending(), because when we ever reach
that path, we're done processing outgoing frames anyway and only
look if there are skbs still outstanding to be orphaned. We can
stay lockless in this percpu counter since it's acceptable when we
reach this path for the sum to be imprecise first, but we'll level
out at 0 after all pending frames have reached the skb destructor
eventually through tx reclaim. When people pin a tx process to
particular CPUs, we expect overflows to happen in the reference
counter as on one CPU we expect heavy increase; and distributed
through ksoftirqd on all CPUs a decrease, for example. As
David Laight points out, since the C language doesn't define the
result of signed int overflow (i.e. rather than wrap, it is
allowed to saturate as a possible outcome), we have to use
unsigned int as reference count. The sum over all CPUs when tx
is complete will result in 0 again.
The BUG_ON() in tpacket_destruct_skb() we can remove as well. It
can _only_ be set from inside tpacket_snd() path and we made sure
to increase tx_ring.pending in any case before we called po->xmit(skb).
So testing for tx_ring.pending == 0 is not too useful. Instead, it
would rather have been useful to test if lower layers didn't orphan
the skb so that we're missing ring slots being put back to
TP_STATUS_AVAILABLE. But such a bug will be caught in user space
already as we end up realizing that we do not have any
TP_STATUS_AVAILABLE slots left anymore. Therefore, we're all set.
Btw, in case of RX_RING path, we do not make use of the pending
member, therefore we also don't need to use up any percpu memory
here. Also note that __alloc_percpu() already returns a zero-filled
percpu area, so initialization is done already.
[1] http://wiki.ipxwarzone.com/index.php5?title=Linux_packet_mmap
Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-01-15 19:25:36 +04:00
# include <linux/percpu.h>
2012-08-13 09:53:28 +04:00
# include <net/net_namespace.h>
# include <net/sock.h>
# include "internal.h"
2012-08-13 09:55:46 +04:00
static int pdiag_put_info ( const struct packet_sock * po , struct sk_buff * nlskb )
{
struct packet_diag_info pinfo ;
pinfo . pdi_index = po - > ifindex ;
pinfo . pdi_version = po - > tp_version ;
pinfo . pdi_reserve = po - > tp_reserve ;
2024-03-08 16:06:36 +03:00
pinfo . pdi_copy_thresh = READ_ONCE ( po - > copy_thresh ) ;
2023-03-16 04:10:09 +03:00
pinfo . pdi_tstamp = READ_ONCE ( po - > tp_tstamp ) ;
2012-08-13 09:55:46 +04:00
pinfo . pdi_flags = 0 ;
2023-03-16 04:10:13 +03:00
if ( packet_sock_flag ( po , PACKET_SOCK_RUNNING ) )
2012-08-13 09:55:46 +04:00
pinfo . pdi_flags | = PDI_RUNNING ;
2023-03-16 04:10:08 +03:00
if ( packet_sock_flag ( po , PACKET_SOCK_AUXDATA ) )
2012-08-13 09:55:46 +04:00
pinfo . pdi_flags | = PDI_AUXDATA ;
2023-03-16 04:10:07 +03:00
if ( packet_sock_flag ( po , PACKET_SOCK_ORIGDEV ) )
2012-08-13 09:55:46 +04:00
pinfo . pdi_flags | = PDI_ORIGDEV ;
net/packet: support mergeable feature of virtio
Packet sockets, like tap, can be used as the backend for kernel vhost.
In packet sockets, virtio net header size is currently hardcoded to be
the size of struct virtio_net_hdr, which is 10 bytes; however, it is not
always the case: some virtio features, such as mrg_rxbuf, need virtio
net header to be 12-byte long.
Mergeable buffers, as a virtio feature, is worthy of supporting: packets
that are larger than one-mbuf size will be dropped in vhost worker's
handle_rx if mrg_rxbuf feature is not used, but large packets
cannot be avoided and increasing mbuf's size is not economical.
With this virtio feature enabled by virtio-user, packet sockets with
hardcoded 10-byte virtio net header will parse mac head incorrectly in
packet_snd by taking the last two bytes of virtio net header as part of
mac header.
This incorrect mac header parsing will cause packet to be dropped due to
invalid ether head checking in later under-layer device packet receiving.
By adding extra field vnet_hdr_sz with utilizing holes in struct
packet_sock to record currently used virtio net header size and supporting
extra sockopt PACKET_VNET_HDR_SZ to set specified vnet_hdr_sz, packet
sockets can know the exact length of virtio net header that virtio user
gives.
In packet_snd, tpacket_snd and packet_recvmsg, instead of using
hardcoded virtio net header size, it can get the exact vnet_hdr_sz from
corresponding packet_sock, and parse mac header correctly based on this
information to avoid the packets being mistakenly dropped.
Signed-off-by: Jianfeng Tan <henry.tjf@antgroup.com>
Co-developed-by: Anqi Shen <amy.saq@antgroup.com>
Signed-off-by: Anqi Shen <amy.saq@antgroup.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2023-04-19 10:24:16 +03:00
if ( READ_ONCE ( po - > vnet_hdr_sz ) )
2012-08-13 09:55:46 +04:00
pinfo . pdi_flags | = PDI_VNETHDR ;
2023-03-16 04:10:11 +03:00
if ( packet_sock_flag ( po , PACKET_SOCK_TP_LOSS ) )
2012-08-13 09:55:46 +04:00
pinfo . pdi_flags | = PDI_LOSS ;
return nla_put ( nlskb , PACKET_DIAG_INFO , sizeof ( pinfo ) , & pinfo ) ;
}
2012-08-13 09:57:44 +04:00
static int pdiag_put_mclist ( const struct packet_sock * po , struct sk_buff * nlskb )
{
struct nlattr * mca ;
struct packet_mclist * ml ;
2019-04-26 12:13:06 +03:00
mca = nla_nest_start_noflag ( nlskb , PACKET_DIAG_MCLIST ) ;
2012-08-13 09:57:44 +04:00
if ( ! mca )
return - EMSGSIZE ;
rtnl_lock ( ) ;
for ( ml = po - > mclist ; ml ; ml = ml - > next ) {
struct packet_diag_mclist * dml ;
dml = nla_reserve_nohdr ( nlskb , sizeof ( * dml ) ) ;
if ( ! dml ) {
rtnl_unlock ( ) ;
nla_nest_cancel ( nlskb , mca ) ;
return - EMSGSIZE ;
}
dml - > pdmc_index = ml - > ifindex ;
dml - > pdmc_type = ml - > type ;
dml - > pdmc_alen = ml - > alen ;
dml - > pdmc_count = ml - > count ;
BUILD_BUG_ON ( sizeof ( dml - > pdmc_addr ) ! = sizeof ( ml - > addr ) ) ;
memcpy ( dml - > pdmc_addr , ml - > addr , sizeof ( ml - > addr ) ) ;
}
rtnl_unlock ( ) ;
nla_nest_end ( nlskb , mca ) ;
return 0 ;
}
2012-08-16 09:34:22 +04:00
static int pdiag_put_ring ( struct packet_ring_buffer * ring , int ver , int nl_type ,
struct sk_buff * nlskb )
{
struct packet_diag_ring pdr ;
2017-01-10 18:47:15 +03:00
if ( ! ring - > pg_vec )
2012-08-16 09:34:22 +04:00
return 0 ;
pdr . pdr_block_size = ring - > pg_vec_pages < < PAGE_SHIFT ;
pdr . pdr_block_nr = ring - > pg_vec_len ;
pdr . pdr_frame_size = ring - > frame_size ;
pdr . pdr_frame_nr = ring - > frame_max + 1 ;
if ( ver > TPACKET_V2 ) {
pdr . pdr_retire_tmo = ring - > prb_bdqc . retire_blk_tov ;
pdr . pdr_sizeof_priv = ring - > prb_bdqc . blk_sizeof_priv ;
pdr . pdr_features = ring - > prb_bdqc . feature_req_word ;
} else {
pdr . pdr_retire_tmo = 0 ;
pdr . pdr_sizeof_priv = 0 ;
pdr . pdr_features = 0 ;
}
return nla_put ( nlskb , nl_type , sizeof ( pdr ) , & pdr ) ;
}
static int pdiag_put_rings_cfg ( struct packet_sock * po , struct sk_buff * skb )
{
int ret ;
mutex_lock ( & po - > pg_vec_lock ) ;
ret = pdiag_put_ring ( & po - > rx_ring , po - > tp_version ,
PACKET_DIAG_RX_RING , skb ) ;
if ( ! ret )
ret = pdiag_put_ring ( & po - > tx_ring , po - > tp_version ,
PACKET_DIAG_TX_RING , skb ) ;
mutex_unlock ( & po - > pg_vec_lock ) ;
return ret ;
}
2012-08-16 09:36:48 +04:00
static int pdiag_put_fanout ( struct packet_sock * po , struct sk_buff * nlskb )
{
int ret = 0 ;
mutex_lock ( & fanout_mutex ) ;
if ( po - > fanout ) {
u32 val ;
val = ( u32 ) po - > fanout - > id | ( ( u32 ) po - > fanout - > type < < 16 ) ;
ret = nla_put_u32 ( nlskb , PACKET_DIAG_FANOUT , val ) ;
}
mutex_unlock ( & fanout_mutex ) ;
return ret ;
}
2013-04-25 10:53:52 +04:00
static int sk_diag_fill ( struct sock * sk , struct sk_buff * skb ,
struct packet_diag_req * req ,
2014-04-24 01:26:25 +04:00
bool may_report_filterinfo ,
2013-04-25 10:53:52 +04:00
struct user_namespace * user_ns ,
u32 portid , u32 seq , u32 flags , int sk_ino )
2012-08-13 09:53:28 +04:00
{
struct nlmsghdr * nlh ;
struct packet_diag_msg * rp ;
2012-08-16 09:34:22 +04:00
struct packet_sock * po = pkt_sk ( sk ) ;
2012-08-13 09:53:28 +04:00
2012-09-08 00:12:54 +04:00
nlh = nlmsg_put ( skb , portid , seq , SOCK_DIAG_BY_FAMILY , sizeof ( * rp ) , flags ) ;
2012-08-13 09:53:28 +04:00
if ( ! nlh )
return - EMSGSIZE ;
rp = nlmsg_data ( nlh ) ;
rp - > pdiag_family = AF_PACKET ;
rp - > pdiag_type = sk - > sk_type ;
2023-05-25 02:29:34 +03:00
rp - > pdiag_num = ntohs ( READ_ONCE ( po - > num ) ) ;
2012-08-13 09:53:28 +04:00
rp - > pdiag_ino = sk_ino ;
sock_diag_save_cookie ( sk , rp - > pdiag_cookie ) ;
2012-08-13 09:55:46 +04:00
if ( ( req - > pdiag_show & PACKET_SHOW_INFO ) & &
pdiag_put_info ( po , skb ) )
goto out_nlmsg_trim ;
2013-04-25 10:53:52 +04:00
if ( ( req - > pdiag_show & PACKET_SHOW_INFO ) & &
nla_put_u32 ( skb , PACKET_DIAG_UID ,
from_kuid_munged ( user_ns , sock_i_uid ( sk ) ) ) )
goto out_nlmsg_trim ;
2012-08-13 09:57:44 +04:00
if ( ( req - > pdiag_show & PACKET_SHOW_MCLIST ) & &
pdiag_put_mclist ( po , skb ) )
goto out_nlmsg_trim ;
2012-08-16 09:34:22 +04:00
if ( ( req - > pdiag_show & PACKET_SHOW_RING_CFG ) & &
pdiag_put_rings_cfg ( po , skb ) )
goto out_nlmsg_trim ;
2012-08-16 09:36:48 +04:00
if ( ( req - > pdiag_show & PACKET_SHOW_FANOUT ) & &
pdiag_put_fanout ( po , skb ) )
goto out_nlmsg_trim ;
2013-04-25 10:53:53 +04:00
if ( ( req - > pdiag_show & PACKET_SHOW_MEMINFO ) & &
sock_diag_put_meminfo ( sk , skb , PACKET_DIAG_MEMINFO ) )
goto out_nlmsg_trim ;
2013-04-25 10:53:54 +04:00
if ( ( req - > pdiag_show & PACKET_SHOW_FILTER ) & &
2014-04-24 01:26:25 +04:00
sock_diag_put_filterinfo ( may_report_filterinfo , sk , skb ,
PACKET_DIAG_FILTER ) )
2013-04-25 10:53:54 +04:00
goto out_nlmsg_trim ;
2015-01-17 00:09:00 +03:00
nlmsg_end ( skb , nlh ) ;
return 0 ;
2012-08-13 09:55:46 +04:00
out_nlmsg_trim :
nlmsg_cancel ( skb , nlh ) ;
return - EMSGSIZE ;
2012-08-13 09:53:28 +04:00
}
static int packet_diag_dump ( struct sk_buff * skb , struct netlink_callback * cb )
{
int num = 0 , s_num = cb - > args [ 0 ] ;
struct packet_diag_req * req ;
struct net * net ;
struct sock * sk ;
2014-04-24 01:26:25 +04:00
bool may_report_filterinfo ;
2012-08-13 09:53:28 +04:00
net = sock_net ( skb - > sk ) ;
req = nlmsg_data ( cb - > nlh ) ;
2014-04-24 01:29:27 +04:00
may_report_filterinfo = netlink_net_capable ( cb - > skb , CAP_NET_ADMIN ) ;
2012-08-13 09:53:28 +04:00
2012-08-21 05:06:47 +04:00
mutex_lock ( & net - > packet . sklist_lock ) ;
hlist: drop the node parameter from iterators
I'm not sure why, but the hlist for each entry iterators were conceived
list_for_each_entry(pos, head, member)
The hlist ones were greedy and wanted an extra parameter:
hlist_for_each_entry(tpos, pos, head, member)
Why did they need an extra pos parameter? I'm not quite sure. Not only
they don't really need it, it also prevents the iterator from looking
exactly like the list iterator, which is unfortunate.
Besides the semantic patch, there was some manual work required:
- Fix up the actual hlist iterators in linux/list.h
- Fix up the declaration of other iterators based on the hlist ones.
- A very small amount of places were using the 'node' parameter, this
was modified to use 'obj->member' instead.
- Coccinelle didn't handle the hlist_for_each_entry_safe iterator
properly, so those had to be fixed up manually.
The semantic patch which is mostly the work of Peter Senna Tschudin is here:
@@
iterator name hlist_for_each_entry, hlist_for_each_entry_continue, hlist_for_each_entry_from, hlist_for_each_entry_rcu, hlist_for_each_entry_rcu_bh, hlist_for_each_entry_continue_rcu_bh, for_each_busy_worker, ax25_uid_for_each, ax25_for_each, inet_bind_bucket_for_each, sctp_for_each_hentry, sk_for_each, sk_for_each_rcu, sk_for_each_from, sk_for_each_safe, sk_for_each_bound, hlist_for_each_entry_safe, hlist_for_each_entry_continue_rcu, nr_neigh_for_each, nr_neigh_for_each_safe, nr_node_for_each, nr_node_for_each_safe, for_each_gfn_indirect_valid_sp, for_each_gfn_sp, for_each_host;
type T;
expression a,c,d,e;
identifier b;
statement S;
@@
-T b;
<+... when != b
(
hlist_for_each_entry(a,
- b,
c, d) S
|
hlist_for_each_entry_continue(a,
- b,
c) S
|
hlist_for_each_entry_from(a,
- b,
c) S
|
hlist_for_each_entry_rcu(a,
- b,
c, d) S
|
hlist_for_each_entry_rcu_bh(a,
- b,
c, d) S
|
hlist_for_each_entry_continue_rcu_bh(a,
- b,
c) S
|
for_each_busy_worker(a, c,
- b,
d) S
|
ax25_uid_for_each(a,
- b,
c) S
|
ax25_for_each(a,
- b,
c) S
|
inet_bind_bucket_for_each(a,
- b,
c) S
|
sctp_for_each_hentry(a,
- b,
c) S
|
sk_for_each(a,
- b,
c) S
|
sk_for_each_rcu(a,
- b,
c) S
|
sk_for_each_from
-(a, b)
+(a)
S
+ sk_for_each_from(a) S
|
sk_for_each_safe(a,
- b,
c, d) S
|
sk_for_each_bound(a,
- b,
c) S
|
hlist_for_each_entry_safe(a,
- b,
c, d, e) S
|
hlist_for_each_entry_continue_rcu(a,
- b,
c) S
|
nr_neigh_for_each(a,
- b,
c) S
|
nr_neigh_for_each_safe(a,
- b,
c, d) S
|
nr_node_for_each(a,
- b,
c) S
|
nr_node_for_each_safe(a,
- b,
c, d) S
|
- for_each_gfn_sp(a, c, d, b) S
+ for_each_gfn_sp(a, c, d) S
|
- for_each_gfn_indirect_valid_sp(a, c, d, b) S
+ for_each_gfn_indirect_valid_sp(a, c, d) S
|
for_each_host(a,
- b,
c) S
|
for_each_host_safe(a,
- b,
c, d) S
|
for_each_mesh_entry(a,
- b,
c, d) S
)
...+>
[akpm@linux-foundation.org: drop bogus change from net/ipv4/raw.c]
[akpm@linux-foundation.org: drop bogus hunk from net/ipv6/raw.c]
[akpm@linux-foundation.org: checkpatch fixes]
[akpm@linux-foundation.org: fix warnings]
[akpm@linux-foudnation.org: redo intrusive kvm changes]
Tested-by: Peter Senna Tschudin <peter.senna@gmail.com>
Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Sasha Levin <sasha.levin@oracle.com>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-02-28 05:06:00 +04:00
sk_for_each ( sk , & net - > packet . sklist ) {
2012-08-13 09:53:28 +04:00
if ( ! net_eq ( sock_net ( sk ) , net ) )
continue ;
if ( num < s_num )
goto next ;
2013-04-25 10:53:52 +04:00
if ( sk_diag_fill ( sk , skb , req ,
2014-04-24 01:26:25 +04:00
may_report_filterinfo ,
2013-04-25 10:53:52 +04:00
sk_user_ns ( NETLINK_CB ( cb - > skb ) . sk ) ,
NETLINK_CB ( cb - > skb ) . portid ,
cb - > nlh - > nlmsg_seq , NLM_F_MULTI ,
sock_i_ino ( sk ) ) < 0 )
2012-08-13 09:53:28 +04:00
goto done ;
next :
num + + ;
}
done :
2012-08-21 05:06:47 +04:00
mutex_unlock ( & net - > packet . sklist_lock ) ;
2012-08-13 09:53:28 +04:00
cb - > args [ 0 ] = num ;
return skb - > len ;
}
static int packet_diag_handler_dump ( struct sk_buff * skb , struct nlmsghdr * h )
{
int hdrlen = sizeof ( struct packet_diag_req ) ;
struct net * net = sock_net ( skb - > sk ) ;
struct packet_diag_req * req ;
if ( nlmsg_len ( h ) < hdrlen )
return - EINVAL ;
req = nlmsg_data ( h ) ;
/* Make it possible to support protocol filtering later */
if ( req - > sdiag_protocol )
return - EINVAL ;
if ( h - > nlmsg_flags & NLM_F_DUMP ) {
struct netlink_dump_control c = {
. dump = packet_diag_dump ,
} ;
return netlink_dump_start ( net - > diag_nlsk , skb , h , & c ) ;
} else
return - EOPNOTSUPP ;
}
static const struct sock_diag_handler packet_diag_handler = {
2024-01-22 14:25:59 +03:00
. owner = THIS_MODULE ,
2012-08-13 09:53:28 +04:00
. family = AF_PACKET ,
. dump = packet_diag_handler_dump ,
} ;
static int __init packet_diag_init ( void )
{
return sock_diag_register ( & packet_diag_handler ) ;
}
static void __exit packet_diag_exit ( void )
{
sock_diag_unregister ( & packet_diag_handler ) ;
}
module_init ( packet_diag_init ) ;
module_exit ( packet_diag_exit ) ;
MODULE_LICENSE ( " GPL " ) ;
2023-11-19 06:30:06 +03:00
MODULE_DESCRIPTION ( " PACKET socket monitoring via SOCK_DIAG " ) ;
2012-08-13 09:53:28 +04:00
MODULE_ALIAS_NET_PF_PROTO_TYPE ( PF_NETLINK , NETLINK_SOCK_DIAG , 17 /* AF_PACKET */ ) ;