diff --git a/MAINTAINERS b/MAINTAINERS index 3c84a8fecc09..d76fccd09266 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -9957,6 +9957,13 @@ L: linuxppc-dev@lists.ozlabs.org S: Maintained F: drivers/block/ps3vram.c +PSAMPLE PACKET SAMPLING SUPPORT: +M: Yotam Gigi +S: Maintained +F: net/psample +F: include/net/psample.h +F: include/uapi/linux/psample.h + PSTORE FILESYSTEM M: Anton Vorontsov M: Colin Cross diff --git a/drivers/net/ethernet/mellanox/mlxsw/reg.h b/drivers/net/ethernet/mellanox/mlxsw/reg.h index 1357fe04391b..9fb0316b3a30 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/reg.h +++ b/drivers/net/ethernet/mellanox/mlxsw/reg.h @@ -4965,6 +4965,46 @@ static inline void mlxsw_reg_mlcr_pack(char *payload, u8 local_port, MLXSW_REG_MLCR_DURATION_MAX : 0); } +/* MPSC - Monitoring Packet Sampling Configuration Register + * -------------------------------------------------------- + * MPSC Register is used to configure the Packet Sampling mechanism. + */ +#define MLXSW_REG_MPSC_ID 0x9080 +#define MLXSW_REG_MPSC_LEN 0x1C + +MLXSW_REG_DEFINE(mpsc, MLXSW_REG_MPSC_ID, MLXSW_REG_MPSC_LEN); + +/* reg_mpsc_local_port + * Local port number + * Not supported for CPU port + * Access: Index + */ +MLXSW_ITEM32(reg, mpsc, local_port, 0x00, 16, 8); + +/* reg_mpsc_e + * Enable sampling on port local_port + * Access: RW + */ +MLXSW_ITEM32(reg, mpsc, e, 0x04, 30, 1); + +#define MLXSW_REG_MPSC_RATE_MAX 3500000000UL + +/* reg_mpsc_rate + * Sampling rate = 1 out of rate packets (with randomization around + * the point). Valid values are: 1 to MLXSW_REG_MPSC_RATE_MAX + * Access: RW + */ +MLXSW_ITEM32(reg, mpsc, rate, 0x08, 0, 32); + +static inline void mlxsw_reg_mpsc_pack(char *payload, u8 local_port, bool e, + u32 rate) +{ + MLXSW_REG_ZERO(mpsc, payload); + mlxsw_reg_mpsc_local_port_set(payload, local_port); + mlxsw_reg_mpsc_e_set(payload, e); + mlxsw_reg_mpsc_rate_set(payload, rate); +} + /* SBPR - Shared Buffer Pools Register * ----------------------------------- * The SBPR configures and retrieves the shared buffer pools and configuration. @@ -5429,6 +5469,7 @@ static const struct mlxsw_reg_info *mlxsw_reg_infos[] = { MLXSW_REG(mpat), MLXSW_REG(mpar), MLXSW_REG(mlcr), + MLXSW_REG(mpsc), MLXSW_REG(sbpr), MLXSW_REG(sbcm), MLXSW_REG(sbpm), diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c index 3dbd82e636f8..467aa5288935 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c @@ -57,6 +57,7 @@ #include #include #include +#include #include "spectrum.h" #include "pci.h" @@ -469,6 +470,16 @@ static void mlxsw_sp_span_mirror_remove(struct mlxsw_sp_port *from, mlxsw_sp_span_inspected_port_unbind(from, span_entry, type); } +static int mlxsw_sp_port_sample_set(struct mlxsw_sp_port *mlxsw_sp_port, + bool enable, u32 rate) +{ + struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp; + char mpsc_pl[MLXSW_REG_MPSC_LEN]; + + mlxsw_reg_mpsc_pack(mpsc_pl, mlxsw_sp_port->local_port, enable, rate); + return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(mpsc), mpsc_pl); +} + static int mlxsw_sp_port_admin_status_set(struct mlxsw_sp_port *mlxsw_sp_port, bool is_up) { @@ -1218,6 +1229,51 @@ mlxsw_sp_port_del_cls_matchall_mirror(struct mlxsw_sp_port *mlxsw_sp_port, mlxsw_sp_span_mirror_remove(mlxsw_sp_port, to_port, span_type); } +static int +mlxsw_sp_port_add_cls_matchall_sample(struct mlxsw_sp_port *mlxsw_sp_port, + struct tc_cls_matchall_offload *cls, + const struct tc_action *a, + bool ingress) +{ + int err; + + if (!mlxsw_sp_port->sample) + return -EOPNOTSUPP; + if (rtnl_dereference(mlxsw_sp_port->sample->psample_group)) { + netdev_err(mlxsw_sp_port->dev, "sample already active\n"); + return -EEXIST; + } + if (tcf_sample_rate(a) > MLXSW_REG_MPSC_RATE_MAX) { + netdev_err(mlxsw_sp_port->dev, "sample rate not supported\n"); + return -EOPNOTSUPP; + } + + rcu_assign_pointer(mlxsw_sp_port->sample->psample_group, + tcf_sample_psample_group(a)); + mlxsw_sp_port->sample->truncate = tcf_sample_truncate(a); + mlxsw_sp_port->sample->trunc_size = tcf_sample_trunc_size(a); + mlxsw_sp_port->sample->rate = tcf_sample_rate(a); + + err = mlxsw_sp_port_sample_set(mlxsw_sp_port, true, tcf_sample_rate(a)); + if (err) + goto err_port_sample_set; + return 0; + +err_port_sample_set: + RCU_INIT_POINTER(mlxsw_sp_port->sample->psample_group, NULL); + return err; +} + +static void +mlxsw_sp_port_del_cls_matchall_sample(struct mlxsw_sp_port *mlxsw_sp_port) +{ + if (!mlxsw_sp_port->sample) + return; + + mlxsw_sp_port_sample_set(mlxsw_sp_port, false, 1); + RCU_INIT_POINTER(mlxsw_sp_port->sample->psample_group, NULL); +} + static int mlxsw_sp_port_add_cls_matchall(struct mlxsw_sp_port *mlxsw_sp_port, __be16 protocol, struct tc_cls_matchall_offload *cls, @@ -1248,6 +1304,10 @@ static int mlxsw_sp_port_add_cls_matchall(struct mlxsw_sp_port *mlxsw_sp_port, mirror = &mall_tc_entry->mirror; err = mlxsw_sp_port_add_cls_matchall_mirror(mlxsw_sp_port, mirror, a, ingress); + } else if (is_tcf_sample(a) && protocol == htons(ETH_P_ALL)) { + mall_tc_entry->type = MLXSW_SP_PORT_MALL_SAMPLE; + err = mlxsw_sp_port_add_cls_matchall_sample(mlxsw_sp_port, cls, + a, ingress); } else { err = -EOPNOTSUPP; } @@ -1281,6 +1341,9 @@ static void mlxsw_sp_port_del_cls_matchall(struct mlxsw_sp_port *mlxsw_sp_port, mlxsw_sp_port_del_cls_matchall_mirror(mlxsw_sp_port, &mall_tc_entry->mirror); break; + case MLXSW_SP_PORT_MALL_SAMPLE: + mlxsw_sp_port_del_cls_matchall_sample(mlxsw_sp_port); + break; default: WARN_ON(1); } @@ -2259,6 +2322,13 @@ static int __mlxsw_sp_port_create(struct mlxsw_sp *mlxsw_sp, u8 local_port, goto err_alloc_stats; } + mlxsw_sp_port->sample = kzalloc(sizeof(*mlxsw_sp_port->sample), + GFP_KERNEL); + if (!mlxsw_sp_port->sample) { + err = -ENOMEM; + goto err_alloc_sample; + } + mlxsw_sp_port->hw_stats.cache = kzalloc(sizeof(*mlxsw_sp_port->hw_stats.cache), GFP_KERNEL); @@ -2387,6 +2457,8 @@ err_dev_addr_init: err_port_swid_set: kfree(mlxsw_sp_port->hw_stats.cache); err_alloc_hw_stats: + kfree(mlxsw_sp_port->sample); +err_alloc_sample: free_percpu(mlxsw_sp_port->pcpu_stats); err_alloc_stats: kfree(mlxsw_sp_port->untagged_vlans); @@ -2433,6 +2505,7 @@ static void __mlxsw_sp_port_remove(struct mlxsw_sp *mlxsw_sp, u8 local_port) mlxsw_sp_port_swid_set(mlxsw_sp_port, MLXSW_PORT_SWID_DISABLED_PORT); mlxsw_sp_port_module_unmap(mlxsw_sp, mlxsw_sp_port->local_port); kfree(mlxsw_sp_port->hw_stats.cache); + kfree(mlxsw_sp_port->sample); free_percpu(mlxsw_sp_port->pcpu_stats); kfree(mlxsw_sp_port->untagged_vlans); kfree(mlxsw_sp_port->active_vlans); @@ -2734,6 +2807,41 @@ static void mlxsw_sp_rx_listener_mark_func(struct sk_buff *skb, u8 local_port, return mlxsw_sp_rx_listener_no_mark_func(skb, local_port, priv); } +static void mlxsw_sp_rx_listener_sample_func(struct sk_buff *skb, u8 local_port, + void *priv) +{ + struct mlxsw_sp *mlxsw_sp = priv; + struct mlxsw_sp_port *mlxsw_sp_port = mlxsw_sp->ports[local_port]; + struct psample_group *psample_group; + u32 size; + + if (unlikely(!mlxsw_sp_port)) { + dev_warn_ratelimited(mlxsw_sp->bus_info->dev, "Port %d: sample skb received for non-existent port\n", + local_port); + goto out; + } + if (unlikely(!mlxsw_sp_port->sample)) { + dev_warn_ratelimited(mlxsw_sp->bus_info->dev, "Port %d: sample skb received on unsupported port\n", + local_port); + goto out; + } + + size = mlxsw_sp_port->sample->truncate ? + mlxsw_sp_port->sample->trunc_size : skb->len; + + rcu_read_lock(); + psample_group = rcu_dereference(mlxsw_sp_port->sample->psample_group); + if (!psample_group) + goto out_unlock; + psample_sample_packet(psample_group, skb, size, + mlxsw_sp_port->dev->ifindex, 0, + mlxsw_sp_port->sample->rate); +out_unlock: + rcu_read_unlock(); +out: + consume_skb(skb); +} + #define MLXSW_SP_RXL_NO_MARK(_trap_id, _action, _trap_group, _is_ctrl) \ MLXSW_RXL(mlxsw_sp_rx_listener_no_mark_func, _trap_id, _action, \ _is_ctrl, SP_##_trap_group, DISCARD) @@ -2769,6 +2877,9 @@ static const struct mlxsw_listener mlxsw_sp_listener[] = { MLXSW_SP_RXL_NO_MARK(RTR_INGRESS0, TRAP_TO_CPU, REMOTE_ROUTE, false), MLXSW_SP_RXL_NO_MARK(HOST_MISS_IPV4, TRAP_TO_CPU, ARP_MISS, false), MLXSW_SP_RXL_NO_MARK(BGP_IPV4, TRAP_TO_CPU, BGP_IPV4, false), + /* PKT Sample trap */ + MLXSW_RXL(mlxsw_sp_rx_listener_sample_func, PKT_SAMPLE, MIRROR_TO_CPU, + false, SP_IP2ME, DISCARD) }; static int mlxsw_sp_cpu_policers_set(struct mlxsw_core *mlxsw_core) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h index cc1af19d699a..bc3efe1629c4 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h @@ -46,6 +46,7 @@ #include #include #include +#include #include "port.h" #include "core.h" @@ -229,6 +230,7 @@ struct mlxsw_sp_span_entry { enum mlxsw_sp_port_mall_action_type { MLXSW_SP_PORT_MALL_MIRROR, + MLXSW_SP_PORT_MALL_SAMPLE, }; struct mlxsw_sp_port_mall_mirror_tc_entry { @@ -315,6 +317,13 @@ struct mlxsw_sp_port_pcpu_stats { u32 tx_dropped; }; +struct mlxsw_sp_port_sample { + struct psample_group __rcu *psample_group; + u32 trunc_size; + u32 rate; + bool truncate; +}; + struct mlxsw_sp_port { struct net_device *dev; struct mlxsw_sp_port_pcpu_stats __percpu *pcpu_stats; @@ -361,6 +370,7 @@ struct mlxsw_sp_port { struct rtnl_link_stats64 *cache; struct delayed_work update_dw; } hw_stats; + struct mlxsw_sp_port_sample *sample; }; struct mlxsw_sp_port *mlxsw_sp_port_lower_dev_hold(struct net_device *dev); diff --git a/drivers/net/ethernet/mellanox/mlxsw/trap.h b/drivers/net/ethernet/mellanox/mlxsw/trap.h index 7ab275deacac..02ea48b15eb5 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/trap.h +++ b/drivers/net/ethernet/mellanox/mlxsw/trap.h @@ -54,6 +54,7 @@ enum { MLXSW_TRAP_ID_IGMP_V2_REPORT = 0x32, MLXSW_TRAP_ID_IGMP_V2_LEAVE = 0x33, MLXSW_TRAP_ID_IGMP_V3_REPORT = 0x34, + MLXSW_TRAP_ID_PKT_SAMPLE = 0x38, MLXSW_TRAP_ID_ARPBC = 0x50, MLXSW_TRAP_ID_ARPUC = 0x51, MLXSW_TRAP_ID_MTUERROR = 0x52, diff --git a/include/net/psample.h b/include/net/psample.h new file mode 100644 index 000000000000..8888b0e1a82e --- /dev/null +++ b/include/net/psample.h @@ -0,0 +1,36 @@ +#ifndef __NET_PSAMPLE_H +#define __NET_PSAMPLE_H + +#include +#include +#include + +struct psample_group { + struct list_head list; + struct net *net; + u32 group_num; + u32 refcount; + u32 seq; +}; + +struct psample_group *psample_group_get(struct net *net, u32 group_num); +void psample_group_put(struct psample_group *group); + +#if IS_ENABLED(CONFIG_PSAMPLE) + +void psample_sample_packet(struct psample_group *group, struct sk_buff *skb, + u32 trunc_size, int in_ifindex, int out_ifindex, + u32 sample_rate); + +#else + +static inline void psample_sample_packet(struct psample_group *group, + struct sk_buff *skb, u32 trunc_size, + int in_ifindex, int out_ifindex, + u32 sample_rate) +{ +} + +#endif + +#endif /* __NET_PSAMPLE_H */ diff --git a/include/net/tc_act/tc_sample.h b/include/net/tc_act/tc_sample.h new file mode 100644 index 000000000000..89e9305be880 --- /dev/null +++ b/include/net/tc_act/tc_sample.h @@ -0,0 +1,50 @@ +#ifndef __NET_TC_SAMPLE_H +#define __NET_TC_SAMPLE_H + +#include +#include +#include + +struct tcf_sample { + struct tc_action common; + u32 rate; + bool truncate; + u32 trunc_size; + struct psample_group __rcu *psample_group; + u32 psample_group_num; + struct list_head tcfm_list; + struct rcu_head rcu; +}; +#define to_sample(a) ((struct tcf_sample *)a) + +static inline bool is_tcf_sample(const struct tc_action *a) +{ +#ifdef CONFIG_NET_CLS_ACT + return a->ops && a->ops->type == TCA_ACT_SAMPLE; +#else + return false; +#endif +} + +static inline __u32 tcf_sample_rate(const struct tc_action *a) +{ + return to_sample(a)->rate; +} + +static inline bool tcf_sample_truncate(const struct tc_action *a) +{ + return to_sample(a)->truncate; +} + +static inline int tcf_sample_trunc_size(const struct tc_action *a) +{ + return to_sample(a)->trunc_size; +} + +static inline struct psample_group * +tcf_sample_psample_group(const struct tc_action *a) +{ + return rcu_dereference(to_sample(a)->psample_group); +} + +#endif /* __NET_TC_SAMPLE_H */ diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild index e600b50be77e..80ad741a42fa 100644 --- a/include/uapi/linux/Kbuild +++ b/include/uapi/linux/Kbuild @@ -305,6 +305,7 @@ header-y += netrom.h header-y += net_namespace.h header-y += net_tstamp.h header-y += nfc.h +header-y += psample.h header-y += nfs2.h header-y += nfs3.h header-y += nfs4.h diff --git a/include/uapi/linux/psample.h b/include/uapi/linux/psample.h new file mode 100644 index 000000000000..ed48996ec0e8 --- /dev/null +++ b/include/uapi/linux/psample.h @@ -0,0 +1,35 @@ +#ifndef __UAPI_PSAMPLE_H +#define __UAPI_PSAMPLE_H + +enum { + /* sampled packet metadata */ + PSAMPLE_ATTR_IIFINDEX, + PSAMPLE_ATTR_OIFINDEX, + PSAMPLE_ATTR_ORIGSIZE, + PSAMPLE_ATTR_SAMPLE_GROUP, + PSAMPLE_ATTR_GROUP_SEQ, + PSAMPLE_ATTR_SAMPLE_RATE, + PSAMPLE_ATTR_DATA, + + /* commands attributes */ + PSAMPLE_ATTR_GROUP_REFCOUNT, + + __PSAMPLE_ATTR_MAX +}; + +enum psample_command { + PSAMPLE_CMD_SAMPLE, + PSAMPLE_CMD_GET_GROUP, + PSAMPLE_CMD_NEW_GROUP, + PSAMPLE_CMD_DEL_GROUP, +}; + +/* Can be overridden at runtime by module option */ +#define PSAMPLE_ATTR_MAX (__PSAMPLE_ATTR_MAX - 1) + +#define PSAMPLE_NL_MCGRP_CONFIG_NAME "config" +#define PSAMPLE_NL_MCGRP_SAMPLE_NAME "packets" +#define PSAMPLE_GENL_NAME "psample" +#define PSAMPLE_GENL_VERSION 1 + +#endif diff --git a/include/uapi/linux/tc_act/Kbuild b/include/uapi/linux/tc_act/Kbuild index e3db7403296f..ba62ddf0e58a 100644 --- a/include/uapi/linux/tc_act/Kbuild +++ b/include/uapi/linux/tc_act/Kbuild @@ -4,6 +4,7 @@ header-y += tc_defact.h header-y += tc_gact.h header-y += tc_ipt.h header-y += tc_mirred.h +header-y += tc_sample.h header-y += tc_nat.h header-y += tc_pedit.h header-y += tc_skbedit.h diff --git a/include/uapi/linux/tc_act/tc_sample.h b/include/uapi/linux/tc_act/tc_sample.h new file mode 100644 index 000000000000..edc9058bb30d --- /dev/null +++ b/include/uapi/linux/tc_act/tc_sample.h @@ -0,0 +1,26 @@ +#ifndef __LINUX_TC_SAMPLE_H +#define __LINUX_TC_SAMPLE_H + +#include +#include +#include + +#define TCA_ACT_SAMPLE 26 + +struct tc_sample { + tc_gen; +}; + +enum { + TCA_SAMPLE_UNSPEC, + TCA_SAMPLE_TM, + TCA_SAMPLE_PARMS, + TCA_SAMPLE_RATE, + TCA_SAMPLE_TRUNC_SIZE, + TCA_SAMPLE_PSAMPLE_GROUP, + TCA_SAMPLE_PAD, + __TCA_SAMPLE_MAX +}; +#define TCA_SAMPLE_MAX (__TCA_SAMPLE_MAX - 1) + +#endif diff --git a/net/Kconfig b/net/Kconfig index 92ae1500d9e1..ce4aee69fc0d 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -390,6 +390,7 @@ source "net/9p/Kconfig" source "net/caif/Kconfig" source "net/ceph/Kconfig" source "net/nfc/Kconfig" +source "net/psample/Kconfig" config LWTUNNEL bool "Network light weight tunnels" diff --git a/net/Makefile b/net/Makefile index 5d6e0e5ff7f8..7d41de48310e 100644 --- a/net/Makefile +++ b/net/Makefile @@ -70,6 +70,7 @@ obj-$(CONFIG_DNS_RESOLVER) += dns_resolver/ obj-$(CONFIG_CEPH_LIB) += ceph/ obj-$(CONFIG_BATMAN_ADV) += batman-adv/ obj-$(CONFIG_NFC) += nfc/ +obj-$(CONFIG_PSAMPLE) += psample/ obj-$(CONFIG_OPENVSWITCH) += openvswitch/ obj-$(CONFIG_VSOCKETS) += vmw_vsock/ obj-$(CONFIG_MPLS) += mpls/ diff --git a/net/psample/Kconfig b/net/psample/Kconfig new file mode 100644 index 000000000000..d850246a6059 --- /dev/null +++ b/net/psample/Kconfig @@ -0,0 +1,15 @@ +# +# psample packet sampling configuration +# + +menuconfig PSAMPLE + depends on NET + tristate "Packet-sampling netlink channel" + default n + help + Say Y here to add support for packet-sampling netlink channel + This netlink channel allows transferring packets alongside some + metadata to userspace. + + To compile this support as a module, choose M here: the module will + be called psample. diff --git a/net/psample/Makefile b/net/psample/Makefile new file mode 100644 index 000000000000..609b0a79c9f3 --- /dev/null +++ b/net/psample/Makefile @@ -0,0 +1,5 @@ +# +# Makefile for the psample netlink channel +# + +obj-$(CONFIG_PSAMPLE) += psample.o diff --git a/net/psample/psample.c b/net/psample/psample.c new file mode 100644 index 000000000000..8aa58a918783 --- /dev/null +++ b/net/psample/psample.c @@ -0,0 +1,301 @@ +/* + * net/psample/psample.c - Netlink channel for packet sampling + * Copyright (c) 2017 Yotam Gigi + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define PSAMPLE_MAX_PACKET_SIZE 0xffff + +static LIST_HEAD(psample_groups_list); +static DEFINE_SPINLOCK(psample_groups_lock); + +/* multicast groups */ +enum psample_nl_multicast_groups { + PSAMPLE_NL_MCGRP_CONFIG, + PSAMPLE_NL_MCGRP_SAMPLE, +}; + +static const struct genl_multicast_group psample_nl_mcgrps[] = { + [PSAMPLE_NL_MCGRP_CONFIG] = { .name = PSAMPLE_NL_MCGRP_CONFIG_NAME }, + [PSAMPLE_NL_MCGRP_SAMPLE] = { .name = PSAMPLE_NL_MCGRP_SAMPLE_NAME }, +}; + +static struct genl_family psample_nl_family __ro_after_init; + +static int psample_group_nl_fill(struct sk_buff *msg, + struct psample_group *group, + enum psample_command cmd, u32 portid, u32 seq, + int flags) +{ + void *hdr; + int ret; + + hdr = genlmsg_put(msg, portid, seq, &psample_nl_family, flags, cmd); + if (!hdr) + return -EMSGSIZE; + + ret = nla_put_u32(msg, PSAMPLE_ATTR_SAMPLE_GROUP, group->group_num); + if (ret < 0) + goto error; + + ret = nla_put_u32(msg, PSAMPLE_ATTR_GROUP_REFCOUNT, group->refcount); + if (ret < 0) + goto error; + + ret = nla_put_u32(msg, PSAMPLE_ATTR_GROUP_SEQ, group->seq); + if (ret < 0) + goto error; + + genlmsg_end(msg, hdr); + return 0; + +error: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +static int psample_nl_cmd_get_group_dumpit(struct sk_buff *msg, + struct netlink_callback *cb) +{ + struct psample_group *group; + int start = cb->args[0]; + int idx = 0; + int err; + + spin_lock(&psample_groups_lock); + list_for_each_entry(group, &psample_groups_list, list) { + if (!net_eq(group->net, sock_net(msg->sk))) + continue; + if (idx < start) { + idx++; + continue; + } + err = psample_group_nl_fill(msg, group, PSAMPLE_CMD_NEW_GROUP, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, NLM_F_MULTI); + if (err) + break; + idx++; + } + + spin_unlock(&psample_groups_lock); + cb->args[0] = idx; + return msg->len; +} + +static const struct genl_ops psample_nl_ops[] = { + { + .cmd = PSAMPLE_CMD_GET_GROUP, + .dumpit = psample_nl_cmd_get_group_dumpit, + /* can be retrieved by unprivileged users */ + } +}; + +static struct genl_family psample_nl_family __ro_after_init = { + .name = PSAMPLE_GENL_NAME, + .version = PSAMPLE_GENL_VERSION, + .maxattr = PSAMPLE_ATTR_MAX, + .netnsok = true, + .module = THIS_MODULE, + .mcgrps = psample_nl_mcgrps, + .ops = psample_nl_ops, + .n_ops = ARRAY_SIZE(psample_nl_ops), + .n_mcgrps = ARRAY_SIZE(psample_nl_mcgrps), +}; + +static void psample_group_notify(struct psample_group *group, + enum psample_command cmd) +{ + struct sk_buff *msg; + int err; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); + if (!msg) + return; + + err = psample_group_nl_fill(msg, group, cmd, 0, 0, NLM_F_MULTI); + if (!err) + genlmsg_multicast_netns(&psample_nl_family, group->net, msg, 0, + PSAMPLE_NL_MCGRP_CONFIG, GFP_ATOMIC); + else + nlmsg_free(msg); +} + +static struct psample_group *psample_group_create(struct net *net, + u32 group_num) +{ + struct psample_group *group; + + group = kzalloc(sizeof(*group), GFP_ATOMIC); + if (!group) + return NULL; + + group->net = net; + group->group_num = group_num; + list_add_tail(&group->list, &psample_groups_list); + + psample_group_notify(group, PSAMPLE_CMD_NEW_GROUP); + return group; +} + +static void psample_group_destroy(struct psample_group *group) +{ + psample_group_notify(group, PSAMPLE_CMD_DEL_GROUP); + list_del(&group->list); + kfree(group); +} + +static struct psample_group * +psample_group_lookup(struct net *net, u32 group_num) +{ + struct psample_group *group; + + list_for_each_entry(group, &psample_groups_list, list) + if ((group->group_num == group_num) && (group->net == net)) + return group; + return NULL; +} + +struct psample_group *psample_group_get(struct net *net, u32 group_num) +{ + struct psample_group *group; + + spin_lock(&psample_groups_lock); + + group = psample_group_lookup(net, group_num); + if (!group) { + group = psample_group_create(net, group_num); + if (!group) + goto out; + } + group->refcount++; + +out: + spin_unlock(&psample_groups_lock); + return group; +} +EXPORT_SYMBOL_GPL(psample_group_get); + +void psample_group_put(struct psample_group *group) +{ + spin_lock(&psample_groups_lock); + + if (--group->refcount == 0) + psample_group_destroy(group); + + spin_unlock(&psample_groups_lock); +} +EXPORT_SYMBOL_GPL(psample_group_put); + +void psample_sample_packet(struct psample_group *group, struct sk_buff *skb, + u32 trunc_size, int in_ifindex, int out_ifindex, + u32 sample_rate) +{ + struct sk_buff *nl_skb; + int data_len; + int meta_len; + void *data; + int ret; + + meta_len = (in_ifindex ? nla_total_size(sizeof(u16)) : 0) + + (out_ifindex ? nla_total_size(sizeof(u16)) : 0) + + nla_total_size(sizeof(u32)) + /* sample_rate */ + nla_total_size(sizeof(u32)) + /* orig_size */ + nla_total_size(sizeof(u32)) + /* group_num */ + nla_total_size(sizeof(u32)); /* seq */ + + data_len = min(skb->len, trunc_size); + if (meta_len + nla_total_size(data_len) > PSAMPLE_MAX_PACKET_SIZE) + data_len = PSAMPLE_MAX_PACKET_SIZE - meta_len - NLA_HDRLEN + - NLA_ALIGNTO; + + nl_skb = genlmsg_new(meta_len + data_len, GFP_ATOMIC); + if (unlikely(!nl_skb)) + return; + + data = genlmsg_put(nl_skb, 0, 0, &psample_nl_family, 0, + PSAMPLE_CMD_SAMPLE); + if (unlikely(!data)) + goto error; + + if (in_ifindex) { + ret = nla_put_u16(nl_skb, PSAMPLE_ATTR_IIFINDEX, in_ifindex); + if (unlikely(ret < 0)) + goto error; + } + + if (out_ifindex) { + ret = nla_put_u16(nl_skb, PSAMPLE_ATTR_OIFINDEX, out_ifindex); + if (unlikely(ret < 0)) + goto error; + } + + ret = nla_put_u32(nl_skb, PSAMPLE_ATTR_SAMPLE_RATE, sample_rate); + if (unlikely(ret < 0)) + goto error; + + ret = nla_put_u32(nl_skb, PSAMPLE_ATTR_ORIGSIZE, skb->len); + if (unlikely(ret < 0)) + goto error; + + ret = nla_put_u32(nl_skb, PSAMPLE_ATTR_SAMPLE_GROUP, group->group_num); + if (unlikely(ret < 0)) + goto error; + + ret = nla_put_u32(nl_skb, PSAMPLE_ATTR_GROUP_SEQ, group->seq++); + if (unlikely(ret < 0)) + goto error; + + if (data_len) { + int nla_len = nla_total_size(data_len); + struct nlattr *nla; + + nla = (struct nlattr *)skb_put(nl_skb, nla_len); + nla->nla_type = PSAMPLE_ATTR_DATA; + nla->nla_len = nla_attr_size(data_len); + + if (skb_copy_bits(skb, 0, nla_data(nla), data_len)) + goto error; + } + + genlmsg_end(nl_skb, data); + genlmsg_multicast_netns(&psample_nl_family, group->net, nl_skb, 0, + PSAMPLE_NL_MCGRP_SAMPLE, GFP_ATOMIC); + + return; +error: + pr_err_ratelimited("Could not create psample log message\n"); + nlmsg_free(nl_skb); +} +EXPORT_SYMBOL_GPL(psample_sample_packet); + +static int __init psample_module_init(void) +{ + return genl_register_family(&psample_nl_family); +} + +static void __exit psample_module_exit(void) +{ + genl_unregister_family(&psample_nl_family); +} + +module_init(psample_module_init); +module_exit(psample_module_exit); + +MODULE_AUTHOR("Yotam Gigi "); +MODULE_DESCRIPTION("netlink channel for packet sampling"); +MODULE_LICENSE("GPL v2"); diff --git a/net/sched/Kconfig b/net/sched/Kconfig index a9aa38d43fa7..72cfa3a6bac0 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -650,6 +650,18 @@ config NET_ACT_MIRRED To compile this code as a module, choose M here: the module will be called act_mirred. +config NET_ACT_SAMPLE + tristate "Traffic Sampling" + depends on NET_CLS_ACT + select PSAMPLE + ---help--- + Say Y here to allow packet sampling tc action. The packet sample + action consists of statistically choosing packets and sampling + them using the psample module. + + To compile this code as a module, choose M here: the + module will be called act_sample. + config NET_ACT_IPT tristate "IPtables targets" depends on NET_CLS_ACT && NETFILTER && IP_NF_IPTABLES diff --git a/net/sched/Makefile b/net/sched/Makefile index 4bdda3634e0b..7b915d226de7 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile @@ -10,6 +10,7 @@ obj-$(CONFIG_NET_CLS_ACT) += act_api.o obj-$(CONFIG_NET_ACT_POLICE) += act_police.o obj-$(CONFIG_NET_ACT_GACT) += act_gact.o obj-$(CONFIG_NET_ACT_MIRRED) += act_mirred.o +obj-$(CONFIG_NET_ACT_SAMPLE) += act_sample.o obj-$(CONFIG_NET_ACT_IPT) += act_ipt.o obj-$(CONFIG_NET_ACT_NAT) += act_nat.o obj-$(CONFIG_NET_ACT_PEDIT) += act_pedit.o diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c new file mode 100644 index 000000000000..39229756de07 --- /dev/null +++ b/net/sched/act_sample.c @@ -0,0 +1,274 @@ +/* + * net/sched/act_sample.c - Packet sampling tc action + * Copyright (c) 2017 Yotam Gigi + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#define SAMPLE_TAB_MASK 7 +static unsigned int sample_net_id; +static struct tc_action_ops act_sample_ops; + +static const struct nla_policy sample_policy[TCA_SAMPLE_MAX + 1] = { + [TCA_SAMPLE_PARMS] = { .len = sizeof(struct tc_sample) }, + [TCA_SAMPLE_RATE] = { .type = NLA_U32 }, + [TCA_SAMPLE_TRUNC_SIZE] = { .type = NLA_U32 }, + [TCA_SAMPLE_PSAMPLE_GROUP] = { .type = NLA_U32 }, +}; + +static int tcf_sample_init(struct net *net, struct nlattr *nla, + struct nlattr *est, struct tc_action **a, int ovr, + int bind) +{ + struct tc_action_net *tn = net_generic(net, sample_net_id); + struct nlattr *tb[TCA_SAMPLE_MAX + 1]; + struct psample_group *psample_group; + struct tc_sample *parm; + struct tcf_sample *s; + bool exists = false; + int ret; + + if (!nla) + return -EINVAL; + ret = nla_parse_nested(tb, TCA_SAMPLE_MAX, nla, sample_policy); + if (ret < 0) + return ret; + if (!tb[TCA_SAMPLE_PARMS] || !tb[TCA_SAMPLE_RATE] || + !tb[TCA_SAMPLE_PSAMPLE_GROUP]) + return -EINVAL; + + parm = nla_data(tb[TCA_SAMPLE_PARMS]); + + exists = tcf_hash_check(tn, parm->index, a, bind); + if (exists && bind) + return 0; + + if (!exists) { + ret = tcf_hash_create(tn, parm->index, est, a, + &act_sample_ops, bind, false); + if (ret) + return ret; + ret = ACT_P_CREATED; + } else { + tcf_hash_release(*a, bind); + if (!ovr) + return -EEXIST; + } + s = to_sample(*a); + + ASSERT_RTNL(); + s->tcf_action = parm->action; + s->rate = nla_get_u32(tb[TCA_SAMPLE_RATE]); + s->psample_group_num = nla_get_u32(tb[TCA_SAMPLE_PSAMPLE_GROUP]); + psample_group = psample_group_get(net, s->psample_group_num); + if (!psample_group) + return -ENOMEM; + RCU_INIT_POINTER(s->psample_group, psample_group); + + if (tb[TCA_SAMPLE_TRUNC_SIZE]) { + s->truncate = true; + s->trunc_size = nla_get_u32(tb[TCA_SAMPLE_TRUNC_SIZE]); + } + + if (ret == ACT_P_CREATED) + tcf_hash_insert(tn, *a); + return ret; +} + +static void tcf_sample_cleanup_rcu(struct rcu_head *rcu) +{ + struct tcf_sample *s = container_of(rcu, struct tcf_sample, rcu); + struct psample_group *psample_group; + + psample_group = rcu_dereference_protected(s->psample_group, 1); + RCU_INIT_POINTER(s->psample_group, NULL); + psample_group_put(psample_group); +} + +static void tcf_sample_cleanup(struct tc_action *a, int bind) +{ + struct tcf_sample *s = to_sample(a); + + call_rcu(&s->rcu, tcf_sample_cleanup_rcu); +} + +static bool tcf_sample_dev_ok_push(struct net_device *dev) +{ + switch (dev->type) { + case ARPHRD_TUNNEL: + case ARPHRD_TUNNEL6: + case ARPHRD_SIT: + case ARPHRD_IPGRE: + case ARPHRD_VOID: + case ARPHRD_NONE: + return false; + default: + return true; + } +} + +static int tcf_sample_act(struct sk_buff *skb, const struct tc_action *a, + struct tcf_result *res) +{ + struct tcf_sample *s = to_sample(a); + struct psample_group *psample_group; + int retval; + int size; + int iif; + int oif; + + tcf_lastuse_update(&s->tcf_tm); + bstats_cpu_update(this_cpu_ptr(s->common.cpu_bstats), skb); + retval = READ_ONCE(s->tcf_action); + + rcu_read_lock(); + psample_group = rcu_dereference(s->psample_group); + + /* randomly sample packets according to rate */ + if (psample_group && (prandom_u32() % s->rate == 0)) { + if (!skb_at_tc_ingress(skb)) { + iif = skb->skb_iif; + oif = skb->dev->ifindex; + } else { + iif = skb->dev->ifindex; + oif = 0; + } + + /* on ingress, the mac header gets popped, so push it back */ + if (skb_at_tc_ingress(skb) && tcf_sample_dev_ok_push(skb->dev)) + skb_push(skb, skb->mac_len); + + size = s->truncate ? s->trunc_size : skb->len; + psample_sample_packet(psample_group, skb, size, iif, oif, + s->rate); + + if (skb_at_tc_ingress(skb) && tcf_sample_dev_ok_push(skb->dev)) + skb_pull(skb, skb->mac_len); + } + + rcu_read_unlock(); + return retval; +} + +static int tcf_sample_dump(struct sk_buff *skb, struct tc_action *a, + int bind, int ref) +{ + unsigned char *b = skb_tail_pointer(skb); + struct tcf_sample *s = to_sample(a); + struct tc_sample opt = { + .index = s->tcf_index, + .action = s->tcf_action, + .refcnt = s->tcf_refcnt - ref, + .bindcnt = s->tcf_bindcnt - bind, + }; + struct tcf_t t; + + if (nla_put(skb, TCA_SAMPLE_PARMS, sizeof(opt), &opt)) + goto nla_put_failure; + + tcf_tm_dump(&t, &s->tcf_tm); + if (nla_put_64bit(skb, TCA_SAMPLE_TM, sizeof(t), &t, TCA_SAMPLE_PAD)) + goto nla_put_failure; + + if (nla_put_u32(skb, TCA_SAMPLE_RATE, s->rate)) + goto nla_put_failure; + + if (s->truncate) + if (nla_put_u32(skb, TCA_SAMPLE_TRUNC_SIZE, s->trunc_size)) + goto nla_put_failure; + + if (nla_put_u32(skb, TCA_SAMPLE_PSAMPLE_GROUP, s->psample_group_num)) + goto nla_put_failure; + return skb->len; + +nla_put_failure: + nlmsg_trim(skb, b); + return -1; +} + +static int tcf_sample_walker(struct net *net, struct sk_buff *skb, + struct netlink_callback *cb, int type, + const struct tc_action_ops *ops) +{ + struct tc_action_net *tn = net_generic(net, sample_net_id); + + return tcf_generic_walker(tn, skb, cb, type, ops); +} + +static int tcf_sample_search(struct net *net, struct tc_action **a, u32 index) +{ + struct tc_action_net *tn = net_generic(net, sample_net_id); + + return tcf_hash_search(tn, a, index); +} + +static struct tc_action_ops act_sample_ops = { + .kind = "sample", + .type = TCA_ACT_SAMPLE, + .owner = THIS_MODULE, + .act = tcf_sample_act, + .dump = tcf_sample_dump, + .init = tcf_sample_init, + .cleanup = tcf_sample_cleanup, + .walk = tcf_sample_walker, + .lookup = tcf_sample_search, + .size = sizeof(struct tcf_sample), +}; + +static __net_init int sample_init_net(struct net *net) +{ + struct tc_action_net *tn = net_generic(net, sample_net_id); + + return tc_action_net_init(tn, &act_sample_ops, SAMPLE_TAB_MASK); +} + +static void __net_exit sample_exit_net(struct net *net) +{ + struct tc_action_net *tn = net_generic(net, sample_net_id); + + tc_action_net_exit(tn); +} + +static struct pernet_operations sample_net_ops = { + .init = sample_init_net, + .exit = sample_exit_net, + .id = &sample_net_id, + .size = sizeof(struct tc_action_net), +}; + +static int __init sample_init_module(void) +{ + return tcf_register_action(&act_sample_ops, &sample_net_ops); +} + +static void __exit sample_cleanup_module(void) +{ + tcf_unregister_action(&act_sample_ops, &sample_net_ops); +} + +module_init(sample_init_module); +module_exit(sample_cleanup_module); + +MODULE_AUTHOR("Yotam Gigi "); +MODULE_DESCRIPTION("Packet sampling action"); +MODULE_LICENSE("GPL v2");