linux/drivers/net/ethernet/ti/cpsw_new.c

2131 lines
52 KiB
C
Raw Normal View History

// SPDX-License-Identifier: GPL-2.0
/*
* Texas Instruments Ethernet Switch Driver
*
* Copyright (C) 2019 Texas Instruments
*/
#include <linux/io.h>
#include <linux/clk.h>
#include <linux/platform_device.h>
#include <linux/timer.h>
#include <linux/module.h>
#include <linux/irqreturn.h>
#include <linux/interrupt.h>
#include <linux/if_ether.h>
#include <linux/etherdevice.h>
#include <linux/net_tstamp.h>
#include <linux/phy.h>
#include <linux/phy/phy.h>
#include <linux/delay.h>
#include <linux/pinctrl/consumer.h>
#include <linux/pm_runtime.h>
#include <linux/gpio/consumer.h>
#include <linux/of.h>
#include <linux/of_mdio.h>
#include <linux/of_net.h>
#include <linux/of_platform.h>
#include <linux/if_vlan.h>
#include <linux/kmemleak.h>
#include <linux/sys_soc.h>
net: make switchdev_bridge_port_{,unoffload} loosely coupled with the bridge With the introduction of explicit offloading API in switchdev in commit 2f5dc00f7a3e ("net: bridge: switchdev: let drivers inform which bridge ports are offloaded"), we started having Ethernet switch drivers calling directly into a function exported by net/bridge/br_switchdev.c, which is a function exported by the bridge driver. This means that drivers that did not have an explicit dependency on the bridge before, like cpsw and am65-cpsw, now do - otherwise it is not possible to call a symbol exported by a driver that can be built as module unless you are a module too. There was an attempt to solve the dependency issue in the form of commit b0e81817629a ("net: build all switchdev drivers as modules when the bridge is a module"). Grygorii Strashko, however, says about it: | In my opinion, the problem is a bit bigger here than just fixing the | build :( | | In case, of ^cpsw the switchdev mode is kinda optional and in many | cases (especially for testing purposes, NFS) the multi-mac mode is | still preferable mode. | | There were no such tight dependency between switchdev drivers and | bridge core before and switchdev serviced as independent, notification | based layer between them, so ^cpsw still can be "Y" and bridge can be | "M". Now for mostly every kernel build configuration the CONFIG_BRIDGE | will need to be set as "Y", or we will have to update drivers to | support build with BRIDGE=n and maintain separate builds for | networking vs non-networking testing. But is this enough? Wouldn't | it cause 'chain reaction' required to add more and more "Y" options | (like CONFIG_VLAN_8021Q)? | | PS. Just to be sure we on the same page - ARM builds will be forced | (with this patch) to have CONFIG_TI_CPSW_SWITCHDEV=m and so all our | automation testing will just fail with omap2plus_defconfig. In the light of this, it would be desirable for some configurations to avoid dependencies between switchdev drivers and the bridge, and have the switchdev mode as completely optional within the driver. Arnd Bergmann also tried to write a patch which better expressed the build time dependency for Ethernet switch drivers where the switchdev support is optional, like cpsw/am65-cpsw, and this made the drivers follow the bridge (compile as module if the bridge is a module) only if the optional switchdev support in the driver was enabled in the first place: https://patchwork.kernel.org/project/netdevbpf/patch/20210802144813.1152762-1-arnd@kernel.org/ but this still did not solve the fact that cpsw and am65-cpsw now must be built as modules when the bridge is a module - it just expressed correctly that optional dependency. But the new behavior is an apparent regression from Grygorii's perspective. So to support the use case where the Ethernet driver is built-in, NET_SWITCHDEV (a bool option) is enabled, and the bridge is a module, we need a framework that can handle the possible absence of the bridge from the running system, i.e. runtime bloatware as opposed to build-time bloatware. Luckily we already have this framework, since switchdev has been using it extensively. Events from the bridge side are transmitted to the driver side using notifier chains - this was originally done so that unrelated drivers could snoop for events emitted by the bridge towards ports that are implemented by other drivers (think of a switch driver with LAG offload that listens for switchdev events on a bonding/team interface that it offloads). There are also events which are transmitted from the driver side to the bridge side, which again are modeled using notifiers. SWITCHDEV_FDB_ADD_TO_BRIDGE is an example of this, and deals with notifying the bridge that a MAC address has been dynamically learned. So there is a precedent we can use for modeling the new framework. The difference compared to SWITCHDEV_FDB_ADD_TO_BRIDGE is that the work that the bridge needs to do when a port becomes offloaded is blocking in its nature: replay VLANs, MDBs etc. The calling context is indeed blocking (we are under rtnl_mutex), but the existing switchdev notification chain that the bridge is subscribed to is only the atomic one. So we need to subscribe the bridge to the blocking switchdev notification chain too. This patch: - keeps the driver-side perception of the switchdev_bridge_port_{,un}offload unchanged - moves the implementation of switchdev_bridge_port_{,un}offload from the bridge module into the switchdev module. - makes everybody that is subscribed to the switchdev blocking notifier chain "hear" offload & unoffload events - makes the bridge driver subscribe and handle those events - moves the bridge driver's handling of those events into 2 new functions called br_switchdev_port_{,un}offload. These functions contain in fact the core of the logic that was previously in switchdev_bridge_port_{,un}offload, just that now we go through an extra indirection layer to reach them. Unlike all the other switchdev notification structures, the structure used to carry the bridge port information, struct switchdev_notifier_brport_info, does not contain a "bool handled". This is because in the current usage pattern, we always know that a switchdev bridge port offloading event will be handled by the bridge, because the switchdev_bridge_port_offload() call was initiated by a NETDEV_CHANGEUPPER event in the first place, where info->upper_dev is a bridge. So if the bridge wasn't loaded, then the CHANGEUPPER event couldn't have happened. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Tested-by: Grygorii Strashko <grygorii.strashko@ti.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2021-08-03 23:34:08 +03:00
#include <net/switchdev.h>
#include <net/page_pool/helpers.h>
#include <net/pkt_cls.h>
#include <net/devlink.h>
#include "cpsw.h"
#include "cpsw_ale.h"
#include "cpsw_priv.h"
#include "cpsw_sl.h"
#include "cpsw_switchdev.h"
#include "cpts.h"
#include "davinci_cpdma.h"
#include <net/pkt_sched.h>
static int debug_level;
static int ale_ageout = CPSW_ALE_AGEOUT_DEFAULT;
static int rx_packet_max = CPSW_MAX_PACKET_SIZE;
static int descs_pool_size = CPSW_CPDMA_DESCS_POOL_SIZE_DEFAULT;
struct cpsw_devlink {
struct cpsw_common *cpsw;
};
enum cpsw_devlink_param_id {
CPSW_DEVLINK_PARAM_ID_BASE = DEVLINK_PARAM_GENERIC_ID_MAX,
CPSW_DL_PARAM_SWITCH_MODE,
CPSW_DL_PARAM_ALE_BYPASS,
};
/* struct cpsw_common is not needed, kept here for compatibility
* reasons witrh the old driver
*/
static int cpsw_slave_index_priv(struct cpsw_common *cpsw,
struct cpsw_priv *priv)
{
if (priv->emac_port == HOST_PORT_NUM)
return -1;
return priv->emac_port - 1;
}
static bool cpsw_is_switch_en(struct cpsw_common *cpsw)
{
return !cpsw->data.dual_emac;
}
static void cpsw_set_promiscious(struct net_device *ndev, bool enable)
{
struct cpsw_common *cpsw = ndev_to_cpsw(ndev);
bool enable_uni = false;
int i;
if (cpsw_is_switch_en(cpsw))
return;
/* Enabling promiscuous mode for one interface will be
* common for both the interface as the interface shares
* the same hardware resource.
*/
for (i = 0; i < cpsw->data.slaves; i++)
if (cpsw->slaves[i].ndev &&
(cpsw->slaves[i].ndev->flags & IFF_PROMISC))
enable_uni = true;
if (!enable && enable_uni) {
enable = enable_uni;
dev_dbg(cpsw->dev, "promiscuity not disabled as the other interface is still in promiscuity mode\n");
}
if (enable) {
/* Enable unknown unicast, reg/unreg mcast */
cpsw_ale_control_set(cpsw->ale, HOST_PORT_NUM,
ALE_P0_UNI_FLOOD, 1);
dev_dbg(cpsw->dev, "promiscuity enabled\n");
} else {
/* Disable unknown unicast */
cpsw_ale_control_set(cpsw->ale, HOST_PORT_NUM,
ALE_P0_UNI_FLOOD, 0);
dev_dbg(cpsw->dev, "promiscuity disabled\n");
}
}
/**
* cpsw_set_mc - adds multicast entry to the table if it's not added or deletes
* if it's not deleted
* @ndev: device to sync
* @addr: address to be added or deleted
* @vid: vlan id, if vid < 0 set/unset address for real device
* @add: add address if the flag is set or remove otherwise
*/
static int cpsw_set_mc(struct net_device *ndev, const u8 *addr,
int vid, int add)
{
struct cpsw_priv *priv = netdev_priv(ndev);
struct cpsw_common *cpsw = priv->cpsw;
int mask, flags, ret, slave_no;
slave_no = cpsw_slave_index(cpsw, priv);
if (vid < 0)
vid = cpsw->slaves[slave_no].port_vlan;
mask = ALE_PORT_HOST;
flags = vid ? ALE_VLAN : 0;
if (add)
ret = cpsw_ale_add_mcast(cpsw->ale, addr, mask, flags, vid, 0);
else
ret = cpsw_ale_del_mcast(cpsw->ale, addr, 0, flags, vid);
return ret;
}
static int cpsw_update_vlan_mc(struct net_device *vdev, int vid, void *ctx)
{
struct addr_sync_ctx *sync_ctx = ctx;
struct netdev_hw_addr *ha;
int found = 0, ret = 0;
if (!vdev || !(vdev->flags & IFF_UP))
return 0;
/* vlan address is relevant if its sync_cnt != 0 */
netdev_for_each_mc_addr(ha, vdev) {
if (ether_addr_equal(ha->addr, sync_ctx->addr)) {
found = ha->sync_cnt;
break;
}
}
if (found)
sync_ctx->consumed++;
if (sync_ctx->flush) {
if (!found)
cpsw_set_mc(sync_ctx->ndev, sync_ctx->addr, vid, 0);
return 0;
}
if (found)
ret = cpsw_set_mc(sync_ctx->ndev, sync_ctx->addr, vid, 1);
return ret;
}
static int cpsw_add_mc_addr(struct net_device *ndev, const u8 *addr, int num)
{
struct addr_sync_ctx sync_ctx;
int ret;
sync_ctx.consumed = 0;
sync_ctx.addr = addr;
sync_ctx.ndev = ndev;
sync_ctx.flush = 0;
ret = vlan_for_each(ndev, cpsw_update_vlan_mc, &sync_ctx);
if (sync_ctx.consumed < num && !ret)
ret = cpsw_set_mc(ndev, addr, -1, 1);
return ret;
}
static int cpsw_del_mc_addr(struct net_device *ndev, const u8 *addr, int num)
{
struct addr_sync_ctx sync_ctx;
sync_ctx.consumed = 0;
sync_ctx.addr = addr;
sync_ctx.ndev = ndev;
sync_ctx.flush = 1;
vlan_for_each(ndev, cpsw_update_vlan_mc, &sync_ctx);
if (sync_ctx.consumed == num)
cpsw_set_mc(ndev, addr, -1, 0);
return 0;
}
static int cpsw_purge_vlan_mc(struct net_device *vdev, int vid, void *ctx)
{
struct addr_sync_ctx *sync_ctx = ctx;
struct netdev_hw_addr *ha;
int found = 0;
if (!vdev || !(vdev->flags & IFF_UP))
return 0;
/* vlan address is relevant if its sync_cnt != 0 */
netdev_for_each_mc_addr(ha, vdev) {
if (ether_addr_equal(ha->addr, sync_ctx->addr)) {
found = ha->sync_cnt;
break;
}
}
if (!found)
return 0;
sync_ctx->consumed++;
cpsw_set_mc(sync_ctx->ndev, sync_ctx->addr, vid, 0);
return 0;
}
static int cpsw_purge_all_mc(struct net_device *ndev, const u8 *addr, int num)
{
struct addr_sync_ctx sync_ctx;
sync_ctx.addr = addr;
sync_ctx.ndev = ndev;
sync_ctx.consumed = 0;
vlan_for_each(ndev, cpsw_purge_vlan_mc, &sync_ctx);
if (sync_ctx.consumed < num)
cpsw_set_mc(ndev, addr, -1, 0);
return 0;
}
static void cpsw_ndo_set_rx_mode(struct net_device *ndev)
{
struct cpsw_priv *priv = netdev_priv(ndev);
struct cpsw_common *cpsw = priv->cpsw;
if (ndev->flags & IFF_PROMISC) {
/* Enable promiscuous mode */
cpsw_set_promiscious(ndev, true);
cpsw_ale_set_allmulti(cpsw->ale, IFF_ALLMULTI, priv->emac_port);
return;
}
/* Disable promiscuous mode */
cpsw_set_promiscious(ndev, false);
/* Restore allmulti on vlans if necessary */
cpsw_ale_set_allmulti(cpsw->ale,
ndev->flags & IFF_ALLMULTI, priv->emac_port);
/* add/remove mcast address either for real netdev or for vlan */
__hw_addr_ref_sync_dev(&ndev->mc, ndev, cpsw_add_mc_addr,
cpsw_del_mc_addr);
}
static unsigned int cpsw_rxbuf_total_len(unsigned int len)
{
len += CPSW_HEADROOM_NA;
len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
return SKB_DATA_ALIGN(len);
}
static void cpsw_rx_handler(void *token, int len, int status)
{
struct page *new_page, *page = token;
void *pa = page_address(page);
int headroom = CPSW_HEADROOM_NA;
struct cpsw_meta_xdp *xmeta;
struct cpsw_common *cpsw;
struct net_device *ndev;
int port, ch, pkt_size;
struct cpsw_priv *priv;
struct page_pool *pool;
struct sk_buff *skb;
struct xdp_buff xdp;
int ret = 0;
dma_addr_t dma;
xmeta = pa + CPSW_XMETA_OFFSET;
cpsw = ndev_to_cpsw(xmeta->ndev);
ndev = xmeta->ndev;
pkt_size = cpsw->rx_packet_max;
ch = xmeta->ch;
if (status >= 0) {
port = CPDMA_RX_SOURCE_PORT(status);
if (port)
ndev = cpsw->slaves[--port].ndev;
}
priv = netdev_priv(ndev);
pool = cpsw->page_pool[ch];
if (unlikely(status < 0) || unlikely(!netif_running(ndev))) {
/* In dual emac mode check for all interfaces */
if (cpsw->usage_count && status >= 0) {
/* The packet received is for the interface which
* is already down and the other interface is up
* and running, instead of freeing which results
* in reducing of the number of rx descriptor in
* DMA engine, requeue page back to cpdma.
*/
new_page = page;
goto requeue;
}
/* the interface is going down, pages are purged */
page_pool_recycle_direct(pool, page);
return;
}
new_page = page_pool_dev_alloc_pages(pool);
if (unlikely(!new_page)) {
new_page = page;
ndev->stats.rx_dropped++;
goto requeue;
}
if (priv->xdp_prog) {
int size = len;
xdp_init_buff(&xdp, PAGE_SIZE, &priv->xdp_rxq[ch]);
if (status & CPDMA_RX_VLAN_ENCAP) {
headroom += CPSW_RX_VLAN_ENCAP_HDR_SIZE;
size -= CPSW_RX_VLAN_ENCAP_HDR_SIZE;
}
xdp_prepare_buff(&xdp, pa, headroom, size, false);
ret = cpsw_run_xdp(priv, ch, &xdp, page, priv->emac_port, &len);
if (ret != CPSW_XDP_PASS)
goto requeue;
headroom = xdp.data - xdp.data_hard_start;
/* XDP prog can modify vlan tag, so can't use encap header */
status &= ~CPDMA_RX_VLAN_ENCAP;
}
/* pass skb to netstack if no XDP prog or returned XDP_PASS */
skb = build_skb(pa, cpsw_rxbuf_total_len(pkt_size));
if (!skb) {
ndev->stats.rx_dropped++;
page_pool_recycle_direct(pool, page);
goto requeue;
}
skb->offload_fwd_mark = priv->offload_fwd_mark;
skb_reserve(skb, headroom);
skb_put(skb, len);
skb->dev = ndev;
if (status & CPDMA_RX_VLAN_ENCAP)
cpsw_rx_vlan_encap(skb);
if (priv->rx_ts_enabled)
cpts_rx_timestamp(cpsw->cpts, skb);
skb->protocol = eth_type_trans(skb, ndev);
net: ti: add pp skb recycling support As already done for mvneta and mvpp2, enable skb recycling for ti ethernet drivers ti driver on net-next: ---------------------- [perf top] 47.15% [kernel] [k] _raw_spin_unlock_irqrestore 11.77% [kernel] [k] __cpdma_chan_free 3.16% [kernel] [k] ___bpf_prog_run 2.52% [kernel] [k] cpsw_rx_vlan_encap 2.34% [kernel] [k] __netif_receive_skb_core 2.27% [kernel] [k] free_unref_page 2.26% [kernel] [k] kmem_cache_free 2.24% [kernel] [k] kmem_cache_alloc 1.69% [kernel] [k] __softirqentry_text_start 1.61% [kernel] [k] cpsw_rx_handler 1.19% [kernel] [k] page_pool_release_page 1.19% [kernel] [k] clear_bits_ll 1.15% [kernel] [k] page_frag_free 1.06% [kernel] [k] __dma_page_dev_to_cpu 0.99% [kernel] [k] memset 0.94% [kernel] [k] __alloc_pages_bulk 0.92% [kernel] [k] kfree_skb 0.85% [kernel] [k] packet_rcv 0.78% [kernel] [k] page_address 0.75% [kernel] [k] v7_dma_inv_range 0.71% [kernel] [k] __lock_text_start [iperf3 tcp] [ 5] 0.00-10.00 sec 873 MBytes 732 Mbits/sec 0 sender [ 5] 0.00-10.01 sec 866 MBytes 726 Mbits/sec receiver ti + skb recycling: ------------------- [perf top] 40.58% [kernel] [k] _raw_spin_unlock_irqrestore 16.18% [kernel] [k] __softirqentry_text_start 10.33% [kernel] [k] __cpdma_chan_free 2.62% [kernel] [k] ___bpf_prog_run 2.05% [kernel] [k] cpsw_rx_vlan_encap 2.00% [kernel] [k] kmem_cache_alloc 1.86% [kernel] [k] __netif_receive_skb_core 1.80% [kernel] [k] kmem_cache_free 1.63% [kernel] [k] cpsw_rx_handler 1.12% [kernel] [k] cpsw_rx_mq_poll 1.11% [kernel] [k] page_pool_put_page 1.04% [kernel] [k] _raw_spin_unlock 0.97% [kernel] [k] clear_bits_ll 0.90% [kernel] [k] packet_rcv 0.88% [kernel] [k] __dma_page_dev_to_cpu 0.85% [kernel] [k] kfree_skb 0.80% [kernel] [k] memset 0.71% [kernel] [k] __lock_text_start 0.66% [kernel] [k] v7_dma_inv_range 0.64% [kernel] [k] gen_pool_free_owner [iperf3 tcp] [ 5] 0.00-10.00 sec 884 MBytes 742 Mbits/sec 0 sender [ 5] 0.00-10.01 sec 878 MBytes 735 Mbits/sec receiver Tested-by: Grygorii Strashko <grygorii.strashko@ti.com> Reviewed-by: Grygorii Strashko <grygorii.strashko@ti.com> Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org> Signed-off-by: David S. Miller <davem@davemloft.net>
2021-06-15 16:27:41 +03:00
/* mark skb for recycling */
skb_mark_for_recycle(skb);
netif_receive_skb(skb);
ndev->stats.rx_bytes += len;
ndev->stats.rx_packets++;
requeue:
xmeta = page_address(new_page) + CPSW_XMETA_OFFSET;
xmeta->ndev = ndev;
xmeta->ch = ch;
dma = page_pool_get_dma_addr(new_page) + CPSW_HEADROOM_NA;
ret = cpdma_chan_submit_mapped(cpsw->rxv[ch].ch, new_page, dma,
pkt_size, 0);
if (ret < 0) {
WARN_ON(ret == -ENOMEM);
page_pool_recycle_direct(pool, new_page);
}
}
static int cpsw_add_vlan_ale_entry(struct cpsw_priv *priv,
unsigned short vid)
{
struct cpsw_common *cpsw = priv->cpsw;
int unreg_mcast_mask = 0;
int mcast_mask;
u32 port_mask;
int ret;
port_mask = (1 << priv->emac_port) | ALE_PORT_HOST;
mcast_mask = ALE_PORT_HOST;
if (priv->ndev->flags & IFF_ALLMULTI)
unreg_mcast_mask = mcast_mask;
ret = cpsw_ale_add_vlan(cpsw->ale, vid, port_mask, 0, port_mask,
unreg_mcast_mask);
if (ret != 0)
return ret;
ret = cpsw_ale_add_ucast(cpsw->ale, priv->mac_addr,
HOST_PORT_NUM, ALE_VLAN, vid);
if (ret != 0)
goto clean_vid;
ret = cpsw_ale_add_mcast(cpsw->ale, priv->ndev->broadcast,
mcast_mask, ALE_VLAN, vid, 0);
if (ret != 0)
goto clean_vlan_ucast;
return 0;
clean_vlan_ucast:
cpsw_ale_del_ucast(cpsw->ale, priv->mac_addr,
HOST_PORT_NUM, ALE_VLAN, vid);
clean_vid:
cpsw_ale_del_vlan(cpsw->ale, vid, 0);
return ret;
}
static int cpsw_ndo_vlan_rx_add_vid(struct net_device *ndev,
__be16 proto, u16 vid)
{
struct cpsw_priv *priv = netdev_priv(ndev);
struct cpsw_common *cpsw = priv->cpsw;
int ret, i;
if (cpsw_is_switch_en(cpsw)) {
dev_dbg(cpsw->dev, ".ndo_vlan_rx_add_vid called in switch mode\n");
return 0;
}
if (vid == cpsw->data.default_vlan)
return 0;
ret = pm_runtime_resume_and_get(cpsw->dev);
if (ret < 0)
return ret;
/* In dual EMAC, reserved VLAN id should not be used for
* creating VLAN interfaces as this can break the dual
* EMAC port separation
*/
for (i = 0; i < cpsw->data.slaves; i++) {
if (cpsw->slaves[i].ndev &&
vid == cpsw->slaves[i].port_vlan) {
ret = -EINVAL;
goto err;
}
}
dev_dbg(priv->dev, "Adding vlanid %d to vlan filter\n", vid);
ret = cpsw_add_vlan_ale_entry(priv, vid);
err:
pm_runtime_put(cpsw->dev);
return ret;
}
static int cpsw_restore_vlans(struct net_device *vdev, int vid, void *arg)
{
struct cpsw_priv *priv = arg;
if (!vdev || !vid)
return 0;
cpsw_ndo_vlan_rx_add_vid(priv->ndev, 0, vid);
return 0;
}
/* restore resources after port reset */
static void cpsw_restore(struct cpsw_priv *priv)
{
struct cpsw_common *cpsw = priv->cpsw;
/* restore vlan configurations */
vlan_for_each(priv->ndev, cpsw_restore_vlans, priv);
/* restore MQPRIO offload */
cpsw_mqprio_resume(&cpsw->slaves[priv->emac_port - 1], priv);
/* restore CBS offload */
cpsw_cbs_resume(&cpsw->slaves[priv->emac_port - 1], priv);
cpsw_qos_clsflower_resume(priv);
}
static void cpsw_init_stp_ale_entry(struct cpsw_common *cpsw)
{
static const char stpa[] = {0x01, 0x80, 0xc2, 0x0, 0x0, 0x0};
cpsw_ale_add_mcast(cpsw->ale, stpa,
ALE_PORT_HOST, ALE_SUPER, 0,
ALE_MCAST_BLOCK_LEARN_FWD);
}
static void cpsw_init_host_port_switch(struct cpsw_common *cpsw)
{
int vlan = cpsw->data.default_vlan;
writel(CPSW_FIFO_NORMAL_MODE, &cpsw->host_port_regs->tx_in_ctl);
writel(vlan, &cpsw->host_port_regs->port_vlan);
cpsw_ale_add_vlan(cpsw->ale, vlan, ALE_ALL_PORTS,
ALE_ALL_PORTS, ALE_ALL_PORTS,
ALE_PORT_1 | ALE_PORT_2);
cpsw_init_stp_ale_entry(cpsw);
cpsw_ale_control_set(cpsw->ale, HOST_PORT_NUM, ALE_P0_UNI_FLOOD, 1);
dev_dbg(cpsw->dev, "Set P0_UNI_FLOOD\n");
cpsw_ale_control_set(cpsw->ale, HOST_PORT_NUM, ALE_PORT_NOLEARN, 0);
}
static void cpsw_init_host_port_dual_mac(struct cpsw_common *cpsw)
{
int vlan = cpsw->data.default_vlan;
writel(CPSW_FIFO_DUAL_MAC_MODE, &cpsw->host_port_regs->tx_in_ctl);
cpsw_ale_control_set(cpsw->ale, HOST_PORT_NUM, ALE_P0_UNI_FLOOD, 0);
dev_dbg(cpsw->dev, "unset P0_UNI_FLOOD\n");
writel(vlan, &cpsw->host_port_regs->port_vlan);
cpsw_ale_add_vlan(cpsw->ale, vlan, ALE_ALL_PORTS, ALE_ALL_PORTS, 0, 0);
/* learning make no sense in dual_mac mode */
cpsw_ale_control_set(cpsw->ale, HOST_PORT_NUM, ALE_PORT_NOLEARN, 1);
}
static void cpsw_init_host_port(struct cpsw_priv *priv)
{
struct cpsw_common *cpsw = priv->cpsw;
u32 control_reg;
/* soft reset the controller and initialize ale */
soft_reset("cpsw", &cpsw->regs->soft_reset);
cpsw_ale_start(cpsw->ale);
/* switch to vlan unaware mode */
cpsw_ale_control_set(cpsw->ale, HOST_PORT_NUM, ALE_VLAN_AWARE,
CPSW_ALE_VLAN_AWARE);
control_reg = readl(&cpsw->regs->control);
control_reg |= CPSW_VLAN_AWARE | CPSW_RX_VLAN_ENCAP;
writel(control_reg, &cpsw->regs->control);
/* setup host port priority mapping */
writel_relaxed(CPDMA_TX_PRIORITY_MAP,
&cpsw->host_port_regs->cpdma_tx_pri_map);
writel_relaxed(0, &cpsw->host_port_regs->cpdma_rx_chan_map);
/* disable priority elevation */
writel_relaxed(0, &cpsw->regs->ptype);
/* enable statistics collection only on all ports */
writel_relaxed(0x7, &cpsw->regs->stat_port_en);
/* Enable internal fifo flow control */
writel(0x7, &cpsw->regs->flow_control);
if (cpsw_is_switch_en(cpsw))
cpsw_init_host_port_switch(cpsw);
else
cpsw_init_host_port_dual_mac(cpsw);
cpsw_ale_control_set(cpsw->ale, HOST_PORT_NUM,
ALE_PORT_STATE, ALE_PORT_STATE_FORWARD);
}
static void cpsw_port_add_dual_emac_def_ale_entries(struct cpsw_priv *priv,
struct cpsw_slave *slave)
{
u32 port_mask = 1 << priv->emac_port | ALE_PORT_HOST;
struct cpsw_common *cpsw = priv->cpsw;
u32 reg;
reg = (cpsw->version == CPSW_VERSION_1) ? CPSW1_PORT_VLAN :
CPSW2_PORT_VLAN;
slave_write(slave, slave->port_vlan, reg);
cpsw_ale_add_vlan(cpsw->ale, slave->port_vlan, port_mask,
port_mask, port_mask, 0);
cpsw_ale_add_mcast(cpsw->ale, priv->ndev->broadcast,
ALE_PORT_HOST, ALE_VLAN, slave->port_vlan,
ALE_MCAST_FWD);
cpsw_ale_add_ucast(cpsw->ale, priv->mac_addr,
HOST_PORT_NUM, ALE_VLAN |
ALE_SECURE, slave->port_vlan);
cpsw_ale_control_set(cpsw->ale, priv->emac_port,
ALE_PORT_DROP_UNKNOWN_VLAN, 1);
/* learning make no sense in dual_mac mode */
cpsw_ale_control_set(cpsw->ale, priv->emac_port,
ALE_PORT_NOLEARN, 1);
}
static void cpsw_port_add_switch_def_ale_entries(struct cpsw_priv *priv,
struct cpsw_slave *slave)
{
u32 port_mask = 1 << priv->emac_port | ALE_PORT_HOST;
struct cpsw_common *cpsw = priv->cpsw;
u32 reg;
cpsw_ale_control_set(cpsw->ale, priv->emac_port,
ALE_PORT_DROP_UNKNOWN_VLAN, 0);
cpsw_ale_control_set(cpsw->ale, priv->emac_port,
ALE_PORT_NOLEARN, 0);
/* disabling SA_UPDATE required to make stp work, without this setting
* Host MAC addresses will jump between ports.
* As per TRM MAC address can be defined as unicast supervisory (super)
* by setting both (ALE_BLOCKED | ALE_SECURE) which should prevent
* SA_UPDATE, but HW seems works incorrectly and setting ALE_SECURE
* causes STP packets to be dropped due to ingress filter
* if (source address found) and (secure) and
* (receive port number != port_number))
* then discard the packet
*/
cpsw_ale_control_set(cpsw->ale, priv->emac_port,
ALE_PORT_NO_SA_UPDATE, 1);
cpsw_ale_add_mcast(cpsw->ale, priv->ndev->broadcast,
port_mask, ALE_VLAN, slave->port_vlan,
ALE_MCAST_FWD_2);
cpsw_ale_add_ucast(cpsw->ale, priv->mac_addr,
HOST_PORT_NUM, ALE_VLAN, slave->port_vlan);
reg = (cpsw->version == CPSW_VERSION_1) ? CPSW1_PORT_VLAN :
CPSW2_PORT_VLAN;
slave_write(slave, slave->port_vlan, reg);
}
static void cpsw_adjust_link(struct net_device *ndev)
{
struct cpsw_priv *priv = netdev_priv(ndev);
struct cpsw_common *cpsw = priv->cpsw;
struct cpsw_slave *slave;
struct phy_device *phy;
u32 mac_control = 0;
slave = &cpsw->slaves[priv->emac_port - 1];
phy = slave->phy;
if (!phy)
return;
if (phy->link) {
mac_control = CPSW_SL_CTL_GMII_EN;
if (phy->speed == 1000)
mac_control |= CPSW_SL_CTL_GIG;
if (phy->duplex)
mac_control |= CPSW_SL_CTL_FULLDUPLEX;
/* set speed_in input in case RMII mode is used in 100Mbps */
if (phy->speed == 100)
mac_control |= CPSW_SL_CTL_IFCTL_A;
/* in band mode only works in 10Mbps RGMII mode */
else if ((phy->speed == 10) && phy_interface_is_rgmii(phy))
mac_control |= CPSW_SL_CTL_EXT_EN; /* In Band mode */
if (priv->rx_pause)
mac_control |= CPSW_SL_CTL_RX_FLOW_EN;
if (priv->tx_pause)
mac_control |= CPSW_SL_CTL_TX_FLOW_EN;
if (mac_control != slave->mac_control)
cpsw_sl_ctl_set(slave->mac_sl, mac_control);
/* enable forwarding */
cpsw_ale_control_set(cpsw->ale, priv->emac_port,
ALE_PORT_STATE, ALE_PORT_STATE_FORWARD);
netif_tx_wake_all_queues(ndev);
if (priv->shp_cfg_speed &&
priv->shp_cfg_speed != slave->phy->speed &&
!cpsw_shp_is_off(priv))
dev_warn(priv->dev, "Speed was changed, CBS shaper speeds are changed!");
} else {
netif_tx_stop_all_queues(ndev);
mac_control = 0;
/* disable forwarding */
cpsw_ale_control_set(cpsw->ale, priv->emac_port,
ALE_PORT_STATE, ALE_PORT_STATE_DISABLE);
cpsw_sl_wait_for_idle(slave->mac_sl, 100);
cpsw_sl_ctl_reset(slave->mac_sl);
}
if (mac_control != slave->mac_control)
phy_print_status(phy);
slave->mac_control = mac_control;
if (phy->link && cpsw_need_resplit(cpsw))
cpsw_split_res(cpsw);
}
static void cpsw_slave_open(struct cpsw_slave *slave, struct cpsw_priv *priv)
{
struct cpsw_common *cpsw = priv->cpsw;
struct phy_device *phy;
cpsw_sl_reset(slave->mac_sl, 100);
cpsw_sl_ctl_reset(slave->mac_sl);
/* setup priority mapping */
cpsw_sl_reg_write(slave->mac_sl, CPSW_SL_RX_PRI_MAP,
RX_PRIORITY_MAPPING);
switch (cpsw->version) {
case CPSW_VERSION_1:
slave_write(slave, TX_PRIORITY_MAPPING, CPSW1_TX_PRI_MAP);
/* Increase RX FIFO size to 5 for supporting fullduplex
* flow control mode
*/
slave_write(slave,
(CPSW_MAX_BLKS_TX << CPSW_MAX_BLKS_TX_SHIFT) |
CPSW_MAX_BLKS_RX, CPSW1_MAX_BLKS);
break;
case CPSW_VERSION_2:
case CPSW_VERSION_3:
case CPSW_VERSION_4:
slave_write(slave, TX_PRIORITY_MAPPING, CPSW2_TX_PRI_MAP);
/* Increase RX FIFO size to 5 for supporting fullduplex
* flow control mode
*/
slave_write(slave,
(CPSW_MAX_BLKS_TX << CPSW_MAX_BLKS_TX_SHIFT) |
CPSW_MAX_BLKS_RX, CPSW2_MAX_BLKS);
break;
}
/* setup max packet size, and mac address */
cpsw_sl_reg_write(slave->mac_sl, CPSW_SL_RX_MAXLEN,
cpsw->rx_packet_max);
cpsw_set_slave_mac(slave, priv);
slave->mac_control = 0; /* no link yet */
if (cpsw_is_switch_en(cpsw))
cpsw_port_add_switch_def_ale_entries(priv, slave);
else
cpsw_port_add_dual_emac_def_ale_entries(priv, slave);
if (!slave->data->phy_node)
dev_err(priv->dev, "no phy found on slave %d\n",
slave->slave_num);
phy = of_phy_connect(priv->ndev, slave->data->phy_node,
&cpsw_adjust_link, 0, slave->data->phy_if);
if (!phy) {
dev_err(priv->dev, "phy \"%pOF\" not found on slave %d\n",
slave->data->phy_node,
slave->slave_num);
return;
}
slave->phy = phy;
phy_attached_info(slave->phy);
phy_start(slave->phy);
/* Configure GMII_SEL register */
phy_set_mode_ext(slave->data->ifphy, PHY_MODE_ETHERNET,
slave->data->phy_if);
}
static int cpsw_ndo_stop(struct net_device *ndev)
{
struct cpsw_priv *priv = netdev_priv(ndev);
struct cpsw_common *cpsw = priv->cpsw;
struct cpsw_slave *slave;
cpsw_info(priv, ifdown, "shutting down ndev\n");
slave = &cpsw->slaves[priv->emac_port - 1];
if (slave->phy)
phy_stop(slave->phy);
netif_tx_stop_all_queues(priv->ndev);
if (slave->phy) {
phy_disconnect(slave->phy);
slave->phy = NULL;
}
__hw_addr_ref_unsync_dev(&ndev->mc, ndev, cpsw_purge_all_mc);
if (cpsw->usage_count <= 1) {
napi_disable(&cpsw->napi_rx);
napi_disable(&cpsw->napi_tx);
cpts_unregister(cpsw->cpts);
cpsw_intr_disable(cpsw);
cpdma_ctlr_stop(cpsw->dma);
cpsw_ale_stop(cpsw->ale);
cpsw_destroy_xdp_rxqs(cpsw);
}
if (cpsw_need_resplit(cpsw))
cpsw_split_res(cpsw);
cpsw->usage_count--;
pm_runtime_put_sync(cpsw->dev);
return 0;
}
static int cpsw_ndo_open(struct net_device *ndev)
{
struct cpsw_priv *priv = netdev_priv(ndev);
struct cpsw_common *cpsw = priv->cpsw;
int ret;
dev_info(priv->dev, "starting ndev. mode: %s\n",
cpsw_is_switch_en(cpsw) ? "switch" : "dual_mac");
ret = pm_runtime_resume_and_get(cpsw->dev);
if (ret < 0)
return ret;
/* Notify the stack of the actual queue counts. */
ret = netif_set_real_num_tx_queues(ndev, cpsw->tx_ch_num);
if (ret) {
dev_err(priv->dev, "cannot set real number of tx queues\n");
goto pm_cleanup;
}
ret = netif_set_real_num_rx_queues(ndev, cpsw->rx_ch_num);
if (ret) {
dev_err(priv->dev, "cannot set real number of rx queues\n");
goto pm_cleanup;
}
/* Initialize host and slave ports */
if (!cpsw->usage_count)
cpsw_init_host_port(priv);
cpsw_slave_open(&cpsw->slaves[priv->emac_port - 1], priv);
/* initialize shared resources for every ndev */
if (!cpsw->usage_count) {
/* create rxqs for both infs in dual mac as they use same pool
* and must be destroyed together when no users.
*/
ret = cpsw_create_xdp_rxqs(cpsw);
if (ret < 0)
goto err_cleanup;
ret = cpsw_fill_rx_channels(priv);
if (ret < 0)
goto err_cleanup;
if (cpsw->cpts) {
if (cpts_register(cpsw->cpts))
dev_err(priv->dev, "error registering cpts device\n");
else
writel(0x10, &cpsw->wr_regs->misc_en);
}
napi_enable(&cpsw->napi_rx);
napi_enable(&cpsw->napi_tx);
if (cpsw->tx_irq_disabled) {
cpsw->tx_irq_disabled = false;
enable_irq(cpsw->irqs_table[1]);
}
if (cpsw->rx_irq_disabled) {
cpsw->rx_irq_disabled = false;
enable_irq(cpsw->irqs_table[0]);
}
}
cpsw_restore(priv);
/* Enable Interrupt pacing if configured */
if (cpsw->coal_intvl != 0) {
struct ethtool_coalesce coal;
coal.rx_coalesce_usecs = cpsw->coal_intvl;
cpsw_set_coalesce(ndev, &coal, NULL, NULL);
}
cpdma_ctlr_start(cpsw->dma);
cpsw_intr_enable(cpsw);
cpsw->usage_count++;
return 0;
err_cleanup:
cpsw_ndo_stop(ndev);
pm_cleanup:
pm_runtime_put_sync(cpsw->dev);
return ret;
}
static netdev_tx_t cpsw_ndo_start_xmit(struct sk_buff *skb,
struct net_device *ndev)
{
struct cpsw_priv *priv = netdev_priv(ndev);
struct cpsw_common *cpsw = priv->cpsw;
struct cpts *cpts = cpsw->cpts;
struct netdev_queue *txq;
struct cpdma_chan *txch;
int ret, q_idx;
if (skb_put_padto(skb, READ_ONCE(priv->tx_packet_min))) {
cpsw_err(priv, tx_err, "packet pad failed\n");
ndev->stats.tx_dropped++;
return NET_XMIT_DROP;
}
if (skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP &&
priv->tx_ts_enabled && cpts_can_timestamp(cpts, skb))
skb_shinfo(skb)->tx_flags |= SKBTX_IN_PROGRESS;
q_idx = skb_get_queue_mapping(skb);
if (q_idx >= cpsw->tx_ch_num)
q_idx = q_idx % cpsw->tx_ch_num;
txch = cpsw->txv[q_idx].ch;
txq = netdev_get_tx_queue(ndev, q_idx);
skb_tx_timestamp(skb);
ret = cpdma_chan_submit(txch, skb, skb->data, skb->len,
priv->emac_port);
if (unlikely(ret != 0)) {
cpsw_err(priv, tx_err, "desc submit failed\n");
goto fail;
}
/* If there is no more tx desc left free then we need to
* tell the kernel to stop sending us tx frames.
*/
if (unlikely(!cpdma_check_free_tx_desc(txch))) {
netif_tx_stop_queue(txq);
/* Barrier, so that stop_queue visible to other cpus */
smp_mb__after_atomic();
if (cpdma_check_free_tx_desc(txch))
netif_tx_wake_queue(txq);
}
return NETDEV_TX_OK;
fail:
ndev->stats.tx_dropped++;
netif_tx_stop_queue(txq);
/* Barrier, so that stop_queue visible to other cpus */
smp_mb__after_atomic();
if (cpdma_check_free_tx_desc(txch))
netif_tx_wake_queue(txq);
return NETDEV_TX_BUSY;
}
static int cpsw_ndo_set_mac_address(struct net_device *ndev, void *p)
{
struct sockaddr *addr = (struct sockaddr *)p;
struct cpsw_priv *priv = netdev_priv(ndev);
struct cpsw_common *cpsw = priv->cpsw;
int ret, slave_no;
int flags = 0;
u16 vid = 0;
slave_no = cpsw_slave_index(cpsw, priv);
if (!is_valid_ether_addr(addr->sa_data))
return -EADDRNOTAVAIL;
ret = pm_runtime_resume_and_get(cpsw->dev);
if (ret < 0)
return ret;
vid = cpsw->slaves[slave_no].port_vlan;
flags = ALE_VLAN | ALE_SECURE;
cpsw_ale_del_ucast(cpsw->ale, priv->mac_addr, HOST_PORT_NUM,
flags, vid);
cpsw_ale_add_ucast(cpsw->ale, addr->sa_data, HOST_PORT_NUM,
flags, vid);
ether_addr_copy(priv->mac_addr, addr->sa_data);
eth_hw_addr_set(ndev, priv->mac_addr);
cpsw_set_slave_mac(&cpsw->slaves[slave_no], priv);
pm_runtime_put(cpsw->dev);
return 0;
}
static int cpsw_ndo_vlan_rx_kill_vid(struct net_device *ndev,
__be16 proto, u16 vid)
{
struct cpsw_priv *priv = netdev_priv(ndev);
struct cpsw_common *cpsw = priv->cpsw;
int ret;
int i;
if (cpsw_is_switch_en(cpsw)) {
dev_dbg(cpsw->dev, "ndo del vlan is called in switch mode\n");
return 0;
}
if (vid == cpsw->data.default_vlan)
return 0;
ret = pm_runtime_resume_and_get(cpsw->dev);
if (ret < 0)
return ret;
/* reset the return code as pm_runtime_get_sync() can return
* non zero values as well.
*/
ret = 0;
for (i = 0; i < cpsw->data.slaves; i++) {
if (cpsw->slaves[i].ndev &&
vid == cpsw->slaves[i].port_vlan) {
ret = -EINVAL;
goto err;
}
}
dev_dbg(priv->dev, "removing vlanid %d from vlan filter\n", vid);
ret = cpsw_ale_del_vlan(cpsw->ale, vid, 0);
if (ret)
dev_err(priv->dev, "cpsw_ale_del_vlan() failed: ret %d\n", ret);
ret = cpsw_ale_del_ucast(cpsw->ale, priv->mac_addr,
HOST_PORT_NUM, ALE_VLAN, vid);
if (ret)
dev_err(priv->dev, "cpsw_ale_del_ucast() failed: ret %d\n",
ret);
ret = cpsw_ale_del_mcast(cpsw->ale, priv->ndev->broadcast,
0, ALE_VLAN, vid);
if (ret)
dev_err(priv->dev, "cpsw_ale_del_mcast failed. ret %d\n",
ret);
cpsw_ale_flush_multicast(cpsw->ale, ALE_PORT_HOST, vid);
ret = 0;
err:
pm_runtime_put(cpsw->dev);
return ret;
}
static int cpsw_ndo_get_phys_port_name(struct net_device *ndev, char *name,
size_t len)
{
struct cpsw_priv *priv = netdev_priv(ndev);
int err;
err = snprintf(name, len, "p%d", priv->emac_port);
if (err >= len)
return -EINVAL;
return 0;
}
#ifdef CONFIG_NET_POLL_CONTROLLER
static void cpsw_ndo_poll_controller(struct net_device *ndev)
{
struct cpsw_common *cpsw = ndev_to_cpsw(ndev);
cpsw_intr_disable(cpsw);
cpsw_rx_interrupt(cpsw->irqs_table[0], cpsw);
cpsw_tx_interrupt(cpsw->irqs_table[1], cpsw);
cpsw_intr_enable(cpsw);
}
#endif
static int cpsw_ndo_xdp_xmit(struct net_device *ndev, int n,
struct xdp_frame **frames, u32 flags)
{
struct cpsw_priv *priv = netdev_priv(ndev);
struct xdp_frame *xdpf;
int i, nxmit = 0;
if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK))
return -EINVAL;
for (i = 0; i < n; i++) {
xdpf = frames[i];
if (xdpf->len < READ_ONCE(priv->tx_packet_min))
break;
if (cpsw_xdp_tx_frame(priv, xdpf, NULL, priv->emac_port))
break;
nxmit++;
}
return nxmit;
}
static int cpsw_get_port_parent_id(struct net_device *ndev,
struct netdev_phys_item_id *ppid)
{
struct cpsw_common *cpsw = ndev_to_cpsw(ndev);
ppid->id_len = sizeof(cpsw->base_mac);
memcpy(&ppid->id, &cpsw->base_mac, ppid->id_len);
return 0;
}
static const struct net_device_ops cpsw_netdev_ops = {
.ndo_open = cpsw_ndo_open,
.ndo_stop = cpsw_ndo_stop,
.ndo_start_xmit = cpsw_ndo_start_xmit,
.ndo_set_mac_address = cpsw_ndo_set_mac_address,
.ndo_eth_ioctl = cpsw_ndo_ioctl,
.ndo_validate_addr = eth_validate_addr,
.ndo_tx_timeout = cpsw_ndo_tx_timeout,
.ndo_set_rx_mode = cpsw_ndo_set_rx_mode,
.ndo_set_tx_maxrate = cpsw_ndo_set_tx_maxrate,
#ifdef CONFIG_NET_POLL_CONTROLLER
.ndo_poll_controller = cpsw_ndo_poll_controller,
#endif
.ndo_vlan_rx_add_vid = cpsw_ndo_vlan_rx_add_vid,
.ndo_vlan_rx_kill_vid = cpsw_ndo_vlan_rx_kill_vid,
.ndo_setup_tc = cpsw_ndo_setup_tc,
.ndo_get_phys_port_name = cpsw_ndo_get_phys_port_name,
.ndo_bpf = cpsw_ndo_bpf,
.ndo_xdp_xmit = cpsw_ndo_xdp_xmit,
.ndo_get_port_parent_id = cpsw_get_port_parent_id,
};
static void cpsw_get_drvinfo(struct net_device *ndev,
struct ethtool_drvinfo *info)
{
struct cpsw_common *cpsw = ndev_to_cpsw(ndev);
struct platform_device *pdev;
pdev = to_platform_device(cpsw->dev);
strscpy(info->driver, "cpsw-switch", sizeof(info->driver));
strscpy(info->version, "2.0", sizeof(info->version));
strscpy(info->bus_info, pdev->name, sizeof(info->bus_info));
}
static int cpsw_set_pauseparam(struct net_device *ndev,
struct ethtool_pauseparam *pause)
{
struct cpsw_common *cpsw = ndev_to_cpsw(ndev);
struct cpsw_priv *priv = netdev_priv(ndev);
int slave_no;
slave_no = cpsw_slave_index(cpsw, priv);
if (!cpsw->slaves[slave_no].phy)
return -EINVAL;
if (!phy_validate_pause(cpsw->slaves[slave_no].phy, pause))
return -EINVAL;
priv->rx_pause = pause->rx_pause ? true : false;
priv->tx_pause = pause->tx_pause ? true : false;
phy_set_asym_pause(cpsw->slaves[slave_no].phy,
priv->rx_pause, priv->tx_pause);
return 0;
}
static int cpsw_set_channels(struct net_device *ndev,
struct ethtool_channels *chs)
{
return cpsw_set_channels_common(ndev, chs, cpsw_rx_handler);
}
static const struct ethtool_ops cpsw_ethtool_ops = {
.supported_coalesce_params = ETHTOOL_COALESCE_RX_USECS,
.get_drvinfo = cpsw_get_drvinfo,
.get_msglevel = cpsw_get_msglevel,
.set_msglevel = cpsw_set_msglevel,
.get_link = ethtool_op_get_link,
.get_ts_info = cpsw_get_ts_info,
.get_coalesce = cpsw_get_coalesce,
.set_coalesce = cpsw_set_coalesce,
.get_sset_count = cpsw_get_sset_count,
.get_strings = cpsw_get_strings,
.get_ethtool_stats = cpsw_get_ethtool_stats,
.get_pauseparam = cpsw_get_pauseparam,
.set_pauseparam = cpsw_set_pauseparam,
.get_wol = cpsw_get_wol,
.set_wol = cpsw_set_wol,
.get_regs_len = cpsw_get_regs_len,
.get_regs = cpsw_get_regs,
.begin = cpsw_ethtool_op_begin,
.complete = cpsw_ethtool_op_complete,
.get_channels = cpsw_get_channels,
.set_channels = cpsw_set_channels,
.get_link_ksettings = cpsw_get_link_ksettings,
.set_link_ksettings = cpsw_set_link_ksettings,
.get_eee = cpsw_get_eee,
.set_eee = cpsw_set_eee,
.nway_reset = cpsw_nway_reset,
.get_ringparam = cpsw_get_ringparam,
.set_ringparam = cpsw_set_ringparam,
};
static int cpsw_probe_dt(struct cpsw_common *cpsw)
{
struct device_node *node = cpsw->dev->of_node, *tmp_node, *port_np;
struct cpsw_platform_data *data = &cpsw->data;
struct device *dev = cpsw->dev;
int ret;
u32 prop;
if (!node)
return -EINVAL;
tmp_node = of_get_child_by_name(node, "ethernet-ports");
if (!tmp_node)
return -ENOENT;
data->slaves = of_get_child_count(tmp_node);
if (data->slaves != CPSW_SLAVE_PORTS_NUM) {
of_node_put(tmp_node);
return -ENOENT;
}
data->active_slave = 0;
data->channels = CPSW_MAX_QUEUES;
data->dual_emac = true;
data->bd_ram_size = CPSW_BD_RAM_SIZE;
data->mac_control = 0;
data->slave_data = devm_kcalloc(dev, CPSW_SLAVE_PORTS_NUM,
sizeof(struct cpsw_slave_data),
GFP_KERNEL);
if (!data->slave_data) {
of_node_put(tmp_node);
return -ENOMEM;
}
/* Populate all the child nodes here...
*/
ret = devm_of_platform_populate(dev);
/* We do not want to force this, as in some cases may not have child */
if (ret)
dev_warn(dev, "Doesn't have any child node\n");
for_each_child_of_node(tmp_node, port_np) {
struct cpsw_slave_data *slave_data;
u32 port_id;
ret = of_property_read_u32(port_np, "reg", &port_id);
if (ret < 0) {
dev_err(dev, "%pOF error reading port_id %d\n",
port_np, ret);
goto err_node_put;
}
if (!port_id || port_id > CPSW_SLAVE_PORTS_NUM) {
dev_err(dev, "%pOF has invalid port_id %u\n",
port_np, port_id);
ret = -EINVAL;
goto err_node_put;
}
slave_data = &data->slave_data[port_id - 1];
slave_data->disabled = !of_device_is_available(port_np);
if (slave_data->disabled)
continue;
slave_data->slave_node = port_np;
slave_data->ifphy = devm_of_phy_get(dev, port_np, NULL);
if (IS_ERR(slave_data->ifphy)) {
ret = PTR_ERR(slave_data->ifphy);
dev_err(dev, "%pOF: Error retrieving port phy: %d\n",
port_np, ret);
goto err_node_put;
}
if (of_phy_is_fixed_link(port_np)) {
ret = of_phy_register_fixed_link(port_np);
if (ret) {
dev_err_probe(dev, ret, "%pOF failed to register fixed-link phy\n",
port_np);
goto err_node_put;
}
slave_data->phy_node = of_node_get(port_np);
} else {
slave_data->phy_node =
of_parse_phandle(port_np, "phy-handle", 0);
}
if (!slave_data->phy_node) {
dev_err(dev, "%pOF no phy found\n", port_np);
ret = -ENODEV;
goto err_node_put;
}
ret = of_get_phy_mode(port_np, &slave_data->phy_if);
if (ret) {
dev_err(dev, "%pOF read phy-mode err %d\n",
port_np, ret);
goto err_node_put;
}
of: net: pass the dst buffer to of_get_mac_address() of_get_mac_address() returns a "const void*" pointer to a MAC address. Lately, support to fetch the MAC address by an NVMEM provider was added. But this will only work with platform devices. It will not work with PCI devices (e.g. of an integrated root complex) and esp. not with DSA ports. There is an of_* variant of the nvmem binding which works without devices. The returned data of a nvmem_cell_read() has to be freed after use. On the other hand the return of_get_mac_address() points to some static data without a lifetime. The trick for now, was to allocate a device resource managed buffer which is then returned. This will only work if we have an actual device. Change it, so that the caller of of_get_mac_address() has to supply a buffer where the MAC address is written to. Unfortunately, this will touch all drivers which use the of_get_mac_address(). Usually the code looks like: const char *addr; addr = of_get_mac_address(np); if (!IS_ERR(addr)) ether_addr_copy(ndev->dev_addr, addr); This can then be simply rewritten as: of_get_mac_address(np, ndev->dev_addr); Sometimes is_valid_ether_addr() is used to test the MAC address. of_get_mac_address() already makes sure, it just returns a valid MAC address. Thus we can just test its return code. But we have to be careful if there are still other sources for the MAC address before the of_get_mac_address(). In this case we have to keep the is_valid_ether_addr() call. The following coccinelle patch was used to convert common cases to the new style. Afterwards, I've manually gone over the drivers and fixed the return code variable: either used a new one or if one was already available use that. Mansour Moufid, thanks for that coccinelle patch! <spml> @a@ identifier x; expression y, z; @@ - x = of_get_mac_address(y); + x = of_get_mac_address(y, z); <... - ether_addr_copy(z, x); ...> @@ identifier a.x; @@ - if (<+... x ...+>) {} @@ identifier a.x; @@ if (<+... x ...+>) { ... } - else {} @@ identifier a.x; expression e; @@ - if (<+... x ...+>@e) - {} - else + if (!(e)) {...} @@ expression x, y, z; @@ - x = of_get_mac_address(y, z); + of_get_mac_address(y, z); ... when != x </spml> All drivers, except drivers/net/ethernet/aeroflex/greth.c, were compile-time tested. Suggested-by: Andrew Lunn <andrew@lunn.ch> Signed-off-by: Michael Walle <michael@walle.cc> Reviewed-by: Andrew Lunn <andrew@lunn.ch> Signed-off-by: David S. Miller <davem@davemloft.net>
2021-04-12 20:47:17 +03:00
ret = of_get_mac_address(port_np, slave_data->mac_addr);
if (ret) {
ret = ti_cm_get_macid(dev, port_id - 1,
slave_data->mac_addr);
if (ret)
goto err_node_put;
}
if (of_property_read_u32(port_np, "ti,dual-emac-pvid",
&prop)) {
dev_err(dev, "%pOF Missing dual_emac_res_vlan in DT.\n",
port_np);
slave_data->dual_emac_res_vlan = port_id;
dev_err(dev, "%pOF Using %d as Reserved VLAN\n",
port_np, slave_data->dual_emac_res_vlan);
} else {
slave_data->dual_emac_res_vlan = prop;
}
}
of_node_put(tmp_node);
return 0;
err_node_put:
of_node_put(port_np);
of_node_put(tmp_node);
return ret;
}
static void cpsw_remove_dt(struct cpsw_common *cpsw)
{
struct cpsw_platform_data *data = &cpsw->data;
int i = 0;
for (i = 0; i < cpsw->data.slaves; i++) {
struct cpsw_slave_data *slave_data = &data->slave_data[i];
struct device_node *port_np = slave_data->phy_node;
if (port_np) {
if (of_phy_is_fixed_link(port_np))
of_phy_deregister_fixed_link(port_np);
of_node_put(port_np);
}
}
}
static int cpsw_create_ports(struct cpsw_common *cpsw)
{
struct cpsw_platform_data *data = &cpsw->data;
struct net_device *ndev, *napi_ndev = NULL;
struct device *dev = cpsw->dev;
struct cpsw_priv *priv;
int ret = 0, i = 0;
for (i = 0; i < cpsw->data.slaves; i++) {
struct cpsw_slave_data *slave_data = &data->slave_data[i];
if (slave_data->disabled)
continue;
ndev = devm_alloc_etherdev_mqs(dev, sizeof(struct cpsw_priv),
CPSW_MAX_QUEUES,
CPSW_MAX_QUEUES);
if (!ndev) {
dev_err(dev, "error allocating net_device\n");
return -ENOMEM;
}
priv = netdev_priv(ndev);
priv->cpsw = cpsw;
priv->ndev = ndev;
priv->dev = dev;
priv->msg_enable = netif_msg_init(debug_level, CPSW_DEBUG);
priv->emac_port = i + 1;
priv->tx_packet_min = CPSW_MIN_PACKET_SIZE;
if (is_valid_ether_addr(slave_data->mac_addr)) {
ether_addr_copy(priv->mac_addr, slave_data->mac_addr);
dev_info(cpsw->dev, "Detected MACID = %pM\n",
priv->mac_addr);
} else {
eth_random_addr(slave_data->mac_addr);
dev_info(cpsw->dev, "Random MACID = %pM\n",
priv->mac_addr);
}
eth_hw_addr_set(ndev, slave_data->mac_addr);
ether_addr_copy(priv->mac_addr, slave_data->mac_addr);
cpsw->slaves[i].ndev = ndev;
ndev->features |= NETIF_F_HW_VLAN_CTAG_FILTER |
NETIF_F_HW_VLAN_CTAG_RX | NETIF_F_NETNS_LOCAL | NETIF_F_HW_TC;
drivers: net: turn on XDP features A summary of the flags being set for various drivers is given below. Note that XDP_F_REDIRECT_TARGET and XDP_F_FRAG_TARGET are features that can be turned off and on at runtime. This means that these flags may be set and unset under RTNL lock protection by the driver. Hence, READ_ONCE must be used by code loading the flag value. Also, these flags are not used for synchronization against the availability of XDP resources on a device. It is merely a hint, and hence the read may race with the actual teardown of XDP resources on the device. This may change in the future, e.g. operations taking a reference on the XDP resources of the driver, and in turn inhibiting turning off this flag. However, for now, it can only be used as a hint to check whether device supports becoming a redirection target. Turn 'hw-offload' feature flag on for: - netronome (nfp) - netdevsim. Turn 'native' and 'zerocopy' features flags on for: - intel (i40e, ice, ixgbe, igc) - mellanox (mlx5). - stmmac - netronome (nfp) Turn 'native' features flags on for: - amazon (ena) - broadcom (bnxt) - freescale (dpaa, dpaa2, enetc) - funeth - intel (igb) - marvell (mvneta, mvpp2, octeontx2) - mellanox (mlx4) - mtk_eth_soc - qlogic (qede) - sfc - socionext (netsec) - ti (cpsw) - tap - tsnep - veth - xen - virtio_net. Turn 'basic' (tx, pass, aborted and drop) features flags on for: - netronome (nfp) - cavium (thunder) - hyperv. Turn 'redirect_target' feature flag on for: - amanzon (ena) - broadcom (bnxt) - freescale (dpaa, dpaa2) - intel (i40e, ice, igb, ixgbe) - ti (cpsw) - marvell (mvneta, mvpp2) - sfc - socionext (netsec) - qlogic (qede) - mellanox (mlx5) - tap - veth - virtio_net - xen Reviewed-by: Gerhard Engleder <gerhard@engleder-embedded.com> Reviewed-by: Simon Horman <simon.horman@corigine.com> Acked-by: Stanislav Fomichev <sdf@google.com> Acked-by: Jakub Kicinski <kuba@kernel.org> Co-developed-by: Kumar Kartikeya Dwivedi <memxor@gmail.com> Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com> Co-developed-by: Lorenzo Bianconi <lorenzo@kernel.org> Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org> Signed-off-by: Marek Majtyka <alardam@gmail.com> Link: https://lore.kernel.org/r/3eca9fafb308462f7edb1f58e451d59209aa07eb.1675245258.git.lorenzo@kernel.org Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2023-02-01 13:24:18 +03:00
ndev->xdp_features = NETDEV_XDP_ACT_BASIC |
NETDEV_XDP_ACT_REDIRECT |
NETDEV_XDP_ACT_NDO_XMIT;
ndev->netdev_ops = &cpsw_netdev_ops;
ndev->ethtool_ops = &cpsw_ethtool_ops;
SET_NETDEV_DEV(ndev, dev);
if (!napi_ndev) {
/* CPSW Host port CPDMA interface is shared between
* ports and there is only one TX and one RX IRQs
* available for all possible TX and RX channels
* accordingly.
*/
netif_napi_add(ndev, &cpsw->napi_rx,
cpsw->quirk_irq ? cpsw_rx_poll : cpsw_rx_mq_poll);
netif_napi_add_tx(ndev, &cpsw->napi_tx,
cpsw->quirk_irq ?
cpsw_tx_poll : cpsw_tx_mq_poll);
}
napi_ndev = ndev;
}
return ret;
}
static void cpsw_unregister_ports(struct cpsw_common *cpsw)
{
int i = 0;
for (i = 0; i < cpsw->data.slaves; i++) {
if (!cpsw->slaves[i].ndev)
continue;
unregister_netdev(cpsw->slaves[i].ndev);
}
}
static int cpsw_register_ports(struct cpsw_common *cpsw)
{
int ret = 0, i = 0;
for (i = 0; i < cpsw->data.slaves; i++) {
if (!cpsw->slaves[i].ndev)
continue;
/* register the network device */
ret = register_netdev(cpsw->slaves[i].ndev);
if (ret) {
dev_err(cpsw->dev,
"cpsw: err registering net device%d\n", i);
cpsw->slaves[i].ndev = NULL;
break;
}
}
if (ret)
cpsw_unregister_ports(cpsw);
return ret;
}
bool cpsw_port_dev_check(const struct net_device *ndev)
{
if (ndev->netdev_ops == &cpsw_netdev_ops) {
struct cpsw_common *cpsw = ndev_to_cpsw(ndev);
return !cpsw->data.dual_emac;
}
return false;
}
static void cpsw_port_offload_fwd_mark_update(struct cpsw_common *cpsw)
{
int set_val = 0;
int i;
if (!cpsw->ale_bypass &&
(cpsw->br_members == (ALE_PORT_1 | ALE_PORT_2)))
set_val = 1;
dev_dbg(cpsw->dev, "set offload_fwd_mark %d\n", set_val);
for (i = 0; i < cpsw->data.slaves; i++) {
struct net_device *sl_ndev = cpsw->slaves[i].ndev;
struct cpsw_priv *priv = netdev_priv(sl_ndev);
priv->offload_fwd_mark = set_val;
}
}
static int cpsw_netdevice_port_link(struct net_device *ndev,
net: bridge: switchdev: let drivers inform which bridge ports are offloaded On reception of an skb, the bridge checks if it was marked as 'already forwarded in hardware' (checks if skb->offload_fwd_mark == 1), and if it is, it assigns the source hardware domain of that skb based on the hardware domain of the ingress port. Then during forwarding, it enforces that the egress port must have a different hardware domain than the ingress one (this is done in nbp_switchdev_allowed_egress). Non-switchdev drivers don't report any physical switch id (neither through devlink nor .ndo_get_port_parent_id), therefore the bridge assigns them a hardware domain of 0, and packets coming from them will always have skb->offload_fwd_mark = 0. So there aren't any restrictions. Problems appear due to the fact that DSA would like to perform software fallback for bonding and team interfaces that the physical switch cannot offload. +-- br0 ---+ / / | \ / / | \ / | | bond0 / | | / \ swp0 swp1 swp2 swp3 swp4 There, it is desirable that the presence of swp3 and swp4 under a non-offloaded LAG does not preclude us from doing hardware bridging beteen swp0, swp1 and swp2. The bandwidth of the CPU is often times high enough that software bridging between {swp0,swp1,swp2} and bond0 is not impractical. But this creates an impossible paradox given the current way in which port hardware domains are assigned. When the driver receives a packet from swp0 (say, due to flooding), it must set skb->offload_fwd_mark to something. - If we set it to 0, then the bridge will forward it towards swp1, swp2 and bond0. But the switch has already forwarded it towards swp1 and swp2 (not to bond0, remember, that isn't offloaded, so as far as the switch is concerned, ports swp3 and swp4 are not looking up the FDB, and the entire bond0 is a destination that is strictly behind the CPU). But we don't want duplicated traffic towards swp1 and swp2, so it's not ok to set skb->offload_fwd_mark = 0. - If we set it to 1, then the bridge will not forward the skb towards the ports with the same switchdev mark, i.e. not to swp1, swp2 and bond0. Towards swp1 and swp2 that's ok, but towards bond0? It should have forwarded the skb there. So the real issue is that bond0 will be assigned the same hardware domain as {swp0,swp1,swp2}, because the function that assigns hardware domains to bridge ports, nbp_switchdev_add(), recurses through bond0's lower interfaces until it finds something that implements devlink (calls dev_get_port_parent_id with bool recurse = true). This is a problem because the fact that bond0 can be offloaded by swp3 and swp4 in our example is merely an assumption. A solution is to give the bridge explicit hints as to what hardware domain it should use for each port. Currently, the bridging offload is very 'silent': a driver registers a netdevice notifier, which is put on the netns's notifier chain, and which sniffs around for NETDEV_CHANGEUPPER events where the upper is a bridge, and the lower is an interface it knows about (one registered by this driver, normally). Then, from within that notifier, it does a bunch of stuff behind the bridge's back, without the bridge necessarily knowing that there's somebody offloading that port. It looks like this: ip link set swp0 master br0 | v br_add_if() calls netdev_master_upper_dev_link() | v call_netdevice_notifiers | v dsa_slave_netdevice_event | v oh, hey! it's for me! | v .port_bridge_join What we do to solve the conundrum is to be less silent, and change the switchdev drivers to present themselves to the bridge. Something like this: ip link set swp0 master br0 | v br_add_if() calls netdev_master_upper_dev_link() | v bridge: Aye! I'll use this call_netdevice_notifiers ^ ppid as the | | hardware domain for v | this port, and zero dsa_slave_netdevice_event | if I got nothing. | | v | oh, hey! it's for me! | | | v | .port_bridge_join | | | +------------------------+ switchdev_bridge_port_offload(swp0, swp0) Then stacked interfaces (like bond0 on top of swp3/swp4) would be treated differently in DSA, depending on whether we can or cannot offload them. The offload case: ip link set bond0 master br0 | v br_add_if() calls netdev_master_upper_dev_link() | v bridge: Aye! I'll use this call_netdevice_notifiers ^ ppid as the | | switchdev mark for v | bond0. dsa_slave_netdevice_event | Coincidentally (or not), | | bond0 and swp0, swp1, swp2 v | all have the same switchdev hmm, it's not quite for me, | mark now, since the ASIC but my driver has already | is able to forward towards called .port_lag_join | all these ports in hw. for it, because I have | a port with dp->lag_dev == bond0. | | | v | .port_bridge_join | for swp3 and swp4 | | | +------------------------+ switchdev_bridge_port_offload(bond0, swp3) switchdev_bridge_port_offload(bond0, swp4) And the non-offload case: ip link set bond0 master br0 | v br_add_if() calls netdev_master_upper_dev_link() | v bridge waiting: call_netdevice_notifiers ^ huh, switchdev_bridge_port_offload | | wasn't called, okay, I'll use a v | hwdom of zero for this one. dsa_slave_netdevice_event : Then packets received on swp0 will | : not be software-forwarded towards v : swp1, but they will towards bond0. it's not for me, but bond0 is an upper of swp3 and swp4, but their dp->lag_dev is NULL because they couldn't offload it. Basically we can draw the conclusion that the lowers of a bridge port can come and go, so depending on the configuration of lowers for a bridge port, it can dynamically toggle between offloaded and unoffloaded. Therefore, we need an equivalent switchdev_bridge_port_unoffload too. This patch changes the way any switchdev driver interacts with the bridge. From now on, everybody needs to call switchdev_bridge_port_offload and switchdev_bridge_port_unoffload, otherwise the bridge will treat the port as non-offloaded and allow software flooding to other ports from the same ASIC. Note that these functions lay the ground for a more complex handshake between switchdev drivers and the bridge in the future. For drivers that will request a replay of the switchdev objects when they offload and unoffload a bridge port (DSA, dpaa2-switch, ocelot), we place the call to switchdev_bridge_port_unoffload() strategically inside the NETDEV_PRECHANGEUPPER notifier's code path, and not inside NETDEV_CHANGEUPPER. This is because the switchdev object replay helpers need the netdev adjacency lists to be valid, and that is only true in NETDEV_PRECHANGEUPPER. Cc: Vadym Kochan <vkochan@marvell.com> Cc: Taras Chornyi <tchornyi@marvell.com> Cc: Ioana Ciornei <ioana.ciornei@nxp.com> Cc: Lars Povlsen <lars.povlsen@microchip.com> Cc: Steen Hegelund <Steen.Hegelund@microchip.com> Cc: UNGLinuxDriver@microchip.com Cc: Claudiu Manoil <claudiu.manoil@nxp.com> Cc: Alexandre Belloni <alexandre.belloni@bootlin.com> Cc: Grygorii Strashko <grygorii.strashko@ti.com> Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Tested-by: Ioana Ciornei <ioana.ciornei@nxp.com> # dpaa2-switch: regression Acked-by: Ioana Ciornei <ioana.ciornei@nxp.com> # dpaa2-switch Tested-by: Horatiu Vultur <horatiu.vultur@microchip.com> # ocelot-switch Signed-off-by: David S. Miller <davem@davemloft.net>
2021-07-21 19:24:01 +03:00
struct net_device *br_ndev,
struct netlink_ext_ack *extack)
{
struct cpsw_priv *priv = netdev_priv(ndev);
struct cpsw_common *cpsw = priv->cpsw;
net: bridge: switchdev: let drivers inform which bridge ports are offloaded On reception of an skb, the bridge checks if it was marked as 'already forwarded in hardware' (checks if skb->offload_fwd_mark == 1), and if it is, it assigns the source hardware domain of that skb based on the hardware domain of the ingress port. Then during forwarding, it enforces that the egress port must have a different hardware domain than the ingress one (this is done in nbp_switchdev_allowed_egress). Non-switchdev drivers don't report any physical switch id (neither through devlink nor .ndo_get_port_parent_id), therefore the bridge assigns them a hardware domain of 0, and packets coming from them will always have skb->offload_fwd_mark = 0. So there aren't any restrictions. Problems appear due to the fact that DSA would like to perform software fallback for bonding and team interfaces that the physical switch cannot offload. +-- br0 ---+ / / | \ / / | \ / | | bond0 / | | / \ swp0 swp1 swp2 swp3 swp4 There, it is desirable that the presence of swp3 and swp4 under a non-offloaded LAG does not preclude us from doing hardware bridging beteen swp0, swp1 and swp2. The bandwidth of the CPU is often times high enough that software bridging between {swp0,swp1,swp2} and bond0 is not impractical. But this creates an impossible paradox given the current way in which port hardware domains are assigned. When the driver receives a packet from swp0 (say, due to flooding), it must set skb->offload_fwd_mark to something. - If we set it to 0, then the bridge will forward it towards swp1, swp2 and bond0. But the switch has already forwarded it towards swp1 and swp2 (not to bond0, remember, that isn't offloaded, so as far as the switch is concerned, ports swp3 and swp4 are not looking up the FDB, and the entire bond0 is a destination that is strictly behind the CPU). But we don't want duplicated traffic towards swp1 and swp2, so it's not ok to set skb->offload_fwd_mark = 0. - If we set it to 1, then the bridge will not forward the skb towards the ports with the same switchdev mark, i.e. not to swp1, swp2 and bond0. Towards swp1 and swp2 that's ok, but towards bond0? It should have forwarded the skb there. So the real issue is that bond0 will be assigned the same hardware domain as {swp0,swp1,swp2}, because the function that assigns hardware domains to bridge ports, nbp_switchdev_add(), recurses through bond0's lower interfaces until it finds something that implements devlink (calls dev_get_port_parent_id with bool recurse = true). This is a problem because the fact that bond0 can be offloaded by swp3 and swp4 in our example is merely an assumption. A solution is to give the bridge explicit hints as to what hardware domain it should use for each port. Currently, the bridging offload is very 'silent': a driver registers a netdevice notifier, which is put on the netns's notifier chain, and which sniffs around for NETDEV_CHANGEUPPER events where the upper is a bridge, and the lower is an interface it knows about (one registered by this driver, normally). Then, from within that notifier, it does a bunch of stuff behind the bridge's back, without the bridge necessarily knowing that there's somebody offloading that port. It looks like this: ip link set swp0 master br0 | v br_add_if() calls netdev_master_upper_dev_link() | v call_netdevice_notifiers | v dsa_slave_netdevice_event | v oh, hey! it's for me! | v .port_bridge_join What we do to solve the conundrum is to be less silent, and change the switchdev drivers to present themselves to the bridge. Something like this: ip link set swp0 master br0 | v br_add_if() calls netdev_master_upper_dev_link() | v bridge: Aye! I'll use this call_netdevice_notifiers ^ ppid as the | | hardware domain for v | this port, and zero dsa_slave_netdevice_event | if I got nothing. | | v | oh, hey! it's for me! | | | v | .port_bridge_join | | | +------------------------+ switchdev_bridge_port_offload(swp0, swp0) Then stacked interfaces (like bond0 on top of swp3/swp4) would be treated differently in DSA, depending on whether we can or cannot offload them. The offload case: ip link set bond0 master br0 | v br_add_if() calls netdev_master_upper_dev_link() | v bridge: Aye! I'll use this call_netdevice_notifiers ^ ppid as the | | switchdev mark for v | bond0. dsa_slave_netdevice_event | Coincidentally (or not), | | bond0 and swp0, swp1, swp2 v | all have the same switchdev hmm, it's not quite for me, | mark now, since the ASIC but my driver has already | is able to forward towards called .port_lag_join | all these ports in hw. for it, because I have | a port with dp->lag_dev == bond0. | | | v | .port_bridge_join | for swp3 and swp4 | | | +------------------------+ switchdev_bridge_port_offload(bond0, swp3) switchdev_bridge_port_offload(bond0, swp4) And the non-offload case: ip link set bond0 master br0 | v br_add_if() calls netdev_master_upper_dev_link() | v bridge waiting: call_netdevice_notifiers ^ huh, switchdev_bridge_port_offload | | wasn't called, okay, I'll use a v | hwdom of zero for this one. dsa_slave_netdevice_event : Then packets received on swp0 will | : not be software-forwarded towards v : swp1, but they will towards bond0. it's not for me, but bond0 is an upper of swp3 and swp4, but their dp->lag_dev is NULL because they couldn't offload it. Basically we can draw the conclusion that the lowers of a bridge port can come and go, so depending on the configuration of lowers for a bridge port, it can dynamically toggle between offloaded and unoffloaded. Therefore, we need an equivalent switchdev_bridge_port_unoffload too. This patch changes the way any switchdev driver interacts with the bridge. From now on, everybody needs to call switchdev_bridge_port_offload and switchdev_bridge_port_unoffload, otherwise the bridge will treat the port as non-offloaded and allow software flooding to other ports from the same ASIC. Note that these functions lay the ground for a more complex handshake between switchdev drivers and the bridge in the future. For drivers that will request a replay of the switchdev objects when they offload and unoffload a bridge port (DSA, dpaa2-switch, ocelot), we place the call to switchdev_bridge_port_unoffload() strategically inside the NETDEV_PRECHANGEUPPER notifier's code path, and not inside NETDEV_CHANGEUPPER. This is because the switchdev object replay helpers need the netdev adjacency lists to be valid, and that is only true in NETDEV_PRECHANGEUPPER. Cc: Vadym Kochan <vkochan@marvell.com> Cc: Taras Chornyi <tchornyi@marvell.com> Cc: Ioana Ciornei <ioana.ciornei@nxp.com> Cc: Lars Povlsen <lars.povlsen@microchip.com> Cc: Steen Hegelund <Steen.Hegelund@microchip.com> Cc: UNGLinuxDriver@microchip.com Cc: Claudiu Manoil <claudiu.manoil@nxp.com> Cc: Alexandre Belloni <alexandre.belloni@bootlin.com> Cc: Grygorii Strashko <grygorii.strashko@ti.com> Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Tested-by: Ioana Ciornei <ioana.ciornei@nxp.com> # dpaa2-switch: regression Acked-by: Ioana Ciornei <ioana.ciornei@nxp.com> # dpaa2-switch Tested-by: Horatiu Vultur <horatiu.vultur@microchip.com> # ocelot-switch Signed-off-by: David S. Miller <davem@davemloft.net>
2021-07-21 19:24:01 +03:00
int err;
if (!cpsw->br_members) {
cpsw->hw_bridge_dev = br_ndev;
} else {
/* This is adding the port to a second bridge, this is
* unsupported
*/
if (cpsw->hw_bridge_dev != br_ndev)
return -EOPNOTSUPP;
}
net: bridge: move the switchdev object replay helpers to "push" mode Starting with commit 4f2673b3a2b6 ("net: bridge: add helper to replay port and host-joined mdb entries"), DSA has introduced some bridge helpers that replay switchdev events (FDB/MDB/VLAN additions and deletions) that can be lost by the switchdev drivers in a variety of circumstances: - an IP multicast group was host-joined on the bridge itself before any switchdev port joined the bridge, leading to the host MDB entries missing in the hardware database. - during the bridge creation process, the MAC address of the bridge was added to the FDB as an entry pointing towards the bridge device itself, but with no switchdev ports being part of the bridge yet, this local FDB entry would remain unknown to the switchdev hardware database. - a VLAN/FDB/MDB was added to a bridge port that is a LAG interface, before any switchdev port joined that LAG, leading to the hardware database missing those entries. - a switchdev port left a LAG that is a bridge port, while the LAG remained part of the bridge, and all FDB/MDB/VLAN entries remained installed in the hardware database of the switchdev port. Also, since commit 0d2cfbd41c4a ("net: bridge: ignore switchdev events for LAG ports which didn't request replay"), DSA introduced a method, based on a const void *ctx, to ensure that two switchdev ports under the same LAG that is a bridge port do not see the same MDB/VLAN entry being replayed twice by the bridge, once for every bridge port that joins the LAG. With so many ordering corner cases being possible, it seems unreasonable to expect a switchdev driver writer to get it right from the first try. Therefore, now that DSA has experimented with the bridge replay helpers for a little bit, we can move the code to the bridge driver where it is more readily available to all switchdev drivers. To convert the switchdev object replay helpers from "pull mode" (where the driver asks for them) to a "push mode" (where the bridge offers them automatically), the biggest problem is that the bridge needs to be aware when a switchdev port joins and leaves, even when the switchdev is only indirectly a bridge port (for example when the bridge port is a LAG upper of the switchdev). Luckily, we already have a hook for that, in the form of the newly introduced switchdev_bridge_port_offload() and switchdev_bridge_port_unoffload() calls. These offer a natural place for hooking the object addition and deletion replays. Extend the above 2 functions with: - pointers to the switchdev atomic notifier (for FDB replays) and the blocking notifier (for MDB and VLAN replays). - the "const void *ctx" argument required for drivers to be able to disambiguate between which port is targeted, when multiple ports are lowers of the same LAG that is a bridge port. Most of the drivers pass NULL to this argument, except the ones that support LAG offload and have the proper context check already in place in the switchdev blocking notifier handler. Also unexport the replay helpers, since nobody except the bridge calls them directly now. Note that: (a) we abuse the terminology slightly, because FDB entries are not "switchdev objects", but we count them as objects nonetheless. With no direct way to prove it, I think they are not modeled as switchdev objects because those can only be installed by the bridge to the hardware (as opposed to FDB entries which can be propagated in the other direction too). This is merely an abuse of terms, FDB entries are replayed too, despite not being objects. (b) the bridge does not attempt to sync port attributes to newly joined ports, just the countable stuff (the objects). The reason for this is simple: no universal and symmetric way to sync and unsync them is known. For example, VLAN filtering: what to do on unsync, disable or leave it enabled? Similarly, STP state, ageing timer, etc etc. What a switchdev port does when it becomes standalone again is not really up to the bridge's competence, and the driver should deal with it. On the other hand, replaying deletions of switchdev objects can be seen a matter of cleanup and therefore be treated by the bridge, hence this patch. We make the replay helpers opt-in for drivers, because they might not bring immediate benefits for them: - nbp_vlan_init() is called _after_ netdev_master_upper_dev_link(), so br_vlan_replay() should not do anything for the new drivers on which we call it. The existing drivers where there was even a slight possibility for there to exist a VLAN on a bridge port before they join it are already guarded against this: mlxsw and prestera deny joining LAG interfaces that are members of a bridge. - br_fdb_replay() should now notify of local FDB entries, but I patched all drivers except DSA to ignore these new entries in commit 2c4eca3ef716 ("net: bridge: switchdev: include local flag in FDB notifications"). Driver authors can lift this restriction as they wish, and when they do, they can also opt into the FDB replay functionality. - br_mdb_replay() should fix a real issue which is described in commit 4f2673b3a2b6 ("net: bridge: add helper to replay port and host-joined mdb entries"). However most drivers do not offload the SWITCHDEV_OBJ_ID_HOST_MDB to see this issue: only cpsw and am65_cpsw offload this switchdev object, and I don't completely understand the way in which they offload this switchdev object anyway. So I'll leave it up to these drivers' respective maintainers to opt into br_mdb_replay(). So most of the drivers pass NULL notifier blocks for the replay helpers, except: - dpaa2-switch which was already acked/regression-tested with the helpers enabled (and there isn't much of a downside in having them) - ocelot which already had replay logic in "pull" mode - DSA which already had replay logic in "pull" mode An important observation is that the drivers which don't currently request bridge event replays don't even have the switchdev_bridge_port_{offload,unoffload} calls placed in proper places right now. This was done to avoid unnecessary rework for drivers which might never even add support for this. For driver writers who wish to add replay support, this can be used as a tentative placement guide: https://patchwork.kernel.org/project/netdevbpf/patch/20210720134655.892334-11-vladimir.oltean@nxp.com/ Cc: Vadym Kochan <vkochan@marvell.com> Cc: Taras Chornyi <tchornyi@marvell.com> Cc: Ioana Ciornei <ioana.ciornei@nxp.com> Cc: Lars Povlsen <lars.povlsen@microchip.com> Cc: Steen Hegelund <Steen.Hegelund@microchip.com> Cc: UNGLinuxDriver@microchip.com Cc: Claudiu Manoil <claudiu.manoil@nxp.com> Cc: Alexandre Belloni <alexandre.belloni@bootlin.com> Cc: Grygorii Strashko <grygorii.strashko@ti.com> Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Acked-by: Ioana Ciornei <ioana.ciornei@nxp.com> # dpaa2-switch Signed-off-by: David S. Miller <davem@davemloft.net>
2021-07-21 19:24:03 +03:00
err = switchdev_bridge_port_offload(ndev, ndev, NULL, NULL, NULL,
net: bridge: switchdev: allow the TX data plane forwarding to be offloaded Allow switchdevs to forward frames from the CPU in accordance with the bridge configuration in the same way as is done between bridge ports. This means that the bridge will only send a single skb towards one of the ports under the switchdev's control, and expects the driver to deliver the packet to all eligible ports in its domain. Primarily this improves the performance of multicast flows with multiple subscribers, as it allows the hardware to perform the frame replication. The basic flow between the driver and the bridge is as follows: - When joining a bridge port, the switchdev driver calls switchdev_bridge_port_offload() with tx_fwd_offload = true. - The bridge sends offloadable skbs to one of the ports under the switchdev's control using skb->offload_fwd_mark = true. - The switchdev driver checks the skb->offload_fwd_mark field and lets its FDB lookup select the destination port mask for this packet. v1->v2: - convert br_input_skb_cb::fwd_hwdoms to a plain unsigned long - introduce a static key "br_switchdev_fwd_offload_used" to minimize the impact of the newly introduced feature on all the setups which don't have hardware that can make use of it - introduce a check for nbp->flags & BR_FWD_OFFLOAD to optimize cache line access - reorder nbp_switchdev_frame_mark_accel() and br_handle_vlan() in __br_forward() - do not strip VLAN on egress if forwarding offload on VLAN-aware bridge is being used - propagate errors from .ndo_dfwd_add_station() if not EOPNOTSUPP v2->v3: - replace the solution based on .ndo_dfwd_add_station with a solution based on switchdev_bridge_port_offload - rename BR_FWD_OFFLOAD to BR_TX_FWD_OFFLOAD v3->v4: rebase v4->v5: - make sure the static key is decremented on bridge port unoffload - more function and variable renaming and comments for them: br_switchdev_fwd_offload_used to br_switchdev_tx_fwd_offload br_switchdev_accels_skb to br_switchdev_frame_uses_tx_fwd_offload nbp_switchdev_frame_mark_tx_fwd to nbp_switchdev_frame_mark_tx_fwd_to_hwdom nbp_switchdev_frame_mark_accel to nbp_switchdev_frame_mark_tx_fwd_offload fwd_accel to tx_fwd_offload Signed-off-by: Tobias Waldekranz <tobias@waldekranz.com> Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Reviewed-by: Florian Fainelli <f.fainelli@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2021-07-22 18:55:38 +03:00
false, extack);
net: bridge: switchdev: let drivers inform which bridge ports are offloaded On reception of an skb, the bridge checks if it was marked as 'already forwarded in hardware' (checks if skb->offload_fwd_mark == 1), and if it is, it assigns the source hardware domain of that skb based on the hardware domain of the ingress port. Then during forwarding, it enforces that the egress port must have a different hardware domain than the ingress one (this is done in nbp_switchdev_allowed_egress). Non-switchdev drivers don't report any physical switch id (neither through devlink nor .ndo_get_port_parent_id), therefore the bridge assigns them a hardware domain of 0, and packets coming from them will always have skb->offload_fwd_mark = 0. So there aren't any restrictions. Problems appear due to the fact that DSA would like to perform software fallback for bonding and team interfaces that the physical switch cannot offload. +-- br0 ---+ / / | \ / / | \ / | | bond0 / | | / \ swp0 swp1 swp2 swp3 swp4 There, it is desirable that the presence of swp3 and swp4 under a non-offloaded LAG does not preclude us from doing hardware bridging beteen swp0, swp1 and swp2. The bandwidth of the CPU is often times high enough that software bridging between {swp0,swp1,swp2} and bond0 is not impractical. But this creates an impossible paradox given the current way in which port hardware domains are assigned. When the driver receives a packet from swp0 (say, due to flooding), it must set skb->offload_fwd_mark to something. - If we set it to 0, then the bridge will forward it towards swp1, swp2 and bond0. But the switch has already forwarded it towards swp1 and swp2 (not to bond0, remember, that isn't offloaded, so as far as the switch is concerned, ports swp3 and swp4 are not looking up the FDB, and the entire bond0 is a destination that is strictly behind the CPU). But we don't want duplicated traffic towards swp1 and swp2, so it's not ok to set skb->offload_fwd_mark = 0. - If we set it to 1, then the bridge will not forward the skb towards the ports with the same switchdev mark, i.e. not to swp1, swp2 and bond0. Towards swp1 and swp2 that's ok, but towards bond0? It should have forwarded the skb there. So the real issue is that bond0 will be assigned the same hardware domain as {swp0,swp1,swp2}, because the function that assigns hardware domains to bridge ports, nbp_switchdev_add(), recurses through bond0's lower interfaces until it finds something that implements devlink (calls dev_get_port_parent_id with bool recurse = true). This is a problem because the fact that bond0 can be offloaded by swp3 and swp4 in our example is merely an assumption. A solution is to give the bridge explicit hints as to what hardware domain it should use for each port. Currently, the bridging offload is very 'silent': a driver registers a netdevice notifier, which is put on the netns's notifier chain, and which sniffs around for NETDEV_CHANGEUPPER events where the upper is a bridge, and the lower is an interface it knows about (one registered by this driver, normally). Then, from within that notifier, it does a bunch of stuff behind the bridge's back, without the bridge necessarily knowing that there's somebody offloading that port. It looks like this: ip link set swp0 master br0 | v br_add_if() calls netdev_master_upper_dev_link() | v call_netdevice_notifiers | v dsa_slave_netdevice_event | v oh, hey! it's for me! | v .port_bridge_join What we do to solve the conundrum is to be less silent, and change the switchdev drivers to present themselves to the bridge. Something like this: ip link set swp0 master br0 | v br_add_if() calls netdev_master_upper_dev_link() | v bridge: Aye! I'll use this call_netdevice_notifiers ^ ppid as the | | hardware domain for v | this port, and zero dsa_slave_netdevice_event | if I got nothing. | | v | oh, hey! it's for me! | | | v | .port_bridge_join | | | +------------------------+ switchdev_bridge_port_offload(swp0, swp0) Then stacked interfaces (like bond0 on top of swp3/swp4) would be treated differently in DSA, depending on whether we can or cannot offload them. The offload case: ip link set bond0 master br0 | v br_add_if() calls netdev_master_upper_dev_link() | v bridge: Aye! I'll use this call_netdevice_notifiers ^ ppid as the | | switchdev mark for v | bond0. dsa_slave_netdevice_event | Coincidentally (or not), | | bond0 and swp0, swp1, swp2 v | all have the same switchdev hmm, it's not quite for me, | mark now, since the ASIC but my driver has already | is able to forward towards called .port_lag_join | all these ports in hw. for it, because I have | a port with dp->lag_dev == bond0. | | | v | .port_bridge_join | for swp3 and swp4 | | | +------------------------+ switchdev_bridge_port_offload(bond0, swp3) switchdev_bridge_port_offload(bond0, swp4) And the non-offload case: ip link set bond0 master br0 | v br_add_if() calls netdev_master_upper_dev_link() | v bridge waiting: call_netdevice_notifiers ^ huh, switchdev_bridge_port_offload | | wasn't called, okay, I'll use a v | hwdom of zero for this one. dsa_slave_netdevice_event : Then packets received on swp0 will | : not be software-forwarded towards v : swp1, but they will towards bond0. it's not for me, but bond0 is an upper of swp3 and swp4, but their dp->lag_dev is NULL because they couldn't offload it. Basically we can draw the conclusion that the lowers of a bridge port can come and go, so depending on the configuration of lowers for a bridge port, it can dynamically toggle between offloaded and unoffloaded. Therefore, we need an equivalent switchdev_bridge_port_unoffload too. This patch changes the way any switchdev driver interacts with the bridge. From now on, everybody needs to call switchdev_bridge_port_offload and switchdev_bridge_port_unoffload, otherwise the bridge will treat the port as non-offloaded and allow software flooding to other ports from the same ASIC. Note that these functions lay the ground for a more complex handshake between switchdev drivers and the bridge in the future. For drivers that will request a replay of the switchdev objects when they offload and unoffload a bridge port (DSA, dpaa2-switch, ocelot), we place the call to switchdev_bridge_port_unoffload() strategically inside the NETDEV_PRECHANGEUPPER notifier's code path, and not inside NETDEV_CHANGEUPPER. This is because the switchdev object replay helpers need the netdev adjacency lists to be valid, and that is only true in NETDEV_PRECHANGEUPPER. Cc: Vadym Kochan <vkochan@marvell.com> Cc: Taras Chornyi <tchornyi@marvell.com> Cc: Ioana Ciornei <ioana.ciornei@nxp.com> Cc: Lars Povlsen <lars.povlsen@microchip.com> Cc: Steen Hegelund <Steen.Hegelund@microchip.com> Cc: UNGLinuxDriver@microchip.com Cc: Claudiu Manoil <claudiu.manoil@nxp.com> Cc: Alexandre Belloni <alexandre.belloni@bootlin.com> Cc: Grygorii Strashko <grygorii.strashko@ti.com> Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Tested-by: Ioana Ciornei <ioana.ciornei@nxp.com> # dpaa2-switch: regression Acked-by: Ioana Ciornei <ioana.ciornei@nxp.com> # dpaa2-switch Tested-by: Horatiu Vultur <horatiu.vultur@microchip.com> # ocelot-switch Signed-off-by: David S. Miller <davem@davemloft.net>
2021-07-21 19:24:01 +03:00
if (err)
return err;
cpsw->br_members |= BIT(priv->emac_port);
cpsw_port_offload_fwd_mark_update(cpsw);
return NOTIFY_DONE;
}
static void cpsw_netdevice_port_unlink(struct net_device *ndev)
{
struct cpsw_priv *priv = netdev_priv(ndev);
struct cpsw_common *cpsw = priv->cpsw;
net: bridge: move the switchdev object replay helpers to "push" mode Starting with commit 4f2673b3a2b6 ("net: bridge: add helper to replay port and host-joined mdb entries"), DSA has introduced some bridge helpers that replay switchdev events (FDB/MDB/VLAN additions and deletions) that can be lost by the switchdev drivers in a variety of circumstances: - an IP multicast group was host-joined on the bridge itself before any switchdev port joined the bridge, leading to the host MDB entries missing in the hardware database. - during the bridge creation process, the MAC address of the bridge was added to the FDB as an entry pointing towards the bridge device itself, but with no switchdev ports being part of the bridge yet, this local FDB entry would remain unknown to the switchdev hardware database. - a VLAN/FDB/MDB was added to a bridge port that is a LAG interface, before any switchdev port joined that LAG, leading to the hardware database missing those entries. - a switchdev port left a LAG that is a bridge port, while the LAG remained part of the bridge, and all FDB/MDB/VLAN entries remained installed in the hardware database of the switchdev port. Also, since commit 0d2cfbd41c4a ("net: bridge: ignore switchdev events for LAG ports which didn't request replay"), DSA introduced a method, based on a const void *ctx, to ensure that two switchdev ports under the same LAG that is a bridge port do not see the same MDB/VLAN entry being replayed twice by the bridge, once for every bridge port that joins the LAG. With so many ordering corner cases being possible, it seems unreasonable to expect a switchdev driver writer to get it right from the first try. Therefore, now that DSA has experimented with the bridge replay helpers for a little bit, we can move the code to the bridge driver where it is more readily available to all switchdev drivers. To convert the switchdev object replay helpers from "pull mode" (where the driver asks for them) to a "push mode" (where the bridge offers them automatically), the biggest problem is that the bridge needs to be aware when a switchdev port joins and leaves, even when the switchdev is only indirectly a bridge port (for example when the bridge port is a LAG upper of the switchdev). Luckily, we already have a hook for that, in the form of the newly introduced switchdev_bridge_port_offload() and switchdev_bridge_port_unoffload() calls. These offer a natural place for hooking the object addition and deletion replays. Extend the above 2 functions with: - pointers to the switchdev atomic notifier (for FDB replays) and the blocking notifier (for MDB and VLAN replays). - the "const void *ctx" argument required for drivers to be able to disambiguate between which port is targeted, when multiple ports are lowers of the same LAG that is a bridge port. Most of the drivers pass NULL to this argument, except the ones that support LAG offload and have the proper context check already in place in the switchdev blocking notifier handler. Also unexport the replay helpers, since nobody except the bridge calls them directly now. Note that: (a) we abuse the terminology slightly, because FDB entries are not "switchdev objects", but we count them as objects nonetheless. With no direct way to prove it, I think they are not modeled as switchdev objects because those can only be installed by the bridge to the hardware (as opposed to FDB entries which can be propagated in the other direction too). This is merely an abuse of terms, FDB entries are replayed too, despite not being objects. (b) the bridge does not attempt to sync port attributes to newly joined ports, just the countable stuff (the objects). The reason for this is simple: no universal and symmetric way to sync and unsync them is known. For example, VLAN filtering: what to do on unsync, disable or leave it enabled? Similarly, STP state, ageing timer, etc etc. What a switchdev port does when it becomes standalone again is not really up to the bridge's competence, and the driver should deal with it. On the other hand, replaying deletions of switchdev objects can be seen a matter of cleanup and therefore be treated by the bridge, hence this patch. We make the replay helpers opt-in for drivers, because they might not bring immediate benefits for them: - nbp_vlan_init() is called _after_ netdev_master_upper_dev_link(), so br_vlan_replay() should not do anything for the new drivers on which we call it. The existing drivers where there was even a slight possibility for there to exist a VLAN on a bridge port before they join it are already guarded against this: mlxsw and prestera deny joining LAG interfaces that are members of a bridge. - br_fdb_replay() should now notify of local FDB entries, but I patched all drivers except DSA to ignore these new entries in commit 2c4eca3ef716 ("net: bridge: switchdev: include local flag in FDB notifications"). Driver authors can lift this restriction as they wish, and when they do, they can also opt into the FDB replay functionality. - br_mdb_replay() should fix a real issue which is described in commit 4f2673b3a2b6 ("net: bridge: add helper to replay port and host-joined mdb entries"). However most drivers do not offload the SWITCHDEV_OBJ_ID_HOST_MDB to see this issue: only cpsw and am65_cpsw offload this switchdev object, and I don't completely understand the way in which they offload this switchdev object anyway. So I'll leave it up to these drivers' respective maintainers to opt into br_mdb_replay(). So most of the drivers pass NULL notifier blocks for the replay helpers, except: - dpaa2-switch which was already acked/regression-tested with the helpers enabled (and there isn't much of a downside in having them) - ocelot which already had replay logic in "pull" mode - DSA which already had replay logic in "pull" mode An important observation is that the drivers which don't currently request bridge event replays don't even have the switchdev_bridge_port_{offload,unoffload} calls placed in proper places right now. This was done to avoid unnecessary rework for drivers which might never even add support for this. For driver writers who wish to add replay support, this can be used as a tentative placement guide: https://patchwork.kernel.org/project/netdevbpf/patch/20210720134655.892334-11-vladimir.oltean@nxp.com/ Cc: Vadym Kochan <vkochan@marvell.com> Cc: Taras Chornyi <tchornyi@marvell.com> Cc: Ioana Ciornei <ioana.ciornei@nxp.com> Cc: Lars Povlsen <lars.povlsen@microchip.com> Cc: Steen Hegelund <Steen.Hegelund@microchip.com> Cc: UNGLinuxDriver@microchip.com Cc: Claudiu Manoil <claudiu.manoil@nxp.com> Cc: Alexandre Belloni <alexandre.belloni@bootlin.com> Cc: Grygorii Strashko <grygorii.strashko@ti.com> Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Acked-by: Ioana Ciornei <ioana.ciornei@nxp.com> # dpaa2-switch Signed-off-by: David S. Miller <davem@davemloft.net>
2021-07-21 19:24:03 +03:00
switchdev_bridge_port_unoffload(ndev, NULL, NULL, NULL);
net: bridge: switchdev: let drivers inform which bridge ports are offloaded On reception of an skb, the bridge checks if it was marked as 'already forwarded in hardware' (checks if skb->offload_fwd_mark == 1), and if it is, it assigns the source hardware domain of that skb based on the hardware domain of the ingress port. Then during forwarding, it enforces that the egress port must have a different hardware domain than the ingress one (this is done in nbp_switchdev_allowed_egress). Non-switchdev drivers don't report any physical switch id (neither through devlink nor .ndo_get_port_parent_id), therefore the bridge assigns them a hardware domain of 0, and packets coming from them will always have skb->offload_fwd_mark = 0. So there aren't any restrictions. Problems appear due to the fact that DSA would like to perform software fallback for bonding and team interfaces that the physical switch cannot offload. +-- br0 ---+ / / | \ / / | \ / | | bond0 / | | / \ swp0 swp1 swp2 swp3 swp4 There, it is desirable that the presence of swp3 and swp4 under a non-offloaded LAG does not preclude us from doing hardware bridging beteen swp0, swp1 and swp2. The bandwidth of the CPU is often times high enough that software bridging between {swp0,swp1,swp2} and bond0 is not impractical. But this creates an impossible paradox given the current way in which port hardware domains are assigned. When the driver receives a packet from swp0 (say, due to flooding), it must set skb->offload_fwd_mark to something. - If we set it to 0, then the bridge will forward it towards swp1, swp2 and bond0. But the switch has already forwarded it towards swp1 and swp2 (not to bond0, remember, that isn't offloaded, so as far as the switch is concerned, ports swp3 and swp4 are not looking up the FDB, and the entire bond0 is a destination that is strictly behind the CPU). But we don't want duplicated traffic towards swp1 and swp2, so it's not ok to set skb->offload_fwd_mark = 0. - If we set it to 1, then the bridge will not forward the skb towards the ports with the same switchdev mark, i.e. not to swp1, swp2 and bond0. Towards swp1 and swp2 that's ok, but towards bond0? It should have forwarded the skb there. So the real issue is that bond0 will be assigned the same hardware domain as {swp0,swp1,swp2}, because the function that assigns hardware domains to bridge ports, nbp_switchdev_add(), recurses through bond0's lower interfaces until it finds something that implements devlink (calls dev_get_port_parent_id with bool recurse = true). This is a problem because the fact that bond0 can be offloaded by swp3 and swp4 in our example is merely an assumption. A solution is to give the bridge explicit hints as to what hardware domain it should use for each port. Currently, the bridging offload is very 'silent': a driver registers a netdevice notifier, which is put on the netns's notifier chain, and which sniffs around for NETDEV_CHANGEUPPER events where the upper is a bridge, and the lower is an interface it knows about (one registered by this driver, normally). Then, from within that notifier, it does a bunch of stuff behind the bridge's back, without the bridge necessarily knowing that there's somebody offloading that port. It looks like this: ip link set swp0 master br0 | v br_add_if() calls netdev_master_upper_dev_link() | v call_netdevice_notifiers | v dsa_slave_netdevice_event | v oh, hey! it's for me! | v .port_bridge_join What we do to solve the conundrum is to be less silent, and change the switchdev drivers to present themselves to the bridge. Something like this: ip link set swp0 master br0 | v br_add_if() calls netdev_master_upper_dev_link() | v bridge: Aye! I'll use this call_netdevice_notifiers ^ ppid as the | | hardware domain for v | this port, and zero dsa_slave_netdevice_event | if I got nothing. | | v | oh, hey! it's for me! | | | v | .port_bridge_join | | | +------------------------+ switchdev_bridge_port_offload(swp0, swp0) Then stacked interfaces (like bond0 on top of swp3/swp4) would be treated differently in DSA, depending on whether we can or cannot offload them. The offload case: ip link set bond0 master br0 | v br_add_if() calls netdev_master_upper_dev_link() | v bridge: Aye! I'll use this call_netdevice_notifiers ^ ppid as the | | switchdev mark for v | bond0. dsa_slave_netdevice_event | Coincidentally (or not), | | bond0 and swp0, swp1, swp2 v | all have the same switchdev hmm, it's not quite for me, | mark now, since the ASIC but my driver has already | is able to forward towards called .port_lag_join | all these ports in hw. for it, because I have | a port with dp->lag_dev == bond0. | | | v | .port_bridge_join | for swp3 and swp4 | | | +------------------------+ switchdev_bridge_port_offload(bond0, swp3) switchdev_bridge_port_offload(bond0, swp4) And the non-offload case: ip link set bond0 master br0 | v br_add_if() calls netdev_master_upper_dev_link() | v bridge waiting: call_netdevice_notifiers ^ huh, switchdev_bridge_port_offload | | wasn't called, okay, I'll use a v | hwdom of zero for this one. dsa_slave_netdevice_event : Then packets received on swp0 will | : not be software-forwarded towards v : swp1, but they will towards bond0. it's not for me, but bond0 is an upper of swp3 and swp4, but their dp->lag_dev is NULL because they couldn't offload it. Basically we can draw the conclusion that the lowers of a bridge port can come and go, so depending on the configuration of lowers for a bridge port, it can dynamically toggle between offloaded and unoffloaded. Therefore, we need an equivalent switchdev_bridge_port_unoffload too. This patch changes the way any switchdev driver interacts with the bridge. From now on, everybody needs to call switchdev_bridge_port_offload and switchdev_bridge_port_unoffload, otherwise the bridge will treat the port as non-offloaded and allow software flooding to other ports from the same ASIC. Note that these functions lay the ground for a more complex handshake between switchdev drivers and the bridge in the future. For drivers that will request a replay of the switchdev objects when they offload and unoffload a bridge port (DSA, dpaa2-switch, ocelot), we place the call to switchdev_bridge_port_unoffload() strategically inside the NETDEV_PRECHANGEUPPER notifier's code path, and not inside NETDEV_CHANGEUPPER. This is because the switchdev object replay helpers need the netdev adjacency lists to be valid, and that is only true in NETDEV_PRECHANGEUPPER. Cc: Vadym Kochan <vkochan@marvell.com> Cc: Taras Chornyi <tchornyi@marvell.com> Cc: Ioana Ciornei <ioana.ciornei@nxp.com> Cc: Lars Povlsen <lars.povlsen@microchip.com> Cc: Steen Hegelund <Steen.Hegelund@microchip.com> Cc: UNGLinuxDriver@microchip.com Cc: Claudiu Manoil <claudiu.manoil@nxp.com> Cc: Alexandre Belloni <alexandre.belloni@bootlin.com> Cc: Grygorii Strashko <grygorii.strashko@ti.com> Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Tested-by: Ioana Ciornei <ioana.ciornei@nxp.com> # dpaa2-switch: regression Acked-by: Ioana Ciornei <ioana.ciornei@nxp.com> # dpaa2-switch Tested-by: Horatiu Vultur <horatiu.vultur@microchip.com> # ocelot-switch Signed-off-by: David S. Miller <davem@davemloft.net>
2021-07-21 19:24:01 +03:00
cpsw->br_members &= ~BIT(priv->emac_port);
cpsw_port_offload_fwd_mark_update(cpsw);
if (!cpsw->br_members)
cpsw->hw_bridge_dev = NULL;
}
/* netdev notifier */
static int cpsw_netdevice_event(struct notifier_block *unused,
unsigned long event, void *ptr)
{
net: bridge: switchdev: let drivers inform which bridge ports are offloaded On reception of an skb, the bridge checks if it was marked as 'already forwarded in hardware' (checks if skb->offload_fwd_mark == 1), and if it is, it assigns the source hardware domain of that skb based on the hardware domain of the ingress port. Then during forwarding, it enforces that the egress port must have a different hardware domain than the ingress one (this is done in nbp_switchdev_allowed_egress). Non-switchdev drivers don't report any physical switch id (neither through devlink nor .ndo_get_port_parent_id), therefore the bridge assigns them a hardware domain of 0, and packets coming from them will always have skb->offload_fwd_mark = 0. So there aren't any restrictions. Problems appear due to the fact that DSA would like to perform software fallback for bonding and team interfaces that the physical switch cannot offload. +-- br0 ---+ / / | \ / / | \ / | | bond0 / | | / \ swp0 swp1 swp2 swp3 swp4 There, it is desirable that the presence of swp3 and swp4 under a non-offloaded LAG does not preclude us from doing hardware bridging beteen swp0, swp1 and swp2. The bandwidth of the CPU is often times high enough that software bridging between {swp0,swp1,swp2} and bond0 is not impractical. But this creates an impossible paradox given the current way in which port hardware domains are assigned. When the driver receives a packet from swp0 (say, due to flooding), it must set skb->offload_fwd_mark to something. - If we set it to 0, then the bridge will forward it towards swp1, swp2 and bond0. But the switch has already forwarded it towards swp1 and swp2 (not to bond0, remember, that isn't offloaded, so as far as the switch is concerned, ports swp3 and swp4 are not looking up the FDB, and the entire bond0 is a destination that is strictly behind the CPU). But we don't want duplicated traffic towards swp1 and swp2, so it's not ok to set skb->offload_fwd_mark = 0. - If we set it to 1, then the bridge will not forward the skb towards the ports with the same switchdev mark, i.e. not to swp1, swp2 and bond0. Towards swp1 and swp2 that's ok, but towards bond0? It should have forwarded the skb there. So the real issue is that bond0 will be assigned the same hardware domain as {swp0,swp1,swp2}, because the function that assigns hardware domains to bridge ports, nbp_switchdev_add(), recurses through bond0's lower interfaces until it finds something that implements devlink (calls dev_get_port_parent_id with bool recurse = true). This is a problem because the fact that bond0 can be offloaded by swp3 and swp4 in our example is merely an assumption. A solution is to give the bridge explicit hints as to what hardware domain it should use for each port. Currently, the bridging offload is very 'silent': a driver registers a netdevice notifier, which is put on the netns's notifier chain, and which sniffs around for NETDEV_CHANGEUPPER events where the upper is a bridge, and the lower is an interface it knows about (one registered by this driver, normally). Then, from within that notifier, it does a bunch of stuff behind the bridge's back, without the bridge necessarily knowing that there's somebody offloading that port. It looks like this: ip link set swp0 master br0 | v br_add_if() calls netdev_master_upper_dev_link() | v call_netdevice_notifiers | v dsa_slave_netdevice_event | v oh, hey! it's for me! | v .port_bridge_join What we do to solve the conundrum is to be less silent, and change the switchdev drivers to present themselves to the bridge. Something like this: ip link set swp0 master br0 | v br_add_if() calls netdev_master_upper_dev_link() | v bridge: Aye! I'll use this call_netdevice_notifiers ^ ppid as the | | hardware domain for v | this port, and zero dsa_slave_netdevice_event | if I got nothing. | | v | oh, hey! it's for me! | | | v | .port_bridge_join | | | +------------------------+ switchdev_bridge_port_offload(swp0, swp0) Then stacked interfaces (like bond0 on top of swp3/swp4) would be treated differently in DSA, depending on whether we can or cannot offload them. The offload case: ip link set bond0 master br0 | v br_add_if() calls netdev_master_upper_dev_link() | v bridge: Aye! I'll use this call_netdevice_notifiers ^ ppid as the | | switchdev mark for v | bond0. dsa_slave_netdevice_event | Coincidentally (or not), | | bond0 and swp0, swp1, swp2 v | all have the same switchdev hmm, it's not quite for me, | mark now, since the ASIC but my driver has already | is able to forward towards called .port_lag_join | all these ports in hw. for it, because I have | a port with dp->lag_dev == bond0. | | | v | .port_bridge_join | for swp3 and swp4 | | | +------------------------+ switchdev_bridge_port_offload(bond0, swp3) switchdev_bridge_port_offload(bond0, swp4) And the non-offload case: ip link set bond0 master br0 | v br_add_if() calls netdev_master_upper_dev_link() | v bridge waiting: call_netdevice_notifiers ^ huh, switchdev_bridge_port_offload | | wasn't called, okay, I'll use a v | hwdom of zero for this one. dsa_slave_netdevice_event : Then packets received on swp0 will | : not be software-forwarded towards v : swp1, but they will towards bond0. it's not for me, but bond0 is an upper of swp3 and swp4, but their dp->lag_dev is NULL because they couldn't offload it. Basically we can draw the conclusion that the lowers of a bridge port can come and go, so depending on the configuration of lowers for a bridge port, it can dynamically toggle between offloaded and unoffloaded. Therefore, we need an equivalent switchdev_bridge_port_unoffload too. This patch changes the way any switchdev driver interacts with the bridge. From now on, everybody needs to call switchdev_bridge_port_offload and switchdev_bridge_port_unoffload, otherwise the bridge will treat the port as non-offloaded and allow software flooding to other ports from the same ASIC. Note that these functions lay the ground for a more complex handshake between switchdev drivers and the bridge in the future. For drivers that will request a replay of the switchdev objects when they offload and unoffload a bridge port (DSA, dpaa2-switch, ocelot), we place the call to switchdev_bridge_port_unoffload() strategically inside the NETDEV_PRECHANGEUPPER notifier's code path, and not inside NETDEV_CHANGEUPPER. This is because the switchdev object replay helpers need the netdev adjacency lists to be valid, and that is only true in NETDEV_PRECHANGEUPPER. Cc: Vadym Kochan <vkochan@marvell.com> Cc: Taras Chornyi <tchornyi@marvell.com> Cc: Ioana Ciornei <ioana.ciornei@nxp.com> Cc: Lars Povlsen <lars.povlsen@microchip.com> Cc: Steen Hegelund <Steen.Hegelund@microchip.com> Cc: UNGLinuxDriver@microchip.com Cc: Claudiu Manoil <claudiu.manoil@nxp.com> Cc: Alexandre Belloni <alexandre.belloni@bootlin.com> Cc: Grygorii Strashko <grygorii.strashko@ti.com> Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Tested-by: Ioana Ciornei <ioana.ciornei@nxp.com> # dpaa2-switch: regression Acked-by: Ioana Ciornei <ioana.ciornei@nxp.com> # dpaa2-switch Tested-by: Horatiu Vultur <horatiu.vultur@microchip.com> # ocelot-switch Signed-off-by: David S. Miller <davem@davemloft.net>
2021-07-21 19:24:01 +03:00
struct netlink_ext_ack *extack = netdev_notifier_info_to_extack(ptr);
struct net_device *ndev = netdev_notifier_info_to_dev(ptr);
struct netdev_notifier_changeupper_info *info;
int ret = NOTIFY_DONE;
if (!cpsw_port_dev_check(ndev))
return NOTIFY_DONE;
switch (event) {
case NETDEV_CHANGEUPPER:
info = ptr;
if (netif_is_bridge_master(info->upper_dev)) {
if (info->linking)
ret = cpsw_netdevice_port_link(ndev,
net: bridge: switchdev: let drivers inform which bridge ports are offloaded On reception of an skb, the bridge checks if it was marked as 'already forwarded in hardware' (checks if skb->offload_fwd_mark == 1), and if it is, it assigns the source hardware domain of that skb based on the hardware domain of the ingress port. Then during forwarding, it enforces that the egress port must have a different hardware domain than the ingress one (this is done in nbp_switchdev_allowed_egress). Non-switchdev drivers don't report any physical switch id (neither through devlink nor .ndo_get_port_parent_id), therefore the bridge assigns them a hardware domain of 0, and packets coming from them will always have skb->offload_fwd_mark = 0. So there aren't any restrictions. Problems appear due to the fact that DSA would like to perform software fallback for bonding and team interfaces that the physical switch cannot offload. +-- br0 ---+ / / | \ / / | \ / | | bond0 / | | / \ swp0 swp1 swp2 swp3 swp4 There, it is desirable that the presence of swp3 and swp4 under a non-offloaded LAG does not preclude us from doing hardware bridging beteen swp0, swp1 and swp2. The bandwidth of the CPU is often times high enough that software bridging between {swp0,swp1,swp2} and bond0 is not impractical. But this creates an impossible paradox given the current way in which port hardware domains are assigned. When the driver receives a packet from swp0 (say, due to flooding), it must set skb->offload_fwd_mark to something. - If we set it to 0, then the bridge will forward it towards swp1, swp2 and bond0. But the switch has already forwarded it towards swp1 and swp2 (not to bond0, remember, that isn't offloaded, so as far as the switch is concerned, ports swp3 and swp4 are not looking up the FDB, and the entire bond0 is a destination that is strictly behind the CPU). But we don't want duplicated traffic towards swp1 and swp2, so it's not ok to set skb->offload_fwd_mark = 0. - If we set it to 1, then the bridge will not forward the skb towards the ports with the same switchdev mark, i.e. not to swp1, swp2 and bond0. Towards swp1 and swp2 that's ok, but towards bond0? It should have forwarded the skb there. So the real issue is that bond0 will be assigned the same hardware domain as {swp0,swp1,swp2}, because the function that assigns hardware domains to bridge ports, nbp_switchdev_add(), recurses through bond0's lower interfaces until it finds something that implements devlink (calls dev_get_port_parent_id with bool recurse = true). This is a problem because the fact that bond0 can be offloaded by swp3 and swp4 in our example is merely an assumption. A solution is to give the bridge explicit hints as to what hardware domain it should use for each port. Currently, the bridging offload is very 'silent': a driver registers a netdevice notifier, which is put on the netns's notifier chain, and which sniffs around for NETDEV_CHANGEUPPER events where the upper is a bridge, and the lower is an interface it knows about (one registered by this driver, normally). Then, from within that notifier, it does a bunch of stuff behind the bridge's back, without the bridge necessarily knowing that there's somebody offloading that port. It looks like this: ip link set swp0 master br0 | v br_add_if() calls netdev_master_upper_dev_link() | v call_netdevice_notifiers | v dsa_slave_netdevice_event | v oh, hey! it's for me! | v .port_bridge_join What we do to solve the conundrum is to be less silent, and change the switchdev drivers to present themselves to the bridge. Something like this: ip link set swp0 master br0 | v br_add_if() calls netdev_master_upper_dev_link() | v bridge: Aye! I'll use this call_netdevice_notifiers ^ ppid as the | | hardware domain for v | this port, and zero dsa_slave_netdevice_event | if I got nothing. | | v | oh, hey! it's for me! | | | v | .port_bridge_join | | | +------------------------+ switchdev_bridge_port_offload(swp0, swp0) Then stacked interfaces (like bond0 on top of swp3/swp4) would be treated differently in DSA, depending on whether we can or cannot offload them. The offload case: ip link set bond0 master br0 | v br_add_if() calls netdev_master_upper_dev_link() | v bridge: Aye! I'll use this call_netdevice_notifiers ^ ppid as the | | switchdev mark for v | bond0. dsa_slave_netdevice_event | Coincidentally (or not), | | bond0 and swp0, swp1, swp2 v | all have the same switchdev hmm, it's not quite for me, | mark now, since the ASIC but my driver has already | is able to forward towards called .port_lag_join | all these ports in hw. for it, because I have | a port with dp->lag_dev == bond0. | | | v | .port_bridge_join | for swp3 and swp4 | | | +------------------------+ switchdev_bridge_port_offload(bond0, swp3) switchdev_bridge_port_offload(bond0, swp4) And the non-offload case: ip link set bond0 master br0 | v br_add_if() calls netdev_master_upper_dev_link() | v bridge waiting: call_netdevice_notifiers ^ huh, switchdev_bridge_port_offload | | wasn't called, okay, I'll use a v | hwdom of zero for this one. dsa_slave_netdevice_event : Then packets received on swp0 will | : not be software-forwarded towards v : swp1, but they will towards bond0. it's not for me, but bond0 is an upper of swp3 and swp4, but their dp->lag_dev is NULL because they couldn't offload it. Basically we can draw the conclusion that the lowers of a bridge port can come and go, so depending on the configuration of lowers for a bridge port, it can dynamically toggle between offloaded and unoffloaded. Therefore, we need an equivalent switchdev_bridge_port_unoffload too. This patch changes the way any switchdev driver interacts with the bridge. From now on, everybody needs to call switchdev_bridge_port_offload and switchdev_bridge_port_unoffload, otherwise the bridge will treat the port as non-offloaded and allow software flooding to other ports from the same ASIC. Note that these functions lay the ground for a more complex handshake between switchdev drivers and the bridge in the future. For drivers that will request a replay of the switchdev objects when they offload and unoffload a bridge port (DSA, dpaa2-switch, ocelot), we place the call to switchdev_bridge_port_unoffload() strategically inside the NETDEV_PRECHANGEUPPER notifier's code path, and not inside NETDEV_CHANGEUPPER. This is because the switchdev object replay helpers need the netdev adjacency lists to be valid, and that is only true in NETDEV_PRECHANGEUPPER. Cc: Vadym Kochan <vkochan@marvell.com> Cc: Taras Chornyi <tchornyi@marvell.com> Cc: Ioana Ciornei <ioana.ciornei@nxp.com> Cc: Lars Povlsen <lars.povlsen@microchip.com> Cc: Steen Hegelund <Steen.Hegelund@microchip.com> Cc: UNGLinuxDriver@microchip.com Cc: Claudiu Manoil <claudiu.manoil@nxp.com> Cc: Alexandre Belloni <alexandre.belloni@bootlin.com> Cc: Grygorii Strashko <grygorii.strashko@ti.com> Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Tested-by: Ioana Ciornei <ioana.ciornei@nxp.com> # dpaa2-switch: regression Acked-by: Ioana Ciornei <ioana.ciornei@nxp.com> # dpaa2-switch Tested-by: Horatiu Vultur <horatiu.vultur@microchip.com> # ocelot-switch Signed-off-by: David S. Miller <davem@davemloft.net>
2021-07-21 19:24:01 +03:00
info->upper_dev,
extack);
else
cpsw_netdevice_port_unlink(ndev);
}
break;
default:
return NOTIFY_DONE;
}
return notifier_from_errno(ret);
}
static struct notifier_block cpsw_netdevice_nb __read_mostly = {
.notifier_call = cpsw_netdevice_event,
};
static int cpsw_register_notifiers(struct cpsw_common *cpsw)
{
int ret = 0;
ret = register_netdevice_notifier(&cpsw_netdevice_nb);
if (ret) {
dev_err(cpsw->dev, "can't register netdevice notifier\n");
return ret;
}
ret = cpsw_switchdev_register_notifiers(cpsw);
if (ret)
unregister_netdevice_notifier(&cpsw_netdevice_nb);
return ret;
}
static void cpsw_unregister_notifiers(struct cpsw_common *cpsw)
{
cpsw_switchdev_unregister_notifiers(cpsw);
unregister_netdevice_notifier(&cpsw_netdevice_nb);
}
static const struct devlink_ops cpsw_devlink_ops = {
};
static int cpsw_dl_switch_mode_get(struct devlink *dl, u32 id,
struct devlink_param_gset_ctx *ctx)
{
struct cpsw_devlink *dl_priv = devlink_priv(dl);
struct cpsw_common *cpsw = dl_priv->cpsw;
dev_dbg(cpsw->dev, "%s id:%u\n", __func__, id);
if (id != CPSW_DL_PARAM_SWITCH_MODE)
return -EOPNOTSUPP;
ctx->val.vbool = !cpsw->data.dual_emac;
return 0;
}
static int cpsw_dl_switch_mode_set(struct devlink *dl, u32 id,
struct devlink_param_gset_ctx *ctx)
{
struct cpsw_devlink *dl_priv = devlink_priv(dl);
struct cpsw_common *cpsw = dl_priv->cpsw;
int vlan = cpsw->data.default_vlan;
bool switch_en = ctx->val.vbool;
bool if_running = false;
int i;
dev_dbg(cpsw->dev, "%s id:%u\n", __func__, id);
if (id != CPSW_DL_PARAM_SWITCH_MODE)
return -EOPNOTSUPP;
if (switch_en == !cpsw->data.dual_emac)
return 0;
if (!switch_en && cpsw->br_members) {
dev_err(cpsw->dev, "Remove ports from BR before disabling switch mode\n");
return -EINVAL;
}
rtnl_lock();
for (i = 0; i < cpsw->data.slaves; i++) {
struct cpsw_slave *slave = &cpsw->slaves[i];
struct net_device *sl_ndev = slave->ndev;
if (!sl_ndev || !netif_running(sl_ndev))
continue;
if_running = true;
}
if (!if_running) {
/* all ndevs are down */
cpsw->data.dual_emac = !switch_en;
for (i = 0; i < cpsw->data.slaves; i++) {
struct cpsw_slave *slave = &cpsw->slaves[i];
struct net_device *sl_ndev = slave->ndev;
if (!sl_ndev)
continue;
if (switch_en)
vlan = cpsw->data.default_vlan;
else
vlan = slave->data->dual_emac_res_vlan;
slave->port_vlan = vlan;
}
goto exit;
}
if (switch_en) {
dev_info(cpsw->dev, "Enable switch mode\n");
/* enable bypass - no forwarding; all traffic goes to Host */
cpsw_ale_control_set(cpsw->ale, 0, ALE_BYPASS, 1);
/* clean up ALE table */
cpsw_ale_control_set(cpsw->ale, 0, ALE_CLEAR, 1);
cpsw_ale_control_get(cpsw->ale, 0, ALE_AGEOUT);
cpsw_init_host_port_switch(cpsw);
for (i = 0; i < cpsw->data.slaves; i++) {
struct cpsw_slave *slave = &cpsw->slaves[i];
struct net_device *sl_ndev = slave->ndev;
struct cpsw_priv *priv;
if (!sl_ndev)
continue;
priv = netdev_priv(sl_ndev);
slave->port_vlan = vlan;
WRITE_ONCE(priv->tx_packet_min, CPSW_MIN_PACKET_SIZE_VLAN);
if (netif_running(sl_ndev))
cpsw_port_add_switch_def_ale_entries(priv,
slave);
}
cpsw_ale_control_set(cpsw->ale, 0, ALE_BYPASS, 0);
cpsw->data.dual_emac = false;
} else {
dev_info(cpsw->dev, "Disable switch mode\n");
/* enable bypass - no forwarding; all traffic goes to Host */
cpsw_ale_control_set(cpsw->ale, 0, ALE_BYPASS, 1);
cpsw_ale_control_set(cpsw->ale, 0, ALE_CLEAR, 1);
cpsw_ale_control_get(cpsw->ale, 0, ALE_AGEOUT);
cpsw_init_host_port_dual_mac(cpsw);
for (i = 0; i < cpsw->data.slaves; i++) {
struct cpsw_slave *slave = &cpsw->slaves[i];
struct net_device *sl_ndev = slave->ndev;
struct cpsw_priv *priv;
if (!sl_ndev)
continue;
priv = netdev_priv(slave->ndev);
slave->port_vlan = slave->data->dual_emac_res_vlan;
WRITE_ONCE(priv->tx_packet_min, CPSW_MIN_PACKET_SIZE);
cpsw_port_add_dual_emac_def_ale_entries(priv, slave);
}
cpsw_ale_control_set(cpsw->ale, 0, ALE_BYPASS, 0);
cpsw->data.dual_emac = true;
}
exit:
rtnl_unlock();
return 0;
}
static int cpsw_dl_ale_ctrl_get(struct devlink *dl, u32 id,
struct devlink_param_gset_ctx *ctx)
{
struct cpsw_devlink *dl_priv = devlink_priv(dl);
struct cpsw_common *cpsw = dl_priv->cpsw;
dev_dbg(cpsw->dev, "%s id:%u\n", __func__, id);
switch (id) {
case CPSW_DL_PARAM_ALE_BYPASS:
ctx->val.vbool = cpsw_ale_control_get(cpsw->ale, 0, ALE_BYPASS);
break;
default:
return -EOPNOTSUPP;
}
return 0;
}
static int cpsw_dl_ale_ctrl_set(struct devlink *dl, u32 id,
struct devlink_param_gset_ctx *ctx)
{
struct cpsw_devlink *dl_priv = devlink_priv(dl);
struct cpsw_common *cpsw = dl_priv->cpsw;
int ret = -EOPNOTSUPP;
dev_dbg(cpsw->dev, "%s id:%u\n", __func__, id);
switch (id) {
case CPSW_DL_PARAM_ALE_BYPASS:
ret = cpsw_ale_control_set(cpsw->ale, 0, ALE_BYPASS,
ctx->val.vbool);
if (!ret) {
cpsw->ale_bypass = ctx->val.vbool;
cpsw_port_offload_fwd_mark_update(cpsw);
}
break;
default:
return -EOPNOTSUPP;
}
return 0;
}
static const struct devlink_param cpsw_devlink_params[] = {
DEVLINK_PARAM_DRIVER(CPSW_DL_PARAM_SWITCH_MODE,
"switch_mode", DEVLINK_PARAM_TYPE_BOOL,
BIT(DEVLINK_PARAM_CMODE_RUNTIME),
cpsw_dl_switch_mode_get, cpsw_dl_switch_mode_set,
NULL),
DEVLINK_PARAM_DRIVER(CPSW_DL_PARAM_ALE_BYPASS,
"ale_bypass", DEVLINK_PARAM_TYPE_BOOL,
BIT(DEVLINK_PARAM_CMODE_RUNTIME),
cpsw_dl_ale_ctrl_get, cpsw_dl_ale_ctrl_set, NULL),
};
static int cpsw_register_devlink(struct cpsw_common *cpsw)
{
struct device *dev = cpsw->dev;
struct cpsw_devlink *dl_priv;
int ret = 0;
cpsw->devlink = devlink_alloc(&cpsw_devlink_ops, sizeof(*dl_priv), dev);
if (!cpsw->devlink)
return -ENOMEM;
dl_priv = devlink_priv(cpsw->devlink);
dl_priv->cpsw = cpsw;
ret = devlink_params_register(cpsw->devlink, cpsw_devlink_params,
ARRAY_SIZE(cpsw_devlink_params));
if (ret) {
dev_err(dev, "DL params reg fail ret:%d\n", ret);
goto dl_unreg;
}
devlink_register(cpsw->devlink);
return ret;
dl_unreg:
devlink_free(cpsw->devlink);
return ret;
}
static void cpsw_unregister_devlink(struct cpsw_common *cpsw)
{
devlink_unregister(cpsw->devlink);
devlink_params_unregister(cpsw->devlink, cpsw_devlink_params,
ARRAY_SIZE(cpsw_devlink_params));
devlink_free(cpsw->devlink);
}
static const struct of_device_id cpsw_of_mtable[] = {
{ .compatible = "ti,cpsw-switch"},
{ .compatible = "ti,am335x-cpsw-switch"},
{ .compatible = "ti,am4372-cpsw-switch"},
{ .compatible = "ti,dra7-cpsw-switch"},
{ /* sentinel */ },
};
MODULE_DEVICE_TABLE(of, cpsw_of_mtable);
static const struct soc_device_attribute cpsw_soc_devices[] = {
{ .family = "AM33xx", .revision = "ES1.0"},
{ /* sentinel */ }
};
static int cpsw_probe(struct platform_device *pdev)
{
const struct soc_device_attribute *soc;
struct device *dev = &pdev->dev;
struct cpsw_common *cpsw;
struct resource *ss_res;
struct gpio_descs *mode;
void __iomem *ss_regs;
int ret = 0, ch;
struct clk *clk;
int irq;
cpsw = devm_kzalloc(dev, sizeof(struct cpsw_common), GFP_KERNEL);
if (!cpsw)
return -ENOMEM;
cpsw_slave_index = cpsw_slave_index_priv;
cpsw->dev = dev;
cpsw->slaves = devm_kcalloc(dev,
CPSW_SLAVE_PORTS_NUM,
sizeof(struct cpsw_slave),
GFP_KERNEL);
if (!cpsw->slaves)
return -ENOMEM;
mode = devm_gpiod_get_array_optional(dev, "mode", GPIOD_OUT_LOW);
if (IS_ERR(mode)) {
ret = PTR_ERR(mode);
dev_err(dev, "gpio request failed, ret %d\n", ret);
return ret;
}
clk = devm_clk_get(dev, "fck");
if (IS_ERR(clk)) {
ret = PTR_ERR(clk);
dev_err(dev, "fck is not found %d\n", ret);
return ret;
}
cpsw->bus_freq_mhz = clk_get_rate(clk) / 1000000;
ss_regs = devm_platform_get_and_ioremap_resource(pdev, 0, &ss_res);
if (IS_ERR(ss_regs)) {
ret = PTR_ERR(ss_regs);
return ret;
}
cpsw->regs = ss_regs;
irq = platform_get_irq_byname(pdev, "rx");
if (irq < 0)
return irq;
cpsw->irqs_table[0] = irq;
irq = platform_get_irq_byname(pdev, "tx");
if (irq < 0)
return irq;
cpsw->irqs_table[1] = irq;
irq = platform_get_irq_byname(pdev, "misc");
if (irq <= 0)
return irq;
cpsw->misc_irq = irq;
platform_set_drvdata(pdev, cpsw);
/* This may be required here for child devices. */
pm_runtime_enable(dev);
/* Need to enable clocks with runtime PM api to access module
* registers
*/
ret = pm_runtime_resume_and_get(dev);
if (ret < 0) {
pm_runtime_disable(dev);
return ret;
}
ret = cpsw_probe_dt(cpsw);
if (ret)
goto clean_dt_ret;
soc = soc_device_match(cpsw_soc_devices);
if (soc)
cpsw->quirk_irq = true;
cpsw->rx_packet_max = rx_packet_max;
cpsw->descs_pool_size = descs_pool_size;
eth_random_addr(cpsw->base_mac);
ret = cpsw_init_common(cpsw, ss_regs, ale_ageout,
(u32 __force)ss_res->start + CPSW2_BD_OFFSET,
descs_pool_size);
if (ret)
goto clean_dt_ret;
cpsw->wr_regs = cpsw->version == CPSW_VERSION_1 ?
ss_regs + CPSW1_WR_OFFSET :
ss_regs + CPSW2_WR_OFFSET;
ch = cpsw->quirk_irq ? 0 : 7;
cpsw->txv[0].ch = cpdma_chan_create(cpsw->dma, ch, cpsw_tx_handler, 0);
if (IS_ERR(cpsw->txv[0].ch)) {
dev_err(dev, "error initializing tx dma channel\n");
ret = PTR_ERR(cpsw->txv[0].ch);
goto clean_cpts;
}
cpsw->rxv[0].ch = cpdma_chan_create(cpsw->dma, 0, cpsw_rx_handler, 1);
if (IS_ERR(cpsw->rxv[0].ch)) {
dev_err(dev, "error initializing rx dma channel\n");
ret = PTR_ERR(cpsw->rxv[0].ch);
goto clean_cpts;
}
cpsw_split_res(cpsw);
/* setup netdevs */
ret = cpsw_create_ports(cpsw);
if (ret)
goto clean_unregister_netdev;
/* Grab RX and TX IRQs. Note that we also have RX_THRESHOLD and
* MISC IRQs which are always kept disabled with this driver so
* we will not request them.
*
* If anyone wants to implement support for those, make sure to
* first request and append them to irqs_table array.
*/
ret = devm_request_irq(dev, cpsw->irqs_table[0], cpsw_rx_interrupt,
0, dev_name(dev), cpsw);
if (ret < 0) {
dev_err(dev, "error attaching irq (%d)\n", ret);
goto clean_unregister_netdev;
}
ret = devm_request_irq(dev, cpsw->irqs_table[1], cpsw_tx_interrupt,
0, dev_name(dev), cpsw);
if (ret < 0) {
dev_err(dev, "error attaching irq (%d)\n", ret);
goto clean_unregister_netdev;
}
if (!cpsw->cpts)
goto skip_cpts;
ret = devm_request_irq(dev, cpsw->misc_irq, cpsw_misc_interrupt,
0, dev_name(&pdev->dev), cpsw);
if (ret < 0) {
dev_err(dev, "error attaching misc irq (%d)\n", ret);
goto clean_unregister_netdev;
}
/* Enable misc CPTS evnt_pend IRQ */
cpts_set_irqpoll(cpsw->cpts, false);
skip_cpts:
ret = cpsw_register_notifiers(cpsw);
if (ret)
goto clean_unregister_netdev;
ret = cpsw_register_devlink(cpsw);
if (ret)
goto clean_unregister_notifiers;
ret = cpsw_register_ports(cpsw);
if (ret)
goto clean_unregister_notifiers;
dev_notice(dev, "initialized (regs %pa, pool size %d) hw_ver:%08X %d.%d (%d)\n",
&ss_res->start, descs_pool_size,
cpsw->version, CPSW_MAJOR_VERSION(cpsw->version),
CPSW_MINOR_VERSION(cpsw->version),
CPSW_RTL_VERSION(cpsw->version));
pm_runtime_put(dev);
return 0;
clean_unregister_notifiers:
cpsw_unregister_notifiers(cpsw);
clean_unregister_netdev:
cpsw_unregister_ports(cpsw);
clean_cpts:
cpts_release(cpsw->cpts);
cpdma_ctlr_destroy(cpsw->dma);
clean_dt_ret:
cpsw_remove_dt(cpsw);
pm_runtime_put_sync(dev);
pm_runtime_disable(dev);
return ret;
}
static void cpsw_remove(struct platform_device *pdev)
{
struct cpsw_common *cpsw = platform_get_drvdata(pdev);
int ret;
ret = pm_runtime_resume_and_get(&pdev->dev);
if (ret < 0) {
/* Note, if this error path is taken, we're leaking some
* resources.
*/
dev_err(&pdev->dev, "Failed to resume device (%pe)\n",
ERR_PTR(ret));
return;
}
cpsw_unregister_notifiers(cpsw);
cpsw_unregister_devlink(cpsw);
cpsw_unregister_ports(cpsw);
cpts_release(cpsw->cpts);
cpdma_ctlr_destroy(cpsw->dma);
cpsw_remove_dt(cpsw);
pm_runtime_put_sync(&pdev->dev);
pm_runtime_disable(&pdev->dev);
}
static int __maybe_unused cpsw_suspend(struct device *dev)
{
struct cpsw_common *cpsw = dev_get_drvdata(dev);
int i;
rtnl_lock();
for (i = 0; i < cpsw->data.slaves; i++) {
struct net_device *ndev = cpsw->slaves[i].ndev;
if (!(ndev && netif_running(ndev)))
continue;
cpsw_ndo_stop(ndev);
}
rtnl_unlock();
/* Select sleep pin state */
pinctrl_pm_select_sleep_state(dev);
return 0;
}
static int __maybe_unused cpsw_resume(struct device *dev)
{
struct cpsw_common *cpsw = dev_get_drvdata(dev);
int i;
/* Select default pin state */
pinctrl_pm_select_default_state(dev);
/* shut up ASSERT_RTNL() warning in netif_set_real_num_tx/rx_queues */
rtnl_lock();
for (i = 0; i < cpsw->data.slaves; i++) {
struct net_device *ndev = cpsw->slaves[i].ndev;
if (!(ndev && netif_running(ndev)))
continue;
cpsw_ndo_open(ndev);
}
rtnl_unlock();
return 0;
}
static SIMPLE_DEV_PM_OPS(cpsw_pm_ops, cpsw_suspend, cpsw_resume);
static struct platform_driver cpsw_driver = {
.driver = {
.name = "cpsw-switch",
.pm = &cpsw_pm_ops,
.of_match_table = cpsw_of_mtable,
},
.probe = cpsw_probe,
.remove_new = cpsw_remove,
};
module_platform_driver(cpsw_driver);
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("TI CPSW switchdev Ethernet driver");