From 035dd64de9485752dd07fc28c3318b69d6722771 Mon Sep 17 00:00:00 2001 From: Ioana Ciornei Date: Wed, 9 Feb 2022 11:23:29 +0200 Subject: [PATCH 1/7] dpaa2-eth: rearrange variable declaration in __dpaa2_eth_tx In the next patches we'll be moving things arroung in the mentioned function and also add some new variable declarations. Before all this, cleanup the variable declaration order. Signed-off-by: Ioana Ciornei Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c index e985ae008a97..218b1516b86b 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c @@ -1102,16 +1102,16 @@ static netdev_tx_t __dpaa2_eth_tx(struct sk_buff *skb, struct net_device *net_dev) { struct dpaa2_eth_priv *priv = netdev_priv(net_dev); - struct dpaa2_fd fd; - struct rtnl_link_stats64 *percpu_stats; struct dpaa2_eth_drv_stats *percpu_extras; + struct rtnl_link_stats64 *percpu_stats; + unsigned int needed_headroom; struct dpaa2_eth_fq *fq; struct netdev_queue *nq; + struct dpaa2_fd fd; u16 queue_mapping; - unsigned int needed_headroom; - u32 fd_len; u8 prio = 0; int err, i; + u32 fd_len; void *swa; percpu_stats = this_cpu_ptr(priv->percpu_stats); From 8378a7910d1401885114647a63ea099667754707 Mon Sep 17 00:00:00 2001 From: Ioana Ciornei Date: Wed, 9 Feb 2022 11:23:30 +0200 Subject: [PATCH 2/7] dpaa2-eth: allocate a fragment already aligned Instead of allocating memory and then manually aligning it to the desired value use napi_alloc_frag_align() directly to streamline the process. Signed-off-by: Ioana Ciornei Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c index 218b1516b86b..6ccbec21300f 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c @@ -888,14 +888,13 @@ static int dpaa2_eth_build_sg_fd_single_buf(struct dpaa2_eth_priv *priv, sgt_buf_size = priv->tx_data_offset + sizeof(struct dpaa2_sg_entry); if (sgt_cache->count == 0) - sgt_buf = kzalloc(sgt_buf_size + DPAA2_ETH_TX_BUF_ALIGN, - GFP_ATOMIC); + sgt_buf = napi_alloc_frag_align(sgt_buf_size, DPAA2_ETH_TX_BUF_ALIGN); else sgt_buf = sgt_cache->buf[--sgt_cache->count]; if (unlikely(!sgt_buf)) return -ENOMEM; + memset(sgt_buf, 0, sgt_buf_size); - sgt_buf = PTR_ALIGN(sgt_buf, DPAA2_ETH_TX_BUF_ALIGN); sgt = (struct dpaa2_sg_entry *)(sgt_buf + priv->tx_data_offset); addr = dma_map_single(dev, skb->data, skb->len, DMA_BIDIRECTIONAL); @@ -935,7 +934,7 @@ sgt_map_failed: dma_unmap_single(dev, addr, skb->len, DMA_BIDIRECTIONAL); data_map_failed: if (sgt_cache->count >= DPAA2_ETH_SGT_CACHE_SIZE) - kfree(sgt_buf); + skb_free_frag(sgt_buf); else sgt_cache->buf[sgt_cache->count++] = sgt_buf; @@ -1088,7 +1087,7 @@ static void dpaa2_eth_free_tx_fd(struct dpaa2_eth_priv *priv, skb_free_frag(buffer_start); } else { if (sgt_cache->count >= DPAA2_ETH_SGT_CACHE_SIZE) - kfree(buffer_start); + skb_free_frag(buffer_start); else sgt_cache->buf[sgt_cache->count++] = buffer_start; } @@ -1523,7 +1522,7 @@ static void dpaa2_eth_sgt_cache_drain(struct dpaa2_eth_priv *priv) count = sgt_cache->count; for (i = 0; i < count; i++) - kfree(sgt_cache->buf[i]); + skb_free_frag(sgt_cache->buf[i]); sgt_cache->count = 0; } } From ae3b08177529b8ad68f4e755a3970d1faac8df21 Mon Sep 17 00:00:00 2001 From: Ioana Ciornei Date: Wed, 9 Feb 2022 11:23:31 +0200 Subject: [PATCH 3/7] dpaa2-eth: extract the S/G table buffer cache interaction into functions The dpaa2-eth driver uses in certain circumstances a buffer cache for the S/G tables needed in case of a S/G FD. At the moment, the interraction with the cache is open-coded and couldn't be reused easily. Add two new functions - dpaa2_eth_sgt_get and dpaa2_eth_sgt_recycle - which help with code reusability. Signed-off-by: Ioana Ciornei Signed-off-by: David S. Miller --- .../net/ethernet/freescale/dpaa2/dpaa2-eth.c | 61 +++++++++++-------- 1 file changed, 37 insertions(+), 24 deletions(-) diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c index 6ccbec21300f..006ab355a21d 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c @@ -760,6 +760,38 @@ static void dpaa2_eth_enable_tx_tstamp(struct dpaa2_eth_priv *priv, } } +static void *dpaa2_eth_sgt_get(struct dpaa2_eth_priv *priv) +{ + struct dpaa2_eth_sgt_cache *sgt_cache; + void *sgt_buf = NULL; + int sgt_buf_size; + + sgt_cache = this_cpu_ptr(priv->sgt_cache); + sgt_buf_size = priv->tx_data_offset + sizeof(struct dpaa2_sg_entry); + + if (sgt_cache->count == 0) + sgt_buf = napi_alloc_frag_align(sgt_buf_size, DPAA2_ETH_TX_BUF_ALIGN); + else + sgt_buf = sgt_cache->buf[--sgt_cache->count]; + if (!sgt_buf) + return NULL; + + memset(sgt_buf, 0, sgt_buf_size); + + return sgt_buf; +} + +static void dpaa2_eth_sgt_recycle(struct dpaa2_eth_priv *priv, void *sgt_buf) +{ + struct dpaa2_eth_sgt_cache *sgt_cache; + + sgt_cache = this_cpu_ptr(priv->sgt_cache); + if (sgt_cache->count >= DPAA2_ETH_SGT_CACHE_SIZE) + skb_free_frag(sgt_buf); + else + sgt_cache->buf[sgt_cache->count++] = sgt_buf; +} + /* Create a frame descriptor based on a fragmented skb */ static int dpaa2_eth_build_sg_fd(struct dpaa2_eth_priv *priv, struct sk_buff *skb, @@ -810,7 +842,6 @@ static int dpaa2_eth_build_sg_fd(struct dpaa2_eth_priv *priv, err = -ENOMEM; goto sgt_buf_alloc_failed; } - memset(sgt_buf, 0, sgt_buf_size); sgt = (struct dpaa2_sg_entry *)(sgt_buf + priv->tx_data_offset); @@ -875,7 +906,6 @@ static int dpaa2_eth_build_sg_fd_single_buf(struct dpaa2_eth_priv *priv, void **swa_addr) { struct device *dev = priv->net_dev->dev.parent; - struct dpaa2_eth_sgt_cache *sgt_cache; struct dpaa2_sg_entry *sgt; struct dpaa2_eth_swa *swa; dma_addr_t addr, sgt_addr; @@ -884,17 +914,10 @@ static int dpaa2_eth_build_sg_fd_single_buf(struct dpaa2_eth_priv *priv, int err; /* Prepare the HW SGT structure */ - sgt_cache = this_cpu_ptr(priv->sgt_cache); sgt_buf_size = priv->tx_data_offset + sizeof(struct dpaa2_sg_entry); - - if (sgt_cache->count == 0) - sgt_buf = napi_alloc_frag_align(sgt_buf_size, DPAA2_ETH_TX_BUF_ALIGN); - else - sgt_buf = sgt_cache->buf[--sgt_cache->count]; + sgt_buf = dpaa2_eth_sgt_get(priv); if (unlikely(!sgt_buf)) return -ENOMEM; - memset(sgt_buf, 0, sgt_buf_size); - sgt = (struct dpaa2_sg_entry *)(sgt_buf + priv->tx_data_offset); addr = dma_map_single(dev, skb->data, skb->len, DMA_BIDIRECTIONAL); @@ -933,10 +956,7 @@ static int dpaa2_eth_build_sg_fd_single_buf(struct dpaa2_eth_priv *priv, sgt_map_failed: dma_unmap_single(dev, addr, skb->len, DMA_BIDIRECTIONAL); data_map_failed: - if (sgt_cache->count >= DPAA2_ETH_SGT_CACHE_SIZE) - skb_free_frag(sgt_buf); - else - sgt_cache->buf[sgt_cache->count++] = sgt_buf; + dpaa2_eth_sgt_recycle(priv, sgt_buf); return err; } @@ -1004,8 +1024,6 @@ static void dpaa2_eth_free_tx_fd(struct dpaa2_eth_priv *priv, struct dpaa2_eth_swa *swa; u8 fd_format = dpaa2_fd_get_format(fd); u32 fd_len = dpaa2_fd_get_len(fd); - - struct dpaa2_eth_sgt_cache *sgt_cache; struct dpaa2_sg_entry *sgt; fd_addr = dpaa2_fd_get_addr(fd); @@ -1082,15 +1100,10 @@ static void dpaa2_eth_free_tx_fd(struct dpaa2_eth_priv *priv, /* Free SGT buffer allocated on tx */ if (fd_format != dpaa2_fd_single) { - sgt_cache = this_cpu_ptr(priv->sgt_cache); - if (swa->type == DPAA2_ETH_SWA_SG) { + if (swa->type == DPAA2_ETH_SWA_SG) skb_free_frag(buffer_start); - } else { - if (sgt_cache->count >= DPAA2_ETH_SGT_CACHE_SIZE) - skb_free_frag(buffer_start); - else - sgt_cache->buf[sgt_cache->count++] = buffer_start; - } + else + dpaa2_eth_sgt_recycle(priv, buffer_start); } /* Move on with skb release */ From a4218aef7c86689339a808a89d98611c26f91201 Mon Sep 17 00:00:00 2001 From: Ioana Ciornei Date: Wed, 9 Feb 2022 11:23:32 +0200 Subject: [PATCH 4/7] dpaa2-eth: use the S/G table cache also for the normal S/G path Instead of allocating memory for an S/G table each time a nonlinear skb is processed, and then freeing it on the Tx confirmation path, use the S/G table cache in order to reuse the memory. For this to work we have to change the size of the cached buffers so that it can hold the maximum number of scatterlist entries. Other than that, each allocate/free call is replaced by a call to the dpaa2_eth_sgt_get/dpaa2_eth_sgt_recycle functions, introduced in the previous patch. Signed-off-by: Ioana Ciornei Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c | 15 ++++++--------- drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h | 2 ++ 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c index 006ab355a21d..73e242fad000 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c @@ -767,7 +767,8 @@ static void *dpaa2_eth_sgt_get(struct dpaa2_eth_priv *priv) int sgt_buf_size; sgt_cache = this_cpu_ptr(priv->sgt_cache); - sgt_buf_size = priv->tx_data_offset + sizeof(struct dpaa2_sg_entry); + sgt_buf_size = priv->tx_data_offset + + DPAA2_ETH_SG_ENTRIES_MAX * sizeof(struct dpaa2_sg_entry); if (sgt_cache->count == 0) sgt_buf = napi_alloc_frag_align(sgt_buf_size, DPAA2_ETH_TX_BUF_ALIGN); @@ -837,7 +838,7 @@ static int dpaa2_eth_build_sg_fd(struct dpaa2_eth_priv *priv, /* Prepare the HW SGT structure */ sgt_buf_size = priv->tx_data_offset + sizeof(struct dpaa2_sg_entry) * num_dma_bufs; - sgt_buf = napi_alloc_frag_align(sgt_buf_size, DPAA2_ETH_TX_BUF_ALIGN); + sgt_buf = dpaa2_eth_sgt_get(priv); if (unlikely(!sgt_buf)) { err = -ENOMEM; goto sgt_buf_alloc_failed; @@ -886,7 +887,7 @@ static int dpaa2_eth_build_sg_fd(struct dpaa2_eth_priv *priv, return 0; dma_map_single_failed: - skb_free_frag(sgt_buf); + dpaa2_eth_sgt_recycle(priv, sgt_buf); sgt_buf_alloc_failed: dma_unmap_sg(dev, scl, num_sg, DMA_BIDIRECTIONAL); dma_map_sg_failed: @@ -1099,12 +1100,8 @@ static void dpaa2_eth_free_tx_fd(struct dpaa2_eth_priv *priv, } /* Free SGT buffer allocated on tx */ - if (fd_format != dpaa2_fd_single) { - if (swa->type == DPAA2_ETH_SWA_SG) - skb_free_frag(buffer_start); - else - dpaa2_eth_sgt_recycle(priv, buffer_start); - } + if (fd_format != dpaa2_fd_single) + dpaa2_eth_sgt_recycle(priv, buffer_start); /* Move on with skb release */ napi_consume_skb(skb, in_napi); diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h index e54e70ebdd05..7f9c6f4dea53 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h @@ -493,6 +493,8 @@ struct dpaa2_eth_trap_data { struct dpaa2_eth_priv *priv; }; +#define DPAA2_ETH_SG_ENTRIES_MAX (PAGE_SIZE / sizeof(struct scatterlist)) + #define DPAA2_ETH_DEFAULT_COPYBREAK 512 /* Driver private data */ From a4ca448e8bfef0def568a6283a69d9cf018fd2c8 Mon Sep 17 00:00:00 2001 From: Ioana Ciornei Date: Wed, 9 Feb 2022 11:23:33 +0200 Subject: [PATCH 5/7] dpaa2-eth: work with an array of FDs Up until now, the __dpaa2_eth_tx function used a single FD on the stack to construct the structure to be enqueued. Since we are now preparing the ground work to add support for TSO done in software at the driver level, the same function needs to work with an array of FDs and enqueue as many as the build_*_fd functions create. Make the necessary adjustments in order to do this. These include: keeping an array of FDs in a percpu structure, cleaning up the necessary FDs before populating it and then, retrying the enqueue process up till all the generated FDs were enqueued or until we reach the maximum number retries. This patch does not change the fact that only a single FD will result from a __dpaa2_eth_tx call but rather just creates the necessary changes for the next patch. Signed-off-by: Ioana Ciornei Signed-off-by: David S. Miller --- .../net/ethernet/freescale/dpaa2/dpaa2-eth.c | 56 +++++++++++++------ .../net/ethernet/freescale/dpaa2/dpaa2-eth.h | 7 +++ 2 files changed, 47 insertions(+), 16 deletions(-) diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c index 73e242fad000..d9871b9c1aad 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c @@ -878,6 +878,7 @@ static int dpaa2_eth_build_sg_fd(struct dpaa2_eth_priv *priv, err = -ENOMEM; goto dma_map_single_failed; } + memset(fd, 0, sizeof(struct dpaa2_fd)); dpaa2_fd_set_offset(fd, priv->tx_data_offset); dpaa2_fd_set_format(fd, dpaa2_fd_sg); dpaa2_fd_set_addr(fd, addr); @@ -946,6 +947,7 @@ static int dpaa2_eth_build_sg_fd_single_buf(struct dpaa2_eth_priv *priv, goto sgt_map_failed; } + memset(fd, 0, sizeof(struct dpaa2_fd)); dpaa2_fd_set_offset(fd, priv->tx_data_offset); dpaa2_fd_set_format(fd, dpaa2_fd_sg); dpaa2_fd_set_addr(fd, sgt_addr); @@ -998,6 +1000,7 @@ static int dpaa2_eth_build_single_fd(struct dpaa2_eth_priv *priv, if (unlikely(dma_mapping_error(dev, addr))) return -ENOMEM; + memset(fd, 0, sizeof(struct dpaa2_fd)); dpaa2_fd_set_addr(fd, addr); dpaa2_fd_set_offset(fd, (u16)(skb->data - buffer_start)); dpaa2_fd_set_len(fd, skb->len); @@ -1111,12 +1114,14 @@ static netdev_tx_t __dpaa2_eth_tx(struct sk_buff *skb, struct net_device *net_dev) { struct dpaa2_eth_priv *priv = netdev_priv(net_dev); + int total_enqueued = 0, retries = 0, enqueued; struct dpaa2_eth_drv_stats *percpu_extras; struct rtnl_link_stats64 *percpu_stats; unsigned int needed_headroom; + int num_fds = 1, max_retries; struct dpaa2_eth_fq *fq; struct netdev_queue *nq; - struct dpaa2_fd fd; + struct dpaa2_fd *fd; u16 queue_mapping; u8 prio = 0; int err, i; @@ -1125,6 +1130,7 @@ static netdev_tx_t __dpaa2_eth_tx(struct sk_buff *skb, percpu_stats = this_cpu_ptr(priv->percpu_stats); percpu_extras = this_cpu_ptr(priv->percpu_extras); + fd = (this_cpu_ptr(priv->fd))->array; needed_headroom = dpaa2_eth_needed_headroom(skb); @@ -1139,20 +1145,22 @@ static netdev_tx_t __dpaa2_eth_tx(struct sk_buff *skb, } /* Setup the FD fields */ - memset(&fd, 0, sizeof(fd)); if (skb_is_nonlinear(skb)) { - err = dpaa2_eth_build_sg_fd(priv, skb, &fd, &swa); + err = dpaa2_eth_build_sg_fd(priv, skb, fd, &swa); percpu_extras->tx_sg_frames++; percpu_extras->tx_sg_bytes += skb->len; + fd_len = dpaa2_fd_get_len(fd); } else if (skb_headroom(skb) < needed_headroom) { - err = dpaa2_eth_build_sg_fd_single_buf(priv, skb, &fd, &swa); + err = dpaa2_eth_build_sg_fd_single_buf(priv, skb, fd, &swa); percpu_extras->tx_sg_frames++; percpu_extras->tx_sg_bytes += skb->len; percpu_extras->tx_converted_sg_frames++; percpu_extras->tx_converted_sg_bytes += skb->len; + fd_len = dpaa2_fd_get_len(fd); } else { - err = dpaa2_eth_build_single_fd(priv, skb, &fd, &swa); + err = dpaa2_eth_build_single_fd(priv, skb, fd, &swa); + fd_len = dpaa2_fd_get_len(fd); } if (unlikely(err)) { @@ -1161,10 +1169,11 @@ static netdev_tx_t __dpaa2_eth_tx(struct sk_buff *skb, } if (skb->cb[0]) - dpaa2_eth_enable_tx_tstamp(priv, &fd, swa, skb); + dpaa2_eth_enable_tx_tstamp(priv, fd, swa, skb); /* Tracing point */ - trace_dpaa2_tx_fd(net_dev, &fd); + for (i = 0; i < num_fds; i++) + trace_dpaa2_tx_fd(net_dev, &fd[i]); /* TxConf FQ selection relies on queue id from the stack. * In case of a forwarded frame from another DPNI interface, we choose @@ -1184,27 +1193,32 @@ static netdev_tx_t __dpaa2_eth_tx(struct sk_buff *skb, queue_mapping %= dpaa2_eth_queue_count(priv); } fq = &priv->fq[queue_mapping]; - - fd_len = dpaa2_fd_get_len(&fd); nq = netdev_get_tx_queue(net_dev, queue_mapping); netdev_tx_sent_queue(nq, fd_len); /* Everything that happens after this enqueues might race with * the Tx confirmation callback for this frame */ - for (i = 0; i < DPAA2_ETH_ENQUEUE_RETRIES; i++) { - err = priv->enqueue(priv, fq, &fd, prio, 1, NULL); - if (err != -EBUSY) - break; + max_retries = num_fds * DPAA2_ETH_ENQUEUE_RETRIES; + while (total_enqueued < num_fds && retries < max_retries) { + err = priv->enqueue(priv, fq, &fd[total_enqueued], + prio, num_fds - total_enqueued, &enqueued); + if (err == -EBUSY) { + retries++; + continue; + } + + total_enqueued += enqueued; } - percpu_extras->tx_portal_busy += i; + percpu_extras->tx_portal_busy += retries; + if (unlikely(err < 0)) { percpu_stats->tx_errors++; /* Clean up everything, including freeing the skb */ - dpaa2_eth_free_tx_fd(priv, fq, &fd, false); + dpaa2_eth_free_tx_fd(priv, fq, fd, false); netdev_tx_completed_queue(nq, 1, fd_len); } else { - percpu_stats->tx_packets++; + percpu_stats->tx_packets += total_enqueued; percpu_stats->tx_bytes += fd_len; } @@ -4406,6 +4420,13 @@ static int dpaa2_eth_probe(struct fsl_mc_device *dpni_dev) goto err_alloc_sgt_cache; } + priv->fd = alloc_percpu(*priv->fd); + if (!priv->fd) { + dev_err(dev, "alloc_percpu(fds) failed\n"); + err = -ENOMEM; + goto err_alloc_fds; + } + err = dpaa2_eth_netdev_init(net_dev); if (err) goto err_netdev_init; @@ -4493,6 +4514,8 @@ err_poll_thread: err_alloc_rings: err_csum: err_netdev_init: + free_percpu(priv->fd); +err_alloc_fds: free_percpu(priv->sgt_cache); err_alloc_sgt_cache: free_percpu(priv->percpu_extras); @@ -4548,6 +4571,7 @@ static int dpaa2_eth_remove(struct fsl_mc_device *ls_dev) fsl_mc_free_irqs(ls_dev); dpaa2_eth_free_rings(priv); + free_percpu(priv->fd); free_percpu(priv->sgt_cache); free_percpu(priv->percpu_stats); free_percpu(priv->percpu_extras); diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h index 7f9c6f4dea53..64e4aaebdcb2 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h @@ -497,6 +497,11 @@ struct dpaa2_eth_trap_data { #define DPAA2_ETH_DEFAULT_COPYBREAK 512 +#define DPAA2_ETH_ENQUEUE_MAX_FDS 200 +struct dpaa2_eth_fds { + struct dpaa2_fd array[DPAA2_ETH_ENQUEUE_MAX_FDS]; +}; + /* Driver private data */ struct dpaa2_eth_priv { struct net_device *net_dev; @@ -579,6 +584,8 @@ struct dpaa2_eth_priv { struct devlink_port devlink_port; u32 rx_copybreak; + + struct dpaa2_eth_fds __percpu *fd; }; struct dpaa2_eth_devlink_priv { From 3dc709e0cd47c602a8d1a6747f1a91e9737eeed3 Mon Sep 17 00:00:00 2001 From: Ioana Ciornei Date: Wed, 9 Feb 2022 11:23:34 +0200 Subject: [PATCH 6/7] dpaa2-eth: add support for software TSO This patch adds support for driver level TSO in the enetc driver using the TSO API. There is not much to say about this specific implementation. We are using the usual tso_build_hdr(), tso_build_data() to create each data segment, we create an array of S/G FDs where the first S/G entry is referencing the header data and the remaining ones the data portion. For the S/G Table buffer we use the same cache of buffers used on the other non-GSO cases - dpaa2_eth_sgt_get() and dpaa2_eth_sgt_recycle(). We cannot keep a DMA coherent buffer for all the TSO headers because the DPAA2 architecture does not work in a ring based fashion so we just allocate a buffer each time. Even with these limitations we get the following improvement in TCP termination on the LX2160A SoC, on a single A72 core running at 2.2GHz. before: 6.38Gbit/s after: 8.48Gbit/s Signed-off-by: Ioana Ciornei Signed-off-by: David S. Miller --- .../net/ethernet/freescale/dpaa2/dpaa2-eth.c | 210 ++++++++++++++++-- .../net/ethernet/freescale/dpaa2/dpaa2-eth.h | 9 + .../ethernet/freescale/dpaa2/dpaa2-ethtool.c | 2 + 3 files changed, 205 insertions(+), 16 deletions(-) diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c index d9871b9c1aad..88534aa29af2 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c @@ -18,6 +18,7 @@ #include #include #include +#include #include "dpaa2-eth.h" @@ -1029,6 +1030,8 @@ static void dpaa2_eth_free_tx_fd(struct dpaa2_eth_priv *priv, u8 fd_format = dpaa2_fd_get_format(fd); u32 fd_len = dpaa2_fd_get_len(fd); struct dpaa2_sg_entry *sgt; + int should_free_skb = 1; + int i; fd_addr = dpaa2_fd_get_addr(fd); buffer_start = dpaa2_iova_to_virt(priv->iommu_domain, fd_addr); @@ -1060,6 +1063,28 @@ static void dpaa2_eth_free_tx_fd(struct dpaa2_eth_priv *priv, /* Unmap the SGT buffer */ dma_unmap_single(dev, fd_addr, swa->sg.sgt_size, DMA_BIDIRECTIONAL); + } else if (swa->type == DPAA2_ETH_SWA_SW_TSO) { + skb = swa->tso.skb; + + sgt = (struct dpaa2_sg_entry *)(buffer_start + + priv->tx_data_offset); + + /* Unmap and free the header */ + dma_unmap_single(dev, dpaa2_sg_get_addr(sgt), TSO_HEADER_SIZE, + DMA_TO_DEVICE); + kfree(dpaa2_iova_to_virt(priv->iommu_domain, dpaa2_sg_get_addr(sgt))); + + /* Unmap the other SG entries for the data */ + for (i = 1; i < swa->tso.num_sg; i++) + dma_unmap_single(dev, dpaa2_sg_get_addr(&sgt[i]), + dpaa2_sg_get_len(&sgt[i]), DMA_TO_DEVICE); + + /* Unmap the SGT buffer */ + dma_unmap_single(dev, fd_addr, swa->sg.sgt_size, + DMA_BIDIRECTIONAL); + + if (!swa->tso.is_last_fd) + should_free_skb = 0; } else { skb = swa->single.skb; @@ -1088,26 +1113,172 @@ static void dpaa2_eth_free_tx_fd(struct dpaa2_eth_priv *priv, } /* Get the timestamp value */ - if (skb->cb[0] == TX_TSTAMP) { - struct skb_shared_hwtstamps shhwtstamps; - __le64 *ts = dpaa2_get_ts(buffer_start, true); - u64 ns; + if (swa->type != DPAA2_ETH_SWA_SW_TSO) { + if (skb->cb[0] == TX_TSTAMP) { + struct skb_shared_hwtstamps shhwtstamps; + __le64 *ts = dpaa2_get_ts(buffer_start, true); + u64 ns; - memset(&shhwtstamps, 0, sizeof(shhwtstamps)); + memset(&shhwtstamps, 0, sizeof(shhwtstamps)); - ns = DPAA2_PTP_CLK_PERIOD_NS * le64_to_cpup(ts); - shhwtstamps.hwtstamp = ns_to_ktime(ns); - skb_tstamp_tx(skb, &shhwtstamps); - } else if (skb->cb[0] == TX_TSTAMP_ONESTEP_SYNC) { - mutex_unlock(&priv->onestep_tstamp_lock); + ns = DPAA2_PTP_CLK_PERIOD_NS * le64_to_cpup(ts); + shhwtstamps.hwtstamp = ns_to_ktime(ns); + skb_tstamp_tx(skb, &shhwtstamps); + } else if (skb->cb[0] == TX_TSTAMP_ONESTEP_SYNC) { + mutex_unlock(&priv->onestep_tstamp_lock); + } } /* Free SGT buffer allocated on tx */ if (fd_format != dpaa2_fd_single) dpaa2_eth_sgt_recycle(priv, buffer_start); - /* Move on with skb release */ - napi_consume_skb(skb, in_napi); + /* Move on with skb release. If we are just confirming multiple FDs + * from the same TSO skb then only the last one will need to free the + * skb. + */ + if (should_free_skb) + napi_consume_skb(skb, in_napi); +} + +static int dpaa2_eth_build_gso_fd(struct dpaa2_eth_priv *priv, + struct sk_buff *skb, struct dpaa2_fd *fd, + int *num_fds, u32 *total_fds_len) +{ + struct device *dev = priv->net_dev->dev.parent; + int hdr_len, total_len, data_left, fd_len; + int num_sge, err, i, sgt_buf_size; + struct dpaa2_fd *fd_start = fd; + struct dpaa2_sg_entry *sgt; + struct dpaa2_eth_swa *swa; + dma_addr_t sgt_addr, addr; + dma_addr_t tso_hdr_dma; + unsigned int index = 0; + struct tso_t tso; + char *tso_hdr; + void *sgt_buf; + + /* Initialize the TSO handler, and prepare the first payload */ + hdr_len = tso_start(skb, &tso); + *total_fds_len = 0; + + total_len = skb->len - hdr_len; + while (total_len > 0) { + /* Prepare the HW SGT structure for this frame */ + sgt_buf = dpaa2_eth_sgt_get(priv); + if (unlikely(!sgt_buf)) { + netdev_err(priv->net_dev, "dpaa2_eth_sgt_get() failed\n"); + err = -ENOMEM; + goto err_sgt_get; + } + sgt = (struct dpaa2_sg_entry *)(sgt_buf + priv->tx_data_offset); + + /* Determine the data length of this frame */ + data_left = min_t(int, skb_shinfo(skb)->gso_size, total_len); + total_len -= data_left; + fd_len = data_left + hdr_len; + + /* Prepare packet headers: MAC + IP + TCP */ + tso_hdr = kmalloc(TSO_HEADER_SIZE, GFP_ATOMIC); + if (!tso_hdr) { + err = -ENOMEM; + goto err_alloc_tso_hdr; + } + + tso_build_hdr(skb, tso_hdr, &tso, data_left, total_len == 0); + tso_hdr_dma = dma_map_single(dev, tso_hdr, TSO_HEADER_SIZE, DMA_TO_DEVICE); + if (dma_mapping_error(dev, tso_hdr_dma)) { + netdev_err(priv->net_dev, "dma_map_single(tso_hdr) failed\n"); + err = -ENOMEM; + goto err_map_tso_hdr; + } + + /* Setup the SG entry for the header */ + dpaa2_sg_set_addr(sgt, tso_hdr_dma); + dpaa2_sg_set_len(sgt, hdr_len); + dpaa2_sg_set_final(sgt, data_left > 0 ? false : true); + + /* Compose the SG entries for each fragment of data */ + num_sge = 1; + while (data_left > 0) { + int size; + + /* Move to the next SG entry */ + sgt++; + size = min_t(int, tso.size, data_left); + + addr = dma_map_single(dev, tso.data, size, DMA_TO_DEVICE); + if (dma_mapping_error(dev, addr)) { + netdev_err(priv->net_dev, "dma_map_single(tso.data) failed\n"); + err = -ENOMEM; + goto err_map_data; + } + dpaa2_sg_set_addr(sgt, addr); + dpaa2_sg_set_len(sgt, size); + dpaa2_sg_set_final(sgt, size == data_left ? true : false); + + num_sge++; + + /* Build the data for the __next__ fragment */ + data_left -= size; + tso_build_data(skb, &tso, size); + } + + /* Store the skb backpointer in the SGT buffer */ + sgt_buf_size = priv->tx_data_offset + num_sge * sizeof(struct dpaa2_sg_entry); + swa = (struct dpaa2_eth_swa *)sgt_buf; + swa->type = DPAA2_ETH_SWA_SW_TSO; + swa->tso.skb = skb; + swa->tso.num_sg = num_sge; + swa->tso.sgt_size = sgt_buf_size; + swa->tso.is_last_fd = total_len == 0 ? 1 : 0; + + /* Separately map the SGT buffer */ + sgt_addr = dma_map_single(dev, sgt_buf, sgt_buf_size, DMA_BIDIRECTIONAL); + if (unlikely(dma_mapping_error(dev, sgt_addr))) { + netdev_err(priv->net_dev, "dma_map_single(sgt_buf) failed\n"); + err = -ENOMEM; + goto err_map_sgt; + } + + /* Setup the frame descriptor */ + memset(fd, 0, sizeof(struct dpaa2_fd)); + dpaa2_fd_set_offset(fd, priv->tx_data_offset); + dpaa2_fd_set_format(fd, dpaa2_fd_sg); + dpaa2_fd_set_addr(fd, sgt_addr); + dpaa2_fd_set_len(fd, fd_len); + dpaa2_fd_set_ctrl(fd, FD_CTRL_PTA); + + *total_fds_len += fd_len; + /* Advance to the next frame descriptor */ + fd++; + index++; + } + + *num_fds = index; + + return 0; + +err_map_sgt: +err_map_data: + /* Unmap all the data S/G entries for the current FD */ + sgt = (struct dpaa2_sg_entry *)(sgt_buf + priv->tx_data_offset); + for (i = 1; i < num_sge; i++) + dma_unmap_single(dev, dpaa2_sg_get_addr(&sgt[i]), + dpaa2_sg_get_len(&sgt[i]), DMA_TO_DEVICE); + + /* Unmap the header entry */ + dma_unmap_single(dev, tso_hdr_dma, TSO_HEADER_SIZE, DMA_TO_DEVICE); +err_map_tso_hdr: + kfree(tso_hdr); +err_alloc_tso_hdr: + dpaa2_eth_sgt_recycle(priv, sgt_buf); +err_sgt_get: + /* Free all the other FDs that were already fully created */ + for (i = 0; i < index; i++) + dpaa2_eth_free_tx_fd(priv, NULL, &fd_start[i], false); + + return err; } static netdev_tx_t __dpaa2_eth_tx(struct sk_buff *skb, @@ -1123,10 +1294,10 @@ static netdev_tx_t __dpaa2_eth_tx(struct sk_buff *skb, struct netdev_queue *nq; struct dpaa2_fd *fd; u16 queue_mapping; + void *swa = NULL; u8 prio = 0; int err, i; u32 fd_len; - void *swa; percpu_stats = this_cpu_ptr(priv->percpu_stats); percpu_extras = this_cpu_ptr(priv->percpu_extras); @@ -1146,7 +1317,13 @@ static netdev_tx_t __dpaa2_eth_tx(struct sk_buff *skb, /* Setup the FD fields */ - if (skb_is_nonlinear(skb)) { + if (skb_is_gso(skb)) { + err = dpaa2_eth_build_gso_fd(priv, skb, fd, &num_fds, &fd_len); + percpu_extras->tx_sg_frames += num_fds; + percpu_extras->tx_sg_bytes += fd_len; + percpu_extras->tx_tso_frames += num_fds; + percpu_extras->tx_tso_bytes += fd_len; + } else if (skb_is_nonlinear(skb)) { err = dpaa2_eth_build_sg_fd(priv, skb, fd, &swa); percpu_extras->tx_sg_frames++; percpu_extras->tx_sg_bytes += skb->len; @@ -1168,7 +1345,7 @@ static netdev_tx_t __dpaa2_eth_tx(struct sk_buff *skb, goto err_build_fd; } - if (skb->cb[0]) + if (swa && skb->cb[0]) dpaa2_eth_enable_tx_tstamp(priv, fd, swa, skb); /* Tracing point */ @@ -4138,7 +4315,8 @@ static int dpaa2_eth_netdev_init(struct net_device *net_dev) net_dev->features = NETIF_F_RXCSUM | NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM | NETIF_F_SG | NETIF_F_HIGHDMA | - NETIF_F_LLTX | NETIF_F_HW_TC; + NETIF_F_LLTX | NETIF_F_HW_TC | NETIF_F_TSO; + net_dev->gso_max_segs = DPAA2_ETH_ENQUEUE_MAX_FDS; net_dev->hw_features = net_dev->features; if (priv->dpni_attrs.vlan_filter_entries) diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h index 64e4aaebdcb2..b79831cd1a94 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h @@ -122,6 +122,7 @@ enum dpaa2_eth_swa_type { DPAA2_ETH_SWA_SINGLE, DPAA2_ETH_SWA_SG, DPAA2_ETH_SWA_XDP, + DPAA2_ETH_SWA_SW_TSO, }; /* Must keep this struct smaller than DPAA2_ETH_SWA_SIZE */ @@ -142,6 +143,12 @@ struct dpaa2_eth_swa { int dma_size; struct xdp_frame *xdpf; } xdp; + struct { + struct sk_buff *skb; + int num_sg; + int sgt_size; + int is_last_fd; + } tso; }; }; @@ -354,6 +361,8 @@ struct dpaa2_eth_drv_stats { __u64 tx_conf_bytes; __u64 tx_sg_frames; __u64 tx_sg_bytes; + __u64 tx_tso_frames; + __u64 tx_tso_bytes; __u64 rx_sg_frames; __u64 rx_sg_bytes; /* Linear skbs sent as a S/G FD due to insufficient headroom */ diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-ethtool.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-ethtool.c index 3fdbf87dccb1..eea7d7a07c00 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-ethtool.c +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-ethtool.c @@ -44,6 +44,8 @@ static char dpaa2_ethtool_extras[][ETH_GSTRING_LEN] = { "[drv] tx conf bytes", "[drv] tx sg frames", "[drv] tx sg bytes", + "[drv] tx tso frames", + "[drv] tx tso bytes", "[drv] rx sg frames", "[drv] rx sg bytes", "[drv] tx converted sg frames", From 86ec882f59a070e07d1e74c5b03340180ad90a1e Mon Sep 17 00:00:00 2001 From: Ioana Ciornei Date: Wed, 9 Feb 2022 11:23:35 +0200 Subject: [PATCH 7/7] soc: fsl: dpio: read the consumer index from the cache inhibited area Once we added support in the dpaa2-eth for driver level software TSO we observed the following situation: if the EQCR CI (consumer index) is read from the cache-enabled area we sometimes end up with a computed value of available enqueue entries bigger than the size of the ring. This eventually will lead to the multiple enqueue of the same FD which will determine the same FD to end up on the Tx confirmation path and the same skb being freed twice. Just read the consumer index from the cache inhibited area so that we avoid this situation. Signed-off-by: Ioana Ciornei Signed-off-by: David S. Miller --- drivers/soc/fsl/dpio/qbman-portal.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/soc/fsl/dpio/qbman-portal.c b/drivers/soc/fsl/dpio/qbman-portal.c index 058b78fac5e3..0a3fb6c115f4 100644 --- a/drivers/soc/fsl/dpio/qbman-portal.c +++ b/drivers/soc/fsl/dpio/qbman-portal.c @@ -743,8 +743,8 @@ int qbman_swp_enqueue_multiple_mem_back(struct qbman_swp *s, full_mask = s->eqcr.pi_ci_mask; if (!s->eqcr.available) { eqcr_ci = s->eqcr.ci; - p = s->addr_cena + QBMAN_CENA_SWP_EQCR_CI_MEMBACK; - s->eqcr.ci = *p & full_mask; + s->eqcr.ci = qbman_read_register(s, QBMAN_CINH_SWP_EQCR_CI); + s->eqcr.ci &= full_mask; s->eqcr.available = qm_cyc_diff(s->eqcr.pi_ring_size, eqcr_ci, s->eqcr.ci); if (!s->eqcr.available) { @@ -887,8 +887,8 @@ int qbman_swp_enqueue_multiple_desc_mem_back(struct qbman_swp *s, full_mask = s->eqcr.pi_ci_mask; if (!s->eqcr.available) { eqcr_ci = s->eqcr.ci; - p = s->addr_cena + QBMAN_CENA_SWP_EQCR_CI_MEMBACK; - s->eqcr.ci = *p & full_mask; + s->eqcr.ci = qbman_read_register(s, QBMAN_CINH_SWP_EQCR_CI); + s->eqcr.ci &= full_mask; s->eqcr.available = qm_cyc_diff(s->eqcr.pi_ring_size, eqcr_ci, s->eqcr.ci); if (!s->eqcr.available)