bb32347876
To correlate the hardware RX timestamp with something, add tracking of two software timestamps both clock source CLOCK_TAI (see description in man clock_gettime(2)). XDP metadata is extended with xdp_timestamp for capturing when XDP received the packet. Populated with BPF helper bpf_ktime_get_tai_ns(). I could not find a BPF helper for getting CLOCK_REALTIME, which would have been preferred. In userspace when AF_XDP sees the packet another software timestamp is recorded via clock_gettime() also clock source CLOCK_TAI. Example output shortly after loading igc driver: poll: 1 (0) skip=1 fail=0 redir=2 xsk_ring_cons__peek: 1 0x12557a8: rx_desc[1]->addr=100000000009000 addr=9100 comp_addr=9000 rx_hash: 0x82A96531 with RSS type:0x1 rx_timestamp: 1681740540304898909 (sec:1681740540.3049) XDP RX-time: 1681740577304958316 (sec:1681740577.3050) delta sec:37.0001 (37000059.407 usec) AF_XDP time: 1681740577305051315 (sec:1681740577.3051) delta sec:0.0001 (92.999 usec) 0x12557a8: complete idx=9 addr=9000 The first observation is that the 37 sec difference between RX HW vs XDP timestamps, which indicate hardware is likely clock source CLOCK_REALTIME, because (as of this writing) CLOCK_TAI is initialised with a 37 sec offset. The 93 usec (microsec) difference between XDP vs AF_XDP userspace is the userspace wakeup time. On this hardware it was caused by CPU idle sleep states, which can be reduced by tuning /dev/cpu_dma_latency. View current requested/allowed latency bound via: hexdump --format '"%d\n"' /dev/cpu_dma_latency More explanation of the output and how this can be used to identify clock drift for the HW clock can be seen here[1]: [1] https://github.com/xdp-project/xdp-project/blob/master/areas/hints/xdp_hints_kfuncs02_driver_igc.org Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> Acked-by: Stanislav Fomichev <sdf@google.com> Acked-by: Song Yoong Siang <yoong.siang.song@intel.com> Link: https://lore.kernel.org/bpf/168182466298.616355.2544377890818617459.stgit@firesoul
489 lines
11 KiB
C
489 lines
11 KiB
C
// SPDX-License-Identifier: GPL-2.0
|
|
|
|
/* Reference program for verifying XDP metadata on real HW. Functional test
|
|
* only, doesn't test the performance.
|
|
*
|
|
* RX:
|
|
* - UDP 9091 packets are diverted into AF_XDP
|
|
* - Metadata verified:
|
|
* - rx_timestamp
|
|
* - rx_hash
|
|
*
|
|
* TX:
|
|
* - TBD
|
|
*/
|
|
|
|
#include <test_progs.h>
|
|
#include <network_helpers.h>
|
|
#include "xdp_hw_metadata.skel.h"
|
|
#include "xsk.h"
|
|
|
|
#include <error.h>
|
|
#include <linux/errqueue.h>
|
|
#include <linux/if_link.h>
|
|
#include <linux/net_tstamp.h>
|
|
#include <linux/udp.h>
|
|
#include <linux/sockios.h>
|
|
#include <sys/mman.h>
|
|
#include <net/if.h>
|
|
#include <poll.h>
|
|
#include <time.h>
|
|
|
|
#include "xdp_metadata.h"
|
|
|
|
#define UMEM_NUM 16
|
|
#define UMEM_FRAME_SIZE XSK_UMEM__DEFAULT_FRAME_SIZE
|
|
#define UMEM_SIZE (UMEM_FRAME_SIZE * UMEM_NUM)
|
|
#define XDP_FLAGS (XDP_FLAGS_DRV_MODE | XDP_FLAGS_REPLACE)
|
|
|
|
struct xsk {
|
|
void *umem_area;
|
|
struct xsk_umem *umem;
|
|
struct xsk_ring_prod fill;
|
|
struct xsk_ring_cons comp;
|
|
struct xsk_ring_prod tx;
|
|
struct xsk_ring_cons rx;
|
|
struct xsk_socket *socket;
|
|
};
|
|
|
|
struct xdp_hw_metadata *bpf_obj;
|
|
struct xsk *rx_xsk;
|
|
const char *ifname;
|
|
int ifindex;
|
|
int rxq;
|
|
|
|
void test__fail(void) { /* for network_helpers.c */ }
|
|
|
|
static int open_xsk(int ifindex, struct xsk *xsk, __u32 queue_id)
|
|
{
|
|
int mmap_flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE;
|
|
const struct xsk_socket_config socket_config = {
|
|
.rx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS,
|
|
.tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS,
|
|
.bind_flags = XDP_COPY,
|
|
};
|
|
const struct xsk_umem_config umem_config = {
|
|
.fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS,
|
|
.comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS,
|
|
.frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE,
|
|
.flags = XDP_UMEM_UNALIGNED_CHUNK_FLAG,
|
|
};
|
|
__u32 idx;
|
|
u64 addr;
|
|
int ret;
|
|
int i;
|
|
|
|
xsk->umem_area = mmap(NULL, UMEM_SIZE, PROT_READ | PROT_WRITE, mmap_flags, -1, 0);
|
|
if (xsk->umem_area == MAP_FAILED)
|
|
return -ENOMEM;
|
|
|
|
ret = xsk_umem__create(&xsk->umem,
|
|
xsk->umem_area, UMEM_SIZE,
|
|
&xsk->fill,
|
|
&xsk->comp,
|
|
&umem_config);
|
|
if (ret)
|
|
return ret;
|
|
|
|
ret = xsk_socket__create(&xsk->socket, ifindex, queue_id,
|
|
xsk->umem,
|
|
&xsk->rx,
|
|
&xsk->tx,
|
|
&socket_config);
|
|
if (ret)
|
|
return ret;
|
|
|
|
/* First half of umem is for TX. This way address matches 1-to-1
|
|
* to the completion queue index.
|
|
*/
|
|
|
|
for (i = 0; i < UMEM_NUM / 2; i++) {
|
|
addr = i * UMEM_FRAME_SIZE;
|
|
printf("%p: tx_desc[%d] -> %lx\n", xsk, i, addr);
|
|
}
|
|
|
|
/* Second half of umem is for RX. */
|
|
|
|
ret = xsk_ring_prod__reserve(&xsk->fill, UMEM_NUM / 2, &idx);
|
|
for (i = 0; i < UMEM_NUM / 2; i++) {
|
|
addr = (UMEM_NUM / 2 + i) * UMEM_FRAME_SIZE;
|
|
printf("%p: rx_desc[%d] -> %lx\n", xsk, i, addr);
|
|
*xsk_ring_prod__fill_addr(&xsk->fill, i) = addr;
|
|
}
|
|
xsk_ring_prod__submit(&xsk->fill, ret);
|
|
|
|
return 0;
|
|
}
|
|
|
|
static void close_xsk(struct xsk *xsk)
|
|
{
|
|
if (xsk->umem)
|
|
xsk_umem__delete(xsk->umem);
|
|
if (xsk->socket)
|
|
xsk_socket__delete(xsk->socket);
|
|
munmap(xsk->umem_area, UMEM_SIZE);
|
|
}
|
|
|
|
static void refill_rx(struct xsk *xsk, __u64 addr)
|
|
{
|
|
__u32 idx;
|
|
|
|
if (xsk_ring_prod__reserve(&xsk->fill, 1, &idx) == 1) {
|
|
printf("%p: complete idx=%u addr=%llx\n", xsk, idx, addr);
|
|
*xsk_ring_prod__fill_addr(&xsk->fill, idx) = addr;
|
|
xsk_ring_prod__submit(&xsk->fill, 1);
|
|
}
|
|
}
|
|
|
|
#define NANOSEC_PER_SEC 1000000000 /* 10^9 */
|
|
static __u64 gettime(clockid_t clock_id)
|
|
{
|
|
struct timespec t;
|
|
int res;
|
|
|
|
/* See man clock_gettime(2) for type of clock_id's */
|
|
res = clock_gettime(clock_id, &t);
|
|
|
|
if (res < 0)
|
|
error(res, errno, "Error with clock_gettime()");
|
|
|
|
return (__u64) t.tv_sec * NANOSEC_PER_SEC + t.tv_nsec;
|
|
}
|
|
|
|
static void verify_xdp_metadata(void *data, clockid_t clock_id)
|
|
{
|
|
struct xdp_meta *meta;
|
|
|
|
meta = data - sizeof(*meta);
|
|
|
|
if (meta->rx_hash_err < 0)
|
|
printf("No rx_hash err=%d\n", meta->rx_hash_err);
|
|
else
|
|
printf("rx_hash: 0x%X with RSS type:0x%X\n",
|
|
meta->rx_hash, meta->rx_hash_type);
|
|
|
|
printf("rx_timestamp: %llu (sec:%0.4f)\n", meta->rx_timestamp,
|
|
(double)meta->rx_timestamp / NANOSEC_PER_SEC);
|
|
if (meta->rx_timestamp) {
|
|
__u64 usr_clock = gettime(clock_id);
|
|
__u64 xdp_clock = meta->xdp_timestamp;
|
|
__s64 delta_X = xdp_clock - meta->rx_timestamp;
|
|
__s64 delta_X2U = usr_clock - xdp_clock;
|
|
|
|
printf("XDP RX-time: %llu (sec:%0.4f) delta sec:%0.4f (%0.3f usec)\n",
|
|
xdp_clock, (double)xdp_clock / NANOSEC_PER_SEC,
|
|
(double)delta_X / NANOSEC_PER_SEC,
|
|
(double)delta_X / 1000);
|
|
|
|
printf("AF_XDP time: %llu (sec:%0.4f) delta sec:%0.4f (%0.3f usec)\n",
|
|
usr_clock, (double)usr_clock / NANOSEC_PER_SEC,
|
|
(double)delta_X2U / NANOSEC_PER_SEC,
|
|
(double)delta_X2U / 1000);
|
|
}
|
|
|
|
}
|
|
|
|
static void verify_skb_metadata(int fd)
|
|
{
|
|
char cmsg_buf[1024];
|
|
char packet_buf[128];
|
|
|
|
struct scm_timestamping *ts;
|
|
struct iovec packet_iov;
|
|
struct cmsghdr *cmsg;
|
|
struct msghdr hdr;
|
|
|
|
memset(&hdr, 0, sizeof(hdr));
|
|
hdr.msg_iov = &packet_iov;
|
|
hdr.msg_iovlen = 1;
|
|
packet_iov.iov_base = packet_buf;
|
|
packet_iov.iov_len = sizeof(packet_buf);
|
|
|
|
hdr.msg_control = cmsg_buf;
|
|
hdr.msg_controllen = sizeof(cmsg_buf);
|
|
|
|
if (recvmsg(fd, &hdr, 0) < 0)
|
|
error(1, errno, "recvmsg");
|
|
|
|
for (cmsg = CMSG_FIRSTHDR(&hdr); cmsg != NULL;
|
|
cmsg = CMSG_NXTHDR(&hdr, cmsg)) {
|
|
|
|
if (cmsg->cmsg_level != SOL_SOCKET)
|
|
continue;
|
|
|
|
switch (cmsg->cmsg_type) {
|
|
case SCM_TIMESTAMPING:
|
|
ts = (struct scm_timestamping *)CMSG_DATA(cmsg);
|
|
if (ts->ts[2].tv_sec || ts->ts[2].tv_nsec) {
|
|
printf("found skb hwtstamp = %lu.%lu\n",
|
|
ts->ts[2].tv_sec, ts->ts[2].tv_nsec);
|
|
return;
|
|
}
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
|
|
printf("skb hwtstamp is not found!\n");
|
|
}
|
|
|
|
static int verify_metadata(struct xsk *rx_xsk, int rxq, int server_fd, clockid_t clock_id)
|
|
{
|
|
const struct xdp_desc *rx_desc;
|
|
struct pollfd fds[rxq + 1];
|
|
__u64 comp_addr;
|
|
__u64 addr;
|
|
__u32 idx;
|
|
int ret;
|
|
int i;
|
|
|
|
for (i = 0; i < rxq; i++) {
|
|
fds[i].fd = xsk_socket__fd(rx_xsk[i].socket);
|
|
fds[i].events = POLLIN;
|
|
fds[i].revents = 0;
|
|
}
|
|
|
|
fds[rxq].fd = server_fd;
|
|
fds[rxq].events = POLLIN;
|
|
fds[rxq].revents = 0;
|
|
|
|
while (true) {
|
|
errno = 0;
|
|
ret = poll(fds, rxq + 1, 1000);
|
|
printf("poll: %d (%d) skip=%llu fail=%llu redir=%llu\n",
|
|
ret, errno, bpf_obj->bss->pkts_skip,
|
|
bpf_obj->bss->pkts_fail, bpf_obj->bss->pkts_redir);
|
|
if (ret < 0)
|
|
break;
|
|
if (ret == 0)
|
|
continue;
|
|
|
|
if (fds[rxq].revents)
|
|
verify_skb_metadata(server_fd);
|
|
|
|
for (i = 0; i < rxq; i++) {
|
|
if (fds[i].revents == 0)
|
|
continue;
|
|
|
|
struct xsk *xsk = &rx_xsk[i];
|
|
|
|
ret = xsk_ring_cons__peek(&xsk->rx, 1, &idx);
|
|
printf("xsk_ring_cons__peek: %d\n", ret);
|
|
if (ret != 1)
|
|
continue;
|
|
|
|
rx_desc = xsk_ring_cons__rx_desc(&xsk->rx, idx);
|
|
comp_addr = xsk_umem__extract_addr(rx_desc->addr);
|
|
addr = xsk_umem__add_offset_to_addr(rx_desc->addr);
|
|
printf("%p: rx_desc[%u]->addr=%llx addr=%llx comp_addr=%llx\n",
|
|
xsk, idx, rx_desc->addr, addr, comp_addr);
|
|
verify_xdp_metadata(xsk_umem__get_data(xsk->umem_area, addr),
|
|
clock_id);
|
|
xsk_ring_cons__release(&xsk->rx, 1);
|
|
refill_rx(xsk, comp_addr);
|
|
}
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
struct ethtool_channels {
|
|
__u32 cmd;
|
|
__u32 max_rx;
|
|
__u32 max_tx;
|
|
__u32 max_other;
|
|
__u32 max_combined;
|
|
__u32 rx_count;
|
|
__u32 tx_count;
|
|
__u32 other_count;
|
|
__u32 combined_count;
|
|
};
|
|
|
|
#define ETHTOOL_GCHANNELS 0x0000003c /* Get no of channels */
|
|
|
|
static int rxq_num(const char *ifname)
|
|
{
|
|
struct ethtool_channels ch = {
|
|
.cmd = ETHTOOL_GCHANNELS,
|
|
};
|
|
|
|
struct ifreq ifr = {
|
|
.ifr_data = (void *)&ch,
|
|
};
|
|
strncpy(ifr.ifr_name, ifname, IF_NAMESIZE - 1);
|
|
int fd, ret;
|
|
|
|
fd = socket(AF_UNIX, SOCK_DGRAM, 0);
|
|
if (fd < 0)
|
|
error(1, errno, "socket");
|
|
|
|
ret = ioctl(fd, SIOCETHTOOL, &ifr);
|
|
if (ret < 0)
|
|
error(1, errno, "ioctl(SIOCETHTOOL)");
|
|
|
|
close(fd);
|
|
|
|
return ch.rx_count + ch.combined_count;
|
|
}
|
|
|
|
static void hwtstamp_ioctl(int op, const char *ifname, struct hwtstamp_config *cfg)
|
|
{
|
|
struct ifreq ifr = {
|
|
.ifr_data = (void *)cfg,
|
|
};
|
|
strncpy(ifr.ifr_name, ifname, IF_NAMESIZE - 1);
|
|
int fd, ret;
|
|
|
|
fd = socket(AF_UNIX, SOCK_DGRAM, 0);
|
|
if (fd < 0)
|
|
error(1, errno, "socket");
|
|
|
|
ret = ioctl(fd, op, &ifr);
|
|
if (ret < 0)
|
|
error(1, errno, "ioctl(%d)", op);
|
|
|
|
close(fd);
|
|
}
|
|
|
|
static struct hwtstamp_config saved_hwtstamp_cfg;
|
|
static const char *saved_hwtstamp_ifname;
|
|
|
|
static void hwtstamp_restore(void)
|
|
{
|
|
hwtstamp_ioctl(SIOCSHWTSTAMP, saved_hwtstamp_ifname, &saved_hwtstamp_cfg);
|
|
}
|
|
|
|
static void hwtstamp_enable(const char *ifname)
|
|
{
|
|
struct hwtstamp_config cfg = {
|
|
.rx_filter = HWTSTAMP_FILTER_ALL,
|
|
};
|
|
|
|
hwtstamp_ioctl(SIOCGHWTSTAMP, ifname, &saved_hwtstamp_cfg);
|
|
saved_hwtstamp_ifname = strdup(ifname);
|
|
atexit(hwtstamp_restore);
|
|
|
|
hwtstamp_ioctl(SIOCSHWTSTAMP, ifname, &cfg);
|
|
}
|
|
|
|
static void cleanup(void)
|
|
{
|
|
LIBBPF_OPTS(bpf_xdp_attach_opts, opts);
|
|
int ret;
|
|
int i;
|
|
|
|
if (bpf_obj) {
|
|
opts.old_prog_fd = bpf_program__fd(bpf_obj->progs.rx);
|
|
if (opts.old_prog_fd >= 0) {
|
|
printf("detaching bpf program....\n");
|
|
ret = bpf_xdp_detach(ifindex, XDP_FLAGS, &opts);
|
|
if (ret)
|
|
printf("failed to detach XDP program: %d\n", ret);
|
|
}
|
|
}
|
|
|
|
for (i = 0; i < rxq; i++)
|
|
close_xsk(&rx_xsk[i]);
|
|
|
|
if (bpf_obj)
|
|
xdp_hw_metadata__destroy(bpf_obj);
|
|
}
|
|
|
|
static void handle_signal(int sig)
|
|
{
|
|
/* interrupting poll() is all we need */
|
|
}
|
|
|
|
static void timestamping_enable(int fd, int val)
|
|
{
|
|
int ret;
|
|
|
|
ret = setsockopt(fd, SOL_SOCKET, SO_TIMESTAMPING, &val, sizeof(val));
|
|
if (ret < 0)
|
|
error(1, errno, "setsockopt(SO_TIMESTAMPING)");
|
|
}
|
|
|
|
int main(int argc, char *argv[])
|
|
{
|
|
clockid_t clock_id = CLOCK_TAI;
|
|
int server_fd = -1;
|
|
int ret;
|
|
int i;
|
|
|
|
struct bpf_program *prog;
|
|
|
|
if (argc != 2) {
|
|
fprintf(stderr, "pass device name\n");
|
|
return -1;
|
|
}
|
|
|
|
ifname = argv[1];
|
|
ifindex = if_nametoindex(ifname);
|
|
rxq = rxq_num(ifname);
|
|
|
|
printf("rxq: %d\n", rxq);
|
|
|
|
hwtstamp_enable(ifname);
|
|
|
|
rx_xsk = malloc(sizeof(struct xsk) * rxq);
|
|
if (!rx_xsk)
|
|
error(1, ENOMEM, "malloc");
|
|
|
|
for (i = 0; i < rxq; i++) {
|
|
printf("open_xsk(%s, %p, %d)\n", ifname, &rx_xsk[i], i);
|
|
ret = open_xsk(ifindex, &rx_xsk[i], i);
|
|
if (ret)
|
|
error(1, -ret, "open_xsk");
|
|
|
|
printf("xsk_socket__fd() -> %d\n", xsk_socket__fd(rx_xsk[i].socket));
|
|
}
|
|
|
|
printf("open bpf program...\n");
|
|
bpf_obj = xdp_hw_metadata__open();
|
|
if (libbpf_get_error(bpf_obj))
|
|
error(1, libbpf_get_error(bpf_obj), "xdp_hw_metadata__open");
|
|
|
|
prog = bpf_object__find_program_by_name(bpf_obj->obj, "rx");
|
|
bpf_program__set_ifindex(prog, ifindex);
|
|
bpf_program__set_flags(prog, BPF_F_XDP_DEV_BOUND_ONLY);
|
|
|
|
printf("load bpf program...\n");
|
|
ret = xdp_hw_metadata__load(bpf_obj);
|
|
if (ret)
|
|
error(1, -ret, "xdp_hw_metadata__load");
|
|
|
|
printf("prepare skb endpoint...\n");
|
|
server_fd = start_server(AF_INET6, SOCK_DGRAM, NULL, 9092, 1000);
|
|
if (server_fd < 0)
|
|
error(1, errno, "start_server");
|
|
timestamping_enable(server_fd,
|
|
SOF_TIMESTAMPING_SOFTWARE |
|
|
SOF_TIMESTAMPING_RAW_HARDWARE);
|
|
|
|
printf("prepare xsk map...\n");
|
|
for (i = 0; i < rxq; i++) {
|
|
int sock_fd = xsk_socket__fd(rx_xsk[i].socket);
|
|
__u32 queue_id = i;
|
|
|
|
printf("map[%d] = %d\n", queue_id, sock_fd);
|
|
ret = bpf_map_update_elem(bpf_map__fd(bpf_obj->maps.xsk), &queue_id, &sock_fd, 0);
|
|
if (ret)
|
|
error(1, -ret, "bpf_map_update_elem");
|
|
}
|
|
|
|
printf("attach bpf program...\n");
|
|
ret = bpf_xdp_attach(ifindex,
|
|
bpf_program__fd(bpf_obj->progs.rx),
|
|
XDP_FLAGS, NULL);
|
|
if (ret)
|
|
error(1, -ret, "bpf_xdp_attach");
|
|
|
|
signal(SIGINT, handle_signal);
|
|
ret = verify_metadata(rx_xsk, rxq, server_fd, clock_id);
|
|
close(server_fd);
|
|
cleanup();
|
|
if (ret)
|
|
error(1, -ret, "verify_metadata");
|
|
}
|