libbpf: add support for using AF_XDP sockets
authorMagnus Karlsson <magnus.karlsson@intel.com>
Thu, 21 Feb 2019 09:21:26 +0000 (10:21 +0100)
committerDaniel Borkmann <daniel@iogearbox.net>
Mon, 25 Feb 2019 22:21:42 +0000 (23:21 +0100)
This commit adds AF_XDP support to libbpf. The main reason for this is
to facilitate writing applications that use AF_XDP by offering
higher-level APIs that hide many of the details of the AF_XDP
uapi. This is in the same vein as libbpf facilitates XDP adoption by
offering easy-to-use higher level interfaces of XDP
functionality. Hopefully this will facilitate adoption of AF_XDP, make
applications using it simpler and smaller, and finally also make it
possible for applications to benefit from optimizations in the AF_XDP
user space access code. Previously, people just copied and pasted the
code from the sample application into their application, which is not
desirable.

The interface is composed of two parts:

* Low-level access interface to the four rings and the packet
* High-level control plane interface for creating and setting
  up umems and af_xdp sockets as well as a simple XDP program.

Tested-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
tools/include/uapi/linux/ethtool.h [new file with mode: 0644]
tools/include/uapi/linux/if_xdp.h [new file with mode: 0644]
tools/lib/bpf/Build
tools/lib/bpf/Makefile
tools/lib/bpf/README.rst
tools/lib/bpf/libbpf.map
tools/lib/bpf/xsk.c [new file with mode: 0644]
tools/lib/bpf/xsk.h [new file with mode: 0644]

diff --git a/tools/include/uapi/linux/ethtool.h b/tools/include/uapi/linux/ethtool.h
new file mode 100644 (file)
index 0000000..c86c3e9
--- /dev/null
@@ -0,0 +1,51 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * ethtool.h: Defines for Linux ethtool.
+ *
+ * Copyright (C) 1998 David S. Miller (davem@redhat.com)
+ * Copyright 2001 Jeff Garzik <jgarzik@pobox.com>
+ * Portions Copyright 2001 Sun Microsystems (thockin@sun.com)
+ * Portions Copyright 2002 Intel (eli.kupermann@intel.com,
+ *                                christopher.leech@intel.com,
+ *                                scott.feldman@intel.com)
+ * Portions Copyright (C) Sun Microsystems 2008
+ */
+
+#ifndef _UAPI_LINUX_ETHTOOL_H
+#define _UAPI_LINUX_ETHTOOL_H
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/if_ether.h>
+
+#define ETHTOOL_GCHANNELS       0x0000003c /* Get no of channels */
+
+/**
+ * struct ethtool_channels - configuring number of network channel
+ * @cmd: ETHTOOL_{G,S}CHANNELS
+ * @max_rx: Read only. Maximum number of receive channel the driver support.
+ * @max_tx: Read only. Maximum number of transmit channel the driver support.
+ * @max_other: Read only. Maximum number of other channel the driver support.
+ * @max_combined: Read only. Maximum number of combined channel the driver
+ *     support. Set of queues RX, TX or other.
+ * @rx_count: Valid values are in the range 1 to the max_rx.
+ * @tx_count: Valid values are in the range 1 to the max_tx.
+ * @other_count: Valid values are in the range 1 to the max_other.
+ * @combined_count: Valid values are in the range 1 to the max_combined.
+ *
+ * This can be used to configure RX, TX and other channels.
+ */
+
+struct ethtool_channels {
+       __u32   cmd;
+       __u32   max_rx;
+       __u32   max_tx;
+       __u32   max_other;
+       __u32   max_combined;
+       __u32   rx_count;
+       __u32   tx_count;
+       __u32   other_count;
+       __u32   combined_count;
+};
+
+#endif /* _UAPI_LINUX_ETHTOOL_H */
diff --git a/tools/include/uapi/linux/if_xdp.h b/tools/include/uapi/linux/if_xdp.h
new file mode 100644 (file)
index 0000000..caed8b1
--- /dev/null
@@ -0,0 +1,78 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * if_xdp: XDP socket user-space interface
+ * Copyright(c) 2018 Intel Corporation.
+ *
+ * Author(s): Björn Töpel <bjorn.topel@intel.com>
+ *           Magnus Karlsson <magnus.karlsson@intel.com>
+ */
+
+#ifndef _LINUX_IF_XDP_H
+#define _LINUX_IF_XDP_H
+
+#include <linux/types.h>
+
+/* Options for the sxdp_flags field */
+#define XDP_SHARED_UMEM        (1 << 0)
+#define XDP_COPY       (1 << 1) /* Force copy-mode */
+#define XDP_ZEROCOPY   (1 << 2) /* Force zero-copy mode */
+
+struct sockaddr_xdp {
+       __u16 sxdp_family;
+       __u16 sxdp_flags;
+       __u32 sxdp_ifindex;
+       __u32 sxdp_queue_id;
+       __u32 sxdp_shared_umem_fd;
+};
+
+struct xdp_ring_offset {
+       __u64 producer;
+       __u64 consumer;
+       __u64 desc;
+};
+
+struct xdp_mmap_offsets {
+       struct xdp_ring_offset rx;
+       struct xdp_ring_offset tx;
+       struct xdp_ring_offset fr; /* Fill */
+       struct xdp_ring_offset cr; /* Completion */
+};
+
+/* XDP socket options */
+#define XDP_MMAP_OFFSETS               1
+#define XDP_RX_RING                    2
+#define XDP_TX_RING                    3
+#define XDP_UMEM_REG                   4
+#define XDP_UMEM_FILL_RING             5
+#define XDP_UMEM_COMPLETION_RING       6
+#define XDP_STATISTICS                 7
+
+struct xdp_umem_reg {
+       __u64 addr; /* Start of packet data area */
+       __u64 len; /* Length of packet data area */
+       __u32 chunk_size;
+       __u32 headroom;
+};
+
+struct xdp_statistics {
+       __u64 rx_dropped; /* Dropped for reasons other than invalid desc */
+       __u64 rx_invalid_descs; /* Dropped due to invalid descriptor */
+       __u64 tx_invalid_descs; /* Dropped due to invalid descriptor */
+};
+
+/* Pgoff for mmaping the rings */
+#define XDP_PGOFF_RX_RING                        0
+#define XDP_PGOFF_TX_RING               0x80000000
+#define XDP_UMEM_PGOFF_FILL_RING       0x100000000ULL
+#define XDP_UMEM_PGOFF_COMPLETION_RING 0x180000000ULL
+
+/* Rx/Tx descriptor */
+struct xdp_desc {
+       __u64 addr;
+       __u32 len;
+       __u32 options;
+};
+
+/* UMEM descriptor is __u64 */
+
+#endif /* _LINUX_IF_XDP_H */
index bfd9bfc..ee9d536 100644 (file)
@@ -1 +1 @@
-libbpf-y := libbpf.o bpf.o nlattr.o btf.o libbpf_errno.o str_error.o netlink.o bpf_prog_linfo.o libbpf_probes.o
+libbpf-y := libbpf.o bpf.o nlattr.o btf.o libbpf_errno.o str_error.o netlink.o bpf_prog_linfo.o libbpf_probes.o xsk.o
index 8479162..761691b 100644 (file)
@@ -164,6 +164,9 @@ $(BPF_IN): force elfdep bpfdep
        @(test -f ../../include/uapi/linux/if_link.h -a -f ../../../include/uapi/linux/if_link.h && ( \
        (diff -B ../../include/uapi/linux/if_link.h ../../../include/uapi/linux/if_link.h >/dev/null) || \
        echo "Warning: Kernel ABI header at 'tools/include/uapi/linux/if_link.h' differs from latest version at 'include/uapi/linux/if_link.h'" >&2 )) || true
+       @(test -f ../../include/uapi/linux/if_xdp.h -a -f ../../../include/uapi/linux/if_xdp.h && ( \
+       (diff -B ../../include/uapi/linux/if_xdp.h ../../../include/uapi/linux/if_xdp.h >/dev/null) || \
+       echo "Warning: Kernel ABI header at 'tools/include/uapi/linux/if_xdp.h' differs from latest version at 'include/uapi/linux/if_xdp.h'" >&2 )) || true
        $(Q)$(MAKE) $(build)=libbpf
 
 $(OUTPUT)libbpf.so: $(BPF_IN)
@@ -174,7 +177,7 @@ $(OUTPUT)libbpf.a: $(BPF_IN)
        $(QUIET_LINK)$(RM) $@; $(AR) rcs $@ $^
 
 $(OUTPUT)test_libbpf: test_libbpf.cpp $(OUTPUT)libbpf.a
-       $(QUIET_LINK)$(CXX) $^ -lelf -o $@
+       $(QUIET_LINK)$(CXX) $(INCLUDES) $^ -lelf -o $@
 
 check: check_abi
 
index 607aae4..5788479 100644 (file)
@@ -9,7 +9,7 @@ described here. It's recommended to follow these conventions whenever a
 new function or type is added to keep libbpf API clean and consistent.
 
 All types and functions provided by libbpf API should have one of the
-following prefixes: ``bpf_``, ``btf_``, ``libbpf_``.
+following prefixes: ``bpf_``, ``btf_``, ``libbpf_``, ``xsk_``.
 
 System call wrappers
 --------------------
@@ -62,6 +62,19 @@ Auxiliary functions and types that don't fit well in any of categories
 described above should have ``libbpf_`` prefix, e.g.
 ``libbpf_get_error`` or ``libbpf_prog_type_by_name``.
 
+AF_XDP functions
+-------------------
+
+AF_XDP functions should have an ``xsk_`` prefix, e.g.
+``xsk_umem__get_data`` or ``xsk_umem__create``. The interface consists
+of both low-level ring access functions and high-level configuration
+functions. These can be mixed and matched. Note that these functions
+are not reentrant for performance reasons.
+
+Please take a look at Documentation/networking/af_xdp.rst in the Linux
+kernel source tree on how to use XDP sockets and for some common
+mistakes in case you do not get any traffic up to user space.
+
 libbpf ABI
 ==========
 
index 99dfa71..778a267 100644 (file)
@@ -147,4 +147,10 @@ LIBBPF_0.0.2 {
                btf_ext__new;
                btf_ext__reloc_func_info;
                btf_ext__reloc_line_info;
+               xsk_umem__create;
+               xsk_socket__create;
+               xsk_umem__delete;
+               xsk_socket__delete;
+               xsk_umem__fd;
+               xsk_socket__fd;
 } LIBBPF_0.0.1;
diff --git a/tools/lib/bpf/xsk.c b/tools/lib/bpf/xsk.c
new file mode 100644 (file)
index 0000000..f98ac82
--- /dev/null
@@ -0,0 +1,723 @@
+// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
+
+/*
+ * AF_XDP user-space access library.
+ *
+ * Copyright(c) 2018 - 2019 Intel Corporation.
+ *
+ * Author(s): Magnus Karlsson <magnus.karlsson@intel.com>
+ */
+
+#include <errno.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <arpa/inet.h>
+#include <asm/barrier.h>
+#include <linux/compiler.h>
+#include <linux/ethtool.h>
+#include <linux/filter.h>
+#include <linux/if_ether.h>
+#include <linux/if_packet.h>
+#include <linux/if_xdp.h>
+#include <linux/sockios.h>
+#include <net/if.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+
+#include "bpf.h"
+#include "libbpf.h"
+#include "libbpf_util.h"
+#include "xsk.h"
+
+#ifndef SOL_XDP
+ #define SOL_XDP 283
+#endif
+
+#ifndef AF_XDP
+ #define AF_XDP 44
+#endif
+
+#ifndef PF_XDP
+ #define PF_XDP AF_XDP
+#endif
+
+struct xsk_umem {
+       struct xsk_ring_prod *fill;
+       struct xsk_ring_cons *comp;
+       char *umem_area;
+       struct xsk_umem_config config;
+       int fd;
+       int refcount;
+};
+
+struct xsk_socket {
+       struct xsk_ring_cons *rx;
+       struct xsk_ring_prod *tx;
+       __u64 outstanding_tx;
+       struct xsk_umem *umem;
+       struct xsk_socket_config config;
+       int fd;
+       int xsks_map;
+       int ifindex;
+       int prog_fd;
+       int qidconf_map_fd;
+       int xsks_map_fd;
+       __u32 queue_id;
+       char ifname[IFNAMSIZ];
+};
+
+struct xsk_nl_info {
+       bool xdp_prog_attached;
+       int ifindex;
+       int fd;
+};
+
+/* For 32-bit systems, we need to use mmap2 as the offsets are 64-bit.
+ * Unfortunately, it is not part of glibc.
+ */
+static inline void *xsk_mmap(void *addr, size_t length, int prot, int flags,
+                            int fd, __u64 offset)
+{
+#ifdef __NR_mmap2
+       unsigned int page_shift = __builtin_ffs(getpagesize()) - 1;
+       long ret = syscall(__NR_mmap2, addr, length, prot, flags, fd,
+                          (off_t)(offset >> page_shift));
+
+       return (void *)ret;
+#else
+       return mmap(addr, length, prot, flags, fd, offset);
+#endif
+}
+
+int xsk_umem__fd(const struct xsk_umem *umem)
+{
+       return umem ? umem->fd : -EINVAL;
+}
+
+int xsk_socket__fd(const struct xsk_socket *xsk)
+{
+       return xsk ? xsk->fd : -EINVAL;
+}
+
+static bool xsk_page_aligned(void *buffer)
+{
+       unsigned long addr = (unsigned long)buffer;
+
+       return !(addr & (getpagesize() - 1));
+}
+
+static void xsk_set_umem_config(struct xsk_umem_config *cfg,
+                               const struct xsk_umem_config *usr_cfg)
+{
+       if (!usr_cfg) {
+               cfg->fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS;
+               cfg->comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS;
+               cfg->frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE;
+               cfg->frame_headroom = XSK_UMEM__DEFAULT_FRAME_HEADROOM;
+               return;
+       }
+
+       cfg->fill_size = usr_cfg->fill_size;
+       cfg->comp_size = usr_cfg->comp_size;
+       cfg->frame_size = usr_cfg->frame_size;
+       cfg->frame_headroom = usr_cfg->frame_headroom;
+}
+
+static void xsk_set_xdp_socket_config(struct xsk_socket_config *cfg,
+                                     const struct xsk_socket_config *usr_cfg)
+{
+       if (!usr_cfg) {
+               cfg->rx_size = XSK_RING_CONS__DEFAULT_NUM_DESCS;
+               cfg->tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS;
+               cfg->libbpf_flags = 0;
+               cfg->xdp_flags = 0;
+               cfg->bind_flags = 0;
+               return;
+       }
+
+       cfg->rx_size = usr_cfg->rx_size;
+       cfg->tx_size = usr_cfg->tx_size;
+       cfg->libbpf_flags = usr_cfg->libbpf_flags;
+       cfg->xdp_flags = usr_cfg->xdp_flags;
+       cfg->bind_flags = usr_cfg->bind_flags;
+}
+
+int xsk_umem__create(struct xsk_umem **umem_ptr, void *umem_area, __u64 size,
+                    struct xsk_ring_prod *fill, struct xsk_ring_cons *comp,
+                    const struct xsk_umem_config *usr_config)
+{
+       struct xdp_mmap_offsets off;
+       struct xdp_umem_reg mr;
+       struct xsk_umem *umem;
+       socklen_t optlen;
+       void *map;
+       int err;
+
+       if (!umem_area || !umem_ptr || !fill || !comp)
+               return -EFAULT;
+       if (!size && !xsk_page_aligned(umem_area))
+               return -EINVAL;
+
+       umem = calloc(1, sizeof(*umem));
+       if (!umem)
+               return -ENOMEM;
+
+       umem->fd = socket(AF_XDP, SOCK_RAW, 0);
+       if (umem->fd < 0) {
+               err = -errno;
+               goto out_umem_alloc;
+       }
+
+       umem->umem_area = umem_area;
+       xsk_set_umem_config(&umem->config, usr_config);
+
+       mr.addr = (uintptr_t)umem_area;
+       mr.len = size;
+       mr.chunk_size = umem->config.frame_size;
+       mr.headroom = umem->config.frame_headroom;
+
+       err = setsockopt(umem->fd, SOL_XDP, XDP_UMEM_REG, &mr, sizeof(mr));
+       if (err) {
+               err = -errno;
+               goto out_socket;
+       }
+       err = setsockopt(umem->fd, SOL_XDP, XDP_UMEM_FILL_RING,
+                        &umem->config.fill_size,
+                        sizeof(umem->config.fill_size));
+       if (err) {
+               err = -errno;
+               goto out_socket;
+       }
+       err = setsockopt(umem->fd, SOL_XDP, XDP_UMEM_COMPLETION_RING,
+                        &umem->config.comp_size,
+                        sizeof(umem->config.comp_size));
+       if (err) {
+               err = -errno;
+               goto out_socket;
+       }
+
+       optlen = sizeof(off);
+       err = getsockopt(umem->fd, SOL_XDP, XDP_MMAP_OFFSETS, &off, &optlen);
+       if (err) {
+               err = -errno;
+               goto out_socket;
+       }
+
+       map = xsk_mmap(NULL, off.fr.desc +
+                      umem->config.fill_size * sizeof(__u64),
+                      PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE,
+                      umem->fd, XDP_UMEM_PGOFF_FILL_RING);
+       if (map == MAP_FAILED) {
+               err = -errno;
+               goto out_socket;
+       }
+
+       umem->fill = fill;
+       fill->mask = umem->config.fill_size - 1;
+       fill->size = umem->config.fill_size;
+       fill->producer = map + off.fr.producer;
+       fill->consumer = map + off.fr.consumer;
+       fill->ring = map + off.fr.desc;
+       fill->cached_cons = umem->config.fill_size;
+
+       map = xsk_mmap(NULL,
+                      off.cr.desc + umem->config.comp_size * sizeof(__u64),
+                      PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE,
+                      umem->fd, XDP_UMEM_PGOFF_COMPLETION_RING);
+       if (map == MAP_FAILED) {
+               err = -errno;
+               goto out_mmap;
+       }
+
+       umem->comp = comp;
+       comp->mask = umem->config.comp_size - 1;
+       comp->size = umem->config.comp_size;
+       comp->producer = map + off.cr.producer;
+       comp->consumer = map + off.cr.consumer;
+       comp->ring = map + off.cr.desc;
+
+       *umem_ptr = umem;
+       return 0;
+
+out_mmap:
+       munmap(umem->fill,
+              off.fr.desc + umem->config.fill_size * sizeof(__u64));
+out_socket:
+       close(umem->fd);
+out_umem_alloc:
+       free(umem);
+       return err;
+}
+
+static int xsk_load_xdp_prog(struct xsk_socket *xsk)
+{
+       char bpf_log_buf[BPF_LOG_BUF_SIZE];
+       int err, prog_fd;
+
+       /* This is the C-program:
+        * SEC("xdp_sock") int xdp_sock_prog(struct xdp_md *ctx)
+        * {
+        *     int *qidconf, index = ctx->rx_queue_index;
+        *
+        *     // A set entry here means that the correspnding queue_id
+        *     // has an active AF_XDP socket bound to it.
+        *     qidconf = bpf_map_lookup_elem(&qidconf_map, &index);
+        *     if (!qidconf)
+        *         return XDP_ABORTED;
+        *
+        *     if (*qidconf)
+        *         return bpf_redirect_map(&xsks_map, index, 0);
+        *
+        *     return XDP_PASS;
+        * }
+        */
+       struct bpf_insn prog[] = {
+               /* r1 = *(u32 *)(r1 + 16) */
+               BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_1, 16),
+               /* *(u32 *)(r10 - 4) = r1 */
+               BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_1, -4),
+               BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+               BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4),
+               BPF_LD_MAP_FD(BPF_REG_1, xsk->qidconf_map_fd),
+               BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+               BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+               BPF_MOV32_IMM(BPF_REG_0, 0),
+               /* if r1 == 0 goto +8 */
+               BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 8),
+               BPF_MOV32_IMM(BPF_REG_0, 2),
+               /* r1 = *(u32 *)(r1 + 0) */
+               BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_1, 0),
+               /* if r1 == 0 goto +5 */
+               BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 5),
+               /* r2 = *(u32 *)(r10 - 4) */
+               BPF_LD_MAP_FD(BPF_REG_1, xsk->xsks_map_fd),
+               BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_10, -4),
+               BPF_MOV32_IMM(BPF_REG_3, 0),
+               BPF_EMIT_CALL(BPF_FUNC_redirect_map),
+               /* The jumps are to this instruction */
+               BPF_EXIT_INSN(),
+       };
+       size_t insns_cnt = sizeof(prog) / sizeof(struct bpf_insn);
+
+       prog_fd = bpf_load_program(BPF_PROG_TYPE_XDP, prog, insns_cnt,
+                                  "LGPL-2.1 or BSD-2-Clause", 0, bpf_log_buf,
+                                  BPF_LOG_BUF_SIZE);
+       if (prog_fd < 0) {
+               pr_warning("BPF log buffer:\n%s", bpf_log_buf);
+               return prog_fd;
+       }
+
+       err = bpf_set_link_xdp_fd(xsk->ifindex, prog_fd, xsk->config.xdp_flags);
+       if (err) {
+               close(prog_fd);
+               return err;
+       }
+
+       xsk->prog_fd = prog_fd;
+       return 0;
+}
+
+static int xsk_get_max_queues(struct xsk_socket *xsk)
+{
+       struct ethtool_channels channels;
+       struct ifreq ifr;
+       int fd, err, ret;
+
+       fd = socket(AF_INET, SOCK_DGRAM, 0);
+       if (fd < 0)
+               return -errno;
+
+       channels.cmd = ETHTOOL_GCHANNELS;
+       ifr.ifr_data = (void *)&channels;
+       strncpy(ifr.ifr_name, xsk->ifname, IFNAMSIZ);
+       err = ioctl(fd, SIOCETHTOOL, &ifr);
+       if (err && errno != EOPNOTSUPP) {
+               ret = -errno;
+               goto out;
+       }
+
+       if (channels.max_combined == 0 || errno == EOPNOTSUPP)
+               /* If the device says it has no channels, then all traffic
+                * is sent to a single stream, so max queues = 1.
+                */
+               ret = 1;
+       else
+               ret = channels.max_combined;
+
+out:
+       close(fd);
+       return ret;
+}
+
+static int xsk_create_bpf_maps(struct xsk_socket *xsk)
+{
+       int max_queues;
+       int fd;
+
+       max_queues = xsk_get_max_queues(xsk);
+       if (max_queues < 0)
+               return max_queues;
+
+       fd = bpf_create_map_name(BPF_MAP_TYPE_ARRAY, "qidconf_map",
+                                sizeof(int), sizeof(int), max_queues, 0);
+       if (fd < 0)
+               return fd;
+       xsk->qidconf_map_fd = fd;
+
+       fd = bpf_create_map_name(BPF_MAP_TYPE_XSKMAP, "xsks_map",
+                                sizeof(int), sizeof(int), max_queues, 0);
+       if (fd < 0) {
+               close(xsk->qidconf_map_fd);
+               return fd;
+       }
+       xsk->xsks_map_fd = fd;
+
+       return 0;
+}
+
+static void xsk_delete_bpf_maps(struct xsk_socket *xsk)
+{
+       close(xsk->qidconf_map_fd);
+       close(xsk->xsks_map_fd);
+}
+
+static int xsk_update_bpf_maps(struct xsk_socket *xsk, int qidconf_value,
+                              int xsks_value)
+{
+       bool qidconf_map_updated = false, xsks_map_updated = false;
+       struct bpf_prog_info prog_info = {};
+       __u32 prog_len = sizeof(prog_info);
+       struct bpf_map_info map_info;
+       __u32 map_len = sizeof(map_info);
+       __u32 *map_ids;
+       int reset_value = 0;
+       __u32 num_maps;
+       unsigned int i;
+       int err;
+
+       err = bpf_obj_get_info_by_fd(xsk->prog_fd, &prog_info, &prog_len);
+       if (err)
+               return err;
+
+       num_maps = prog_info.nr_map_ids;
+
+       map_ids = calloc(prog_info.nr_map_ids, sizeof(*map_ids));
+       if (!map_ids)
+               return -ENOMEM;
+
+       memset(&prog_info, 0, prog_len);
+       prog_info.nr_map_ids = num_maps;
+       prog_info.map_ids = (__u64)(unsigned long)map_ids;
+
+       err = bpf_obj_get_info_by_fd(xsk->prog_fd, &prog_info, &prog_len);
+       if (err)
+               goto out_map_ids;
+
+       for (i = 0; i < prog_info.nr_map_ids; i++) {
+               int fd;
+
+               fd = bpf_map_get_fd_by_id(map_ids[i]);
+               if (fd < 0) {
+                       err = -errno;
+                       goto out_maps;
+               }
+
+               err = bpf_obj_get_info_by_fd(fd, &map_info, &map_len);
+               if (err)
+                       goto out_maps;
+
+               if (!strcmp(map_info.name, "qidconf_map")) {
+                       err = bpf_map_update_elem(fd, &xsk->queue_id,
+                                                 &qidconf_value, 0);
+                       if (err)
+                               goto out_maps;
+                       qidconf_map_updated = true;
+                       xsk->qidconf_map_fd = fd;
+               } else if (!strcmp(map_info.name, "xsks_map")) {
+                       err = bpf_map_update_elem(fd, &xsk->queue_id,
+                                                 &xsks_value, 0);
+                       if (err)
+                               goto out_maps;
+                       xsks_map_updated = true;
+                       xsk->xsks_map_fd = fd;
+               }
+
+               if (qidconf_map_updated && xsks_map_updated)
+                       break;
+       }
+
+       if (!(qidconf_map_updated && xsks_map_updated)) {
+               err = -ENOENT;
+               goto out_maps;
+       }
+
+       err = 0;
+       goto out_success;
+
+out_maps:
+       if (qidconf_map_updated)
+               (void)bpf_map_update_elem(xsk->qidconf_map_fd, &xsk->queue_id,
+                                         &reset_value, 0);
+       if (xsks_map_updated)
+               (void)bpf_map_update_elem(xsk->xsks_map_fd, &xsk->queue_id,
+                                         &reset_value, 0);
+out_success:
+       if (qidconf_map_updated)
+               close(xsk->qidconf_map_fd);
+       if (xsks_map_updated)
+               close(xsk->xsks_map_fd);
+out_map_ids:
+       free(map_ids);
+       return err;
+}
+
+static int xsk_setup_xdp_prog(struct xsk_socket *xsk)
+{
+       bool prog_attached = false;
+       __u32 prog_id = 0;
+       int err;
+
+       err = bpf_get_link_xdp_id(xsk->ifindex, &prog_id,
+                                 xsk->config.xdp_flags);
+       if (err)
+               return err;
+
+       if (!prog_id) {
+               prog_attached = true;
+               err = xsk_create_bpf_maps(xsk);
+               if (err)
+                       return err;
+
+               err = xsk_load_xdp_prog(xsk);
+               if (err)
+                       goto out_maps;
+       } else {
+               xsk->prog_fd = bpf_prog_get_fd_by_id(prog_id);
+       }
+
+       err = xsk_update_bpf_maps(xsk, true, xsk->fd);
+       if (err)
+               goto out_load;
+
+       return 0;
+
+out_load:
+       if (prog_attached)
+               close(xsk->prog_fd);
+out_maps:
+       if (prog_attached)
+               xsk_delete_bpf_maps(xsk);
+       return err;
+}
+
+int xsk_socket__create(struct xsk_socket **xsk_ptr, const char *ifname,
+                      __u32 queue_id, struct xsk_umem *umem,
+                      struct xsk_ring_cons *rx, struct xsk_ring_prod *tx,
+                      const struct xsk_socket_config *usr_config)
+{
+       struct sockaddr_xdp sxdp = {};
+       struct xdp_mmap_offsets off;
+       struct xsk_socket *xsk;
+       socklen_t optlen;
+       void *map;
+       int err;
+
+       if (!umem || !xsk_ptr || !rx || !tx)
+               return -EFAULT;
+
+       if (umem->refcount) {
+               pr_warning("Error: shared umems not supported by libbpf.\n");
+               return -EBUSY;
+       }
+
+       xsk = calloc(1, sizeof(*xsk));
+       if (!xsk)
+               return -ENOMEM;
+
+       if (umem->refcount++ > 0) {
+               xsk->fd = socket(AF_XDP, SOCK_RAW, 0);
+               if (xsk->fd < 0) {
+                       err = -errno;
+                       goto out_xsk_alloc;
+               }
+       } else {
+               xsk->fd = umem->fd;
+       }
+
+       xsk->outstanding_tx = 0;
+       xsk->queue_id = queue_id;
+       xsk->umem = umem;
+       xsk->ifindex = if_nametoindex(ifname);
+       if (!xsk->ifindex) {
+               err = -errno;
+               goto out_socket;
+       }
+       strncpy(xsk->ifname, ifname, IFNAMSIZ);
+
+       xsk_set_xdp_socket_config(&xsk->config, usr_config);
+
+       if (rx) {
+               err = setsockopt(xsk->fd, SOL_XDP, XDP_RX_RING,
+                                &xsk->config.rx_size,
+                                sizeof(xsk->config.rx_size));
+               if (err) {
+                       err = -errno;
+                       goto out_socket;
+               }
+       }
+       if (tx) {
+               err = setsockopt(xsk->fd, SOL_XDP, XDP_TX_RING,
+                                &xsk->config.tx_size,
+                                sizeof(xsk->config.tx_size));
+               if (err) {
+                       err = -errno;
+                       goto out_socket;
+               }
+       }
+
+       optlen = sizeof(off);
+       err = getsockopt(xsk->fd, SOL_XDP, XDP_MMAP_OFFSETS, &off, &optlen);
+       if (err) {
+               err = -errno;
+               goto out_socket;
+       }
+
+       if (rx) {
+               map = xsk_mmap(NULL, off.rx.desc +
+                              xsk->config.rx_size * sizeof(struct xdp_desc),
+                              PROT_READ | PROT_WRITE,
+                              MAP_SHARED | MAP_POPULATE,
+                              xsk->fd, XDP_PGOFF_RX_RING);
+               if (map == MAP_FAILED) {
+                       err = -errno;
+                       goto out_socket;
+               }
+
+               rx->mask = xsk->config.rx_size - 1;
+               rx->size = xsk->config.rx_size;
+               rx->producer = map + off.rx.producer;
+               rx->consumer = map + off.rx.consumer;
+               rx->ring = map + off.rx.desc;
+       }
+       xsk->rx = rx;
+
+       if (tx) {
+               map = xsk_mmap(NULL, off.tx.desc +
+                              xsk->config.tx_size * sizeof(struct xdp_desc),
+                              PROT_READ | PROT_WRITE,
+                              MAP_SHARED | MAP_POPULATE,
+                              xsk->fd, XDP_PGOFF_TX_RING);
+               if (map == MAP_FAILED) {
+                       err = -errno;
+                       goto out_mmap_rx;
+               }
+
+               tx->mask = xsk->config.tx_size - 1;
+               tx->size = xsk->config.tx_size;
+               tx->producer = map + off.tx.producer;
+               tx->consumer = map + off.tx.consumer;
+               tx->ring = map + off.tx.desc;
+               tx->cached_cons = xsk->config.tx_size;
+       }
+       xsk->tx = tx;
+
+       sxdp.sxdp_family = PF_XDP;
+       sxdp.sxdp_ifindex = xsk->ifindex;
+       sxdp.sxdp_queue_id = xsk->queue_id;
+       sxdp.sxdp_flags = xsk->config.bind_flags;
+
+       err = bind(xsk->fd, (struct sockaddr *)&sxdp, sizeof(sxdp));
+       if (err) {
+               err = -errno;
+               goto out_mmap_tx;
+       }
+
+       if (!(xsk->config.libbpf_flags & XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD)) {
+               err = xsk_setup_xdp_prog(xsk);
+               if (err)
+                       goto out_mmap_tx;
+       }
+
+       *xsk_ptr = xsk;
+       return 0;
+
+out_mmap_tx:
+       if (tx)
+               munmap(xsk->tx,
+                      off.tx.desc +
+                      xsk->config.tx_size * sizeof(struct xdp_desc));
+out_mmap_rx:
+       if (rx)
+               munmap(xsk->rx,
+                      off.rx.desc +
+                      xsk->config.rx_size * sizeof(struct xdp_desc));
+out_socket:
+       if (--umem->refcount)
+               close(xsk->fd);
+out_xsk_alloc:
+       free(xsk);
+       return err;
+}
+
+int xsk_umem__delete(struct xsk_umem *umem)
+{
+       struct xdp_mmap_offsets off;
+       socklen_t optlen;
+       int err;
+
+       if (!umem)
+               return 0;
+
+       if (umem->refcount)
+               return -EBUSY;
+
+       optlen = sizeof(off);
+       err = getsockopt(umem->fd, SOL_XDP, XDP_MMAP_OFFSETS, &off, &optlen);
+       if (!err) {
+               munmap(umem->fill->ring,
+                      off.fr.desc + umem->config.fill_size * sizeof(__u64));
+               munmap(umem->comp->ring,
+                      off.cr.desc + umem->config.comp_size * sizeof(__u64));
+       }
+
+       close(umem->fd);
+       free(umem);
+
+       return 0;
+}
+
+void xsk_socket__delete(struct xsk_socket *xsk)
+{
+       struct xdp_mmap_offsets off;
+       socklen_t optlen;
+       int err;
+
+       if (!xsk)
+               return;
+
+       (void)xsk_update_bpf_maps(xsk, 0, 0);
+
+       optlen = sizeof(off);
+       err = getsockopt(xsk->fd, SOL_XDP, XDP_MMAP_OFFSETS, &off, &optlen);
+       if (!err) {
+               if (xsk->rx)
+                       munmap(xsk->rx->ring,
+                              off.rx.desc +
+                              xsk->config.rx_size * sizeof(struct xdp_desc));
+               if (xsk->tx)
+                       munmap(xsk->tx->ring,
+                              off.tx.desc +
+                              xsk->config.tx_size * sizeof(struct xdp_desc));
+       }
+
+       xsk->umem->refcount--;
+       /* Do not close an fd that also has an associated umem connected
+        * to it.
+        */
+       if (xsk->fd != xsk->umem->fd)
+               close(xsk->fd);
+       free(xsk);
+}
diff --git a/tools/lib/bpf/xsk.h b/tools/lib/bpf/xsk.h
new file mode 100644 (file)
index 0000000..a497f00
--- /dev/null
@@ -0,0 +1,203 @@
+/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
+
+/*
+ * AF_XDP user-space access library.
+ *
+ * Copyright(c) 2018 - 2019 Intel Corporation.
+ *
+ * Author(s): Magnus Karlsson <magnus.karlsson@intel.com>
+ */
+
+#ifndef __LIBBPF_XSK_H
+#define __LIBBPF_XSK_H
+
+#include <stdio.h>
+#include <stdint.h>
+#include <linux/if_xdp.h>
+
+#include "libbpf.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Do not access these members directly. Use the functions below. */
+#define DEFINE_XSK_RING(name) \
+struct name { \
+       __u32 cached_prod; \
+       __u32 cached_cons; \
+       __u32 mask; \
+       __u32 size; \
+       __u32 *producer; \
+       __u32 *consumer; \
+       void *ring; \
+}
+
+DEFINE_XSK_RING(xsk_ring_prod);
+DEFINE_XSK_RING(xsk_ring_cons);
+
+struct xsk_umem;
+struct xsk_socket;
+
+static inline __u64 *xsk_ring_prod__fill_addr(struct xsk_ring_prod *fill,
+                                             __u32 idx)
+{
+       __u64 *addrs = (__u64 *)fill->ring;
+
+       return &addrs[idx & fill->mask];
+}
+
+static inline const __u64 *
+xsk_ring_cons__comp_addr(const struct xsk_ring_cons *comp, __u32 idx)
+{
+       const __u64 *addrs = (const __u64 *)comp->ring;
+
+       return &addrs[idx & comp->mask];
+}
+
+static inline struct xdp_desc *xsk_ring_prod__tx_desc(struct xsk_ring_prod *tx,
+                                                     __u32 idx)
+{
+       struct xdp_desc *descs = (struct xdp_desc *)tx->ring;
+
+       return &descs[idx & tx->mask];
+}
+
+static inline const struct xdp_desc *
+xsk_ring_cons__rx_desc(const struct xsk_ring_cons *rx, __u32 idx)
+{
+       const struct xdp_desc *descs = (const struct xdp_desc *)rx->ring;
+
+       return &descs[idx & rx->mask];
+}
+
+static inline __u32 xsk_prod_nb_free(struct xsk_ring_prod *r, __u32 nb)
+{
+       __u32 free_entries = r->cached_cons - r->cached_prod;
+
+       if (free_entries >= nb)
+               return free_entries;
+
+       /* Refresh the local tail pointer.
+        * cached_cons is r->size bigger than the real consumer pointer so
+        * that this addition can be avoided in the more frequently
+        * executed code that computs free_entries in the beginning of
+        * this function. Without this optimization it whould have been
+        * free_entries = r->cached_prod - r->cached_cons + r->size.
+        */
+       r->cached_cons = *r->consumer + r->size;
+
+       return r->cached_cons - r->cached_prod;
+}
+
+static inline __u32 xsk_cons_nb_avail(struct xsk_ring_cons *r, __u32 nb)
+{
+       __u32 entries = r->cached_prod - r->cached_cons;
+
+       if (entries == 0) {
+               r->cached_prod = *r->producer;
+               entries = r->cached_prod - r->cached_cons;
+       }
+
+       return (entries > nb) ? nb : entries;
+}
+
+static inline size_t xsk_ring_prod__reserve(struct xsk_ring_prod *prod,
+                                           size_t nb, __u32 *idx)
+{
+       if (unlikely(xsk_prod_nb_free(prod, nb) < nb))
+               return 0;
+
+       *idx = prod->cached_prod;
+       prod->cached_prod += nb;
+
+       return nb;
+}
+
+static inline void xsk_ring_prod__submit(struct xsk_ring_prod *prod, size_t nb)
+{
+       /* Make sure everything has been written to the ring before signalling
+        * this to the kernel.
+        */
+       smp_wmb();
+
+       *prod->producer += nb;
+}
+
+static inline size_t xsk_ring_cons__peek(struct xsk_ring_cons *cons,
+                                        size_t nb, __u32 *idx)
+{
+       size_t entries = xsk_cons_nb_avail(cons, nb);
+
+       if (likely(entries > 0)) {
+               /* Make sure we do not speculatively read the data before
+                * we have received the packet buffers from the ring.
+                */
+               smp_rmb();
+
+               *idx = cons->cached_cons;
+               cons->cached_cons += entries;
+       }
+
+       return entries;
+}
+
+static inline void xsk_ring_cons__release(struct xsk_ring_cons *cons, size_t nb)
+{
+       *cons->consumer += nb;
+}
+
+static inline void *xsk_umem__get_data(void *umem_area, __u64 addr)
+{
+       return &((char *)umem_area)[addr];
+}
+
+LIBBPF_API int xsk_umem__fd(const struct xsk_umem *umem);
+LIBBPF_API int xsk_socket__fd(const struct xsk_socket *xsk);
+
+#define XSK_RING_CONS__DEFAULT_NUM_DESCS      2048
+#define XSK_RING_PROD__DEFAULT_NUM_DESCS      2048
+#define XSK_UMEM__DEFAULT_FRAME_SHIFT    11 /* 2048 bytes */
+#define XSK_UMEM__DEFAULT_FRAME_SIZE     (1 << XSK_UMEM__DEFAULT_FRAME_SHIFT)
+#define XSK_UMEM__DEFAULT_FRAME_HEADROOM 0
+
+struct xsk_umem_config {
+       __u32 fill_size;
+       __u32 comp_size;
+       __u32 frame_size;
+       __u32 frame_headroom;
+};
+
+/* Flags for the libbpf_flags field. */
+#define XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD (1 << 0)
+
+struct xsk_socket_config {
+       __u32 rx_size;
+       __u32 tx_size;
+       __u32 libbpf_flags;
+       __u32 xdp_flags;
+       __u16 bind_flags;
+};
+
+/* Set config to NULL to get the default configuration. */
+LIBBPF_API int xsk_umem__create(struct xsk_umem **umem,
+                               void *umem_area, __u64 size,
+                               struct xsk_ring_prod *fill,
+                               struct xsk_ring_cons *comp,
+                               const struct xsk_umem_config *config);
+LIBBPF_API int xsk_socket__create(struct xsk_socket **xsk,
+                                 const char *ifname, __u32 queue_id,
+                                 struct xsk_umem *umem,
+                                 struct xsk_ring_cons *rx,
+                                 struct xsk_ring_prod *tx,
+                                 const struct xsk_socket_config *config);
+
+/* Returns 0 for success and -EBUSY if the umem is still in use. */
+LIBBPF_API int xsk_umem__delete(struct xsk_umem *umem);
+LIBBPF_API void xsk_socket__delete(struct xsk_socket *xsk);
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif /* __LIBBPF_XSK_H */