L: linux-api@vger.kernel.org
F: include/linux/syscalls.h
F: kernel/sys_ni.c
- F: include/uapi/
- F: arch/*/include/uapi/
+ X: include/uapi/
+ X: arch/*/include/uapi/
ABIT UGURU 1,2 HARDWARE MONITOR DRIVER
M: Hans de Goede <hdegoede@redhat.com>
M: Christian Brauner <christian@brauner.io>
M: Hridya Valsaraju <hridya@google.com>
M: Suren Baghdasaryan <surenb@google.com>
- L: devel@driverdev.osuosl.org
+ L: linux-kernel@vger.kernel.org
S: Supported
T: git git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/staging.git
F: drivers/android/
F: Documentation/devicetree/bindings/i2c/i2c-owl.yaml
F: Documentation/devicetree/bindings/interrupt-controller/actions,owl-sirq.yaml
F: Documentation/devicetree/bindings/mmc/owl-mmc.yaml
+F: Documentation/devicetree/bindings/net/actions,owl-emac.yaml
F: Documentation/devicetree/bindings/pinctrl/actions,*
F: Documentation/devicetree/bindings/power/actions,owl-sps.txt
F: Documentation/devicetree/bindings/timer/actions,owl-timer.txt
F: drivers/i2c/busses/i2c-owl.c
F: drivers/irqchip/irq-owl-sirq.c
F: drivers/mmc/host/owl-mmc.c
+F: drivers/net/ethernet/actions/
F: drivers/pinctrl/actions/*
F: drivers/soc/actions/
F: include/dt-bindings/power/owl-*
T: git git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git
F: Documentation/bpf/
F: Documentation/networking/filter.rst
+F: Documentation/userspace-api/ebpf/
F: arch/*/net/*
F: include/linux/bpf*
F: include/linux/filter.h
F: net/sched/act_bpf.c
F: net/sched/cls_bpf.c
F: samples/bpf/
+F: scripts/bpf_doc.py
F: tools/bpf/
F: tools/lib/bpf/
F: tools/testing/selftests/bpf/
F: drivers/net/ethernet/freescale/dpaa2/dpni*
DPAA2 ETHERNET SWITCH DRIVER
-M: Ioana Radulescu <ruxandra.radulescu@nxp.com>
M: Ioana Ciornei <ioana.ciornei@nxp.com>
-L: linux-kernel@vger.kernel.org
+L: netdev@vger.kernel.org
S: Maintained
-F: drivers/staging/fsl-dpaa2/ethsw
+F: drivers/net/ethernet/freescale/dpaa2/dpaa2-switch*
+F: drivers/net/ethernet/freescale/dpaa2/dpsw*
DPT_I2O SCSI RAID DRIVER
M: Adaptec OEM Raid Solutions <aacraid@microsemi.com>
M: Daniel Vetter <daniel@ffwll.ch>
L: dri-devel@lists.freedesktop.org
S: Maintained
- B: https://bugs.freedesktop.org/
+ B: https://gitlab.freedesktop.org/drm
C: irc://chat.freenode.net/dri-devel
T: git git://anongit.freedesktop.org/drm/drm
F: Documentation/devicetree/bindings/display/
HISILICON STAGING DRIVERS FOR HIKEY 960/970
M: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
- L: devel@driverdev.osuosl.org
S: Maintained
F: drivers/staging/hikey9xx/
M: Dany Madden <drt@linux.ibm.com>
M: Lijun Pan <ljp@linux.ibm.com>
M: Sukadev Bhattiprolu <sukadev@linux.ibm.com>
+ R: Thomas Falcon <tlfalcon@linux.ibm.com>
L: netdev@vger.kernel.org
S: Supported
F: drivers/net/ethernet/ibm/ibmvnic.*
M: Mat Martineau <mathew.j.martineau@linux.intel.com>
M: Matthieu Baerts <matthieu.baerts@tessares.net>
L: netdev@vger.kernel.org
- L: mptcp@lists.01.org
+ L: mptcp@lists.linux.dev
S: Maintained
W: https://github.com/multipath-tcp/mptcp_net-next/wiki
B: https://github.com/multipath-tcp/mptcp_net-next/issues
QLOGIC QLGE 10Gb ETHERNET DRIVER
M: Manish Chopra <manishc@marvell.com>
M: GR-Linux-NIC-Dev@marvell.com
- L: netdev@vger.kernel.org
- S: Supported
- F: drivers/staging/qlge/
-
- QLOGIC QLGE 10Gb ETHERNET DRIVER
M: Coiby Xu <coiby.xu@gmail.com>
L: netdev@vger.kernel.org
- S: Maintained
+ S: Supported
F: Documentation/networking/device_drivers/qlogic/qlge.rst
+ F: drivers/staging/qlge/
QM1D1B0004 MEDIA DRIVER
M: Akihiro Tsukada <tskd08@gmail.com>
SPIDERNET NETWORK DRIVER for CELL
M: Ishizaki Kou <kou.ishizaki@toshiba.co.jp>
+ M: Geoff Levand <geoff@infradead.org>
L: netdev@vger.kernel.org
- S: Supported
+ L: linuxppc-dev@lists.ozlabs.org
+ S: Maintained
F: Documentation/networking/device_drivers/ethernet/toshiba/spider_net.rst
F: drivers/net/ethernet/toshiba/spider_net*
STAGING SUBSYSTEM
M: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
- L: devel@driverdev.osuosl.org
+ L: linux-staging@lists.linux.dev
S: Supported
T: git git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/staging.git
F: drivers/staging/
M: Martyn Welch <martyn@welchs.me.uk>
M: Manohar Vanga <manohar.vanga@gmail.com>
M: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
- L: devel@driverdev.osuosl.org
+ L: linux-kernel@vger.kernel.org
S: Maintained
T: git git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/char-misc.git
F: Documentation/driver-api/vme.rst
F: drivers/infiniband/hw/vmw_pvrdma/
VMware PVSCSI driver
- M: Jim Gill <jgill@vmware.com>
+ M: Vishal Bhakta <vbhakta@vmware.com>
M: VMware PV-Drivers <pv-drivers@vmware.com>
L: linux-scsi@vger.kernel.org
S: Maintained
#include <linux/module.h>
#include <linux/atmdev.h>
#include <linux/sonet.h>
-#include <linux/atm_suni.h>
#include <linux/dma-mapping.h>
#include <linux/delay.h>
#include <linux/firmware.h>
MODULE_AUTHOR("Christophe Lizzi - credits to Uwe Dannowski and Heikki Vatiainen");
MODULE_DESCRIPTION("FORE Systems 200E-series ATM driver - version " FORE200E_VERSION);
- MODULE_SUPPORTED_DEVICE("PCA-200E, SBA-200E");
-
static const int fore200e_rx_buf_nbr[ BUFFER_SCHEME_NBR ][ BUFFER_MAGN_NBR ] = {
{ BUFFER_S1_NBR, BUFFER_L1_NBR },
b53_write8(dev, B53_CTRL_PAGE, B53_IP_MULTICAST_CTRL, mgmt);
}
-static void b53_enable_vlan(struct b53_device *dev, bool enable,
+static void b53_enable_vlan(struct b53_device *dev, int port, bool enable,
bool enable_filtering)
{
u8 mgmt, vc0, vc1, vc4 = 0, vc5;
b53_write8(dev, B53_CTRL_PAGE, B53_SWITCH_MODE, mgmt);
dev->vlan_enabled = enable;
+
+ dev_dbg(dev->dev, "Port %d VLAN enabled: %d, filtering: %d\n",
+ port, enable, enable_filtering);
}
static int b53_set_jumbo(struct b53_device *dev, bool enable, bool allow_10_100)
b53_do_vlan_op(dev, VTA_CMD_CLEAR);
}
- b53_enable_vlan(dev, dev->vlan_enabled, ds->vlan_filtering);
+ b53_enable_vlan(dev, -1, dev->vlan_enabled, ds->vlan_filtering);
b53_for_each_port(dev, i)
b53_write16(dev, B53_VLAN_PAGE,
b53_disable_port(ds, port);
}
- /* Let DSA handle the case were multiple bridges span the same switch
- * device and different VLAN awareness settings are requested, which
- * would be breaking filtering semantics for any of the other bridge
- * devices. (not hardware supported)
- */
- ds->vlan_filtering_is_global = true;
-
return b53_setup_devlink_resources(ds);
}
{
struct b53_device *dev = ds->priv;
- b53_enable_vlan(dev, dev->vlan_enabled, vlan_filtering);
+ b53_enable_vlan(dev, port, dev->vlan_enabled, vlan_filtering);
return 0;
}
if (vlan->vid >= dev->num_vlans)
return -ERANGE;
- b53_enable_vlan(dev, true, ds->vlan_filtering);
+ b53_enable_vlan(dev, port, true, ds->vlan_filtering);
return 0;
}
{
struct b53_device *dev = ds->priv;
- /* Older models (5325, 5365) support a different tag format that we do
- * not support in net/dsa/tag_brcm.c yet.
- */
- if (is5325(dev) || is5365(dev) ||
- !b53_can_enable_brcm_tags(ds, port, mprot)) {
+ if (!b53_can_enable_brcm_tags(ds, port, mprot)) {
dev->tag_protocol = DSA_TAG_PROTO_NONE;
goto out;
}
+ /* Older models require a different 6 byte tag */
+ if (is5325(dev) || is5365(dev) || is63xx(dev)) {
+ dev->tag_protocol = DSA_TAG_PROTO_BRCM_LEGACY;
+ goto out;
+ }
+
/* Broadcom BCM58xx chips have a flow accelerator on Port 8
* which requires us to use the prepended Broadcom tag type
*/
ds->ops = &b53_switch_ops;
ds->untag_bridge_pvid = true;
dev->vlan_enabled = true;
+ /* Let DSA handle the case were multiple bridges span the same switch
+ * device and different VLAN awareness settings are requested, which
+ * would be breaking filtering semantics for any of the other bridge
+ * devices. (not hardware supported)
+ */
+ ds->vlan_filtering_is_global = true;
+
mutex_init(&dev->reg_mutex);
mutex_init(&dev->stats_mutex);
#include "b53/b53_priv.h"
#include "b53/b53_regs.h"
+static u16 bcm_sf2_reg_rgmii_cntrl(struct bcm_sf2_priv *priv, int port)
+{
+ switch (priv->type) {
+ case BCM4908_DEVICE_ID:
+ switch (port) {
+ case 7:
+ return REG_RGMII_11_CNTRL;
+ default:
+ break;
+ }
+ break;
+ default:
+ switch (port) {
+ case 0:
+ return REG_RGMII_0_CNTRL;
+ case 1:
+ return REG_RGMII_1_CNTRL;
+ case 2:
+ return REG_RGMII_2_CNTRL;
+ default:
+ break;
+ }
+ }
+
+ WARN_ONCE(1, "Unsupported port %d\n", port);
+
+ /* RO fallback reg */
+ return REG_SWITCH_STATUS;
+}
+
/* Return the number of active ports, not counting the IMP (CPU) port */
static unsigned int bcm_sf2_num_active_ports(struct dsa_switch *ds)
{
/* Force link status for IMP port */
reg = core_readl(priv, offset);
reg |= (MII_SW_OR | LINK_STS);
- reg &= ~GMII_SPEED_UP_2G;
+ if (priv->type == BCM4908_DEVICE_ID)
+ reg |= GMII_SPEED_UP_2G;
+ else
+ reg &= ~GMII_SPEED_UP_2G;
core_writel(priv, reg, offset);
/* Enable Broadcast, Multicast, Unicast forwarding to IMP port */
return 0;
}
+static void bcm_sf2_crossbar_setup(struct bcm_sf2_priv *priv)
+{
+ struct device *dev = priv->dev->ds->dev;
+ int shift;
+ u32 mask;
+ u32 reg;
+ int i;
+
+ mask = BIT(priv->num_crossbar_int_ports) - 1;
+
+ reg = reg_readl(priv, REG_CROSSBAR);
+ switch (priv->type) {
+ case BCM4908_DEVICE_ID:
+ shift = CROSSBAR_BCM4908_INT_P7 * priv->num_crossbar_int_ports;
+ reg &= ~(mask << shift);
+ if (0) /* FIXME */
+ reg |= CROSSBAR_BCM4908_EXT_SERDES << shift;
+ else if (priv->int_phy_mask & BIT(7))
+ reg |= CROSSBAR_BCM4908_EXT_GPHY4 << shift;
+ else if (phy_interface_mode_is_rgmii(priv->port_sts[7].mode))
+ reg |= CROSSBAR_BCM4908_EXT_RGMII << shift;
+ else if (WARN(1, "Invalid port mode\n"))
+ return;
+ break;
+ default:
+ return;
+ }
+ reg_writel(priv, reg, REG_CROSSBAR);
+
+ reg = reg_readl(priv, REG_CROSSBAR);
+ for (i = 0; i < priv->num_crossbar_int_ports; i++) {
+ shift = i * priv->num_crossbar_int_ports;
+
+ dev_dbg(dev, "crossbar int port #%d - ext port #%d\n", i,
+ (reg >> shift) & mask);
+ }
+}
+
static void bcm_sf2_intr_disable(struct bcm_sf2_priv *priv)
{
intrl2_0_mask_set(priv, 0xffffffff);
static void bcm_sf2_identify_ports(struct bcm_sf2_priv *priv,
struct device_node *dn)
{
+ struct device *dev = priv->dev->ds->dev;
+ struct bcm_sf2_port_status *port_st;
struct device_node *port;
unsigned int port_num;
struct property *prop;
- phy_interface_t mode;
int err;
priv->moca_port = -1;
if (of_property_read_u32(port, "reg", &port_num))
continue;
+ if (port_num >= DSA_MAX_PORTS) {
+ dev_err(dev, "Invalid port number %d\n", port_num);
+ continue;
+ }
+
+ port_st = &priv->port_sts[port_num];
+
/* Internal PHYs get assigned a specific 'phy-mode' property
* value: "internal" to help flag them before MDIO probing
* has completed, since they might be turned off at that
* time
*/
- err = of_get_phy_mode(port, &mode);
+ err = of_get_phy_mode(port, &port_st->mode);
if (err)
continue;
- if (mode == PHY_INTERFACE_MODE_INTERNAL)
+ if (port_st->mode == PHY_INTERFACE_MODE_INTERNAL)
priv->int_phy_mask |= 1 << port_num;
- if (mode == PHY_INTERFACE_MODE_MOCA)
+ if (port_st->mode == PHY_INTERFACE_MODE_MOCA)
priv->moca_port = port_num;
if (of_property_read_bool(port, "brcm,use-bcm-hdr"))
* in bits 15:8 and the patch level in bits 7:0 which is exactly what
* the REG_PHY_REVISION register layout is.
*/
-
- return priv->hw_params.gphy_rev;
+ if (priv->int_phy_mask & BIT(port))
+ return priv->hw_params.gphy_rev;
+ else
+ return 0;
}
static void bcm_sf2_sw_validate(struct dsa_switch *ds, int port,
{
struct bcm_sf2_priv *priv = bcm_sf2_to_priv(ds);
u32 id_mode_dis = 0, port_mode;
+ u32 reg_rgmii_ctrl;
u32 reg;
if (port == core_readl(priv, CORE_IMP0_PRT_ID))
return;
}
+ reg_rgmii_ctrl = bcm_sf2_reg_rgmii_cntrl(priv, port);
+
/* Clear id_mode_dis bit, and the existing port mode, let
* RGMII_MODE_EN bet set by mac_link_{up,down}
*/
- reg = reg_readl(priv, REG_RGMII_CNTRL_P(port));
+ reg = reg_readl(priv, reg_rgmii_ctrl);
reg &= ~ID_MODE_DIS;
reg &= ~(PORT_MODE_MASK << PORT_MODE_SHIFT);
if (id_mode_dis)
reg |= ID_MODE_DIS;
- reg_writel(priv, reg, REG_RGMII_CNTRL_P(port));
+ reg_writel(priv, reg, reg_rgmii_ctrl);
}
static void bcm_sf2_sw_mac_link_set(struct dsa_switch *ds, int port,
phy_interface_t interface, bool link)
{
struct bcm_sf2_priv *priv = bcm_sf2_to_priv(ds);
+ u32 reg_rgmii_ctrl;
u32 reg;
if (!phy_interface_mode_is_rgmii(interface) &&
interface != PHY_INTERFACE_MODE_REVMII)
return;
+ reg_rgmii_ctrl = bcm_sf2_reg_rgmii_cntrl(priv, port);
+
/* If the link is down, just disable the interface to conserve power */
- reg = reg_readl(priv, REG_RGMII_CNTRL_P(port));
+ reg = reg_readl(priv, reg_rgmii_ctrl);
if (link)
reg |= RGMII_MODE_EN;
else
reg &= ~RGMII_MODE_EN;
- reg_writel(priv, reg, REG_RGMII_CNTRL_P(port));
+ reg_writel(priv, reg, reg_rgmii_ctrl);
}
static void bcm_sf2_sw_mac_link_down(struct dsa_switch *ds, int port,
{
struct bcm_sf2_priv *priv = bcm_sf2_to_priv(ds);
struct ethtool_eee *p = &priv->dev->ports[port].eee;
- u32 reg, offset;
bcm_sf2_sw_mac_link_set(ds, port, interface, true);
if (port != core_readl(priv, CORE_IMP0_PRT_ID)) {
+ u32 reg_rgmii_ctrl;
+ u32 reg, offset;
+
+ reg_rgmii_ctrl = bcm_sf2_reg_rgmii_cntrl(priv, port);
+
if (priv->type == BCM4908_DEVICE_ID ||
priv->type == BCM7445_DEVICE_ID)
offset = CORE_STS_OVERRIDE_GMIIP_PORT(port);
interface == PHY_INTERFACE_MODE_RGMII_TXID ||
interface == PHY_INTERFACE_MODE_MII ||
interface == PHY_INTERFACE_MODE_REVMII) {
- reg = reg_readl(priv, REG_RGMII_CNTRL_P(port));
+ reg = reg_readl(priv, reg_rgmii_ctrl);
reg &= ~(RX_PAUSE_EN | TX_PAUSE_EN);
if (tx_pause)
if (rx_pause)
reg |= RX_PAUSE_EN;
- reg_writel(priv, reg, REG_RGMII_CNTRL_P(port));
+ reg_writel(priv, reg, reg_rgmii_ctrl);
}
reg = SW_OVERRIDE | LINK_STS;
return ret;
}
+ bcm_sf2_crossbar_setup(priv);
+
ret = bcm_sf2_cfp_resume(ds);
if (ret)
return ret;
const u16 *reg_offsets;
unsigned int core_reg_align;
unsigned int num_cfp_rules;
+ unsigned int num_crossbar_int_ports;
};
static const u16 bcm_sf2_4908_reg_offsets[] = {
[REG_PHY_REVISION] = 0x14,
[REG_SPHY_CNTRL] = 0x24,
[REG_CROSSBAR] = 0xc8,
- [REG_RGMII_0_CNTRL] = 0xe0,
- [REG_RGMII_1_CNTRL] = 0xec,
- [REG_RGMII_2_CNTRL] = 0xf8,
+ [REG_RGMII_11_CNTRL] = 0x014c,
[REG_LED_0_CNTRL] = 0x40,
[REG_LED_1_CNTRL] = 0x4c,
[REG_LED_2_CNTRL] = 0x58,
.type = BCM4908_DEVICE_ID,
.core_reg_align = 0,
.reg_offsets = bcm_sf2_4908_reg_offsets,
- .num_cfp_rules = 0, /* FIXME */
+ .num_cfp_rules = 256,
+ .num_crossbar_int_ports = 2,
};
/* Register offsets for the SWITCH_REG_* block */
priv->reg_offsets = data->reg_offsets;
priv->core_reg_align = data->core_reg_align;
priv->num_cfp_rules = data->num_cfp_rules;
+ priv->num_crossbar_int_ports = data->num_crossbar_int_ports;
priv->rcdev = devm_reset_control_get_optional_exclusive(&pdev->dev,
"switch");
goto out_clk_mdiv;
}
+ bcm_sf2_crossbar_setup(priv);
+
bcm_sf2_gphy_enable_set(priv->dev->ds, true);
ret = bcm_sf2_mdio_register(ds);
TD_DM_DRVP(8) | TD_DM_DRVN(8));
/* Setup core clock for MT7530 */
- if (!trgint) {
- /* Disable MT7530 core clock */
- core_clear(priv, CORE_TRGMII_GSW_CLK_CG, REG_GSWCK_EN);
-
- /* Disable PLL, since phy_device has not yet been created
- * provided for phy_[read,write]_mmd_indirect is called, we
- * provide our own core_write_mmd_indirect to complete this
- * function.
- */
- core_write_mmd_indirect(priv,
- CORE_GSWPLL_GRP1,
- MDIO_MMD_VEND2,
- 0);
-
- /* Set core clock into 500Mhz */
- core_write(priv, CORE_GSWPLL_GRP2,
- RG_GSWPLL_POSDIV_500M(1) |
- RG_GSWPLL_FBKDIV_500M(25));
-
- /* Enable PLL */
- core_write(priv, CORE_GSWPLL_GRP1,
- RG_GSWPLL_EN_PRE |
- RG_GSWPLL_POSDIV_200M(2) |
- RG_GSWPLL_FBKDIV_200M(32));
-
- /* Enable MT7530 core clock */
- core_set(priv, CORE_TRGMII_GSW_CLK_CG, REG_GSWCK_EN);
- }
+ /* Disable MT7530 core clock */
+ core_clear(priv, CORE_TRGMII_GSW_CLK_CG, REG_GSWCK_EN);
+
+ /* Disable PLL, since phy_device has not yet been created
+ * provided for phy_[read,write]_mmd_indirect is called, we
+ * provide our own core_write_mmd_indirect to complete this
+ * function.
+ */
+ core_write_mmd_indirect(priv,
+ CORE_GSWPLL_GRP1,
+ MDIO_MMD_VEND2,
+ 0);
+
+ /* Set core clock into 500Mhz */
+ core_write(priv, CORE_GSWPLL_GRP2,
+ RG_GSWPLL_POSDIV_500M(1) |
+ RG_GSWPLL_FBKDIV_500M(25));
+
+ /* Enable PLL */
+ core_write(priv, CORE_GSWPLL_GRP1,
+ RG_GSWPLL_EN_PRE |
+ RG_GSWPLL_POSDIV_200M(2) |
+ RG_GSWPLL_FBKDIV_200M(32));
+
+ /* Enable MT7530 core clock */
+ core_set(priv, CORE_TRGMII_GSW_CLK_CG, REG_GSWCK_EN);
/* Setup the MT7530 TRGMII Tx Clock */
core_set(priv, CORE_TRGMII_GSW_CLK_CG, REG_GSWCK_EN);
mt7530_write(priv, MT7530_PVC_P(port),
PORT_SPEC_TAG);
- /* Unknown multicast frame forwarding to the cpu port */
- mt7530_rmw(priv, MT7530_MFC, UNM_FFP_MASK, UNM_FFP(BIT(port)));
+ /* Disable flooding by default */
+ mt7530_rmw(priv, MT7530_MFC, BC_FFP_MASK | UNM_FFP_MASK | UNU_FFP_MASK,
+ BC_FFP(BIT(port)) | UNM_FFP(BIT(port)) | UNU_FFP(BIT(port)));
/* Set CPU port number */
if (priv->id == ID_MT7621)
mt7530_rmw(priv, MT7530_SSP_P(port), FID_PST_MASK, stp_state);
}
+static int
+mt7530_port_pre_bridge_flags(struct dsa_switch *ds, int port,
+ struct switchdev_brport_flags flags,
+ struct netlink_ext_ack *extack)
+{
+ if (flags.mask & ~(BR_LEARNING | BR_FLOOD | BR_MCAST_FLOOD |
+ BR_BCAST_FLOOD))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int
+mt7530_port_bridge_flags(struct dsa_switch *ds, int port,
+ struct switchdev_brport_flags flags,
+ struct netlink_ext_ack *extack)
+{
+ struct mt7530_priv *priv = ds->priv;
+
+ if (flags.mask & BR_LEARNING)
+ mt7530_rmw(priv, MT7530_PSC_P(port), SA_DIS,
+ flags.val & BR_LEARNING ? 0 : SA_DIS);
+
+ if (flags.mask & BR_FLOOD)
+ mt7530_rmw(priv, MT7530_MFC, UNU_FFP(BIT(port)),
+ flags.val & BR_FLOOD ? UNU_FFP(BIT(port)) : 0);
+
+ if (flags.mask & BR_MCAST_FLOOD)
+ mt7530_rmw(priv, MT7530_MFC, UNM_FFP(BIT(port)),
+ flags.val & BR_MCAST_FLOOD ? UNM_FFP(BIT(port)) : 0);
+
+ if (flags.mask & BR_BCAST_FLOOD)
+ mt7530_rmw(priv, MT7530_MFC, BC_FFP(BIT(port)),
+ flags.val & BR_BCAST_FLOOD ? BC_FFP(BIT(port)) : 0);
+
+ return 0;
+}
+
+static int
+mt7530_port_set_mrouter(struct dsa_switch *ds, int port, bool mrouter,
+ struct netlink_ext_ack *extack)
+{
+ struct mt7530_priv *priv = ds->priv;
+
+ mt7530_rmw(priv, MT7530_MFC, UNM_FFP(BIT(port)),
+ mrouter ? UNM_FFP(BIT(port)) : 0);
+
+ return 0;
+}
+
static int
mt7530_port_bridge_join(struct dsa_switch *ds, int port,
struct net_device *bridge)
return 0;
}
+static int
+mt7530_port_mdb_add(struct dsa_switch *ds, int port,
+ const struct switchdev_obj_port_mdb *mdb)
+{
+ struct mt7530_priv *priv = ds->priv;
+ const u8 *addr = mdb->addr;
+ u16 vid = mdb->vid;
+ u8 port_mask = 0;
+ int ret;
+
+ mutex_lock(&priv->reg_mutex);
+
+ mt7530_fdb_write(priv, vid, 0, addr, 0, STATIC_EMP);
+ if (!mt7530_fdb_cmd(priv, MT7530_FDB_READ, NULL))
+ port_mask = (mt7530_read(priv, MT7530_ATRD) >> PORT_MAP)
+ & PORT_MAP_MASK;
+
+ port_mask |= BIT(port);
+ mt7530_fdb_write(priv, vid, port_mask, addr, -1, STATIC_ENT);
+ ret = mt7530_fdb_cmd(priv, MT7530_FDB_WRITE, NULL);
+
+ mutex_unlock(&priv->reg_mutex);
+
+ return ret;
+}
+
+static int
+mt7530_port_mdb_del(struct dsa_switch *ds, int port,
+ const struct switchdev_obj_port_mdb *mdb)
+{
+ struct mt7530_priv *priv = ds->priv;
+ const u8 *addr = mdb->addr;
+ u16 vid = mdb->vid;
+ u8 port_mask = 0;
+ int ret;
+
+ mutex_lock(&priv->reg_mutex);
+
+ mt7530_fdb_write(priv, vid, 0, addr, 0, STATIC_EMP);
+ if (!mt7530_fdb_cmd(priv, MT7530_FDB_READ, NULL))
+ port_mask = (mt7530_read(priv, MT7530_ATRD) >> PORT_MAP)
+ & PORT_MAP_MASK;
+
+ port_mask &= ~BIT(port);
+ mt7530_fdb_write(priv, vid, port_mask, addr, -1,
+ port_mask ? STATIC_ENT : STATIC_EMP);
+ ret = mt7530_fdb_cmd(priv, MT7530_FDB_WRITE, NULL);
+
+ mutex_unlock(&priv->reg_mutex);
+
+ return ret;
+}
+
static int
mt7530_vlan_cmd(struct mt7530_priv *priv, enum mt7530_vlan_cmd cmd, u16 vid)
{
ret = mt753x_cpu_port_enable(ds, i);
if (ret)
return ret;
- } else
+ } else {
mt7530_port_disable(ds, i);
+ /* Disable learning by default on all user ports */
+ mt7530_set(priv, MT7530_PSC_P(i), SA_DIS);
+ }
/* Enable consistent egress tag */
mt7530_rmw(priv, MT7530_PVC_P(i), PVC_EG_TAG_MASK,
PVC_EG_TAG(MT7530_VLAN_EG_CONSISTENT));
ret = mt753x_cpu_port_enable(ds, i);
if (ret)
return ret;
- } else
+ } else {
mt7530_port_disable(ds, i);
+ /* Disable learning by default on all user ports */
+ mt7530_set(priv, MT7530_PSC_P(i), SA_DIS);
+ }
+
/* Enable consistent egress tag */
mt7530_rmw(priv, MT7530_PVC_P(i), PVC_EG_TAG_MASK,
PVC_EG_TAG(MT7530_VLAN_EG_CONSISTENT));
.port_change_mtu = mt7530_port_change_mtu,
.port_max_mtu = mt7530_port_max_mtu,
.port_stp_state_set = mt7530_stp_state_set,
+ .port_pre_bridge_flags = mt7530_port_pre_bridge_flags,
+ .port_bridge_flags = mt7530_port_bridge_flags,
+ .port_set_mrouter = mt7530_port_set_mrouter,
.port_bridge_join = mt7530_port_bridge_join,
.port_bridge_leave = mt7530_port_bridge_leave,
.port_fdb_add = mt7530_port_fdb_add,
.port_fdb_del = mt7530_port_fdb_del,
.port_fdb_dump = mt7530_port_fdb_dump,
+ .port_mdb_add = mt7530_port_mdb_add,
+ .port_mdb_del = mt7530_port_mdb_del,
.port_vlan_filtering = mt7530_port_vlan_filtering,
.port_vlan_add = mt7530_port_vlan_add,
.port_vlan_del = mt7530_port_vlan_del,
#include <linux/pm_runtime.h>
#include <linux/aer.h>
#include <linux/prefetch.h>
+#include <linux/suspend.h>
#include "e1000.h"
struct e1000_adapter *adapter;
adapter = container_of(work, struct e1000_adapter, reset_task);
+ rtnl_lock();
/* don't run the task if already down */
- if (test_bit(__E1000_DOWN, &adapter->state))
+ if (test_bit(__E1000_DOWN, &adapter->state)) {
+ rtnl_unlock();
return;
+ }
if (!(adapter->flags & FLAG_RESTART_NOW)) {
e1000e_dump(adapter);
e_err("Reset adapter unexpectedly\n");
}
e1000e_reinit_locked(adapter);
+ rtnl_unlock();
}
/**
- * e1000_get_stats64 - Get System Network Statistics
+ * e1000e_get_stats64 - Get System Network Statistics
* @netdev: network interface device structure
* @stats: rtnl_link_stats64 pointer
*
}
/**
- * e1000e_hwtstamp_ioctl - control hardware time stamping
+ * e1000e_hwtstamp_set - control hardware time stamping
* @netdev: network interface device structure
* @ifr: interface request
*
}
/**
- * e1000e_disable_aspm_locked Disable ASPM states.
+ * e1000e_disable_aspm_locked - Disable ASPM states.
* @pdev: pointer to PCI device struct
* @state: bit-mask of ASPM states to disable
*
return 0;
}
+static __maybe_unused int e1000e_pm_prepare(struct device *dev)
+{
+ return pm_runtime_suspended(dev) &&
+ pm_suspend_via_firmware();
+}
+
static __maybe_unused int e1000e_pm_suspend(struct device *dev)
{
struct net_device *netdev = pci_get_drvdata(to_pci_dev(dev));
e1000_print_device_info(adapter);
- dev_pm_set_driver_flags(&pdev->dev, DPM_FLAG_NO_DIRECT_COMPLETE);
+ dev_pm_set_driver_flags(&pdev->dev, DPM_FLAG_SMART_PREPARE);
- if (pci_dev_run_wake(pdev) && hw->mac.type < e1000_pch_cnp)
+ if (pci_dev_run_wake(pdev) && hw->mac.type != e1000_pch_cnp)
pm_runtime_put_noidle(&pdev->dev);
return 0;
static const struct dev_pm_ops e1000_pm_ops = {
#ifdef CONFIG_PM_SLEEP
+ .prepare = e1000e_pm_prepare,
.suspend = e1000e_pm_suspend,
.resume = e1000e_pm_resume,
.freeze = e1000e_pm_freeze,
}
/**
- * i40e_next_entry - Get the next non-broadcast filter from a list
+ * i40e_next_filter - Get the next non-broadcast filter from a list
* @next: pointer to filter in list
*
* Returns the next non-broadcast filter in the list. Required so that we
return 0;
}
+ /**
+ * i40e_rx_offset - Return expected offset into page to access data
+ * @rx_ring: Ring we are requesting offset of
+ *
+ * Returns the offset value for ring into the data buffer.
+ */
+ static unsigned int i40e_rx_offset(struct i40e_ring *rx_ring)
+ {
+ return ring_uses_build_skb(rx_ring) ? I40E_SKB_PAD : 0;
+ }
+
/**
* i40e_configure_rx_ring - Configure a receive ring context
* @ring: The Rx ring to configure
else
set_ring_build_skb_enabled(ring);
+ ring->rx_offset = i40e_rx_offset(ring);
+
/* cache tail for quicker writes, and clear the reg before use */
ring->tail = hw->hw_addr + I40E_QRX_TAIL(pf_q);
writel(0, ring->tail);
}
/**
- * i40e_pf_get_pf_tc_map - Get bitmap for enabled traffic classes
+ * i40e_pf_get_tc_map - Get bitmap for enabled traffic classes
* @pf: PF being queried
*
* Return a bitmap for enabled traffic classes for this PF.
}
/**
- * i40e_get_current_atr_count - Get the count of total FD ATR filters programmed
+ * i40e_get_current_atr_cnt - Get the count of total FD ATR filters programmed
* @pf: board private structure
**/
u32 i40e_get_current_atr_cnt(struct i40e_pf *pf)
}
}
- /**
- * i40e_rx_offset - Return expected offset into page to access data
- * @rx_ring: Ring we are requesting offset of
- *
- * Returns the offset value for ring into the data buffer.
- */
- static unsigned int i40e_rx_offset(struct i40e_ring *rx_ring)
- {
- return ring_uses_build_skb(rx_ring) ? I40E_SKB_PAD : 0;
- }
-
/**
* i40e_setup_rx_descriptors - Allocate Rx descriptors
* @rx_ring: Rx descriptor ring (for a specific queue) to setup
rx_ring->next_to_alloc = 0;
rx_ring->next_to_clean = 0;
rx_ring->next_to_use = 0;
- rx_ring->rx_offset = i40e_rx_offset(rx_ring);
/* XDP RX-queue info only needed for RX rings exposed to XDP */
if (rx_ring->vsi->type == I40E_VSI_MAIN) {
}
/**
- * i40e_create_tx_ctx Build the Tx context descriptor
+ * i40e_create_tx_ctx - Build the Tx context descriptor
* @tx_ring: ring to create the descriptor on
* @cd_type_cmd_tso_mss: Quad Word 1
* @cd_tunneling: Quad Word 0 - bits 0-31
}
}
- /**
- * ice_rx_offset - Return expected offset into page to access data
- * @rx_ring: Ring we are requesting offset of
- *
- * Returns the offset value for ring into the data buffer.
- */
- static unsigned int ice_rx_offset(struct ice_ring *rx_ring)
- {
- if (ice_ring_uses_build_skb(rx_ring))
- return ICE_SKB_PAD;
- else if (ice_is_xdp_ena_vsi(rx_ring->vsi))
- return XDP_PACKET_HEADROOM;
-
- return 0;
- }
-
/**
* ice_setup_rx_ring - Allocate the Rx descriptors
* @rx_ring: the Rx ring to set up
rx_ring->next_to_use = 0;
rx_ring->next_to_clean = 0;
- rx_ring->rx_offset = ice_rx_offset(rx_ring);
if (ice_is_xdp_ena_vsi(rx_ring->vsi))
WRITE_ONCE(rx_ring->xdp_prog, rx_ring->vsi->xdp_prog);
dma_rmb();
if (rx_desc->wb.rxdid == FDIR_DESC_RXDID || !rx_ring->netdev) {
+ struct ice_vsi *ctrl_vsi = rx_ring->vsi;
+
+ if (rx_desc->wb.rxdid == FDIR_DESC_RXDID &&
+ ctrl_vsi->vf_id != ICE_INVAL_VFID)
+ ice_vc_fdir_irq_handler(ctrl_vsi, rx_desc);
ice_put_rx_buf(rx_ring, NULL, 0);
cleaned_count++;
continue;
* This function allocates a number of Rx buffers from the fill ring
* or the internal recycle mechanism and places them on the Rx ring.
*
- * Returns false if all allocations were successful, true if any fail.
+ * Returns true if all allocations were successful, false if any fail.
*/
bool ice_alloc_rx_bufs_zc(struct ice_ring *rx_ring, u16 count)
{
union ice_32b_rx_flex_desc *rx_desc;
u16 ntu = rx_ring->next_to_use;
struct ice_rx_buf *rx_buf;
- bool ret = false;
+ bool ok = true;
dma_addr_t dma;
if (!count)
- return false;
+ return true;
rx_desc = ICE_RX_DESC(rx_ring, ntu);
rx_buf = &rx_ring->rx_buf[ntu];
do {
rx_buf->xdp = xsk_buff_alloc(rx_ring->xsk_pool);
if (!rx_buf->xdp) {
- ret = true;
+ ok = false;
break;
}
ice_release_rx_desc(rx_ring, ntu);
}
- return ret;
+ return ok;
}
/**
xdp_prog = READ_ONCE(rx_ring->xdp_prog);
act = bpf_prog_run_xdp(xdp_prog, xdp);
+
+ if (likely(act == XDP_REDIRECT)) {
+ err = xdp_do_redirect(rx_ring->netdev, xdp, xdp_prog);
+ result = !err ? ICE_XDP_REDIR : ICE_XDP_CONSUMED;
+ rcu_read_unlock();
+ return result;
+ }
+
switch (act) {
case XDP_PASS:
break;
xdp_ring = rx_ring->vsi->xdp_rings[rx_ring->q_index];
result = ice_xmit_xdp_buff(xdp, xdp_ring);
break;
- case XDP_REDIRECT:
- err = xdp_do_redirect(rx_ring->netdev, xdp, xdp_prog);
- result = !err ? ICE_XDP_REDIR : ICE_XDP_CONSUMED;
- break;
default:
bpf_warn_invalid_xdp_action(act);
fallthrough;
}
/**
- * Detect and switch function for Media Auto Sense
+ * igb_check_swap_media - Detect and switch function for Media Auto Sense
* @adapter: address of the board private structure
**/
static void igb_check_swap_media(struct igb_adapter *adapter)
return 0;
/* Initialize the i2c bus which is controlled by the registers.
- * This bus will use the i2c_algo_bit structue that implements
+ * This bus will use the i2c_algo_bit structure that implements
* the protocol through toggling of the 4 bits in the register.
*/
adapter->i2c_adap.owner = THIS_MODULE;
}
/**
- * igb_open - Called when a network interface is made active
+ * __igb_open - Called when a network interface is made active
* @netdev: network interface device structure
* @resuming: indicates whether we are in a resume call
*
}
/**
- * igb_close - Disables a network interface
+ * __igb_close - Disables a network interface
* @netdev: network interface device structure
* @suspending: indicates we are in a suspend call
*
*/
if (tx_ring->launchtime_enable) {
ts = ktime_to_timespec64(first->skb->tstamp);
- first->skb->tstamp = ktime_set(0, 0);
+ skb_txtime_consumed(first->skb);
context_desc->seqnum_seed = cpu_to_le32(ts.tv_nsec / 32);
} else {
context_desc->seqnum_seed = 0;
new_buff->pagecnt_bias = old_buff->pagecnt_bias;
}
- static bool igb_can_reuse_rx_page(struct igb_rx_buffer *rx_buffer)
+ static bool igb_can_reuse_rx_page(struct igb_rx_buffer *rx_buffer,
+ int rx_buf_pgcnt)
{
unsigned int pagecnt_bias = rx_buffer->pagecnt_bias;
struct page *page = rx_buffer->page;
#if (PAGE_SIZE < 8192)
/* if we are only owner of page we can reuse it */
- if (unlikely((page_ref_count(page) - pagecnt_bias) > 1))
+ if (unlikely((rx_buf_pgcnt - pagecnt_bias) > 1))
return false;
#else
#define IGB_LAST_OFFSET \
return NULL;
if (unlikely(igb_test_staterr(rx_desc, E1000_RXDADV_STAT_TSIP))) {
- igb_ptp_rx_pktstamp(rx_ring->q_vector, xdp->data, skb);
- xdp->data += IGB_TS_HDR_LEN;
- size -= IGB_TS_HDR_LEN;
+ if (!igb_ptp_rx_pktstamp(rx_ring->q_vector, xdp->data, skb)) {
+ xdp->data += IGB_TS_HDR_LEN;
+ size -= IGB_TS_HDR_LEN;
+ }
}
/* Determine available headroom for copy */
/* pull timestamp out of packet data */
if (igb_test_staterr(rx_desc, E1000_RXDADV_STAT_TSIP)) {
- igb_ptp_rx_pktstamp(rx_ring->q_vector, skb->data, skb);
- __skb_pull(skb, IGB_TS_HDR_LEN);
+ if (!igb_ptp_rx_pktstamp(rx_ring->q_vector, skb->data, skb))
+ __skb_pull(skb, IGB_TS_HDR_LEN);
}
/* update buffer offset */
}
static struct igb_rx_buffer *igb_get_rx_buffer(struct igb_ring *rx_ring,
- const unsigned int size)
+ const unsigned int size, int *rx_buf_pgcnt)
{
struct igb_rx_buffer *rx_buffer;
rx_buffer = &rx_ring->rx_buffer_info[rx_ring->next_to_clean];
+ *rx_buf_pgcnt =
+ #if (PAGE_SIZE < 8192)
+ page_count(rx_buffer->page);
+ #else
+ 0;
+ #endif
prefetchw(rx_buffer->page);
/* we are reusing so sync this buffer for CPU use */
}
static void igb_put_rx_buffer(struct igb_ring *rx_ring,
- struct igb_rx_buffer *rx_buffer)
+ struct igb_rx_buffer *rx_buffer, int rx_buf_pgcnt)
{
- if (igb_can_reuse_rx_page(rx_buffer)) {
+ if (igb_can_reuse_rx_page(rx_buffer, rx_buf_pgcnt)) {
/* hand second half of page back to the ring */
igb_reuse_rx_page(rx_ring, rx_buffer);
} else {
unsigned int xdp_xmit = 0;
struct xdp_buff xdp;
u32 frame_sz = 0;
+ int rx_buf_pgcnt;
/* Frame size depend on rx_ring setup when PAGE_SIZE=4K */
#if (PAGE_SIZE < 8192)
*/
dma_rmb();
- rx_buffer = igb_get_rx_buffer(rx_ring, size);
+ rx_buffer = igb_get_rx_buffer(rx_ring, size, &rx_buf_pgcnt);
/* retrieve a buffer from the ring */
if (!skb) {
break;
}
- igb_put_rx_buffer(rx_ring, rx_buffer);
+ igb_put_rx_buffer(rx_ring, rx_buffer, rx_buf_pgcnt);
cleaned_count++;
/* fetch next buffer in frame if non-eop */
dev_kfree_skb_any(skb);
}
+ #define IGB_RET_PTP_DISABLED 1
+ #define IGB_RET_PTP_INVALID 2
+
/**
* igb_ptp_rx_pktstamp - retrieve Rx per packet timestamp
* @q_vector: Pointer to interrupt specific structure
*
* This function is meant to retrieve a timestamp from the first buffer of an
* incoming frame. The value is stored in little endian format starting on
- * byte 8.
+ * byte 8
+ *
+ * Returns: 0 if success, nonzero if failure
**/
- void igb_ptp_rx_pktstamp(struct igb_q_vector *q_vector, void *va,
- struct sk_buff *skb)
+ int igb_ptp_rx_pktstamp(struct igb_q_vector *q_vector, void *va,
+ struct sk_buff *skb)
{
- __le64 *regval = (__le64 *)va;
struct igb_adapter *adapter = q_vector->adapter;
+ __le64 *regval = (__le64 *)va;
int adjust = 0;
+ if (!(adapter->ptp_flags & IGB_PTP_ENABLED))
+ return IGB_RET_PTP_DISABLED;
+
/* The timestamp is recorded in little endian format.
* DWORD: 0 1 2 3
* Field: Reserved Reserved SYSTIML SYSTIMH
*/
+
+ /* check reserved dwords are zero, be/le doesn't matter for zero */
+ if (regval[0])
+ return IGB_RET_PTP_INVALID;
+
igb_ptp_systim_to_hwtstamp(adapter, skb_hwtstamps(skb),
le64_to_cpu(regval[1]));
}
skb_hwtstamps(skb)->hwtstamp =
ktime_sub_ns(skb_hwtstamps(skb)->hwtstamp, adjust);
+
+ return 0;
}
/**
* This function is meant to retrieve a timestamp from the internal registers
* of the adapter and store it in the skb.
**/
- void igb_ptp_rx_rgtstamp(struct igb_q_vector *q_vector,
- struct sk_buff *skb)
+ void igb_ptp_rx_rgtstamp(struct igb_q_vector *q_vector, struct sk_buff *skb)
{
struct igb_adapter *adapter = q_vector->adapter;
struct e1000_hw *hw = &adapter->hw;
- u64 regval;
int adjust = 0;
+ u64 regval;
+
+ if (!(adapter->ptp_flags & IGB_PTP_ENABLED))
+ return;
/* If this bit is set, then the RX registers contain the time stamp. No
* other packet will be time stamped until we read these registers, so
switch (config->tx_type) {
case HWTSTAMP_TX_OFF:
tsync_tx_ctl = 0;
+ break;
case HWTSTAMP_TX_ON:
break;
default:
struct igc_adapter *adapter = netdev_priv(tx_ring->netdev);
ktime_t txtime = first->skb->tstamp;
- first->skb->tstamp = ktime_set(0, 0);
+ skb_txtime_consumed(first->skb);
context_desc->launch_time = igc_tx_launchtime(adapter,
txtime);
} else {
netif_tx_start_all_queues(adapter->netdev);
/* start the watchdog. */
- hw->mac.get_link_status = 1;
+ hw->mac.get_link_status = true;
schedule_work(&adapter->watchdog_task);
}
adapter = container_of(work, struct igc_adapter, reset_task);
+ rtnl_lock();
+ /* If we're already down or resetting, just bail */
+ if (test_bit(__IGC_DOWN, &adapter->state) ||
+ test_bit(__IGC_RESETTING, &adapter->state)) {
+ rtnl_unlock();
+ return;
+ }
+
igc_rings_dump(adapter);
igc_regs_dump(adapter);
netdev_err(adapter->netdev, "Reset adapter\n");
igc_reinit_locked(adapter);
+ rtnl_unlock();
}
/**
}
if (icr & IGC_ICR_LSC) {
- hw->mac.get_link_status = 1;
+ hw->mac.get_link_status = true;
/* guard against interrupt when we're going down */
if (!test_bit(__IGC_DOWN, &adapter->state))
mod_timer(&adapter->watchdog_timer, jiffies + 1);
}
if (icr & (IGC_ICR_RXSEQ | IGC_ICR_LSC)) {
- hw->mac.get_link_status = 1;
+ hw->mac.get_link_status = true;
if (!test_bit(__IGC_DOWN, &adapter->state))
mod_timer(&adapter->watchdog_timer, jiffies + 1);
}
}
if (icr & (IGC_ICR_RXSEQ | IGC_ICR_LSC)) {
- hw->mac.get_link_status = 1;
+ hw->mac.get_link_status = true;
/* guard against interrupt when we're going down */
if (!test_bit(__IGC_DOWN, &adapter->state))
mod_timer(&adapter->watchdog_timer, jiffies + 1);
netif_tx_start_all_queues(netdev);
/* start the watchdog. */
- hw->mac.get_link_status = 1;
+ hw->mac.get_link_status = true;
schedule_work(&adapter->watchdog_task);
return IGC_SUCCESS;
{
struct igc_mac_info *mac = &adapter->hw.mac;
- mac->autoneg = 0;
+ mac->autoneg = false;
/* Make sure dplx is at most 1 bit and lsb of speed is not set
* for the switch() below to work
mac->forced_speed_duplex = ADVERTISE_100_FULL;
break;
case SPEED_1000 + DUPLEX_FULL:
- mac->autoneg = 1;
+ mac->autoneg = true;
adapter->hw.phy.autoneg_advertised = ADVERTISE_1000_FULL;
break;
case SPEED_1000 + DUPLEX_HALF: /* not supported */
goto err_inval;
case SPEED_2500 + DUPLEX_FULL:
- mac->autoneg = 1;
+ mac->autoneg = true;
adapter->hw.phy.autoneg_advertised = ADVERTISE_2500_FULL;
break;
case SPEED_2500 + DUPLEX_HALF: /* not supported */
}
/**
- * ixgbe_check_from_parent - Determine whether PCIe info should come from parent
+ * ixgbe_pcie_from_parent - Determine whether PCIe info should come from parent
* @hw: hw specific details
*
* This function is used by probe to determine whether a device's PCI-Express
#endif
}
+ ring->rx_offset = ixgbe_rx_offset(ring);
+
if (ring->xsk_pool && hw->mac.type != ixgbe_mac_82599EB) {
u32 xsk_buf_len = xsk_pool_get_rx_frame_size(ring->xsk_pool);
}
/**
- * ixgbe_eee_capable - helper function to determine EEE support on X550
+ * ixgbe_set_eee_capable - helper function to determine EEE support on X550
* @adapter: board private structure
*/
static void ixgbe_set_eee_capable(struct ixgbe_adapter *adapter)
rx_ring->next_to_clean = 0;
rx_ring->next_to_use = 0;
- rx_ring->rx_offset = ixgbe_rx_offset(rx_ring);
/* XDP RX-queue info */
if (xdp_rxq_info_reg(&rx_ring->xdp_rxq, adapter->netdev,
return !(pcifunc & ~RVU_PFVF_FUNC_MASK);
}
+/* check if PF_FUNC is AF */
+static inline bool is_pffunc_af(u16 pcifunc)
+{
+ return !pcifunc;
+}
+
static inline bool is_rvu_fwdata_valid(struct rvu *rvu)
{
return (rvu->fwdata->header_magic == RVU_FWDATA_HEADER_MAGIC) &&
void rvu_npc_install_ucast_entry(struct rvu *rvu, u16 pcifunc,
int nixlf, u64 chan, u8 *mac_addr);
void rvu_npc_install_promisc_entry(struct rvu *rvu, u16 pcifunc,
- int nixlf, u64 chan, bool allmulti);
+ int nixlf, u64 chan, u8 chan_cnt,
+ bool allmulti);
void rvu_npc_disable_promisc_entry(struct rvu *rvu, u16 pcifunc, int nixlf);
void rvu_npc_enable_promisc_entry(struct rvu *rvu, u16 pcifunc, int nixlf);
void rvu_npc_install_bcast_match_entry(struct rvu *rvu, u16 pcifunc,
int npc_mcam_verify_channel(struct rvu *rvu, u16 pcifunc, u8 intf, u16 channel);
int npc_flow_steering_init(struct rvu *rvu, int blkaddr);
const char *npc_get_field_name(u8 hdr);
-bool rvu_npc_write_default_rule(struct rvu *rvu, int blkaddr, int nixlf,
- u16 pcifunc, u8 intf, struct mcam_entry *entry,
- int *entry_index);
int npc_get_bank(struct npc_mcam *mcam, int index);
void npc_mcam_enable_flows(struct rvu *rvu, u16 target);
void npc_mcam_disable_flows(struct rvu *rvu, u16 target);
u8 *intf, u8 *ena);
bool is_mac_feature_supported(struct rvu *rvu, int pf, int feature);
u32 rvu_cgx_get_fifolen(struct rvu *rvu);
+ void *rvu_first_cgx_pdata(struct rvu *rvu);
+int npc_get_nixlf_mcam_index(struct npc_mcam *mcam, u16 pcifunc, int nixlf,
+ int type);
+bool is_mcam_entry_enabled(struct rvu *rvu, struct npc_mcam *mcam, int blkaddr,
+ int index);
+
/* CPT APIs */
int rvu_cpt_lf_teardown(struct rvu *rvu, u16 pcifunc, int lf, int slot);
char __user *buffer,
size_t count, loff_t *ppos)
{
- int index, off = 0, flag = 0, go_back = 0, off_prev;
+ int index, off = 0, flag = 0, go_back = 0, len = 0;
struct rvu *rvu = filp->private_data;
int lf, pf, vf, pcifunc;
struct rvu_block block;
int bytes_not_copied;
+ int lf_str_size = 12;
int buf_size = 2048;
+ char *lfs;
char *buf;
/* don't allow partial reads */
buf = kzalloc(buf_size, GFP_KERNEL);
if (!buf)
return -ENOSPC;
- off += scnprintf(&buf[off], buf_size - 1 - off, "\npcifunc\t\t");
+
+ lfs = kzalloc(lf_str_size, GFP_KERNEL);
+ if (!lfs) {
+ kfree(buf);
+ return -ENOMEM;
+ }
+ off += scnprintf(&buf[off], buf_size - 1 - off, "%-*s", lf_str_size,
+ "pcifunc");
for (index = 0; index < BLK_COUNT; index++)
- if (strlen(rvu->hw->block[index].name))
- off += scnprintf(&buf[off], buf_size - 1 - off,
- "%*s\t", (index - 1) * 2,
- rvu->hw->block[index].name);
+ if (strlen(rvu->hw->block[index].name)) {
+ off += scnprintf(&buf[off], buf_size - 1 - off,
+ "%-*s", lf_str_size,
+ rvu->hw->block[index].name);
+ }
off += scnprintf(&buf[off], buf_size - 1 - off, "\n");
for (pf = 0; pf < rvu->hw->total_pfs; pf++) {
for (vf = 0; vf <= rvu->hw->total_vfs; vf++) {
continue;
if (vf) {
+ sprintf(lfs, "PF%d:VF%d", pf, vf - 1);
go_back = scnprintf(&buf[off],
buf_size - 1 - off,
- "PF%d:VF%d\t\t", pf,
- vf - 1);
+ "%-*s", lf_str_size, lfs);
} else {
+ sprintf(lfs, "PF%d", pf);
go_back = scnprintf(&buf[off],
buf_size - 1 - off,
- "PF%d\t\t", pf);
+ "%-*s", lf_str_size, lfs);
}
off += go_back;
block = rvu->hw->block[index];
if (!strlen(block.name))
continue;
- off_prev = off;
+ len = 0;
+ lfs[len] = '\0';
for (lf = 0; lf < block.lf.max; lf++) {
if (block.fn_map[lf] != pcifunc)
continue;
flag = 1;
- off += scnprintf(&buf[off], buf_size - 1
- - off, "%3d,", lf);
+ len += sprintf(&lfs[len], "%d,", lf);
}
- if (flag && off_prev != off)
- off--;
- else
- go_back++;
+
+ if (flag)
+ len--;
+ lfs[len] = '\0';
off += scnprintf(&buf[off], buf_size - 1 - off,
- "\t");
+ "%-*s", lf_str_size, lfs);
+ if (!strlen(lfs))
+ go_back += lf_str_size;
}
if (!flag)
off -= go_back;
}
bytes_not_copied = copy_to_user(buffer, buf, off);
+ kfree(lfs);
kfree(buf);
if (bytes_not_copied)
struct rvu *rvu = filp->private;
struct pci_dev *pdev = NULL;
struct mac_ops *mac_ops;
- int rvu_def_cgx_id = 0;
char cgx[10], lmac[10];
struct rvu_pfvf *pfvf;
int pf, domain, blkid;
u16 pcifunc;
domain = 2;
- mac_ops = get_mac_ops(rvu_cgx_pdata(rvu_def_cgx_id, rvu));
+ mac_ops = get_mac_ops(rvu_first_cgx_pdata(rvu));
+ /* There can be no CGX devices at all */
+ if (!mac_ops)
+ return 0;
seq_printf(filp, "PCI dev\t\tRVU PF Func\tNIX block\t%s\tLMAC\n",
mac_ops->name);
for (pf = 0; pf < rvu->hw->total_pfs; pf++) {
{
struct mac_ops *mac_ops;
unsigned long lmac_bmap;
- int rvu_def_cgx_id = 0;
int i, lmac_id;
char dname[20];
void *cgx;
if (!cgx_get_cgxcnt_max())
return;
- mac_ops = get_mac_ops(rvu_cgx_pdata(rvu_def_cgx_id, rvu));
+ mac_ops = get_mac_ops(rvu_first_cgx_pdata(rvu));
if (!mac_ops)
return;
seq_printf(s, "mask 0x%x\n", ntohs(rule->mask.etype));
break;
case NPC_OUTER_VID:
- seq_printf(s, "%d ", ntohs(rule->packet.vlan_tci));
+ seq_printf(s, "0x%x ", ntohs(rule->packet.vlan_tci));
seq_printf(s, "mask 0x%x\n",
ntohs(rule->mask.vlan_tci));
break;
seq_printf(s, "\tmcam entry: %d\n", iter->entry);
rvu_dbg_npc_mcam_show_flows(s, iter);
- if (iter->intf == NIX_INTF_RX) {
+ if (is_npc_intf_rx(iter->intf)) {
target = iter->rx_action.pf_func;
pf = (target >> RVU_PFVF_PF_SHIFT) & RVU_PFVF_PF_MASK;
seq_printf(s, "\tForward to: PF%d ", pf);
pfvf->rx_chan_cnt = 1;
pfvf->tx_chan_cnt = 1;
rvu_npc_install_promisc_entry(rvu, pcifunc, nixlf,
- pfvf->rx_chan_base, false);
+ pfvf->rx_chan_base,
+ pfvf->rx_chan_cnt, false);
break;
}
struct nix_rx_flowkey_alg *field;
struct nix_rx_flowkey_alg tmp;
u32 key_type, valid_key;
- int l4_key_offset;
+ int l4_key_offset = 0;
if (!alg)
return -EINVAL;
rvu_npc_disable_promisc_entry(rvu, pcifunc, nixlf);
else
rvu_npc_install_promisc_entry(rvu, pcifunc, nixlf,
- pfvf->rx_chan_base, allmulti);
+ pfvf->rx_chan_base,
+ pfvf->rx_chan_cnt, allmulti);
return 0;
}
if (err)
return err;
- rvu_npc_disable_default_entries(rvu, pcifunc, nixlf);
-
- npc_mcam_disable_flows(rvu, pcifunc);
+ rvu_npc_disable_mcam_entries(rvu, pcifunc, nixlf);
return rvu_cgx_start_stop_io(rvu, pcifunc, false);
}
#define RSVD_MCAM_ENTRIES_PER_PF 2 /* Bcast & Promisc */
#define RSVD_MCAM_ENTRIES_PER_NIXLF 1 /* Ucast for LFs */
-#define NIXLF_UCAST_ENTRY 0
-#define NIXLF_BCAST_ENTRY 1
-#define NIXLF_PROMISC_ENTRY 2
-
#define NPC_PARSE_RESULT_DMAC_OFFSET 8
#define NPC_HW_TSTAMP_OFFSET 8
#define NPC_KEX_CHAN_MASK 0xFFFULL
if (is_npc_intf_tx(intf))
return 0;
+ /* return in case of AF installed rules */
+ if (is_pffunc_af(pcifunc))
+ return 0;
+
if (is_afvf(pcifunc)) {
end = rvu_get_num_lbk_chans();
if (end < 0)
return mcam->nixlf_offset + (max + nixlf) * RSVD_MCAM_ENTRIES_PER_NIXLF;
}
-static int npc_get_nixlf_mcam_index(struct npc_mcam *mcam,
- u16 pcifunc, int nixlf, int type)
+int npc_get_nixlf_mcam_index(struct npc_mcam *mcam,
+ u16 pcifunc, int nixlf, int type)
{
int pf = rvu_get_pf(pcifunc);
int index;
return bank;
}
-static bool is_mcam_entry_enabled(struct rvu *rvu, struct npc_mcam *mcam,
- int blkaddr, int index)
+bool is_mcam_entry_enabled(struct rvu *rvu, struct npc_mcam *mcam,
+ int blkaddr, int index)
{
int bank = npc_get_bank(mcam, index);
u64 cfg;
}
void rvu_npc_install_promisc_entry(struct rvu *rvu, u16 pcifunc,
- int nixlf, u64 chan, bool allmulti)
+ int nixlf, u64 chan, u8 chan_cnt,
+ bool allmulti)
{
struct rvu_pfvf *pfvf = rvu_get_pfvf(rvu, pcifunc);
+ struct npc_install_flow_req req = { 0 };
+ struct npc_install_flow_rsp rsp = { 0 };
struct npc_mcam *mcam = &rvu->hw->mcam;
- int blkaddr, ucast_idx, index, kwi;
- struct mcam_entry entry = { {0} };
- struct nix_rx_action action = { };
+ int blkaddr, ucast_idx, index;
+ u8 mac_addr[ETH_ALEN] = { 0 };
+ struct nix_rx_action action;
+ u64 relaxed_mask;
/* Only PF or AF VF can add a promiscuous entry */
if ((pcifunc & RVU_PFVF_FUNC_MASK) && !is_afvf(pcifunc))
if (blkaddr < 0)
return;
+ *(u64 *)&action = 0x00;
index = npc_get_nixlf_mcam_index(mcam, pcifunc,
nixlf, NIXLF_PROMISC_ENTRY);
- entry.kw[0] = chan;
- entry.kw_mask[0] = 0xFFFULL;
-
- if (allmulti) {
- kwi = NPC_KEXOF_DMAC / sizeof(u64);
- entry.kw[kwi] = BIT_ULL(40); /* LSB bit of 1st byte in DMAC */
- entry.kw_mask[kwi] = BIT_ULL(40);
- }
-
- ucast_idx = npc_get_nixlf_mcam_index(mcam, pcifunc,
- nixlf, NIXLF_UCAST_ENTRY);
-
/* If the corresponding PF's ucast action is RSS,
* use the same action for promisc also
*/
+ ucast_idx = npc_get_nixlf_mcam_index(mcam, pcifunc,
+ nixlf, NIXLF_UCAST_ENTRY);
if (is_mcam_entry_enabled(rvu, mcam, blkaddr, ucast_idx))
*(u64 *)&action = npc_get_mcam_action(rvu, mcam,
blkaddr, ucast_idx);
action.pf_func = pcifunc;
}
- entry.action = *(u64 *)&action;
- npc_config_mcam_entry(rvu, mcam, blkaddr, index,
- pfvf->nix_rx_intf, &entry, true);
+ if (allmulti) {
+ mac_addr[0] = 0x01; /* LSB bit of 1st byte in DMAC */
+ ether_addr_copy(req.packet.dmac, mac_addr);
+ ether_addr_copy(req.mask.dmac, mac_addr);
+ req.features = BIT_ULL(NPC_DMAC);
+ }
+
+ req.chan_mask = 0xFFFU;
+ if (chan_cnt > 1) {
+ if (!is_power_of_2(chan_cnt)) {
+ dev_err(rvu->dev,
+ "%s: channel count more than 1, must be power of 2\n", __func__);
+ return;
+ }
+ relaxed_mask = GENMASK_ULL(BITS_PER_LONG_LONG - 1,
+ ilog2(chan_cnt));
+ req.chan_mask &= relaxed_mask;
+ }
+
+ req.channel = chan;
+ req.intf = pfvf->nix_rx_intf;
+ req.entry = index;
+ req.op = action.op;
+ req.hdr.pcifunc = 0; /* AF is requester */
+ req.vf = pcifunc;
+ req.index = action.index;
+ req.match_id = action.match_id;
+ req.flow_key_alg = action.flow_key_alg;
+
+ rvu_mbox_handler_npc_install_flow(rvu, &req, &rsp);
}
static void npc_enadis_promisc_entry(struct rvu *rvu, u16 pcifunc,
void rvu_npc_install_bcast_match_entry(struct rvu *rvu, u16 pcifunc,
int nixlf, u64 chan)
{
+ struct rvu_pfvf *pfvf;
+ struct npc_install_flow_req req = { 0 };
+ struct npc_install_flow_rsp rsp = { 0 };
struct npc_mcam *mcam = &rvu->hw->mcam;
- struct mcam_entry entry = { {0} };
struct rvu_hwinfo *hw = rvu->hw;
- struct nix_rx_action action;
- struct rvu_pfvf *pfvf;
int blkaddr, index;
+ u32 req_index = 0;
+ u8 op;
blkaddr = rvu_get_blkaddr(rvu, BLKTYPE_NPC, 0);
if (blkaddr < 0)
index = npc_get_nixlf_mcam_index(mcam, pcifunc,
nixlf, NIXLF_BCAST_ENTRY);
- /* Match ingress channel */
- entry.kw[0] = chan;
- entry.kw_mask[0] = 0xfffull;
-
- /* Match broadcast MAC address.
- * DMAC is extracted at 0th bit of PARSE_KEX::KW1
- */
- entry.kw[1] = 0xffffffffffffull;
- entry.kw_mask[1] = 0xffffffffffffull;
-
- *(u64 *)&action = 0x00;
if (!hw->cap.nix_rx_multicast) {
/* Early silicon doesn't support pkt replication,
* so install entry with UCAST action, so that PF
* receives all broadcast packets.
*/
- action.op = NIX_RX_ACTIONOP_UCAST;
- action.pf_func = pcifunc;
+ op = NIX_RX_ACTIONOP_UCAST;
} else {
- action.index = pfvf->bcast_mce_idx;
- action.op = NIX_RX_ACTIONOP_MCAST;
+ op = NIX_RX_ACTIONOP_MCAST;
+ req_index = pfvf->bcast_mce_idx;
}
- entry.action = *(u64 *)&action;
- npc_config_mcam_entry(rvu, mcam, blkaddr, index,
- pfvf->nix_rx_intf, &entry, true);
+ eth_broadcast_addr((u8 *)&req.packet.dmac);
+ eth_broadcast_addr((u8 *)&req.mask.dmac);
+ req.features = BIT_ULL(NPC_DMAC);
+ req.channel = chan;
+ req.intf = pfvf->nix_rx_intf;
+ req.entry = index;
+ req.op = op;
+ req.hdr.pcifunc = 0; /* AF is requester */
+ req.vf = pcifunc;
+ req.index = req_index;
+
+ rvu_mbox_handler_npc_install_flow(rvu, &req, &rsp);
}
void rvu_npc_enable_bcast_entry(struct rvu *rvu, u16 pcifunc, bool enable)
{
struct rvu_pfvf *pfvf = rvu_get_pfvf(rvu, pcifunc);
struct npc_mcam *mcam = &rvu->hw->mcam;
- struct rvu_npc_mcam_rule *rule;
+ struct rvu_npc_mcam_rule *rule, *tmp;
int blkaddr;
blkaddr = rvu_get_blkaddr(rvu, BLKTYPE_NPC, 0);
mutex_lock(&mcam->lock);
/* Disable MCAM entries directing traffic to this 'pcifunc' */
- list_for_each_entry(rule, &mcam->mcam_rules, list) {
+ list_for_each_entry_safe(rule, tmp, &mcam->mcam_rules, list) {
if (is_npc_intf_rx(rule->intf) &&
rule->rx_action.pf_func == pcifunc) {
npc_enable_mcam_entry(rvu, mcam, blkaddr,
rule->entry, false);
rule->enable = false;
/* Indicate that default rule is disabled */
- if (rule->default_rule)
+ if (rule->default_rule) {
pfvf->def_ucast_rule = NULL;
+ list_del(&rule->list);
+ kfree(rule);
+ }
}
}
static int npc_mcam_verify_entry(struct npc_mcam *mcam,
u16 pcifunc, int entry)
{
+ /* verify AF installed entries */
+ if (is_pffunc_af(pcifunc))
+ return 0;
/* Verify if entry is valid and if it is indeed
* allocated to the requesting PFFUNC.
*/
goto exit;
}
+ /* For AF installed rules, the nix_intf should be set to target NIX */
+ if (is_pffunc_af(req->hdr.pcifunc))
+ nix_intf = req->intf;
+
npc_config_mcam_entry(rvu, mcam, blkaddr, req->entry, nix_intf,
&req->entry_data, req->enable_entry);
index = find_next_bit(mcam->bmap, mcam->bmap_entries, entry);
if (index >= mcam->bmap_entries)
break;
+ entry = index + 1;
if (mcam->entry2cntr_map[index] != req->cntr)
continue;
- entry = index + 1;
npc_unmap_mcam_entry_and_cntr(rvu, mcam, blkaddr,
index, req->cntr);
}
return 0;
}
-bool rvu_npc_write_default_rule(struct rvu *rvu, int blkaddr, int nixlf,
- u16 pcifunc, u8 intf, struct mcam_entry *entry,
- int *index)
-{
- struct rvu_pfvf *pfvf = rvu_get_pfvf(rvu, pcifunc);
- struct npc_mcam *mcam = &rvu->hw->mcam;
- bool enable;
- u8 nix_intf;
-
- if (is_npc_intf_tx(intf))
- nix_intf = pfvf->nix_tx_intf;
- else
- nix_intf = pfvf->nix_rx_intf;
-
- *index = npc_get_nixlf_mcam_index(mcam, pcifunc,
- nixlf, NIXLF_UCAST_ENTRY);
- /* dont force enable unicast entry */
- enable = is_mcam_entry_enabled(rvu, mcam, blkaddr, *index);
- npc_config_mcam_entry(rvu, mcam, blkaddr, *index, nix_intf,
- entry, enable);
-
- return enable;
-}
-
int rvu_mbox_handler_npc_read_base_steer_rule(struct rvu *rvu,
struct msg_req *req,
struct npc_mcam_read_base_rule_rsp *rsp)
out:
return rc;
}
+
+int rvu_mbox_handler_npc_mcam_entry_stats(struct rvu *rvu,
+ struct npc_mcam_get_stats_req *req,
+ struct npc_mcam_get_stats_rsp *rsp)
+{
+ struct npc_mcam *mcam = &rvu->hw->mcam;
+ u16 index, cntr;
+ int blkaddr;
+ u64 regval;
+ u32 bank;
+
+ blkaddr = rvu_get_blkaddr(rvu, BLKTYPE_NPC, 0);
+ if (blkaddr < 0)
+ return NPC_MCAM_INVALID_REQ;
+
+ mutex_lock(&mcam->lock);
+
+ index = req->entry & (mcam->banksize - 1);
+ bank = npc_get_bank(mcam, req->entry);
+
+ /* read MCAM entry STAT_ACT register */
+ regval = rvu_read64(rvu, blkaddr, NPC_AF_MCAMEX_BANKX_STAT_ACT(index, bank));
+
+ if (!(regval & BIT_ULL(9))) {
+ rsp->stat_ena = 0;
+ mutex_unlock(&mcam->lock);
+ return 0;
+ }
+
+ cntr = regval & 0x1FF;
+
+ rsp->stat_ena = 1;
+ rsp->stat = rvu_read64(rvu, blkaddr, NPC_AF_MATCH_STATX(cntr));
+ rsp->stat &= BIT_ULL(48) - 1;
+
+ mutex_unlock(&mcam->lock);
+
+ return 0;
+}
flow_cfg->ntuple_max_flows = rsp->count;
flow_cfg->ntuple_offset = 0;
pfvf->flags |= OTX2_FLAG_NTUPLE_SUPPORT;
+ flow_cfg->tc_max_flows = flow_cfg->ntuple_max_flows;
+ pfvf->flags |= OTX2_FLAG_TC_FLOWER_SUPPORT;
} else {
flow_cfg->vf_vlan_offset = 0;
flow_cfg->ntuple_offset = flow_cfg->vf_vlan_offset +
vf_vlan_max_flows;
+ flow_cfg->tc_flower_offset = flow_cfg->ntuple_offset;
flow_cfg->unicast_offset = flow_cfg->ntuple_offset +
OTX2_MAX_NTUPLE_FLOWS;
flow_cfg->rx_vlan_offset = flow_cfg->unicast_offset +
pfvf->flags |= OTX2_FLAG_UCAST_FLTR_SUPPORT;
pfvf->flags |= OTX2_FLAG_RX_VLAN_SUPPORT;
pfvf->flags |= OTX2_FLAG_VF_VLAN_SUPPORT;
+ pfvf->flags |= OTX2_FLAG_TC_FLOWER_SUPPORT;
}
for (i = 0; i < rsp->count; i++)
INIT_LIST_HEAD(&pf->flow_cfg->flow_list);
pf->flow_cfg->ntuple_max_flows = OTX2_MAX_NTUPLE_FLOWS;
+ pf->flow_cfg->tc_max_flows = pf->flow_cfg->ntuple_max_flows;
err = otx2_alloc_mcam_entries(pf);
if (err)
int otx2_get_all_flows(struct otx2_nic *pfvf, struct ethtool_rxnfc *nfc,
u32 *rule_locs)
{
+ u32 rule_cnt = nfc->rule_cnt;
u32 location = 0;
int idx = 0;
int err = 0;
nfc->data = pfvf->flow_cfg->ntuple_max_flows;
- while ((!err || err == -ENOENT) && idx < nfc->rule_cnt) {
+ while ((!err || err == -ENOENT) && idx < rule_cnt) {
err = otx2_get_flow(pfvf, nfc, location);
if (!err)
rule_locs[idx++] = location;
location++;
}
+ nfc->rule_cnt = rule_cnt;
return err;
}
sizeof(pmask->ip4dst));
req->features |= BIT_ULL(NPC_DIP_IPV4);
}
+ if (ipv4_usr_mask->tos) {
+ pkt->tos = ipv4_usr_hdr->tos;
+ pmask->tos = ipv4_usr_mask->tos;
+ req->features |= BIT_ULL(NPC_TOS);
+ }
+ if (ipv4_usr_mask->proto) {
+ switch (ipv4_usr_hdr->proto) {
+ case IPPROTO_ICMP:
+ req->features |= BIT_ULL(NPC_IPPROTO_ICMP);
+ break;
+ case IPPROTO_TCP:
+ req->features |= BIT_ULL(NPC_IPPROTO_TCP);
+ break;
+ case IPPROTO_UDP:
+ req->features |= BIT_ULL(NPC_IPPROTO_UDP);
+ break;
+ case IPPROTO_SCTP:
+ req->features |= BIT_ULL(NPC_IPPROTO_SCTP);
+ break;
+ case IPPROTO_AH:
+ req->features |= BIT_ULL(NPC_IPPROTO_AH);
+ break;
+ case IPPROTO_ESP:
+ req->features |= BIT_ULL(NPC_IPPROTO_ESP);
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+ }
pkt->etype = cpu_to_be16(ETH_P_IP);
pmask->etype = cpu_to_be16(0xFFFF);
req->features |= BIT_ULL(NPC_ETYPE);
sizeof(pmask->ip4dst));
req->features |= BIT_ULL(NPC_DIP_IPV4);
}
+ if (ipv4_l4_mask->tos) {
+ pkt->tos = ipv4_l4_hdr->tos;
+ pmask->tos = ipv4_l4_mask->tos;
+ req->features |= BIT_ULL(NPC_TOS);
+ }
if (ipv4_l4_mask->psrc) {
memcpy(&pkt->sport, &ipv4_l4_hdr->psrc,
sizeof(pkt->sport));
sizeof(pmask->ip4dst));
req->features |= BIT_ULL(NPC_DIP_IPV4);
}
+ if (ah_esp_mask->tos) {
+ pkt->tos = ah_esp_hdr->tos;
+ pmask->tos = ah_esp_mask->tos;
+ req->features |= BIT_ULL(NPC_TOS);
+ }
/* NPC profile doesn't extract AH/ESP header fields */
- if ((ah_esp_mask->spi & ah_esp_hdr->spi) ||
- (ah_esp_mask->tos & ah_esp_mask->tos))
+ if (ah_esp_mask->spi & ah_esp_hdr->spi)
return -EOPNOTSUPP;
if (flow_type == AH_V4_FLOW)
struct otx2_nic *pf = netdev_priv(netdev);
struct otx2_cq_poll *cq_poll = NULL;
struct otx2_qset *qset = &pf->qset;
+ struct otx2_rss_info *rss;
int qidx, vec, wrk;
netif_carrier_off(netdev);
/* First stop packet Rx/Tx */
otx2_rxtx_enable(pf, false);
+ /* Clear RSS enable flag */
+ rss = &pf->hw.rss_info;
+ rss->enable = false;
+
/* Cleanup Queue IRQ */
vec = pci_irq_vector(pf->pdev,
pf->hw.nix_msixoff + NIX_LF_QINT_VEC_START);
return NETDEV_TX_OK;
}
+static netdev_features_t otx2_fix_features(struct net_device *dev,
+ netdev_features_t features)
+{
+ /* check if n-tuple filters are ON */
+ if ((features & NETIF_F_HW_TC) && (dev->features & NETIF_F_NTUPLE)) {
+ netdev_info(dev, "Disabling n-tuple filters\n");
+ features &= ~NETIF_F_NTUPLE;
+ }
+
+ /* check if tc hw offload is ON */
+ if ((features & NETIF_F_NTUPLE) && (dev->features & NETIF_F_HW_TC)) {
+ netdev_info(dev, "Disabling TC hardware offload\n");
+ features &= ~NETIF_F_HW_TC;
+ }
+
+ return features;
+}
+
static void otx2_set_rx_mode(struct net_device *netdev)
{
struct otx2_nic *pf = netdev_priv(netdev);
if ((changed & NETIF_F_NTUPLE) && !ntuple)
otx2_destroy_ntuple_flows(pf);
+ if ((netdev->features & NETIF_F_HW_TC) > (features & NETIF_F_HW_TC) &&
+ pf->tc_info.num_entries) {
+ netdev_err(netdev, "Can't disable TC hardware offload while flows are active\n");
+ return -EBUSY;
+ }
+
return 0;
}
.ndo_open = otx2_open,
.ndo_stop = otx2_stop,
.ndo_start_xmit = otx2_xmit,
+ .ndo_fix_features = otx2_fix_features,
.ndo_set_mac_address = otx2_set_mac_address,
.ndo_change_mtu = otx2_change_mtu,
.ndo_set_rx_mode = otx2_set_rx_mode,
.ndo_set_vf_mac = otx2_set_vf_mac,
.ndo_set_vf_vlan = otx2_set_vf_vlan,
.ndo_get_vf_config = otx2_get_vf_config,
+ .ndo_setup_tc = otx2_setup_tc,
};
static int otx2_wq_init(struct otx2_nic *pf)
NETIF_F_HW_VLAN_STAG_RX;
netdev->features |= netdev->hw_features;
+ /* HW supports tc offload but mutually exclusive with n-tuple filters */
+ if (pf->flags & OTX2_FLAG_TC_FLOWER_SUPPORT)
+ netdev->hw_features |= NETIF_F_HW_TC;
+
netdev->gso_max_segs = OTX2_MAX_GSO_SEGS;
netdev->watchdog_timeo = OTX2_TX_TIMEOUT;
otx2_set_ethtool_ops(netdev);
+ err = otx2_init_tc(pf);
+ if (err)
+ goto err_mcam_flow_del;
+
/* Enable link notifications */
otx2_cgx_config_linkevents(pf, true);
return 0;
+err_mcam_flow_del:
+ otx2_mcam_flow_del(pf);
err_unreg_netdev:
unregister_netdev(netdev);
err_del_mcam_entries:
otx2_ptp_destroy(pf);
otx2_mcam_flow_del(pf);
+ otx2_shutdown_tc(pf);
otx2_detach_resources(&pf->mbox);
if (pf->hw.lmt_base)
iounmap(pf->hw.lmt_base);
MLX5_MPWRQ_LOG_WQE_SZ - PAGE_SHIFT : 0)
#define MLX5_MPWRQ_PAGES_PER_WQE BIT(MLX5_MPWRQ_WQE_PAGE_ORDER)
- #define MLX5_MTT_OCTW(npages) (ALIGN(npages, 8) / 2)
+ #define MLX5_ALIGN_MTTS(mtts) (ALIGN(mtts, 8))
+ #define MLX5_ALIGNED_MTTS_OCTW(mtts) ((mtts) / 2)
+ #define MLX5_MTT_OCTW(mtts) (MLX5_ALIGNED_MTTS_OCTW(MLX5_ALIGN_MTTS(mtts)))
/* Add another page to MLX5E_REQUIRED_WQE_MTTS as a buffer between
* WQEs, This page will absorb write overflow by the hardware, when
* receiving packets larger than MTU. These oversize packets are
* dropped by the driver at a later stage.
*/
- #define MLX5E_REQUIRED_WQE_MTTS (ALIGN(MLX5_MPWRQ_PAGES_PER_WQE + 1, 8))
- #define MLX5E_LOG_ALIGNED_MPWQE_PPW (ilog2(MLX5E_REQUIRED_WQE_MTTS))
+ #define MLX5E_REQUIRED_WQE_MTTS (MLX5_ALIGN_MTTS(MLX5_MPWRQ_PAGES_PER_WQE + 1))
#define MLX5E_REQUIRED_MTTS(wqes) (wqes * MLX5E_REQUIRED_WQE_MTTS)
#define MLX5E_MAX_RQ_NUM_MTTS \
((1 << 16) * 2) /* So that MLX5_MTT_OCTW(num_mtts) fits into u16 */
#endif
struct devlink_health_reporter *tx_reporter;
struct devlink_health_reporter *rx_reporter;
- struct devlink_port dl_port;
struct mlx5e_xsk xsk;
#if IS_ENABLED(CONFIG_PCI_HYPERV_INTERFACE)
struct mlx5e_hv_vhca_stats_agent stats_agent;
void mlx5e_destroy_netdev(struct mlx5e_priv *priv);
int mlx5e_netdev_change_profile(struct mlx5e_priv *priv,
const struct mlx5e_profile *new_profile, void *new_ppriv);
+void mlx5e_netdev_attach_nic_profile(struct mlx5e_priv *priv);
void mlx5e_set_netdev_mtu_boundaries(struct mlx5e_priv *priv);
void mlx5e_build_nic_params(struct mlx5e_priv *priv, struct mlx5e_xsk *xsk, u16 mtu);
void mlx5e_build_rq_params(struct mlx5_core_dev *mdev,
zone_rule->nat = nat;
- spec = kzalloc(sizeof(*spec), GFP_KERNEL);
+ spec = kvzalloc(sizeof(*spec), GFP_KERNEL);
if (!spec)
return -ENOMEM;
zone_rule->attr = attr;
- kfree(spec);
+ kvfree(spec);
ct_dbg("Offloaded ct entry rule in zone %d", entry->tuple.zone);
return 0;
err_mod_hdr:
kfree(attr);
err_attr:
- kfree(spec);
+ kvfree(spec);
return err;
}
mlx5e_tc_match_to_reg_get_match(spec, CTSTATE_TO_REG,
&ctstate, &ctstate_mask);
- if (ctstate_mask)
+
+ if ((ctstate & ctstate_mask) == MLX5_CT_STATE_TRK_BIT)
return -EOPNOTSUPP;
ctstate_mask |= MLX5_CT_STATE_TRK_BIT;
mlx5_tc_ct_free_pre_ct(ft, &ft->pre_ct);
}
+/* To avoid false lock dependency warning set the ct_entries_ht lock
+ * class different than the lock class of the ht being used when deleting
+ * last flow from a group and then deleting a group, we get into del_sw_flow_group()
+ * which call rhashtable_destroy on fg->ftes_hash which will take ht->mutex but
+ * it's different than the ht->mutex here.
+ */
+static struct lock_class_key ct_entries_ht_lock_key;
+
static struct mlx5_ct_ft *
mlx5_tc_ct_add_ft_cb(struct mlx5_tc_ct_priv *ct_priv, u16 zone,
struct nf_flowtable *nf_ft)
if (err)
goto err_init;
+ lockdep_set_class(&ft->ct_entries_ht.mutex, &ct_entries_ht_lock_key);
+
err = rhashtable_insert_fast(&ct_priv->zone_ht, &ft->node,
zone_params);
if (err)
struct mlx5_ct_ft *ft;
u32 fte_id = 1;
- post_ct_spec = kzalloc(sizeof(*post_ct_spec), GFP_KERNEL);
+ post_ct_spec = kvzalloc(sizeof(*post_ct_spec), GFP_KERNEL);
ct_flow = kzalloc(sizeof(*ct_flow), GFP_KERNEL);
if (!post_ct_spec || !ct_flow) {
- kfree(post_ct_spec);
+ kvfree(post_ct_spec);
kfree(ct_flow);
return ERR_PTR(-ENOMEM);
}
ct_flow->post_ct_attr->prio = 0;
ct_flow->post_ct_attr->ft = ct_priv->post_ct;
+ /* Splits were handled before CT */
+ if (ct_priv->ns_type == MLX5_FLOW_NAMESPACE_FDB)
+ ct_flow->post_ct_attr->esw_attr->split_count = 0;
+
ct_flow->post_ct_attr->inner_match_level = MLX5_MATCH_NONE;
ct_flow->post_ct_attr->outer_match_level = MLX5_MATCH_NONE;
ct_flow->post_ct_attr->action &= ~(MLX5_FLOW_CONTEXT_ACTION_DECAP);
attr->ct_attr.ct_flow = ct_flow;
dealloc_mod_hdr_actions(&pre_mod_acts);
- kfree(post_ct_spec);
+ kvfree(post_ct_spec);
return rule;
err_idr:
mlx5_tc_ct_del_ft_cb(ct_priv, ft);
err_ft:
- kfree(post_ct_spec);
+ kvfree(post_ct_spec);
kfree(ct_flow);
netdev_warn(priv->netdev, "Failed to offload ct flow, err %d\n", err);
return ERR_PTR(err);
/* Copyright (c) 2021 Mellanox Technologies. */
#include <net/fib_notifier.h>
+#include <net/nexthop.h>
#include "tc_tun_encap.h"
#include "en_tc.h"
#include "tc_tun.h"
* required to establish routing.
*/
flow_flag_set(flow, TUN_RX);
+ flow->attr->tun_ip_version = ip_version;
return 0;
}
if (err || !esw_attr->rx_tun_attr->decap_vport)
goto out;
- key.ip_version = attr->ip_version;
+ key.ip_version = attr->tun_ip_version;
if (key.ip_version == 4)
key.endpoint_ip.v4 = esw_attr->rx_tun_attr->dst_ip.v4;
else
MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_MTT);
mlx5e_mkey_set_relaxed_ordering(mdev, mkc);
MLX5_SET(mkc, mkc, qpn, 0xffffff);
- MLX5_SET(mkc, mkc, pd, mdev->mlx5e_res.pdn);
+ MLX5_SET(mkc, mkc, pd, mdev->mlx5e_res.hw_objs.pdn);
MLX5_SET64(mkc, mkc, len, npages << page_shift);
MLX5_SET(mkc, mkc, translations_octword_size,
MLX5_MTT_OCTW(npages));
rq->wqe_overflow.addr);
}
- static inline u64 mlx5e_get_mpwqe_offset(struct mlx5e_rq *rq, u16 wqe_ix)
+ static u64 mlx5e_get_mpwqe_offset(u16 wqe_ix)
{
- return (wqe_ix << MLX5E_LOG_ALIGNED_MPWQE_PPW) << PAGE_SHIFT;
+ return MLX5E_REQUIRED_MTTS(wqe_ix) << PAGE_SHIFT;
}
static void mlx5e_init_frags_partition(struct mlx5e_rq *rq)
mlx5_wq_ll_get_wqe(&rq->mpwqe.wq, i);
u32 byte_count =
rq->mpwqe.num_strides << rq->mpwqe.log_stride_sz;
- u64 dma_offset = mlx5e_get_mpwqe_offset(rq, i);
+ u64 dma_offset = mlx5e_get_mpwqe_offset(i);
wqe->data[0].addr = cpu_to_be64(dma_offset + rq->buff.headroom);
wqe->data[0].byte_count = cpu_to_be32(byte_count);
sq->pdev = c->pdev;
sq->mkey_be = c->mkey_be;
sq->channel = c;
- sq->uar_map = mdev->mlx5e_res.bfreg.map;
+ sq->uar_map = mdev->mlx5e_res.hw_objs.bfreg.map;
sq->min_inline_mode = params->tx_min_inline_mode;
sq->hw_mtu = MLX5E_SW2HW_MTU(params, params->sw_mtu);
sq->xsk_pool = xsk_pool;
int err;
sq->channel = c;
- sq->uar_map = mdev->mlx5e_res.bfreg.map;
+ sq->uar_map = mdev->mlx5e_res.hw_objs.bfreg.map;
param->wq.db_numa_node = cpu_to_node(c->cpu);
err = mlx5_wq_cyc_create(mdev, ¶m->wq, sqc_wq, wq, &sq->wq_ctrl);
sq->priv = c->priv;
sq->ch_ix = c->ix;
sq->txq_ix = txq_ix;
- sq->uar_map = mdev->mlx5e_res.bfreg.map;
+ sq->uar_map = mdev->mlx5e_res.hw_objs.bfreg.map;
sq->min_inline_mode = params->tx_min_inline_mode;
sq->hw_mtu = MLX5E_SW2HW_MTU(params, params->sw_mtu);
INIT_WORK(&sq->recover_work, mlx5e_tx_err_cqe_work);
MLX5_SET(sqc, sqc, flush_in_error_en, 1);
MLX5_SET(wq, wq, wq_type, MLX5_WQ_TYPE_CYCLIC);
- MLX5_SET(wq, wq, uar_page, mdev->mlx5e_res.bfreg.index);
+ MLX5_SET(wq, wq, uar_page, mdev->mlx5e_res.hw_objs.bfreg.index);
MLX5_SET(wq, wq, log_wq_pg_sz, csp->wq_ctrl->buf.page_shift -
MLX5_ADAPTER_PAGE_SHIFT);
MLX5_SET64(wq, wq, dbr_addr, csp->wq_ctrl->db.dma);
c->cpu = cpu;
c->pdev = mlx5_core_dma_dev(priv->mdev);
c->netdev = priv->netdev;
- c->mkey_be = cpu_to_be32(priv->mdev->mlx5e_res.mkey.key);
+ c->mkey_be = cpu_to_be32(priv->mdev->mlx5e_res.hw_objs.mkey.key);
c->num_tc = params->num_tc;
c->xdp = !!params->xdp_prog;
c->stats = &priv->channel_stats[ix].ch;
MLX5_SET(wq, wq, end_padding_mode, MLX5_WQ_END_PAD_MODE_ALIGN);
MLX5_SET(wq, wq, log_wq_stride,
mlx5e_get_rqwq_log_stride(params->rq_wq_type, ndsegs));
- MLX5_SET(wq, wq, pd, mdev->mlx5e_res.pdn);
+ MLX5_SET(wq, wq, pd, mdev->mlx5e_res.hw_objs.pdn);
MLX5_SET(rqc, rqc, counter_set_id, priv->q_counter);
MLX5_SET(rqc, rqc, vsd, params->vlan_strip_disable);
MLX5_SET(rqc, rqc, scatter_fcs, params->scatter_fcs_en);
void *wq = MLX5_ADDR_OF(sqc, sqc, wq);
MLX5_SET(wq, wq, log_wq_stride, ilog2(MLX5_SEND_WQE_BB));
- MLX5_SET(wq, wq, pd, priv->mdev->mlx5e_res.pdn);
+ MLX5_SET(wq, wq, pd, priv->mdev->mlx5e_res.hw_objs.pdn);
param->wq.buf_numa_node = dev_to_node(mlx5_core_dma_dev(priv->mdev));
}
{
switch (params->rq_wq_type) {
case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ:
- return order_base_2(MLX5E_UMR_WQEBBS) +
- mlx5e_get_rq_log_wq_sz(rqp->rqc);
+ return max_t(u8, MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE,
+ order_base_2(MLX5E_UMR_WQEBBS) +
+ mlx5e_get_rq_log_wq_sz(rqp->rqc));
default: /* MLX5_WQ_TYPE_CYCLIC */
return MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE;
}
{
int i;
- if (chs->port_ptp)
+ if (chs->port_ptp) {
mlx5e_port_ptp_close(chs->port_ptp);
+ chs->port_ptp = NULL;
+ }
for (i = 0; i < chs->num; i++)
mlx5e_close_channel(chs->c[i]);
{
void *tisc = MLX5_ADDR_OF(create_tis_in, in, ctx);
- MLX5_SET(tisc, tisc, transport_domain, mdev->mlx5e_res.td.tdn);
+ MLX5_SET(tisc, tisc, transport_domain, mdev->mlx5e_res.hw_objs.td.tdn);
if (MLX5_GET(tisc, tisc, tls_en))
- MLX5_SET(tisc, tisc, pd, mdev->mlx5e_res.pdn);
+ MLX5_SET(tisc, tisc, pd, mdev->mlx5e_res.hw_objs.pdn);
if (mlx5_lag_is_lacp_owner(mdev))
MLX5_SET(tisc, tisc, strict_lag_tx_port_affinity, 1);
static void mlx5e_build_indir_tir_ctx_common(struct mlx5e_priv *priv,
u32 rqtn, u32 *tirc)
{
- MLX5_SET(tirc, tirc, transport_domain, priv->mdev->mlx5e_res.td.tdn);
+ MLX5_SET(tirc, tirc, transport_domain, priv->mdev->mlx5e_res.hw_objs.td.tdn);
MLX5_SET(tirc, tirc, disp_type, MLX5_TIRC_DISP_TYPE_INDIRECT);
MLX5_SET(tirc, tirc, indirect_table, rqtn);
MLX5_SET(tirc, tirc, tunneled_offload_en,
void *type_data)
{
struct mlx5e_priv *priv = netdev_priv(dev);
+ bool tc_unbind = false;
int err;
+ if (type == TC_SETUP_BLOCK &&
+ ((struct flow_block_offload *)type_data)->command == FLOW_BLOCK_UNBIND)
+ tc_unbind = true;
+
+ if (!netif_device_present(dev) && !tc_unbind)
+ return -ENODEV;
+
switch (type) {
case TC_SETUP_BLOCK: {
struct flow_block_offload *f = type_data;
for (j = 0; j < priv->max_opened_tc; j++) {
struct mlx5e_sq_stats *sq_stats = &channel_stats->sq[j];
+ s->tx_packets += sq_stats->packets;
+ s->tx_bytes += sq_stats->bytes;
+ s->tx_dropped += sq_stats->dropped;
+ }
+ }
+ if (priv->port_ptp_opened) {
+ for (i = 0; i < priv->max_opened_tc; i++) {
+ struct mlx5e_sq_stats *sq_stats = &priv->port_ptp_stats.sq[i];
+
s->tx_packets += sq_stats->packets;
s->tx_bytes += sq_stats->bytes;
s->tx_dropped += sq_stats->dropped;
struct mlx5e_priv *priv = netdev_priv(dev);
struct mlx5e_pport_stats *pstats = &priv->stats.pport;
+ if (!netif_device_present(dev))
+ return;
+
/* In switchdev mode, monitor counters doesn't monitor
* rx/tx stats of 802_3. The update stats mechanism
* should keep the 802_3 layout counters updated
}
if (mlx5e_is_uplink_rep(priv)) {
+ struct mlx5e_vport_stats *vstats = &priv->stats.vport;
+
stats->rx_packets = PPORT_802_3_GET(pstats, a_frames_received_ok);
stats->rx_bytes = PPORT_802_3_GET(pstats, a_octets_received_ok);
stats->tx_packets = PPORT_802_3_GET(pstats, a_frames_transmitted_ok);
stats->tx_bytes = PPORT_802_3_GET(pstats, a_octets_transmitted_ok);
+
+ /* vport multicast also counts packets that are dropped due to steering
+ * or rx out of buffer
+ */
+ stats->multicast = VPORT_COUNTER_GET(vstats, received_eth_multicast.packets);
} else {
mlx5e_fold_sw_stats64(priv, stats);
}
stats->tx_errors = stats->tx_aborted_errors + stats->tx_carrier_errors;
}
+static void mlx5e_nic_set_rx_mode(struct mlx5e_priv *priv)
+{
+ if (mlx5e_is_uplink_rep(priv))
+ return; /* no rx mode for uplink rep */
+
+ queue_work(priv->wq, &priv->set_rx_mode_work);
+}
+
static void mlx5e_set_rx_mode(struct net_device *dev)
{
struct mlx5e_priv *priv = netdev_priv(dev);
- queue_work(priv->wq, &priv->set_rx_mode_work);
+ mlx5e_nic_set_rx_mode(priv);
}
static int mlx5e_set_mac(struct net_device *netdev, void *addr)
ether_addr_copy(netdev->dev_addr, saddr->sa_data);
netif_addr_unlock_bh(netdev);
- queue_work(priv->wq, &priv->set_rx_mode_work);
+ mlx5e_nic_set_rx_mode(priv);
return 0;
}
struct mlx5e_priv *priv = netdev_priv(dev);
struct mlx5_core_dev *mdev = priv->mdev;
+ if (mlx5e_is_uplink_rep(priv))
+ return -EOPNOTSUPP;
+
return mlx5_eswitch_set_vport_state(mdev->priv.eswitch, vf + 1,
mlx5_ifla_link2vport(link_state));
}
struct mlx5_core_dev *mdev = priv->mdev;
int err;
+ if (!netif_device_present(dev))
+ return -EOPNOTSUPP;
+
err = mlx5_eswitch_get_vport_config(mdev->priv.eswitch, vf + 1, ivi);
if (err)
return err;
return mlx5_eswitch_get_vport_stats(mdev->priv.eswitch, vf + 1,
vf_stats);
}
+
+static bool
+mlx5e_has_offload_stats(const struct net_device *dev, int attr_id)
+{
+ struct mlx5e_priv *priv = netdev_priv(dev);
+
+ if (!netif_device_present(dev))
+ return false;
+
+ if (!mlx5e_is_uplink_rep(priv))
+ return false;
+
+ return mlx5e_rep_has_offload_stats(dev, attr_id);
+}
+
+static int
+mlx5e_get_offload_stats(int attr_id, const struct net_device *dev,
+ void *sp)
+{
+ struct mlx5e_priv *priv = netdev_priv(dev);
+
+ if (!mlx5e_is_uplink_rep(priv))
+ return -EOPNOTSUPP;
+
+ return mlx5e_rep_get_offload_stats(attr_id, dev, sp);
+}
#endif
static bool mlx5e_tunnel_proto_supported_tx(struct mlx5_core_dev *mdev, u8 proto_type)
struct mlx5e_channel *c = priv->channels.c[i];
mlx5e_rq_replace_xdp_prog(&c->rq, prog);
- if (test_bit(MLX5E_CHANNEL_STATE_XSK, c->state))
+ if (test_bit(MLX5E_CHANNEL_STATE_XSK, c->state)) {
+ bpf_prog_inc(prog);
mlx5e_rq_replace_xdp_prog(&c->xskrq, prog);
+ }
}
unlock:
.ndo_get_vf_config = mlx5e_get_vf_config,
.ndo_set_vf_link_state = mlx5e_set_vf_link_state,
.ndo_get_vf_stats = mlx5e_get_vf_stats,
+ .ndo_has_offload_stats = mlx5e_has_offload_stats,
+ .ndo_get_offload_stats = mlx5e_get_offload_stats,
#endif
.ndo_get_devlink_port = mlx5e_get_devlink_port,
};
priv->max_nch);
params->num_tc = 1;
+ /* Set an initial non-zero value, so that mlx5e_select_queue won't
+ * divide by zero if called before first activating channels.
+ */
+ priv->num_tc_x_num_ch = params->num_channels * params->num_tc;
+
/* SQ */
params->log_sq_size = is_kdump_kernel() ?
MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE :
if (err)
mlx5_core_err(mdev, "TLS initialization failed, %d\n", err);
- err = mlx5e_devlink_port_register(priv);
- if (err)
- mlx5_core_err(mdev, "mlx5e_devlink_port_register failed, %d\n", err);
-
mlx5e_health_create_reporters(priv);
return 0;
static void mlx5e_nic_cleanup(struct mlx5e_priv *priv)
{
mlx5e_health_destroy_reporters(priv);
- mlx5e_devlink_port_unregister(priv);
mlx5e_tls_cleanup(priv);
mlx5e_ipsec_cleanup(priv);
}
return;
mlx5e_dcbnl_init_app(priv);
- queue_work(priv->wq, &priv->set_rx_mode_work);
+ mlx5e_nic_set_rx_mode(priv);
rtnl_lock();
if (netif_running(netdev))
netif_device_detach(priv->netdev);
rtnl_unlock();
- queue_work(priv->wq, &priv->set_rx_mode_work);
+ mlx5e_nic_set_rx_mode(priv);
mlx5e_hv_vhca_stats_destroy(priv);
if (mlx5e_monitor_counter_supported(priv))
struct net_device *netdev,
struct mlx5_core_dev *mdev)
{
- memset(priv, 0, sizeof(*priv));
-
/* priv init */
priv->mdev = mdev;
priv->netdev = netdev;
{
int i;
+ /* bail if change profile failed and also rollback failed */
+ if (!priv->mdev)
+ return;
+
destroy_workqueue(priv->wq);
free_cpumask_var(priv->scratchpad.cpumask);
for (i = 0; i < priv->htb.max_qos_sqs; i++)
kfree(priv->htb.qos_sq_stats[i]);
kvfree(priv->htb.qos_sq_stats);
+
+ memset(priv, 0, sizeof(*priv));
}
struct net_device *
}
static int
- mlx5e_netdev_attach_profile(struct mlx5e_priv *priv,
+ mlx5e_netdev_attach_profile(struct net_device *netdev, struct mlx5_core_dev *mdev,
const struct mlx5e_profile *new_profile, void *new_ppriv)
{
- struct net_device *netdev = priv->netdev;
- struct mlx5_core_dev *mdev = priv->mdev;
+ struct mlx5e_priv *priv = netdev_priv(netdev);
int err;
err = mlx5e_priv_init(priv, netdev, mdev);
priv->ppriv = new_ppriv;
err = new_profile->init(priv->mdev, priv->netdev);
if (err)
- return err;
+ goto priv_cleanup;
err = mlx5e_attach_netdev(priv);
if (err)
- new_profile->cleanup(priv);
+ goto profile_cleanup;
+ return err;
+
+ profile_cleanup:
+ new_profile->cleanup(priv);
+ priv_cleanup:
+ mlx5e_priv_cleanup(priv);
return err;
}
{
unsigned int new_max_nch = mlx5e_calc_max_nch(priv, new_profile);
const struct mlx5e_profile *orig_profile = priv->profile;
+ struct net_device *netdev = priv->netdev;
+ struct mlx5_core_dev *mdev = priv->mdev;
void *orig_ppriv = priv->ppriv;
int err, rollback_err;
/* sanity */
if (new_max_nch != priv->max_nch) {
- netdev_warn(priv->netdev,
- "%s: Replacing profile with different max channels\n",
+ netdev_warn(netdev, "%s: Replacing profile with different max channels\n",
__func__);
return -EINVAL;
}
priv->profile->cleanup(priv);
mlx5e_priv_cleanup(priv);
- err = mlx5e_netdev_attach_profile(priv, new_profile, new_ppriv);
+ err = mlx5e_netdev_attach_profile(netdev, mdev, new_profile, new_ppriv);
if (err) { /* roll back to original profile */
- netdev_warn(priv->netdev, "%s: new profile init failed, %d\n",
- __func__, err);
+ netdev_warn(netdev, "%s: new profile init failed, %d\n", __func__, err);
goto rollback;
}
return 0;
rollback:
- rollback_err = mlx5e_netdev_attach_profile(priv, orig_profile, orig_ppriv);
- if (rollback_err) {
- netdev_err(priv->netdev,
- "%s: failed to rollback to orig profile, %d\n",
+ rollback_err = mlx5e_netdev_attach_profile(netdev, mdev, orig_profile, orig_ppriv);
+ if (rollback_err)
+ netdev_err(netdev, "%s: failed to rollback to orig profile, %d\n",
__func__, rollback_err);
- }
return err;
}
+void mlx5e_netdev_attach_nic_profile(struct mlx5e_priv *priv)
+{
+ mlx5e_netdev_change_profile(priv, &mlx5e_nic_profile, NULL);
+}
+
void mlx5e_destroy_netdev(struct mlx5e_priv *priv)
{
struct net_device *netdev = priv->netdev;
priv->profile = profile;
priv->ppriv = NULL;
+
+ err = mlx5e_devlink_port_register(priv);
+ if (err) {
+ mlx5_core_err(mdev, "mlx5e_devlink_port_register failed, %d\n", err);
+ goto err_destroy_netdev;
+ }
+
err = profile->init(mdev, netdev);
if (err) {
mlx5_core_err(mdev, "mlx5e_nic_profile init failed, %d\n", err);
- goto err_destroy_netdev;
+ goto err_devlink_cleanup;
}
err = mlx5e_resume(adev);
mlx5e_devlink_port_type_eth_set(priv);
mlx5e_dcbnl_init_app(priv);
+ mlx5_uplink_netdev_set(mdev, netdev);
return 0;
err_resume:
mlx5e_suspend(adev, state);
err_profile_cleanup:
profile->cleanup(priv);
+err_devlink_cleanup:
+ mlx5e_devlink_port_unregister(priv);
err_destroy_netdev:
mlx5e_destroy_netdev(priv);
return err;
unregister_netdev(priv->netdev);
mlx5e_suspend(adev, state);
priv->profile->cleanup(priv);
+ mlx5e_devlink_port_unregister(priv);
mlx5e_destroy_netdev(priv);
}
mlx5e_ipsec_build_inverse_table();
mlx5e_build_ptys2ethtool_map();
- ret = mlx5e_rep_init();
+ ret = auxiliary_driver_register(&mlx5e_driver);
if (ret)
return ret;
- ret = auxiliary_driver_register(&mlx5e_driver);
+ ret = mlx5e_rep_init();
if (ret)
- mlx5e_rep_cleanup();
+ auxiliary_driver_unregister(&mlx5e_driver);
return ret;
}
void mlx5e_cleanup(void)
{
- auxiliary_driver_unregister(&mlx5e_driver);
mlx5e_rep_cleanup();
+ auxiliary_driver_unregister(&mlx5e_driver);
}
#include "en/health.h"
#include "en/params.h"
#include "devlink.h"
+#include "en/devlink.h"
static struct sk_buff *
mlx5e_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi,
struct mlx5e_icosq *sq = rq->icosq;
struct mlx5_wq_cyc *wq = &sq->wq;
struct mlx5e_umr_wqe *umr_wqe;
- u16 xlt_offset = ix << (MLX5E_LOG_ALIGNED_MPWQE_PPW - 1);
u16 pi;
int err;
int i;
umr_wqe->ctrl.opmod_idx_opcode =
cpu_to_be32((sq->pc << MLX5_WQE_CTRL_WQE_INDEX_SHIFT) |
MLX5_OPCODE_UMR);
- umr_wqe->uctrl.xlt_offset = cpu_to_be16(xlt_offset);
+ umr_wqe->uctrl.xlt_offset =
+ cpu_to_be16(MLX5_ALIGNED_MTTS_OCTW(MLX5E_REQUIRED_MTTS(ix)));
sq->db.wqe_info[pi] = (struct mlx5e_icosq_wqe_info) {
.wqe_type = MLX5E_ICOSQ_WQE_UMR_RX,
get_cqe_opcode(cqe));
mlx5e_dump_error_cqe(&sq->cq, sq->sqn,
(struct mlx5_err_cqe *)cqe);
+ mlx5_wq_cyc_wqe_dump(&sq->wq, ci, wi->num_wqebbs);
if (!test_and_set_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state))
queue_work(cq->priv->wq, &sq->recover_work);
break;
struct mlx5e_priv *priv = netdev_priv(rq->netdev);
struct mlx5_wq_cyc *wq = &rq->wqe.wq;
struct mlx5e_wqe_frag_info *wi;
+ struct devlink_port *dl_port;
struct sk_buff *skb;
u32 cqe_bcnt;
u16 trap_id;
mlx5e_complete_rx_cqe(rq, cqe, cqe_bcnt, skb);
skb_push(skb, ETH_HLEN);
- mlx5_devlink_trap_report(rq->mdev, trap_id, skb, &priv->dl_port);
+ dl_port = mlx5e_devlink_get_dl_port(priv);
+ mlx5_devlink_trap_report(rq->mdev, trap_id, skb, dl_port);
dev_kfree_skb_any(skb);
free_wqe:
mlx5_core_dealloc_transport_domain(hp->func_mdev, hp->tdn);
}
-static void mlx5e_hairpin_fill_rqt_rqns(struct mlx5e_hairpin *hp, void *rqtc)
+static int mlx5e_hairpin_fill_rqt_rqns(struct mlx5e_hairpin *hp, void *rqtc)
{
- u32 indirection_rqt[MLX5E_INDIR_RQT_SIZE], rqn;
+ u32 *indirection_rqt, rqn;
struct mlx5e_priv *priv = hp->func_priv;
int i, ix, sz = MLX5E_INDIR_RQT_SIZE;
+ indirection_rqt = kzalloc(sz, GFP_KERNEL);
+ if (!indirection_rqt)
+ return -ENOMEM;
+
mlx5e_build_default_indir_rqt(indirection_rqt, sz,
hp->num_channels);
rqn = hp->pair->rqn[ix];
MLX5_SET(rqtc, rqtc, rq_num[i], rqn);
}
+
+ kfree(indirection_rqt);
+ return 0;
}
static int mlx5e_hairpin_create_indirect_rqt(struct mlx5e_hairpin *hp)
MLX5_SET(rqtc, rqtc, rqt_actual_size, sz);
MLX5_SET(rqtc, rqtc, rqt_max_size, sz);
- mlx5e_hairpin_fill_rqt_rqns(hp, rqtc);
+ err = mlx5e_hairpin_fill_rqt_rqns(hp, rqtc);
+ if (err)
+ goto out;
err = mlx5_core_create_rqt(mdev, in, inlen, &hp->indir_rqt.rqtn);
if (!err)
hp->indir_rqt.enabled = true;
+out:
kvfree(in);
return err;
}
if (flow_flag_test(flow, CT)) {
mod_hdr_acts = &attr->parse_attr->mod_hdr_acts;
- return mlx5_tc_ct_flow_offload(get_ct_priv(flow->priv),
+ rule = mlx5_tc_ct_flow_offload(get_ct_priv(flow->priv),
flow, spec, attr,
mod_hdr_acts);
+ } else {
+ rule = mlx5_eswitch_add_offloaded_rule(esw, spec, attr);
}
- rule = mlx5_eswitch_add_offloaded_rule(esw, spec, attr);
if (IS_ERR(rule))
return rule;
if (attr->esw_attr->split_count) {
flow->rule[1] = mlx5_eswitch_add_fwd_rule(esw, spec, attr);
if (IS_ERR(flow->rule[1])) {
- mlx5_eswitch_del_offloaded_rule(esw, rule, attr);
+ if (flow_flag_test(flow, CT))
+ mlx5_tc_ct_delete_flow(get_ct_priv(flow->priv), flow, attr);
+ else
+ mlx5_eswitch_del_offloaded_rule(esw, rule, attr);
return flow->rule[1];
}
}
misc_parameters);
void *misc_v = MLX5_ADDR_OF(fte_match_param, spec->match_value,
misc_parameters);
+ void *misc_c_3 = MLX5_ADDR_OF(fte_match_param, spec->match_criteria,
+ misc_parameters_3);
+ void *misc_v_3 = MLX5_ADDR_OF(fte_match_param, spec->match_value,
+ misc_parameters_3);
struct flow_rule *rule = flow_cls_offload_flow_rule(f);
struct flow_dissector *dissector = rule->match.dissector;
u16 addr_type = 0;
BIT(FLOW_DISSECTOR_KEY_CT) |
BIT(FLOW_DISSECTOR_KEY_ENC_IP) |
BIT(FLOW_DISSECTOR_KEY_ENC_OPTS) |
+ BIT(FLOW_DISSECTOR_KEY_ICMP) |
BIT(FLOW_DISSECTOR_KEY_MPLS))) {
NL_SET_ERR_MSG_MOD(extack, "Unsupported key");
netdev_dbg(priv->netdev, "Unsupported key used: 0x%x\n",
if (match.mask->flags)
*match_level = MLX5_MATCH_L4;
}
+ if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_ICMP)) {
+ struct flow_match_icmp match;
+ flow_rule_match_icmp(rule, &match);
+ switch (ip_proto) {
+ case IPPROTO_ICMP:
+ if (!(MLX5_CAP_GEN(priv->mdev, flex_parser_protocols) &
+ MLX5_FLEX_PROTO_ICMP))
+ return -EOPNOTSUPP;
+ MLX5_SET(fte_match_set_misc3, misc_c_3, icmp_type,
+ match.mask->type);
+ MLX5_SET(fte_match_set_misc3, misc_v_3, icmp_type,
+ match.key->type);
+ MLX5_SET(fte_match_set_misc3, misc_c_3, icmp_code,
+ match.mask->code);
+ MLX5_SET(fte_match_set_misc3, misc_v_3, icmp_code,
+ match.key->code);
+ break;
+ case IPPROTO_ICMPV6:
+ if (!(MLX5_CAP_GEN(priv->mdev, flex_parser_protocols) &
+ MLX5_FLEX_PROTO_ICMPV6))
+ return -EOPNOTSUPP;
+ MLX5_SET(fte_match_set_misc3, misc_c_3, icmpv6_type,
+ match.mask->type);
+ MLX5_SET(fte_match_set_misc3, misc_v_3, icmpv6_type,
+ match.key->type);
+ MLX5_SET(fte_match_set_misc3, misc_c_3, icmpv6_code,
+ match.mask->code);
+ MLX5_SET(fte_match_set_misc3, misc_v_3, icmpv6_code,
+ match.key->code);
+ break;
+ default:
+ NL_SET_ERR_MSG_MOD(extack,
+ "Code and type matching only with ICMP and ICMPv6");
+ netdev_err(priv->netdev,
+ "Code and type matching only with ICMP and ICMPv6\n");
+ return -EINVAL;
+ }
+ if (match.mask->code || match.mask->type) {
+ *match_level = MLX5_MATCH_L4;
+ spec->match_criteria_enable |= MLX5_MATCH_MISC_PARAMETERS_3;
+ }
+ }
+ /* Currenlty supported only for MPLS over UDP */
+ if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_MPLS) &&
+ !netif_is_bareudp(filter_dev)) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Matching on MPLS is supported only for MPLS over UDP");
+ netdev_err(priv->netdev,
+ "Matching on MPLS is supported only for MPLS over UDP\n");
+ return -EOPNOTSUPP;
+ }
+
return 0;
}
return 0;
}
+ static bool modify_tuple_supported(bool modify_tuple, bool ct_clear,
+ bool ct_flow, struct netlink_ext_ack *extack,
+ struct mlx5e_priv *priv,
+ struct mlx5_flow_spec *spec)
+ {
+ if (!modify_tuple || ct_clear)
+ return true;
+
+ if (ct_flow) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "can't offload tuple modification with non-clear ct()");
+ netdev_info(priv->netdev,
+ "can't offload tuple modification with non-clear ct()");
+ return false;
+ }
+
+ /* Add ct_state=-trk match so it will be offloaded for non ct flows
+ * (or after clear action), as otherwise, since the tuple is changed,
+ * we can't restore ct state
+ */
+ if (mlx5_tc_ct_add_no_trk_match(spec)) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "can't offload tuple modification with ct matches and no ct(clear) action");
+ netdev_info(priv->netdev,
+ "can't offload tuple modification with ct matches and no ct(clear) action");
+ return false;
+ }
+
+ return true;
+ }
+
static bool modify_header_match_supported(struct mlx5e_priv *priv,
struct mlx5_flow_spec *spec,
struct flow_action *flow_action,
return err;
}
- /* Add ct_state=-trk match so it will be offloaded for non ct flows
- * (or after clear action), as otherwise, since the tuple is changed,
- * we can't restore ct state
- */
- if (!ct_clear && modify_tuple &&
- mlx5_tc_ct_add_no_trk_match(spec)) {
- NL_SET_ERR_MSG_MOD(extack,
- "can't offload tuple modify header with ct matches");
- netdev_info(priv->netdev,
- "can't offload tuple modify header with ct matches");
+ if (!modify_tuple_supported(modify_tuple, ct_clear, ct_flow, extack,
+ priv, spec))
return false;
- }
ip_proto = MLX5_GET(fte_match_set_lyr_2_4, headers_v, ip_protocol);
if (modify_ip_header && ip_proto != IPPROTO_TCP &&
actions = flow->attr->action;
if (mlx5e_is_eswitch_flow(flow)) {
- if (flow->attr->esw_attr->split_count && ct_flow) {
+ if (flow->attr->esw_attr->split_count && ct_flow &&
+ !MLX5_CAP_GEN(flow->attr->esw_attr->in_mdev, reg_c_preserve)) {
/* All registers used by ct are cleared when using
* split rules.
*/
return err;
flow_flag_set(flow, CT);
+ esw_attr->split_count = esw_attr->out_count;
break;
default:
NL_SET_ERR_MSG_MOD(extack, "The offload action is not supported");
return -EOPNOTSUPP;
}
- if (attr->action & MLX5_FLOW_CONTEXT_ACTION_FWD_DEST) {
- NL_SET_ERR_MSG_MOD(extack,
- "Mirroring goto chain rules isn't supported");
- return -EOPNOTSUPP;
- }
attr->action |= MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
}
struct mlx5e_tc_flow *flow;
int err = 0;
+ if (!mlx5_esw_hold(priv->mdev))
+ return -EAGAIN;
+
+ mlx5_esw_get(priv->mdev);
+
rcu_read_lock();
flow = rhashtable_lookup(tc_ht, &f->cookie, tc_ht_params);
if (flow) {
if (err)
goto err_free;
+ mlx5_esw_release(priv->mdev);
return 0;
err_free:
mlx5e_flow_put(priv, flow);
out:
+ mlx5_esw_put(priv->mdev);
+ mlx5_esw_release(priv->mdev);
return err;
}
trace_mlx5e_delete_flower(f);
mlx5e_flow_put(priv, flow);
+ mlx5_esw_put(priv->mdev);
return 0;
errout:
*/
if (rate) {
rate = (rate * BITS_PER_BYTE) + 500000;
- rate_mbps = max_t(u64, do_div(rate, 1000000), 1);
+ do_div(rate, 1000000);
+ rate_mbps = max_t(u32, rate, 1);
}
err = mlx5_esw_modify_vport_rate(esw, vport_num, rate_mbps);
flow_action_for_each(i, act, flow_action) {
switch (act->id) {
case FLOW_ACTION_POLICE:
+ if (act->police.rate_pkt_ps) {
+ NL_SET_ERR_MSG_MOD(extack, "QoS offload not support packets per second");
+ return -EOPNOTSUPP;
+ }
err = apply_police_params(priv, act->police.rate_bytes_ps, extack);
if (err)
return err;
tc->ct = mlx5_tc_ct_init(priv, tc->chains, &priv->fs.tc.mod_hdr,
MLX5_FLOW_NAMESPACE_KERNEL);
- if (IS_ERR(tc->ct)) {
- err = PTR_ERR(tc->ct);
- goto err_ct;
- }
tc->netdevice_nb.notifier_call = mlx5e_tc_netdev_event;
err = register_netdevice_notifier_dev_net(priv->netdev,
err_reg:
mlx5_tc_ct_clean(tc->ct);
-err_ct:
mlx5_chains_destroy(tc->chains);
err_chains:
rhashtable_destroy(&tc->ht);
esw_chains(esw),
&esw->offloads.mod_hdr,
MLX5_FLOW_NAMESPACE_FDB);
- if (IS_ERR(uplink_priv->ct_priv))
- goto err_ct;
mapping = mapping_create(sizeof(struct tunnel_match_key),
TUNNEL_INFO_BITS_MASK, true);
mapping_destroy(uplink_priv->tunnel_mapping);
err_tun_mapping:
mlx5_tc_ct_clean(uplink_priv->ct_priv);
-err_ct:
netdev_warn(priv->netdev,
"Failed to initialize tc (eswitch), err: %d", err);
return err;
int mlx5e_setup_tc_block_cb(enum tc_setup_type type, void *type_data,
void *cb_priv)
{
- unsigned long flags = MLX5_TC_FLAG(INGRESS) | MLX5_TC_FLAG(NIC_OFFLOAD);
+ unsigned long flags = MLX5_TC_FLAG(INGRESS);
struct mlx5e_priv *priv = cb_priv;
+ if (!priv->netdev || !netif_device_present(priv->netdev))
+ return -EOPNOTSUPP;
+
+ if (mlx5e_is_uplink_rep(priv))
+ flags |= MLX5_TC_FLAG(ESW_OFFLOAD);
+ else
+ flags |= MLX5_TC_FLAG(NIC_OFFLOAD);
+
switch (type) {
case TC_SETUP_CLSFLOWER:
return mlx5e_setup_tc_cls_flower(priv, type_data, flags);
#include "eswitch.h"
#include "esw/indir_table.h"
#include "esw/acl/ofld.h"
-#include "esw/indir_table.h"
#include "rdma.h"
#include "en.h"
#include "fs_core.h"
if (!mlx5_eswitch_termtbl_required(esw, attr, flow_act, spec) &&
MLX5_CAP_GEN(esw_attr->in_mdev, reg_c_preserve) &&
- mlx5_eswitch_vport_match_metadata_enabled(esw))
+ mlx5_eswitch_vport_match_metadata_enabled(esw) &&
+ MLX5_CAP_ESW_FLOWTABLE_FDB(esw->dev, ignore_flow_level))
attr->flags |= MLX5_ESW_ATTR_FLAG_SRC_REWRITE;
if (attr->dest_ft) {
if (!mlx5_eswitch_reg_c1_loopback_supported(esw))
return ERR_PTR(-EOPNOTSUPP);
- spec = kzalloc(sizeof(*spec), GFP_KERNEL);
+ spec = kvzalloc(sizeof(*spec), GFP_KERNEL);
if (!spec)
return ERR_PTR(-ENOMEM);
dest.ft = esw->offloads.ft_offloads;
flow_rule = mlx5_add_flow_rules(ft, spec, &flow_act, &dest, 1);
- kfree(spec);
+ kvfree(spec);
if (IS_ERR(flow_rule))
esw_warn(esw->dev,
/* Holds true only as long as DMFS is the default */
mlx5_flow_namespace_set_mode(esw->fdb_table.offloads.ns,
MLX5_FLOW_STEERING_MODE_DMFS);
+ atomic64_set(&esw->user_count, 0);
}
static int esw_create_offloads_table(struct mlx5_eswitch *esw)
if (esw->mode != MLX5_ESWITCH_OFFLOADS)
return 0;
- err = mlx5_esw_offloads_devlink_port_register(esw, vport_num);
- if (err)
- return err;
+ if (vport_num != MLX5_VPORT_UPLINK) {
+ err = mlx5_esw_offloads_devlink_port_register(esw, vport_num);
+ if (err)
+ return err;
+ }
err = mlx5_esw_offloads_rep_load(esw, vport_num);
if (err)
return err;
load_err:
- mlx5_esw_offloads_devlink_port_unregister(esw, vport_num);
+ if (vport_num != MLX5_VPORT_UPLINK)
+ mlx5_esw_offloads_devlink_port_unregister(esw, vport_num);
return err;
}
return;
mlx5_esw_offloads_rep_unload(esw, vport_num);
- mlx5_esw_offloads_devlink_port_unregister(esw, vport_num);
+
+ if (vport_num != MLX5_VPORT_UPLINK)
+ mlx5_esw_offloads_devlink_port_unregister(esw, vport_num);
}
#define ESW_OFFLOADS_DEVCOM_PAIR (0)
struct mlx5_vport *vport;
vport = mlx5_eswitch_get_vport(esw, MLX5_VPORT_UPLINK);
+ if (IS_ERR(vport))
+ return PTR_ERR(vport);
+
return esw_vport_create_offloads_acl_tables(esw, vport);
}
struct mlx5_vport *vport;
vport = mlx5_eswitch_get_vport(esw, MLX5_VPORT_UPLINK);
+ if (IS_ERR(vport))
+ return;
+
esw_vport_destroy_offloads_acl_tables(esw, vport);
}
memset(&esw->fdb_table.offloads, 0, sizeof(struct offloads_fdb));
mutex_init(&esw->fdb_table.offloads.vports.lock);
hash_init(esw->fdb_table.offloads.vports.table);
+ atomic64_set(&esw->user_count, 0);
indir = mlx5_esw_indir_table_init();
if (IS_ERR(indir)) {
if (esw_mode_from_devlink(mode, &mlx5_mode))
return -EINVAL;
- mutex_lock(&esw->mode_lock);
- cur_mlx5_mode = esw->mode;
+ err = mlx5_esw_try_lock(esw);
+ if (err < 0) {
+ NL_SET_ERR_MSG_MOD(extack, "Can't change mode, E-Switch is busy");
+ return err;
+ }
+ cur_mlx5_mode = err;
+ err = 0;
+
if (cur_mlx5_mode == mlx5_mode)
goto unlock;
err = -EINVAL;
unlock:
- mutex_unlock(&esw->mode_lock);
+ mlx5_esw_unlock(esw);
return err;
}
if (IS_ERR(esw))
return PTR_ERR(esw);
- mutex_lock(&esw->mode_lock);
+ down_write(&esw->mode_lock);
err = eswitch_devlink_esw_mode_check(esw);
if (err)
goto unlock;
err = esw_mode_to_devlink(esw->mode, mode);
unlock:
- mutex_unlock(&esw->mode_lock);
+ up_write(&esw->mode_lock);
return err;
}
if (IS_ERR(esw))
return PTR_ERR(esw);
- mutex_lock(&esw->mode_lock);
+ down_write(&esw->mode_lock);
err = eswitch_devlink_esw_mode_check(esw);
if (err)
goto out;
}
esw->offloads.inline_mode = mlx5_mode;
- mutex_unlock(&esw->mode_lock);
+ up_write(&esw->mode_lock);
return 0;
revert_inline_mode:
vport,
esw->offloads.inline_mode);
out:
- mutex_unlock(&esw->mode_lock);
+ up_write(&esw->mode_lock);
return err;
}
if (IS_ERR(esw))
return PTR_ERR(esw);
- mutex_lock(&esw->mode_lock);
+ down_write(&esw->mode_lock);
err = eswitch_devlink_esw_mode_check(esw);
if (err)
goto unlock;
err = esw_inline_mode_to_devlink(esw->offloads.inline_mode, mode);
unlock:
- mutex_unlock(&esw->mode_lock);
+ up_write(&esw->mode_lock);
return err;
}
if (IS_ERR(esw))
return PTR_ERR(esw);
- mutex_lock(&esw->mode_lock);
+ down_write(&esw->mode_lock);
err = eswitch_devlink_esw_mode_check(esw);
if (err)
goto unlock;
}
unlock:
- mutex_unlock(&esw->mode_lock);
+ up_write(&esw->mode_lock);
return err;
}
return PTR_ERR(esw);
- mutex_lock(&esw->mode_lock);
+ down_write(&esw->mode_lock);
err = eswitch_devlink_esw_mode_check(esw);
if (err)
goto unlock;
*encap = esw->offloads.encap;
unlock:
- mutex_unlock(&esw->mode_lock);
+ up_write(&esw->mode_lock);
return 0;
}
}
qpc = MLX5_ADDR_OF(create_qp_in, in, qpc);
+ MLX5_SET(qpc, qpc, ts_format, mlx5_get_qp_default_ts(priv->mdev));
MLX5_SET(qpc, qpc, st, MLX5_QP_ST_UD);
MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED);
MLX5_SET(qpc, qpc, ulp_stateless_offload_mode,
static void mlx5_rdma_netdev_free(struct net_device *netdev)
{
struct mlx5e_priv *priv = mlx5i_epriv(netdev);
+ struct mlx5_core_dev *mdev = priv->mdev;
struct mlx5i_priv *ipriv = priv->ppriv;
const struct mlx5e_profile *profile = priv->profile;
if (!ipriv->sub_interface) {
mlx5i_pkey_qpn_ht_cleanup(netdev);
- mlx5e_destroy_mdev_resources(priv->mdev);
+ mlx5e_destroy_mdev_resources(mdev);
}
}
static bool mlx5_is_sub_interface(struct mlx5_core_dev *mdev)
{
- return mdev->mlx5e_res.pdn != 0;
+ return mdev->mlx5e_res.hw_objs.pdn != 0;
}
static const struct mlx5e_profile *mlx5_get_profile(struct mlx5_core_dev *mdev)
#include "priv.h"
#include "sf.h"
#include "mlx5_ifc_vhca_event.h"
+#include "ecpf.h"
+ #include "vhca_event.h"
+ #include "mlx5_core.h"
struct mlx5_sf_hw {
u32 usr_sfnum;
struct mlx5_core_dev *dev;
struct mlx5_sf_hw *sfs;
int max_local_functions;
- u8 ecpu: 1;
struct mutex table_lock; /* Serializes sf deletion and vhca state change handler. */
struct notifier_block vhca_nb;
};
}
if (sw_id == -ENOSPC) {
err = -ENOSPC;
- goto err;
+ goto exist_err;
}
hw_fn_id = mlx5_sf_sw_to_hw_id(table->dev, sw_id);
if (err)
goto err;
- err = mlx5_modify_vhca_sw_id(dev, hw_fn_id, table->ecpu, usr_sfnum);
+ err = mlx5_modify_vhca_sw_id(dev, hw_fn_id, usr_sfnum);
if (err)
goto vhca_err;
hw_fn_id = mlx5_sf_sw_to_hw_id(dev, id);
mutex_lock(&table->table_lock);
- err = mlx5_cmd_query_vhca_state(dev, hw_fn_id, table->ecpu, out, sizeof(out));
+ err = mlx5_cmd_query_vhca_state(dev, hw_fn_id, out, sizeof(out));
if (err)
goto err;
state = MLX5_GET(query_vhca_state_out, out, vhca_state_context.vhca_state);
table->dev = dev;
table->sfs = sfs;
table->max_local_functions = max_functions;
- table->ecpu = mlx5_read_embedded_cpu(dev);
dev->priv.sf_hw_table = table;
mlx5_core_dbg(dev, "SF HW table: max sfs = %d\n", max_functions);
return 0;
static u64 dr_ste_v1_get_miss_addr(u8 *hw_ste_p)
{
u64 index =
- (MLX5_GET(ste_match_bwc_v1, hw_ste_p, miss_address_31_6) |
- MLX5_GET(ste_match_bwc_v1, hw_ste_p, miss_address_39_32) << 26);
+ ((u64)MLX5_GET(ste_match_bwc_v1, hw_ste_p, miss_address_31_6) |
+ ((u64)MLX5_GET(ste_match_bwc_v1, hw_ste_p, miss_address_39_32)) << 26);
return index << 6;
}
dr_ste_v1_set_reparse(hw_ste_p);
}
-static void dr_ste_v1_set_rx_decap_l3(u8 *hw_ste_p,
- u8 *s_action,
- u16 decap_actions,
- u32 decap_index)
-{
- MLX5_SET(ste_single_action_modify_list_v1, s_action, action_id,
- DR_STE_V1_ACTION_ID_MODIFY_LIST);
- MLX5_SET(ste_single_action_modify_list_v1, s_action, num_of_modify_actions,
- decap_actions);
- MLX5_SET(ste_single_action_modify_list_v1, s_action, modify_actions_ptr,
- decap_index);
-
- dr_ste_v1_set_reparse(hw_ste_p);
-}
-
static void dr_ste_v1_set_rewrite_actions(u8 *hw_ste_p,
u8 *s_action,
u16 num_of_actions,
bool allow_ctr = true;
if (action_type_set[DR_ACTION_TYP_TNL_L3_TO_L2]) {
- dr_ste_v1_set_rx_decap_l3(last_ste, action,
- attr->decap_actions,
- attr->decap_index);
dr_ste_v1_set_rewrite_actions(last_ste, action,
attr->decap_actions,
attr->decap_index);
DR_STE_SET_ONES(src_gvmi_qp_v1, bit_mask, source_gvmi, misc_mask, source_port);
DR_STE_SET_ONES(src_gvmi_qp_v1, bit_mask, source_qp, misc_mask, source_sqn);
+ misc_mask->source_eswitch_owner_vhca_id = 0;
}
static int dr_ste_v1_build_src_gvmi_qpn_tag(struct mlx5dr_match_param *value,
#include "ionic_lif.h"
#include "ionic_txrx.h"
-static void ionic_rx_clean(struct ionic_queue *q,
- struct ionic_desc_info *desc_info,
- struct ionic_cq_info *cq_info,
- void *cb_arg);
-
-static bool ionic_rx_service(struct ionic_cq *cq, struct ionic_cq_info *cq_info);
static bool ionic_tx_service(struct ionic_cq *cq, struct ionic_cq_info *cq_info);
return netdev_get_tx_queue(q->lif->netdev, q->index);
}
-static struct sk_buff *ionic_rx_skb_alloc(struct ionic_queue *q,
- unsigned int len, bool frags)
+static void ionic_rx_buf_reset(struct ionic_buf_info *buf_info)
+{
+ buf_info->page = NULL;
+ buf_info->page_offset = 0;
+ buf_info->dma_addr = 0;
+}
+
+static int ionic_rx_page_alloc(struct ionic_queue *q,
+ struct ionic_buf_info *buf_info)
{
- struct ionic_lif *lif = q->lif;
+ struct net_device *netdev = q->lif->netdev;
struct ionic_rx_stats *stats;
- struct net_device *netdev;
- struct sk_buff *skb;
+ struct device *dev;
- netdev = lif->netdev;
- stats = &q->lif->rxqstats[q->index];
+ dev = q->dev;
+ stats = q_to_rx_stats(q);
- if (frags)
- skb = napi_get_frags(&q_to_qcq(q)->napi);
- else
- skb = netdev_alloc_skb_ip_align(netdev, len);
+ if (unlikely(!buf_info)) {
+ net_err_ratelimited("%s: %s invalid buf_info in alloc\n",
+ netdev->name, q->name);
+ return -EINVAL;
+ }
- if (unlikely(!skb)) {
- net_warn_ratelimited("%s: SKB alloc failed on %s!\n",
- netdev->name, q->name);
+ buf_info->page = alloc_pages(IONIC_PAGE_GFP_MASK, 0);
+ if (unlikely(!buf_info->page)) {
+ net_err_ratelimited("%s: %s page alloc failed\n",
+ netdev->name, q->name);
stats->alloc_err++;
- return NULL;
+ return -ENOMEM;
}
+ buf_info->page_offset = 0;
- return skb;
+ buf_info->dma_addr = dma_map_page(dev, buf_info->page, buf_info->page_offset,
+ IONIC_PAGE_SIZE, DMA_FROM_DEVICE);
+ if (unlikely(dma_mapping_error(dev, buf_info->dma_addr))) {
+ __free_pages(buf_info->page, 0);
+ ionic_rx_buf_reset(buf_info);
+ net_err_ratelimited("%s: %s dma map failed\n",
+ netdev->name, q->name);
+ stats->dma_map_err++;
+ return -EIO;
+ }
+
+ return 0;
+}
+
+static void ionic_rx_page_free(struct ionic_queue *q,
+ struct ionic_buf_info *buf_info)
+{
+ struct net_device *netdev = q->lif->netdev;
+ struct device *dev = q->dev;
+
+ if (unlikely(!buf_info)) {
+ net_err_ratelimited("%s: %s invalid buf_info in free\n",
+ netdev->name, q->name);
+ return;
+ }
+
+ if (!buf_info->page)
+ return;
+
+ dma_unmap_page(dev, buf_info->dma_addr, IONIC_PAGE_SIZE, DMA_FROM_DEVICE);
+ __free_pages(buf_info->page, 0);
+ ionic_rx_buf_reset(buf_info);
+}
+
+static bool ionic_rx_buf_recycle(struct ionic_queue *q,
+ struct ionic_buf_info *buf_info, u32 used)
+{
+ u32 size;
+
+ /* don't re-use pages allocated in low-mem condition */
+ if (page_is_pfmemalloc(buf_info->page))
+ return false;
+
+ /* don't re-use buffers from non-local numa nodes */
+ if (page_to_nid(buf_info->page) != numa_mem_id())
+ return false;
+
+ size = ALIGN(used, IONIC_PAGE_SPLIT_SZ);
+ buf_info->page_offset += size;
+ if (buf_info->page_offset >= IONIC_PAGE_SIZE)
+ return false;
+
+ get_page(buf_info->page);
+
+ return true;
}
static struct sk_buff *ionic_rx_frags(struct ionic_queue *q,
struct ionic_desc_info *desc_info,
- struct ionic_cq_info *cq_info)
+ struct ionic_rxq_comp *comp)
{
- struct ionic_rxq_comp *comp = cq_info->cq_desc;
- struct device *dev = q->lif->ionic->dev;
- struct ionic_page_info *page_info;
+ struct net_device *netdev = q->lif->netdev;
+ struct ionic_buf_info *buf_info;
+ struct ionic_rx_stats *stats;
+ struct device *dev = q->dev;
struct sk_buff *skb;
unsigned int i;
u16 frag_len;
u16 len;
- page_info = &desc_info->pages[0];
+ stats = q_to_rx_stats(q);
+
+ buf_info = &desc_info->bufs[0];
len = le16_to_cpu(comp->len);
- prefetch(page_address(page_info->page) + NET_IP_ALIGN);
+ prefetch(buf_info->page);
- skb = ionic_rx_skb_alloc(q, len, true);
- if (unlikely(!skb))
+ skb = napi_get_frags(&q_to_qcq(q)->napi);
+ if (unlikely(!skb)) {
+ net_warn_ratelimited("%s: SKB alloc failed on %s!\n",
+ netdev->name, q->name);
+ stats->alloc_err++;
return NULL;
+ }
i = comp->num_sg_elems + 1;
do {
- if (unlikely(!page_info->page)) {
- struct napi_struct *napi = &q_to_qcq(q)->napi;
-
- napi->skb = NULL;
+ if (unlikely(!buf_info->page)) {
dev_kfree_skb(skb);
return NULL;
}
- frag_len = min(len, (u16)PAGE_SIZE);
+ frag_len = min_t(u16, len, IONIC_PAGE_SIZE - buf_info->page_offset);
len -= frag_len;
- dma_unmap_page(dev, dma_unmap_addr(page_info, dma_addr),
- PAGE_SIZE, DMA_FROM_DEVICE);
+ dma_sync_single_for_cpu(dev,
+ buf_info->dma_addr + buf_info->page_offset,
+ frag_len, DMA_FROM_DEVICE);
+
skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags,
- page_info->page, 0, frag_len, PAGE_SIZE);
- page_info->page = NULL;
- page_info++;
+ buf_info->page, buf_info->page_offset, frag_len,
+ IONIC_PAGE_SIZE);
+
+ if (!ionic_rx_buf_recycle(q, buf_info, frag_len)) {
+ dma_unmap_page(dev, buf_info->dma_addr,
+ IONIC_PAGE_SIZE, DMA_FROM_DEVICE);
+ ionic_rx_buf_reset(buf_info);
+ }
+
+ buf_info++;
+
i--;
} while (i > 0);
static struct sk_buff *ionic_rx_copybreak(struct ionic_queue *q,
struct ionic_desc_info *desc_info,
- struct ionic_cq_info *cq_info)
+ struct ionic_rxq_comp *comp)
{
- struct ionic_rxq_comp *comp = cq_info->cq_desc;
- struct device *dev = q->lif->ionic->dev;
- struct ionic_page_info *page_info;
+ struct net_device *netdev = q->lif->netdev;
+ struct ionic_buf_info *buf_info;
+ struct ionic_rx_stats *stats;
+ struct device *dev = q->dev;
struct sk_buff *skb;
u16 len;
- page_info = &desc_info->pages[0];
+ stats = q_to_rx_stats(q);
+
+ buf_info = &desc_info->bufs[0];
len = le16_to_cpu(comp->len);
- skb = ionic_rx_skb_alloc(q, len, false);
- if (unlikely(!skb))
+ skb = napi_alloc_skb(&q_to_qcq(q)->napi, len);
+ if (unlikely(!skb)) {
+ net_warn_ratelimited("%s: SKB alloc failed on %s!\n",
+ netdev->name, q->name);
+ stats->alloc_err++;
return NULL;
+ }
- if (unlikely(!page_info->page)) {
+ if (unlikely(!buf_info->page)) {
dev_kfree_skb(skb);
return NULL;
}
- dma_sync_single_for_cpu(dev, dma_unmap_addr(page_info, dma_addr),
+ dma_sync_single_for_cpu(dev, buf_info->dma_addr + buf_info->page_offset,
len, DMA_FROM_DEVICE);
- skb_copy_to_linear_data(skb, page_address(page_info->page), len);
- dma_sync_single_for_device(dev, dma_unmap_addr(page_info, dma_addr),
+ skb_copy_to_linear_data(skb, page_address(buf_info->page) + buf_info->page_offset, len);
+ dma_sync_single_for_device(dev, buf_info->dma_addr + buf_info->page_offset,
len, DMA_FROM_DEVICE);
skb_put(skb, len);
struct ionic_cq_info *cq_info,
void *cb_arg)
{
- struct ionic_rxq_comp *comp = cq_info->cq_desc;
+ struct ionic_rxq_comp *comp = cq_info->rxcq;
+ struct net_device *netdev = q->lif->netdev;
struct ionic_qcq *qcq = q_to_qcq(q);
struct ionic_rx_stats *stats;
- struct net_device *netdev;
struct sk_buff *skb;
stats = q_to_rx_stats(q);
- netdev = q->lif->netdev;
if (comp->status) {
stats->dropped++;
stats->bytes += le16_to_cpu(comp->len);
if (le16_to_cpu(comp->len) <= q->lif->rx_copybreak)
- skb = ionic_rx_copybreak(q, desc_info, cq_info);
+ skb = ionic_rx_copybreak(q, desc_info, comp);
else
- skb = ionic_rx_frags(q, desc_info, cq_info);
+ skb = ionic_rx_frags(q, desc_info, comp);
if (unlikely(!skb)) {
stats->dropped++;
static bool ionic_rx_service(struct ionic_cq *cq, struct ionic_cq_info *cq_info)
{
- struct ionic_rxq_comp *comp = cq_info->cq_desc;
+ struct ionic_rxq_comp *comp = cq_info->rxcq;
struct ionic_queue *q = cq->bound_q;
struct ionic_desc_info *desc_info;
return true;
}
-static int ionic_rx_page_alloc(struct ionic_queue *q,
- struct ionic_page_info *page_info)
-{
- struct ionic_lif *lif = q->lif;
- struct ionic_rx_stats *stats;
- struct net_device *netdev;
- struct device *dev;
-
- netdev = lif->netdev;
- dev = lif->ionic->dev;
- stats = q_to_rx_stats(q);
-
- if (unlikely(!page_info)) {
- net_err_ratelimited("%s: %s invalid page_info in alloc\n",
- netdev->name, q->name);
- return -EINVAL;
- }
-
- page_info->page = dev_alloc_page();
- if (unlikely(!page_info->page)) {
- net_err_ratelimited("%s: %s page alloc failed\n",
- netdev->name, q->name);
- stats->alloc_err++;
- return -ENOMEM;
- }
-
- page_info->dma_addr = dma_map_page(dev, page_info->page, 0, PAGE_SIZE,
- DMA_FROM_DEVICE);
- if (unlikely(dma_mapping_error(dev, page_info->dma_addr))) {
- put_page(page_info->page);
- page_info->dma_addr = 0;
- page_info->page = NULL;
- net_err_ratelimited("%s: %s dma map failed\n",
- netdev->name, q->name);
- stats->dma_map_err++;
- return -EIO;
- }
-
- return 0;
-}
-
-static void ionic_rx_page_free(struct ionic_queue *q,
- struct ionic_page_info *page_info)
-{
- struct ionic_lif *lif = q->lif;
- struct net_device *netdev;
- struct device *dev;
-
- netdev = lif->netdev;
- dev = lif->ionic->dev;
-
- if (unlikely(!page_info)) {
- net_err_ratelimited("%s: %s invalid page_info in free\n",
- netdev->name, q->name);
- return;
- }
-
- if (unlikely(!page_info->page)) {
- net_err_ratelimited("%s: %s invalid page in free\n",
- netdev->name, q->name);
- return;
- }
-
- dma_unmap_page(dev, page_info->dma_addr, PAGE_SIZE, DMA_FROM_DEVICE);
-
- put_page(page_info->page);
- page_info->dma_addr = 0;
- page_info->page = NULL;
-}
-
void ionic_rx_fill(struct ionic_queue *q)
{
struct net_device *netdev = q->lif->netdev;
struct ionic_desc_info *desc_info;
- struct ionic_page_info *page_info;
struct ionic_rxq_sg_desc *sg_desc;
struct ionic_rxq_sg_elem *sg_elem;
+ struct ionic_buf_info *buf_info;
struct ionic_rxq_desc *desc;
unsigned int remain_len;
- unsigned int seg_len;
+ unsigned int frag_len;
unsigned int nfrags;
unsigned int i, j;
unsigned int len;
len = netdev->mtu + ETH_HLEN + VLAN_HLEN;
- nfrags = round_up(len, PAGE_SIZE) / PAGE_SIZE;
for (i = ionic_q_space_avail(q); i; i--) {
+ nfrags = 0;
remain_len = len;
desc_info = &q->info[q->head_idx];
desc = desc_info->desc;
- sg_desc = desc_info->sg_desc;
- page_info = &desc_info->pages[0];
+ buf_info = &desc_info->bufs[0];
- if (page_info->page) { /* recycle the buffer */
- ionic_rxq_post(q, false, ionic_rx_clean, NULL);
- continue;
- }
-
- /* fill main descriptor - pages[0] */
- desc->opcode = (nfrags > 1) ? IONIC_RXQ_DESC_OPCODE_SG :
- IONIC_RXQ_DESC_OPCODE_SIMPLE;
- desc_info->npages = nfrags;
- if (unlikely(ionic_rx_page_alloc(q, page_info))) {
- desc->addr = 0;
- desc->len = 0;
- return;
+ if (!buf_info->page) { /* alloc a new buffer? */
+ if (unlikely(ionic_rx_page_alloc(q, buf_info))) {
+ desc->addr = 0;
+ desc->len = 0;
+ return;
+ }
}
- desc->addr = cpu_to_le64(page_info->dma_addr);
- seg_len = min_t(unsigned int, PAGE_SIZE, len);
- desc->len = cpu_to_le16(seg_len);
- remain_len -= seg_len;
- page_info++;
- /* fill sg descriptors - pages[1..n] */
- for (j = 0; j < nfrags - 1; j++) {
- if (page_info->page) /* recycle the sg buffer */
- continue;
+ /* fill main descriptor - buf[0] */
+ desc->addr = cpu_to_le64(buf_info->dma_addr + buf_info->page_offset);
+ frag_len = min_t(u16, len, IONIC_PAGE_SIZE - buf_info->page_offset);
+ desc->len = cpu_to_le16(frag_len);
+ remain_len -= frag_len;
+ buf_info++;
+ nfrags++;
+ /* fill sg descriptors - buf[1..n] */
+ sg_desc = desc_info->sg_desc;
+ for (j = 0; remain_len > 0 && j < q->max_sg_elems; j++) {
sg_elem = &sg_desc->elems[j];
- if (unlikely(ionic_rx_page_alloc(q, page_info))) {
- sg_elem->addr = 0;
- sg_elem->len = 0;
- return;
+ if (!buf_info->page) { /* alloc a new sg buffer? */
+ if (unlikely(ionic_rx_page_alloc(q, buf_info))) {
+ sg_elem->addr = 0;
+ sg_elem->len = 0;
+ return;
+ }
}
- sg_elem->addr = cpu_to_le64(page_info->dma_addr);
- seg_len = min_t(unsigned int, PAGE_SIZE, remain_len);
- sg_elem->len = cpu_to_le16(seg_len);
- remain_len -= seg_len;
- page_info++;
+
+ sg_elem->addr = cpu_to_le64(buf_info->dma_addr + buf_info->page_offset);
+ frag_len = min_t(u16, remain_len, IONIC_PAGE_SIZE - buf_info->page_offset);
+ sg_elem->len = cpu_to_le16(frag_len);
+ remain_len -= frag_len;
+ buf_info++;
+ nfrags++;
+ }
+
+ /* clear end sg element as a sentinel */
+ if (j < q->max_sg_elems) {
+ sg_elem = &sg_desc->elems[j];
+ memset(sg_elem, 0, sizeof(*sg_elem));
}
+ desc->opcode = (nfrags > 1) ? IONIC_RXQ_DESC_OPCODE_SG :
+ IONIC_RXQ_DESC_OPCODE_SIMPLE;
+ desc_info->nbufs = nfrags;
+
ionic_rxq_post(q, false, ionic_rx_clean, NULL);
}
void ionic_rx_empty(struct ionic_queue *q)
{
struct ionic_desc_info *desc_info;
- struct ionic_page_info *page_info;
+ struct ionic_buf_info *buf_info;
unsigned int i, j;
for (i = 0; i < q->num_descs; i++) {
desc_info = &q->info[i];
for (j = 0; j < IONIC_RX_MAX_SG_ELEMS + 1; j++) {
- page_info = &desc_info->pages[j];
- if (page_info->page)
- ionic_rx_page_free(q, page_info);
+ buf_info = &desc_info->bufs[j];
+ if (buf_info->page)
+ ionic_rx_page_free(q, buf_info);
}
- desc_info->npages = 0;
+ desc_info->nbufs = 0;
desc_info->cb = NULL;
desc_info->cb_arg = NULL;
}
+
+ q->head_idx = 0;
+ q->tail_idx = 0;
}
static void ionic_dim_update(struct ionic_qcq *qcq)
idev = &lif->ionic->idev;
txcq = &lif->txqcqs[qi]->cq;
- tx_work_done = ionic_cq_service(txcq, lif->tx_budget,
+ tx_work_done = ionic_cq_service(txcq, IONIC_TX_BUDGET_DEFAULT,
ionic_tx_service, NULL, NULL);
rx_work_done = ionic_cq_service(rxcq, budget,
void *data, size_t len)
{
struct ionic_tx_stats *stats = q_to_tx_stats(q);
- struct device *dev = q->lif->ionic->dev;
+ struct device *dev = q->dev;
dma_addr_t dma_addr;
dma_addr = dma_map_single(dev, data, len, DMA_TO_DEVICE);
size_t offset, size_t len)
{
struct ionic_tx_stats *stats = q_to_tx_stats(q);
- struct device *dev = q->lif->ionic->dev;
+ struct device *dev = q->dev;
dma_addr_t dma_addr;
dma_addr = skb_frag_dma_map(dev, frag, offset, len, DMA_TO_DEVICE);
return dma_addr;
}
+static int ionic_tx_map_skb(struct ionic_queue *q, struct sk_buff *skb,
+ struct ionic_desc_info *desc_info)
+{
+ struct ionic_buf_info *buf_info = desc_info->bufs;
+ struct device *dev = q->dev;
+ dma_addr_t dma_addr;
+ unsigned int nfrags;
+ skb_frag_t *frag;
+ int frag_idx;
+
+ dma_addr = ionic_tx_map_single(q, skb->data, skb_headlen(skb));
+ if (dma_mapping_error(dev, dma_addr))
+ return -EIO;
+ buf_info->dma_addr = dma_addr;
+ buf_info->len = skb_headlen(skb);
+ buf_info++;
+
+ frag = skb_shinfo(skb)->frags;
+ nfrags = skb_shinfo(skb)->nr_frags;
+ for (frag_idx = 0; frag_idx < nfrags; frag_idx++, frag++) {
+ dma_addr = ionic_tx_map_frag(q, frag, 0, skb_frag_size(frag));
+ if (dma_mapping_error(dev, dma_addr))
+ goto dma_fail;
+ buf_info->dma_addr = dma_addr;
+ buf_info->len = skb_frag_size(frag);
+ buf_info++;
+ }
+
+ desc_info->nbufs = 1 + nfrags;
+
+ return 0;
+
+dma_fail:
+ /* unwind the frag mappings and the head mapping */
+ while (frag_idx > 0) {
+ frag_idx--;
+ buf_info--;
+ dma_unmap_page(dev, buf_info->dma_addr,
+ buf_info->len, DMA_TO_DEVICE);
+ }
+ dma_unmap_single(dev, buf_info->dma_addr, buf_info->len, DMA_TO_DEVICE);
+ return -EIO;
+}
+
static void ionic_tx_clean(struct ionic_queue *q,
struct ionic_desc_info *desc_info,
struct ionic_cq_info *cq_info,
void *cb_arg)
{
- struct ionic_txq_sg_desc *sg_desc = desc_info->sg_desc;
- struct ionic_txq_sg_elem *elem = sg_desc->elems;
+ struct ionic_buf_info *buf_info = desc_info->bufs;
struct ionic_tx_stats *stats = q_to_tx_stats(q);
- struct ionic_txq_desc *desc = desc_info->desc;
- struct device *dev = q->lif->ionic->dev;
- u8 opcode, flags, nsge;
+ struct device *dev = q->dev;
u16 queue_index;
unsigned int i;
- u64 addr;
-
- decode_txq_desc_cmd(le64_to_cpu(desc->cmd),
- &opcode, &flags, &nsge, &addr);
-
- /* use unmap_single only if either this is not TSO,
- * or this is first descriptor of a TSO
- */
- if (opcode != IONIC_TXQ_DESC_OPCODE_TSO ||
- flags & IONIC_TXQ_DESC_FLAG_TSO_SOT)
- dma_unmap_single(dev, (dma_addr_t)addr,
- le16_to_cpu(desc->len), DMA_TO_DEVICE);
- else
- dma_unmap_page(dev, (dma_addr_t)addr,
- le16_to_cpu(desc->len), DMA_TO_DEVICE);
- for (i = 0; i < nsge; i++, elem++)
- dma_unmap_page(dev, (dma_addr_t)le64_to_cpu(elem->addr),
- le16_to_cpu(elem->len), DMA_TO_DEVICE);
+ if (desc_info->nbufs) {
+ dma_unmap_single(dev, (dma_addr_t)buf_info->dma_addr,
+ buf_info->len, DMA_TO_DEVICE);
+ buf_info++;
+ for (i = 1; i < desc_info->nbufs; i++, buf_info++)
+ dma_unmap_page(dev, (dma_addr_t)buf_info->dma_addr,
+ buf_info->len, DMA_TO_DEVICE);
+ }
if (cb_arg) {
struct sk_buff *skb = cb_arg;
- u32 len = skb->len;
queue_index = skb_get_queue_mapping(skb);
if (unlikely(__netif_subqueue_stopped(q->lif->netdev,
netif_wake_subqueue(q->lif->netdev, queue_index);
q->wake++;
}
- dev_kfree_skb_any(skb);
+
+ desc_info->bytes = skb->len;
stats->clean++;
- netdev_tx_completed_queue(q_to_ndq(q), 1, len);
+
+ dev_consume_skb_any(skb);
}
}
static bool ionic_tx_service(struct ionic_cq *cq, struct ionic_cq_info *cq_info)
{
- struct ionic_txq_comp *comp = cq_info->cq_desc;
+ struct ionic_txq_comp *comp = cq_info->txcq;
struct ionic_queue *q = cq->bound_q;
struct ionic_desc_info *desc_info;
+ int bytes = 0;
+ int pkts = 0;
u16 index;
if (!color_match(comp->color, cq->done_color))
*/
do {
desc_info = &q->info[q->tail_idx];
+ desc_info->bytes = 0;
index = q->tail_idx;
q->tail_idx = (q->tail_idx + 1) & (q->num_descs - 1);
ionic_tx_clean(q, desc_info, cq_info, desc_info->cb_arg);
+ if (desc_info->cb_arg) {
+ pkts++;
+ bytes += desc_info->bytes;
+ }
desc_info->cb = NULL;
desc_info->cb_arg = NULL;
} while (index != le16_to_cpu(comp->comp_index));
+ if (pkts && bytes)
+ netdev_tx_completed_queue(q_to_ndq(q), pkts, bytes);
+
return true;
}
void ionic_tx_empty(struct ionic_queue *q)
{
struct ionic_desc_info *desc_info;
+ int bytes = 0;
+ int pkts = 0;
/* walk the not completed tx entries, if any */
while (q->head_idx != q->tail_idx) {
desc_info = &q->info[q->tail_idx];
+ desc_info->bytes = 0;
q->tail_idx = (q->tail_idx + 1) & (q->num_descs - 1);
ionic_tx_clean(q, desc_info, NULL, desc_info->cb_arg);
+ if (desc_info->cb_arg) {
+ pkts++;
+ bytes += desc_info->bytes;
+ }
desc_info->cb = NULL;
desc_info->cb_arg = NULL;
}
+
+ if (pkts && bytes)
+ netdev_tx_completed_queue(q_to_ndq(q), pkts, bytes);
}
static int ionic_tx_tcp_inner_pseudo_csum(struct sk_buff *skb)
desc->hdr_len = cpu_to_le16(hdrlen);
desc->mss = cpu_to_le16(mss);
- if (done) {
+ if (start) {
skb_tx_timestamp(skb);
netdev_tx_sent_queue(q_to_ndq(q), skb->len);
- ionic_txq_post(q, !netdev_xmit_more(), ionic_tx_clean, skb);
+ ionic_txq_post(q, false, ionic_tx_clean, skb);
} else {
- ionic_txq_post(q, false, ionic_tx_clean, NULL);
+ ionic_txq_post(q, done, NULL, NULL);
}
}
-static struct ionic_txq_desc *ionic_tx_tso_next(struct ionic_queue *q,
- struct ionic_txq_sg_elem **elem)
-{
- struct ionic_txq_sg_desc *sg_desc = q->info[q->head_idx].txq_sg_desc;
- struct ionic_txq_desc *desc = q->info[q->head_idx].txq_desc;
-
- *elem = sg_desc->elems;
- return desc;
-}
-
static int ionic_tx_tso(struct ionic_queue *q, struct sk_buff *skb)
{
struct ionic_tx_stats *stats = q_to_tx_stats(q);
- struct ionic_desc_info *rewind_desc_info;
- struct device *dev = q->lif->ionic->dev;
+ struct ionic_desc_info *desc_info;
+ struct ionic_buf_info *buf_info;
struct ionic_txq_sg_elem *elem;
struct ionic_txq_desc *desc;
- unsigned int frag_left = 0;
- unsigned int offset = 0;
- u16 abort = q->head_idx;
- unsigned int len_left;
+ unsigned int chunk_len;
+ unsigned int frag_rem;
+ unsigned int tso_rem;
+ unsigned int seg_rem;
dma_addr_t desc_addr;
+ dma_addr_t frag_addr;
unsigned int hdrlen;
- unsigned int nfrags;
- unsigned int seglen;
- u64 total_bytes = 0;
- u64 total_pkts = 0;
- u16 rewind = abort;
- unsigned int left;
unsigned int len;
unsigned int mss;
- skb_frag_t *frag;
bool start, done;
bool outer_csum;
- dma_addr_t addr;
bool has_vlan;
u16 desc_len;
u8 desc_nsge;
bool encap;
int err;
+ desc_info = &q->info[q->head_idx];
+ buf_info = desc_info->bufs;
+
+ if (unlikely(ionic_tx_map_skb(q, skb, desc_info)))
+ return -EIO;
+
+ len = skb->len;
mss = skb_shinfo(skb)->gso_size;
- nfrags = skb_shinfo(skb)->nr_frags;
- len_left = skb->len - skb_headlen(skb);
outer_csum = (skb_shinfo(skb)->gso_type & SKB_GSO_GRE_CSUM) ||
(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM);
has_vlan = !!skb_vlan_tag_present(skb);
else
hdrlen = skb_transport_offset(skb) + tcp_hdrlen(skb);
- seglen = hdrlen + mss;
- left = skb_headlen(skb);
+ tso_rem = len;
+ seg_rem = min(tso_rem, hdrlen + mss);
- desc = ionic_tx_tso_next(q, &elem);
- start = true;
+ frag_addr = 0;
+ frag_rem = 0;
- /* Chop skb->data up into desc segments */
+ start = true;
- while (left > 0) {
- len = min(seglen, left);
- frag_left = seglen - len;
- desc_addr = ionic_tx_map_single(q, skb->data + offset, len);
- if (dma_mapping_error(dev, desc_addr))
- goto err_out_abort;
- desc_len = len;
+ while (tso_rem > 0) {
+ desc = NULL;
+ elem = NULL;
+ desc_addr = 0;
+ desc_len = 0;
desc_nsge = 0;
- left -= len;
- offset += len;
- if (nfrags > 0 && frag_left > 0)
- continue;
- done = (nfrags == 0 && left == 0);
- ionic_tx_tso_post(q, desc, skb,
- desc_addr, desc_nsge, desc_len,
- hdrlen, mss,
- outer_csum,
- vlan_tci, has_vlan,
- start, done);
- total_pkts++;
- total_bytes += start ? len : len + hdrlen;
- desc = ionic_tx_tso_next(q, &elem);
- start = false;
- seglen = mss;
- }
-
- /* Chop skb frags into desc segments */
-
- for (frag = skb_shinfo(skb)->frags; len_left; frag++) {
- offset = 0;
- left = skb_frag_size(frag);
- len_left -= left;
- nfrags--;
- stats->frags++;
-
- while (left > 0) {
- if (frag_left > 0) {
- len = min(frag_left, left);
- frag_left -= len;
- addr = ionic_tx_map_frag(q, frag, offset, len);
- if (dma_mapping_error(dev, addr))
- goto err_out_abort;
- elem->addr = cpu_to_le64(addr);
- elem->len = cpu_to_le16(len);
+ /* use fragments until we have enough to post a single descriptor */
+ while (seg_rem > 0) {
+ /* if the fragment is exhausted then move to the next one */
+ if (frag_rem == 0) {
+ /* grab the next fragment */
+ frag_addr = buf_info->dma_addr;
+ frag_rem = buf_info->len;
+ buf_info++;
+ }
+ chunk_len = min(frag_rem, seg_rem);
+ if (!desc) {
+ /* fill main descriptor */
+ desc = desc_info->txq_desc;
+ elem = desc_info->txq_sg_desc->elems;
+ desc_addr = frag_addr;
+ desc_len = chunk_len;
+ } else {
+ /* fill sg descriptor */
+ elem->addr = cpu_to_le64(frag_addr);
+ elem->len = cpu_to_le16(chunk_len);
elem++;
desc_nsge++;
- left -= len;
- offset += len;
- if (nfrags > 0 && frag_left > 0)
- continue;
- done = (nfrags == 0 && left == 0);
- ionic_tx_tso_post(q, desc, skb, desc_addr,
- desc_nsge, desc_len,
- hdrlen, mss, outer_csum,
- vlan_tci, has_vlan,
- start, done);
- total_pkts++;
- total_bytes += start ? len : len + hdrlen;
- desc = ionic_tx_tso_next(q, &elem);
- start = false;
- } else {
- len = min(mss, left);
- frag_left = mss - len;
- desc_addr = ionic_tx_map_frag(q, frag,
- offset, len);
- if (dma_mapping_error(dev, desc_addr))
- goto err_out_abort;
- desc_len = len;
- desc_nsge = 0;
- left -= len;
- offset += len;
- if (nfrags > 0 && frag_left > 0)
- continue;
- done = (nfrags == 0 && left == 0);
- ionic_tx_tso_post(q, desc, skb, desc_addr,
- desc_nsge, desc_len,
- hdrlen, mss, outer_csum,
- vlan_tci, has_vlan,
- start, done);
- total_pkts++;
- total_bytes += start ? len : len + hdrlen;
- desc = ionic_tx_tso_next(q, &elem);
- start = false;
}
+ frag_addr += chunk_len;
+ frag_rem -= chunk_len;
+ tso_rem -= chunk_len;
+ seg_rem -= chunk_len;
}
+ seg_rem = min(tso_rem, mss);
+ done = (tso_rem == 0);
+ /* post descriptor */
+ ionic_tx_tso_post(q, desc, skb,
+ desc_addr, desc_nsge, desc_len,
+ hdrlen, mss, outer_csum, vlan_tci, has_vlan,
+ start, done);
+ start = false;
+ /* Buffer information is stored with the first tso descriptor */
+ desc_info = &q->info[q->head_idx];
+ desc_info->nbufs = 0;
}
- stats->pkts += total_pkts;
- stats->bytes += total_bytes;
+ stats->pkts += DIV_ROUND_UP(len - hdrlen, mss);
+ stats->bytes += len;
stats->tso++;
- stats->tso_bytes += total_bytes;
+ stats->tso_bytes = len;
return 0;
-
-err_out_abort:
- while (rewind != q->head_idx) {
- rewind_desc_info = &q->info[rewind];
- ionic_tx_clean(q, rewind_desc_info, NULL, NULL);
- rewind = (rewind + 1) & (q->num_descs - 1);
- }
- q->head_idx = abort;
-
- return -ENOMEM;
}
-static int ionic_tx_calc_csum(struct ionic_queue *q, struct sk_buff *skb)
+static int ionic_tx_calc_csum(struct ionic_queue *q, struct sk_buff *skb,
+ struct ionic_desc_info *desc_info)
{
- struct ionic_txq_desc *desc = q->info[q->head_idx].txq_desc;
+ struct ionic_txq_desc *desc = desc_info->txq_desc;
+ struct ionic_buf_info *buf_info = desc_info->bufs;
struct ionic_tx_stats *stats = q_to_tx_stats(q);
- struct device *dev = q->lif->ionic->dev;
- dma_addr_t dma_addr;
bool has_vlan;
u8 flags = 0;
bool encap;
has_vlan = !!skb_vlan_tag_present(skb);
encap = skb->encapsulation;
- dma_addr = ionic_tx_map_single(q, skb->data, skb_headlen(skb));
- if (dma_mapping_error(dev, dma_addr))
- return -ENOMEM;
-
flags |= has_vlan ? IONIC_TXQ_DESC_FLAG_VLAN : 0;
flags |= encap ? IONIC_TXQ_DESC_FLAG_ENCAP : 0;
cmd = encode_txq_desc_cmd(IONIC_TXQ_DESC_OPCODE_CSUM_PARTIAL,
- flags, skb_shinfo(skb)->nr_frags, dma_addr);
+ flags, skb_shinfo(skb)->nr_frags,
+ buf_info->dma_addr);
desc->cmd = cpu_to_le64(cmd);
- desc->len = cpu_to_le16(skb_headlen(skb));
- desc->csum_start = cpu_to_le16(skb_checksum_start_offset(skb));
- desc->csum_offset = cpu_to_le16(skb->csum_offset);
+ desc->len = cpu_to_le16(buf_info->len);
if (has_vlan) {
desc->vlan_tci = cpu_to_le16(skb_vlan_tag_get(skb));
stats->vlan_inserted++;
+ } else {
+ desc->vlan_tci = 0;
}
+ desc->csum_start = cpu_to_le16(skb_checksum_start_offset(skb));
+ desc->csum_offset = cpu_to_le16(skb->csum_offset);
if (skb_csum_is_sctp(skb))
stats->crc32_csum++;
return 0;
}
-static int ionic_tx_calc_no_csum(struct ionic_queue *q, struct sk_buff *skb)
+static int ionic_tx_calc_no_csum(struct ionic_queue *q, struct sk_buff *skb,
+ struct ionic_desc_info *desc_info)
{
- struct ionic_txq_desc *desc = q->info[q->head_idx].txq_desc;
+ struct ionic_txq_desc *desc = desc_info->txq_desc;
+ struct ionic_buf_info *buf_info = desc_info->bufs;
struct ionic_tx_stats *stats = q_to_tx_stats(q);
- struct device *dev = q->lif->ionic->dev;
- dma_addr_t dma_addr;
bool has_vlan;
u8 flags = 0;
bool encap;
has_vlan = !!skb_vlan_tag_present(skb);
encap = skb->encapsulation;
- dma_addr = ionic_tx_map_single(q, skb->data, skb_headlen(skb));
- if (dma_mapping_error(dev, dma_addr))
- return -ENOMEM;
-
flags |= has_vlan ? IONIC_TXQ_DESC_FLAG_VLAN : 0;
flags |= encap ? IONIC_TXQ_DESC_FLAG_ENCAP : 0;
cmd = encode_txq_desc_cmd(IONIC_TXQ_DESC_OPCODE_CSUM_NONE,
- flags, skb_shinfo(skb)->nr_frags, dma_addr);
+ flags, skb_shinfo(skb)->nr_frags,
+ buf_info->dma_addr);
desc->cmd = cpu_to_le64(cmd);
- desc->len = cpu_to_le16(skb_headlen(skb));
+ desc->len = cpu_to_le16(buf_info->len);
if (has_vlan) {
desc->vlan_tci = cpu_to_le16(skb_vlan_tag_get(skb));
stats->vlan_inserted++;
+ } else {
+ desc->vlan_tci = 0;
}
+ desc->csum_start = 0;
+ desc->csum_offset = 0;
stats->csum_none++;
return 0;
}
-static int ionic_tx_skb_frags(struct ionic_queue *q, struct sk_buff *skb)
+static int ionic_tx_skb_frags(struct ionic_queue *q, struct sk_buff *skb,
+ struct ionic_desc_info *desc_info)
{
- struct ionic_txq_sg_desc *sg_desc = q->info[q->head_idx].txq_sg_desc;
- unsigned int len_left = skb->len - skb_headlen(skb);
+ struct ionic_txq_sg_desc *sg_desc = desc_info->txq_sg_desc;
+ struct ionic_buf_info *buf_info = &desc_info->bufs[1];
struct ionic_txq_sg_elem *elem = sg_desc->elems;
struct ionic_tx_stats *stats = q_to_tx_stats(q);
- struct device *dev = q->lif->ionic->dev;
- dma_addr_t dma_addr;
- skb_frag_t *frag;
- u16 len;
+ unsigned int i;
- for (frag = skb_shinfo(skb)->frags; len_left; frag++, elem++) {
- len = skb_frag_size(frag);
- elem->len = cpu_to_le16(len);
- dma_addr = ionic_tx_map_frag(q, frag, 0, len);
- if (dma_mapping_error(dev, dma_addr))
- return -ENOMEM;
- elem->addr = cpu_to_le64(dma_addr);
- len_left -= len;
- stats->frags++;
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++, buf_info++, elem++) {
+ elem->addr = cpu_to_le64(buf_info->dma_addr);
+ elem->len = cpu_to_le16(buf_info->len);
}
+ stats->frags += skb_shinfo(skb)->nr_frags;
+
return 0;
}
static int ionic_tx(struct ionic_queue *q, struct sk_buff *skb)
{
+ struct ionic_desc_info *desc_info = &q->info[q->head_idx];
struct ionic_tx_stats *stats = q_to_tx_stats(q);
int err;
+ if (unlikely(ionic_tx_map_skb(q, skb, desc_info)))
+ return -EIO;
+
/* set up the initial descriptor */
if (skb->ip_summed == CHECKSUM_PARTIAL)
- err = ionic_tx_calc_csum(q, skb);
+ err = ionic_tx_calc_csum(q, skb, desc_info);
else
- err = ionic_tx_calc_no_csum(q, skb);
+ err = ionic_tx_calc_no_csum(q, skb, desc_info);
if (err)
return err;
/* add frags */
- err = ionic_tx_skb_frags(q, skb);
+ err = ionic_tx_skb_frags(q, skb, desc_info);
if (err)
return err;
static int ionic_tx_descs_needed(struct ionic_queue *q, struct sk_buff *skb)
{
- int sg_elems = q->lif->qtype_info[IONIC_QTYPE_TXQ].max_sg_elems;
struct ionic_tx_stats *stats = q_to_tx_stats(q);
+ int ndescs;
int err;
- /* If TSO, need roundup(skb->len/mss) descs */
+ /* Each desc is mss long max, so a descriptor for each gso_seg */
if (skb_is_gso(skb))
- return (skb->len / skb_shinfo(skb)->gso_size) + 1;
+ ndescs = skb_shinfo(skb)->gso_segs;
+ else
+ ndescs = 1;
- if (skb_shinfo(skb)->nr_frags <= sg_elems)
+ /* If non-TSO, just need 1 desc and nr_frags sg elems */
+ if (skb_shinfo(skb)->nr_frags <= q->max_sg_elems)
- return 1;
+ return ndescs;
/* Too many frags, so linearize */
err = skb_linearize(skb);
stats->linearize++;
- /* Need 1 desc and zero sg elems */
- return 1;
+ return ndescs;
}
static int ionic_maybe_stop_tx(struct ionic_queue *q, int ndescs)
static void rtl8169_do_counters(struct rtl8169_private *tp, u32 counter_cmd)
{
- dma_addr_t paddr = tp->counters_phys_addr;
- u32 cmd;
+ u32 cmd = lower_32_bits(tp->counters_phys_addr);
- RTL_W32(tp, CounterAddrHigh, (u64)paddr >> 32);
+ RTL_W32(tp, CounterAddrHigh, upper_32_bits(tp->counters_phys_addr));
rtl_pci_commit(tp);
- cmd = (u64)paddr & DMA_BIT_MASK(32);
RTL_W32(tp, CounterAddrLow, cmd);
RTL_W32(tp, CounterAddrLow, cmd | counter_cmd);
return ret;
}
+static void rtl8169_get_ringparam(struct net_device *dev,
+ struct ethtool_ringparam *data)
+{
+ data->rx_max_pending = NUM_RX_DESC;
+ data->rx_pending = NUM_RX_DESC;
+ data->tx_max_pending = NUM_TX_DESC;
+ data->tx_pending = NUM_TX_DESC;
+}
+
static const struct ethtool_ops rtl8169_ethtool_ops = {
.supported_coalesce_params = ETHTOOL_COALESCE_USECS |
ETHTOOL_COALESCE_MAX_FRAMES,
.set_eee = rtl8169_set_eee,
.get_link_ksettings = phy_ethtool_get_link_ksettings,
.set_link_ksettings = phy_ethtool_set_link_ksettings,
+ .get_ringparam = rtl8169_get_ringparam,
};
static void rtl_enable_eee(struct rtl8169_private *tp)
rtl8169_update_counters(tp);
+ pci_clear_master(tp->pci_dev);
+ rtl_pci_commit(tp);
+
rtl8169_cleanup(tp, true);
rtl_prepare_power_down(tp);
static void rtl8169_up(struct rtl8169_private *tp)
{
+ pci_set_master(tp->pci_dev);
phy_resume(tp->phydev);
rtl8169_init_phy(tp);
napi_enable(&tp->napi);
rtl_hw_reset(tp);
- pci_set_master(pdev);
-
rc = rtl_alloc_irq(tp);
if (rc < 0) {
dev_err(&pdev->dev, "Can't allocate interrupt\n");
/* IPA_CMD_REGISTER_WRITE */
-/* For IPA v4.0+, this opcode gets modified with pipeline clear options */
-
+/* For IPA v4.0+, the pipeline clear options are encoded in the opcode */
#define REGISTER_WRITE_OPCODE_SKIP_CLEAR_FMASK GENMASK(8, 8)
#define REGISTER_WRITE_OPCODE_CLEAR_OPTION_FMASK GENMASK(10, 9)
struct ipa_cmd_register_write {
- __le16 flags; /* Unused/reserved for IPA v3.5.1 */
+ __le16 flags; /* Unused/reserved prior to IPA v4.0 */
__le16 offset;
__le32 value;
__le32 value_mask;
};
/* Field masks for ipa_cmd_register_write structure fields */
-/* The next field is present for IPA v4.0 and above */
+/* The next field is present for IPA v4.0+ */
#define REGISTER_WRITE_FLAGS_OFFSET_HIGH_FMASK GENMASK(14, 11)
-/* The next field is present for IPA v3.5.1 only */
+/* The next field is not present for IPA v4.0+ */
#define REGISTER_WRITE_FLAGS_SKIP_CLEAR_FMASK GENMASK(15, 15)
-/* The next field and its values are present for IPA v3.5.1 only */
+/* The next field and its values are not present for IPA v4.0+ */
#define REGISTER_WRITE_CLEAR_OPTIONS_FMASK GENMASK(1, 0)
/* IPA_CMD_IP_PACKET_INIT */
/* Field masks for ipa_cmd_hw_dma_mem_mem structure fields */
#define DMA_SHARED_MEM_FLAGS_DIRECTION_FMASK GENMASK(0, 0)
-/* The next two fields are present for IPA v3.5.1 only. */
+/* The next two fields are not present for IPA v4.0+ */
#define DMA_SHARED_MEM_FLAGS_SKIP_CLEAR_FMASK GENMASK(1, 1)
#define DMA_SHARED_MEM_FLAGS_CLEAR_OPTIONS_FMASK GENMASK(3, 2)
: field_max(IP_FLTRT_FLAGS_NHASH_ADDR_FMASK);
if (mem->offset > offset_max ||
ipa->mem_offset > offset_max - mem->offset) {
- dev_err(dev, "IPv%c %s%s table region offset too large "
- "(0x%04x + 0x%04x > 0x%04x)\n",
- ipv6 ? '6' : '4', hashed ? "hashed " : "",
- route ? "route" : "filter",
- ipa->mem_offset, mem->offset, offset_max);
+ dev_err(dev, "IPv%c %s%s table region offset too large\n",
+ ipv6 ? '6' : '4', hashed ? "hashed " : "",
+ route ? "route" : "filter");
+ dev_err(dev, " (0x%04x + 0x%04x > 0x%04x)\n",
+ ipa->mem_offset, mem->offset, offset_max);
+
return false;
}
if (mem->offset > ipa->mem_size ||
mem->size > ipa->mem_size - mem->offset) {
- dev_err(dev, "IPv%c %s%s table region out of range "
- "(0x%04x + 0x%04x > 0x%04x)\n",
- ipv6 ? '6' : '4', hashed ? "hashed " : "",
- route ? "route" : "filter",
- mem->offset, mem->size, ipa->mem_size);
+ dev_err(dev, "IPv%c %s%s table region out of range\n",
+ ipv6 ? '6' : '4', hashed ? "hashed " : "",
+ route ? "route" : "filter");
+ dev_err(dev, " (0x%04x + 0x%04x > 0x%04x)\n",
+ mem->offset, mem->size, ipa->mem_size);
+
return false;
}
u32 size_max;
u32 size;
+ /* In ipa_cmd_hdr_init_local_add() we record the offset and size
+ * of the header table memory area. Make sure the offset and size
+ * fit in the fields that need to hold them, and that the entire
+ * range is within the overall IPA memory range.
+ */
offset_max = field_max(HDR_INIT_LOCAL_FLAGS_HDR_ADDR_FMASK);
if (mem->offset > offset_max ||
ipa->mem_offset > offset_max - mem->offset) {
- dev_err(dev, "header table region offset too large "
- "(0x%04x + 0x%04x > 0x%04x)\n",
- ipa->mem_offset + mem->offset, offset_max);
+ dev_err(dev, "header table region offset too large\n");
+ dev_err(dev, " (0x%04x + 0x%04x > 0x%04x)\n",
+ ipa->mem_offset, mem->offset, offset_max);
+
return false;
}
size_max = field_max(HDR_INIT_LOCAL_FLAGS_TABLE_SIZE_FMASK);
size = ipa->mem[IPA_MEM_MODEM_HEADER].size;
size += ipa->mem[IPA_MEM_AP_HEADER].size;
- if (mem->offset > ipa->mem_size || size > ipa->mem_size - mem->offset) {
- dev_err(dev, "header table region out of range "
- "(0x%04x + 0x%04x > 0x%04x)\n",
- mem->offset, size, ipa->mem_size);
+
+ if (size > size_max) {
+ dev_err(dev, "header table region size too large\n");
+ dev_err(dev, " (0x%04x > 0x%08x)\n", size, size_max);
+
+ return false;
+ }
+ if (size > ipa->mem_size || mem->offset > ipa->mem_size - size) {
+ dev_err(dev, "header table region out of range\n");
+ dev_err(dev, " (0x%04x + 0x%04x > 0x%04x)\n",
+ mem->offset, size, ipa->mem_size);
+
return false;
}
u32 bit_count;
/* The maximum offset in a register_write immediate command depends
- * on the version of IPA. IPA v3.5.1 supports a 16 bit offset, but
- * newer versions allow some additional high-order bits.
+ * on the version of IPA. A 16 bit offset is always supported,
+ * but starting with IPA v4.0 some additional high-order bits are
+ * allowed.
*/
bit_count = BITS_PER_BYTE * sizeof(payload->offset);
- if (ipa->version != IPA_VERSION_3_5_1)
+ if (ipa->version >= IPA_VERSION_4_0)
bit_count += hweight32(REGISTER_WRITE_FLAGS_OFFSET_HIGH_FMASK);
BUILD_BUG_ON(bit_count > 32);
offset_max = ~0U >> (32 - bit_count);
/* pipeline_clear_src_grp is not used */
clear_option = clear_full ? pipeline_clear_full : pipeline_clear_hps;
- if (ipa->version != IPA_VERSION_3_5_1) {
+ /* IPA v4.0+ represents the pipeline clear options in the opcode. It
+ * also supports a larger offset by encoding additional high-order
+ * bits in the payload flags field.
+ */
+ if (ipa->version >= IPA_VERSION_4_0) {
u16 offset_high;
u32 val;
.decoded_size = IPA_QMI_DRIVER_INIT_COMPLETE_REQ_SZ,
.fn = ipa_server_driver_init_complete,
},
+ { },
};
/* Handle an INIT_DRIVER response message from the modem. */
.decoded_size = IPA_QMI_INIT_DRIVER_RSP_SZ,
.fn = ipa_client_init_driver,
},
+ { },
};
/* Return a pointer to an init modem driver request structure, which contains
/* None of the stats fields are valid (IPA v4.0 and above) */
- if (ipa->version != IPA_VERSION_3_5_1) {
- mem = &ipa->mem[IPA_MEM_STATS_QUOTA];
+ if (ipa->version >= IPA_VERSION_4_0) {
+ mem = &ipa->mem[IPA_MEM_STATS_QUOTA_MODEM];
if (mem->size) {
req.hw_stats_quota_base_addr_valid = 1;
req.hw_stats_quota_base_addr =
pl->cfg_link_an_mode = MLO_AN_FIXED;
fwnode_handle_put(dn);
- if (fwnode_property_read_string(fwnode, "managed", &managed) == 0 &&
- strcmp(managed, "in-band-status") == 0) {
+ if ((fwnode_property_read_string(fwnode, "managed", &managed) == 0 &&
+ strcmp(managed, "in-band-status") == 0) ||
+ pl->config->ovr_an_inband) {
if (pl->cfg_link_an_mode == MLO_AN_FIXED) {
phylink_err(pl,
"can't use both fixed-link and in-band-status\n");
err = pl->mac_ops->mac_finish(pl->config, pl->cur_link_an_mode,
state->interface);
if (err < 0)
- phylink_err(pl, "mac_prepare failed: %pe\n",
+ phylink_err(pl, "mac_finish failed: %pe\n",
ERR_PTR(err));
}
}
#include <linux/capability.h>
#include <linux/sched/mm.h>
#include <linux/slab.h>
+ #include <linux/percpu-refcount.h>
struct bpf_verifier_env;
struct bpf_verifier_log;
struct bpf_local_storage_map;
struct kobject;
struct mem_cgroup;
+struct bpf_func_state;
extern struct idr btf_idr;
extern spinlock_t btf_idr_lock;
void *owner, u32 size);
struct bpf_local_storage __rcu ** (*map_owner_storage_ptr)(void *owner);
+ /* Misc helpers.*/
+ int (*map_redirect)(struct bpf_map *map, u32 ifindex, u64 flags);
+
/* map_meta_equal must be implemented for maps that can be
* used as an inner map. It is a runtime check to ensure
* an inner map can be inserted to an outer map.
bool (*map_meta_equal)(const struct bpf_map *meta0,
const struct bpf_map *meta1);
+
+ int (*map_set_for_each_callback_args)(struct bpf_verifier_env *env,
+ struct bpf_func_state *caller,
+ struct bpf_func_state *callee);
+ int (*map_for_each_callback)(struct bpf_map *map, void *callback_fn,
+ void *callback_ctx, u64 flags);
+
/* BTF name and id of struct allocated by map_alloc */
const char * const map_btf_name;
int *map_btf_id;
ARG_CONST_ALLOC_SIZE_OR_ZERO, /* number of allocated bytes requested */
ARG_PTR_TO_BTF_ID_SOCK_COMMON, /* pointer to in-kernel sock_common or bpf-mirrored bpf_sock */
ARG_PTR_TO_PERCPU_BTF_ID, /* pointer to in-kernel percpu type */
+ ARG_PTR_TO_FUNC, /* pointer to a bpf program function */
+ ARG_PTR_TO_STACK_OR_NULL, /* pointer to stack or NULL */
__BPF_ARG_TYPE_MAX,
};
PTR_TO_RDWR_BUF, /* reg points to a read/write buffer */
PTR_TO_RDWR_BUF_OR_NULL, /* reg points to a read/write buffer or NULL */
PTR_TO_PERCPU_BTF_ID, /* reg points to a percpu kernel variable */
+ PTR_TO_FUNC, /* reg points to a bpf program function */
+ PTR_TO_MAP_KEY, /* reg points to a map element key */
};
/* The information passed from prog-specific *_is_valid_access
*/
#define MAX_BPF_FUNC_ARGS 12
+/* The maximum number of arguments passed through registers
+ * a single function may have.
+ */
+#define MAX_BPF_FUNC_REG_ARGS 5
+
struct btf_func_model {
u8 ret_size;
u8 nr_args;
* fentry = a set of program to run before calling original function
* fexit = a set of program to run after original function
*/
- int arch_prepare_bpf_trampoline(void *image, void *image_end,
+ struct bpf_tramp_image;
+ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *tr, void *image, void *image_end,
const struct btf_func_model *m, u32 flags,
struct bpf_tramp_progs *tprogs,
void *orig_call);
void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start);
u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog);
void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start);
+ void notrace __bpf_tramp_enter(struct bpf_tramp_image *tr);
+ void notrace __bpf_tramp_exit(struct bpf_tramp_image *tr);
struct bpf_ksym {
unsigned long start;
BPF_TRAMP_REPLACE, /* more than MAX */
};
+ struct bpf_tramp_image {
+ void *image;
+ struct bpf_ksym ksym;
+ struct percpu_ref pcref;
+ void *ip_after_call;
+ void *ip_epilogue;
+ union {
+ struct rcu_head rcu;
+ struct work_struct work;
+ };
+ };
+
struct bpf_trampoline {
/* hlist for trampoline_table */
struct hlist_node hlist;
/* Number of attached programs. A counter per kind. */
int progs_cnt[BPF_TRAMP_MAX];
/* Executable image of trampoline */
- void *image;
+ struct bpf_tramp_image *cur_image;
u64 selector;
- struct bpf_ksym ksym;
};
struct bpf_attach_target_info {
void bpf_image_ksym_del(struct bpf_ksym *ksym);
void bpf_ksym_add(struct bpf_ksym *ksym);
void bpf_ksym_del(struct bpf_ksym *ksym);
+ int bpf_jit_charge_modmem(u32 pages);
+ void bpf_jit_uncharge_modmem(u32 pages);
#else
static inline int bpf_trampoline_link_prog(struct bpf_prog *prog,
struct bpf_trampoline *tr)
bool func_proto_unreliable;
bool sleepable;
bool tail_call_reachable;
- enum bpf_tramp_prog_type trampoline_prog_type;
struct hlist_node tramp_hlist;
/* BTF_KIND_FUNC_PROTO for valid attach_btf_id */
const struct btf_type *attach_func_proto;
_ret; \
})
- #define __BPF_PROG_RUN_ARRAY(array, ctx, func, check_non_null) \
+ #define __BPF_PROG_RUN_ARRAY(array, ctx, func, check_non_null, set_cg_storage) \
({ \
struct bpf_prog_array_item *_item; \
struct bpf_prog *_prog; \
goto _out; \
_item = &_array->items[0]; \
while ((_prog = READ_ONCE(_item->prog))) { \
- bpf_cgroup_storage_set(_item->cgroup_storage); \
+ if (set_cg_storage) \
+ bpf_cgroup_storage_set(_item->cgroup_storage); \
_ret &= func(_prog, ctx); \
_item++; \
} \
})
#define BPF_PROG_RUN_ARRAY(array, ctx, func) \
- __BPF_PROG_RUN_ARRAY(array, ctx, func, false)
+ __BPF_PROG_RUN_ARRAY(array, ctx, func, false, true)
#define BPF_PROG_RUN_ARRAY_CHECK(array, ctx, func) \
- __BPF_PROG_RUN_ARRAY(array, ctx, func, true)
+ __BPF_PROG_RUN_ARRAY(array, ctx, func, true, false)
#ifdef CONFIG_BPF_SYSCALL
DECLARE_PER_CPU(int, bpf_prog_active);
int bpf_iter_map_fill_link_info(const struct bpf_iter_aux_info *aux,
struct bpf_link_info *info);
+int map_set_for_each_callback_args(struct bpf_verifier_env *env,
+ struct bpf_func_state *caller,
+ struct bpf_func_state *callee);
+
int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value);
int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value);
int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value,
/* Map specifics */
struct xdp_buff;
struct sk_buff;
+struct bpf_dtab_netdev;
+struct bpf_cpu_map_entry;
-struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key);
-struct bpf_dtab_netdev *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key);
void __dev_flush(void);
int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
struct net_device *dev_rx);
struct bpf_prog *xdp_prog);
bool dev_map_can_have_prog(struct bpf_map *map);
-struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key);
void __cpu_map_flush(void);
int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp,
struct net_device *dev_rx);
int bpf_prog_test_run_raw_tp(struct bpf_prog *prog,
const union bpf_attr *kattr,
union bpf_attr __user *uattr);
+int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog,
+ const union bpf_attr *kattr,
+ union bpf_attr __user *uattr);
bool btf_ctx_access(int off, int size, enum bpf_access_type type,
const struct bpf_prog *prog,
struct bpf_insn_access_aux *info);
struct bpf_link *bpf_link_by_id(u32 id);
const struct bpf_func_proto *bpf_base_func_proto(enum bpf_func_id func_id);
+void bpf_task_storage_free(struct task_struct *task);
#else /* !CONFIG_BPF_SYSCALL */
static inline struct bpf_prog *bpf_prog_get(u32 ufd)
{
return -EOPNOTSUPP;
}
-static inline struct net_device *__dev_map_lookup_elem(struct bpf_map *map,
- u32 key)
-{
- return NULL;
-}
-
-static inline struct net_device *__dev_map_hash_lookup_elem(struct bpf_map *map,
- u32 key)
-{
- return NULL;
-}
static inline bool dev_map_can_have_prog(struct bpf_map *map)
{
return false;
struct xdp_buff;
struct bpf_dtab_netdev;
+struct bpf_cpu_map_entry;
static inline
int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
return 0;
}
-static inline
-struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key)
-{
- return NULL;
-}
-
static inline void __cpu_map_flush(void)
{
}
return -ENOTSUPP;
}
+static inline int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog,
+ const union bpf_attr *kattr,
+ union bpf_attr __user *uattr)
+{
+ return -ENOTSUPP;
+}
+
static inline void bpf_map_put(struct bpf_map *map)
{
}
{
return NULL;
}
+
+static inline void bpf_task_storage_free(struct task_struct *task)
+{
+}
#endif /* CONFIG_BPF_SYSCALL */
void __bpf_free_used_btfs(struct bpf_prog_aux *aux,
}
#endif /* CONFIG_NET && CONFIG_BPF_SYSCALL */
-#if defined(CONFIG_BPF_STREAM_PARSER)
-int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog,
- struct bpf_prog *old, u32 which);
+#if defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL)
int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog);
int sock_map_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype);
int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value, u64 flags);
void sock_map_unhash(struct sock *sk);
void sock_map_close(struct sock *sk, long timeout);
+
+void bpf_sk_reuseport_detach(struct sock *sk);
+int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, void *key,
+ void *value);
+int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 map_flags);
#else
-static inline int sock_map_prog_update(struct bpf_map *map,
- struct bpf_prog *prog,
- struct bpf_prog *old, u32 which)
+static inline void bpf_sk_reuseport_detach(struct sock *sk)
{
- return -EOPNOTSUPP;
}
+#ifdef CONFIG_BPF_SYSCALL
static inline int sock_map_get_from_fd(const union bpf_attr *attr,
struct bpf_prog *prog)
{
{
return -EOPNOTSUPP;
}
-#endif /* CONFIG_BPF_STREAM_PARSER */
-#if defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL)
-void bpf_sk_reuseport_detach(struct sock *sk);
-int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, void *key,
- void *value);
-int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, void *key,
- void *value, u64 map_flags);
-#else
-static inline void bpf_sk_reuseport_detach(struct sock *sk)
-{
-}
-
-#ifdef CONFIG_BPF_SYSCALL
static inline int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map,
void *key, void *value)
{
extern const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto;
extern const struct bpf_func_proto bpf_sock_from_file_proto;
extern const struct bpf_func_proto bpf_get_socket_ptr_cookie_proto;
+extern const struct bpf_func_proto bpf_task_storage_get_proto;
+extern const struct bpf_func_proto bpf_task_storage_delete_proto;
+extern const struct bpf_func_proto bpf_for_each_map_elem_proto;
const struct bpf_func_proto *bpf_tracing_func_proto(
enum bpf_func_id func_id, const struct bpf_prog *prog);
NAPI_STATE_IN_BUSY_POLL, /* sk_busy_loop() owns this NAPI */
NAPI_STATE_PREFER_BUSY_POLL, /* prefer busy-polling over softirq processing*/
NAPI_STATE_THREADED, /* The poll is performed inside its own thread*/
+ NAPI_STATE_SCHED_THREADED, /* Napi is currently scheduled in threaded mode */
};
enum {
NAPIF_STATE_IN_BUSY_POLL = BIT(NAPI_STATE_IN_BUSY_POLL),
NAPIF_STATE_PREFER_BUSY_POLL = BIT(NAPI_STATE_PREFER_BUSY_POLL),
NAPIF_STATE_THREADED = BIT(NAPI_STATE_THREADED),
+ NAPIF_STATE_SCHED_THREADED = BIT(NAPI_STATE_SCHED_THREADED),
};
enum gro_result {
const char *buf, size_t len);
};
+/* XPS map type and offset of the xps map within net_device->xps_maps[]. */
+enum xps_map_type {
+ XPS_CPUS = 0,
+ XPS_RXQS,
+ XPS_MAPS_MAX,
+};
+
#ifdef CONFIG_XPS
/*
* This structure holds an XPS map which can be of variable length. The
/*
* This structure holds all XPS maps for device. Maps are indexed by CPU.
+ *
+ * We keep track of the number of cpus/rxqs used when the struct is allocated,
+ * in nr_ids. This will help not accessing out-of-bound memory.
+ *
+ * We keep track of the number of traffic classes used when the struct is
+ * allocated, in num_tc. This will be used to navigate the maps, to ensure we're
+ * not crossing its upper bound, as the original dev->num_tc can be updated in
+ * the meantime.
*/
struct xps_dev_maps {
struct rcu_head rcu;
+ unsigned int nr_ids;
+ s16 num_tc;
struct xps_map __rcu *attr_map[]; /* Either CPUs map or RXQs map */
};
struct sk_buff *skb,
struct net_device *sb_dev);
+enum net_device_path_type {
+ DEV_PATH_ETHERNET = 0,
+ DEV_PATH_VLAN,
+ DEV_PATH_BRIDGE,
+ DEV_PATH_PPPOE,
+ DEV_PATH_DSA,
+};
+
+struct net_device_path {
+ enum net_device_path_type type;
+ const struct net_device *dev;
+ union {
+ struct {
+ u16 id;
+ __be16 proto;
+ u8 h_dest[ETH_ALEN];
+ } encap;
+ struct {
+ enum {
+ DEV_PATH_BR_VLAN_KEEP,
+ DEV_PATH_BR_VLAN_TAG,
+ DEV_PATH_BR_VLAN_UNTAG,
+ DEV_PATH_BR_VLAN_UNTAG_HW,
+ } vlan_mode;
+ u16 vlan_id;
+ __be16 vlan_proto;
+ } bridge;
+ struct {
+ int port;
+ u16 proto;
+ } dsa;
+ };
+};
+
+#define NET_DEVICE_PATH_STACK_MAX 5
+#define NET_DEVICE_PATH_VLAN_MAX 2
+
+struct net_device_path_stack {
+ int num_paths;
+ struct net_device_path path[NET_DEVICE_PATH_STACK_MAX];
+};
+
+struct net_device_path_ctx {
+ const struct net_device *dev;
+ const u8 *daddr;
+
+ int num_vlans;
+ struct {
+ u16 id;
+ __be16 proto;
+ } vlan[NET_DEVICE_PATH_VLAN_MAX];
+};
+
enum tc_setup_type {
TC_SETUP_QDISC_MQPRIO,
TC_SETUP_CLSU32,
* struct net_device *(*ndo_get_peer_dev)(struct net_device *dev);
* If a device is paired with a peer device, return the peer instance.
* The caller must be under RCU read context.
+ * int (*ndo_fill_forward_path)(struct net_device_path_ctx *ctx, struct net_device_path *path);
+ * Get the forwarding path to reach the real device from the HW destination address
*/
struct net_device_ops {
int (*ndo_init)(struct net_device *dev);
int (*ndo_tunnel_ctl)(struct net_device *dev,
struct ip_tunnel_parm *p, int cmd);
struct net_device * (*ndo_get_peer_dev)(struct net_device *dev);
+ int (*ndo_fill_forward_path)(struct net_device_path_ctx *ctx,
+ struct net_device_path *path);
};
/**
* @IFF_FAILOVER_SLAVE: device is lower dev of a failover master device
* @IFF_L3MDEV_RX_HANDLER: only invoke the rx handler of L3 master device
* @IFF_LIVE_RENAME_OK: rename is allowed while device is up and running
+ * @IFF_TX_SKB_NO_LINEAR: device/driver is capable of xmitting frames with
+ * skb_headlen(skb) == 0 (data starts from frag0)
*/
enum netdev_priv_flags {
IFF_802_1Q_VLAN = 1<<0,
IFF_FAILOVER_SLAVE = 1<<28,
IFF_L3MDEV_RX_HANDLER = 1<<29,
IFF_LIVE_RENAME_OK = 1<<30,
+ IFF_TX_SKB_NO_LINEAR = 1<<31,
};
#define IFF_802_1Q_VLAN IFF_802_1Q_VLAN
#define IFF_L3MDEV_SLAVE IFF_L3MDEV_SLAVE
#define IFF_TEAM IFF_TEAM
#define IFF_RXFH_CONFIGURED IFF_RXFH_CONFIGURED
+#define IFF_PHONY_HEADROOM IFF_PHONY_HEADROOM
#define IFF_MACSEC IFF_MACSEC
#define IFF_NO_RX_HANDLER IFF_NO_RX_HANDLER
#define IFF_FAILOVER IFF_FAILOVER
#define IFF_FAILOVER_SLAVE IFF_FAILOVER_SLAVE
#define IFF_L3MDEV_RX_HANDLER IFF_L3MDEV_RX_HANDLER
#define IFF_LIVE_RENAME_OK IFF_LIVE_RENAME_OK
+#define IFF_TX_SKB_NO_LINEAR IFF_TX_SKB_NO_LINEAR
/* Specifies the type of the struct net_device::ml_priv pointer */
enum netdev_ml_priv_type {
* @tx_queue_len: Max frames per queue allowed
* @tx_global_lock: XXX: need comments on this one
* @xdp_bulkq: XDP device bulk queue
- * @xps_cpus_map: all CPUs map for XPS device
- * @xps_rxqs_map: all RXQs map for XPS device
+ * @xps_maps: all CPUs/RXQs maps for XPS device
*
* @xps_maps: XXX: need comments on this one
* @miniq_egress: clsact qdisc specific data for
*
* @proto_down_reason: reason a netdev interface is held down
* @pcpu_refcnt: Number of references to this device
+ * @dev_refcnt: Number of references to this device
* @todo_list: Delayed register/unregister
* @link_watch_list: XXX: need comments on this one
*
struct xdp_dev_bulk_queue __percpu *xdp_bulkq;
#ifdef CONFIG_XPS
- struct xps_dev_maps __rcu *xps_cpus_map;
- struct xps_dev_maps __rcu *xps_rxqs_map;
+ struct xps_dev_maps __rcu *xps_maps[XPS_MAPS_MAX];
#endif
#ifdef CONFIG_NET_CLS_ACT
struct mini_Qdisc __rcu *miniq_egress;
u32 proto_down_reason;
struct list_head todo_list;
+
+#ifdef CONFIG_PCPU_DEV_REFCNT
int __percpu *pcpu_refcnt;
+#else
+ refcount_t dev_refcnt;
+#endif
struct list_head link_watch_list;
int dev_get_iflink(const struct net_device *dev);
int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb);
+int dev_fill_forward_path(const struct net_device *dev, const u8 *daddr,
+ struct net_device_path_stack *stack);
struct net_device *__dev_get_by_flags(struct net *net, unsigned short flags,
unsigned short mask);
struct net_device *dev_get_by_name(struct net *net, const char *name);
return dev_queue->state & QUEUE_STATE_DRV_XOFF_OR_FROZEN;
}
+/**
+ * netdev_queue_set_dql_min_limit - set dql minimum limit
+ * @dev_queue: pointer to transmit queue
+ * @min_limit: dql minimum limit
+ *
+ * Forces xmit_more() to return true until the minimum threshold
+ * defined by @min_limit is reached (or until the tx queue is
+ * empty). Warning: to be use with care, misuse will impact the
+ * latency.
+ */
+static inline void netdev_queue_set_dql_min_limit(struct netdev_queue *dev_queue,
+ unsigned int min_limit)
+{
+#ifdef CONFIG_BQL
+ dev_queue->dql.min_limit = min_limit;
+#endif
+}
+
/**
* netdev_txq_bql_enqueue_prefetchw - prefetch bql data for write
* @dev_queue: pointer to transmit queue
int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
u16 index);
int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
- u16 index, bool is_rxqs_map);
+ u16 index, enum xps_map_type type);
/**
* netif_attr_test_mask - Test a CPU or Rx queue set in a mask
static inline int __netif_set_xps_queue(struct net_device *dev,
const unsigned long *mask,
- u16 index, bool is_rxqs_map)
+ u16 index, enum xps_map_type type)
{
return 0;
}
*/
static inline void dev_put(struct net_device *dev)
{
+#ifdef CONFIG_PCPU_DEV_REFCNT
this_cpu_dec(*dev->pcpu_refcnt);
+#else
+ refcount_dec(&dev->dev_refcnt);
+#endif
}
/**
*/
static inline void dev_hold(struct net_device *dev)
{
+#ifdef CONFIG_PCPU_DEV_REFCNT
this_cpu_inc(*dev->pcpu_refcnt);
+#else
+ refcount_inc(&dev->dev_refcnt);
+#endif
}
/* Carrier loss detection, dial on demand. The functions netif_carrier_on
*
* Check if device has not been removed from system.
*/
-static inline bool netif_device_present(struct net_device *dev)
+static inline bool netif_device_present(const struct net_device *dev)
{
return test_bit(__LINK_STATE_PRESENT, &dev->state);
}
extern int netdev_max_backlog;
extern int netdev_tstamp_prequeue;
+extern int netdev_unregister_timeout_secs;
extern int weight_p;
extern int dev_weight_rx_bias;
extern int dev_weight_tx_bias;
#define PTYPE_HASH_SIZE (16)
#define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1)
+extern struct list_head ptype_all __read_mostly;
+extern struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
+
extern struct net_device *blackhole_netdev;
#endif /* _LINUX_NETDEVICE_H */
struct tc_skb_ext {
__u32 chain;
__u16 mru;
+ bool post_ct;
};
#endif
* @protocol: Packet protocol from driver
* @destructor: Destruct function
* @tcp_tsorted_anchor: list structure for TCP (tp->tsorted_sent_queue)
+ * @_sk_redir: socket redirection information for skmsg
* @_nfct: Associated connection, if any (with nfctinfo bits)
* @nf_bridge: Saved data about a bridged frame - see br_netfilter.c
* @skb_iif: ifindex of device we arrived on
void (*destructor)(struct sk_buff *skb);
};
struct list_head tcp_tsorted_anchor;
+#ifdef CONFIG_NET_SOCK_MSG
+ unsigned long _sk_redir;
+#endif
};
#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
return skb->fclone == SKB_FCLONE_ORIG &&
refcount_read(&fclones->fclone_ref) > 1 &&
- fclones->skb2.sk == sk;
+ READ_ONCE(fclones->skb2.sk) == sk;
}
/**
void __skb_get_hash(struct sk_buff *skb);
u32 __skb_get_hash_symmetric(const struct sk_buff *skb);
u32 skb_get_poff(const struct sk_buff *skb);
-u32 __skb_get_poff(const struct sk_buff *skb, void *data,
+u32 __skb_get_poff(const struct sk_buff *skb, const void *data,
const struct flow_keys_basic *keys, int hlen);
__be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto,
- void *data, int hlen_proto);
+ const void *data, int hlen_proto);
static inline __be32 skb_flow_get_ports(const struct sk_buff *skb,
int thoff, u8 ip_proto)
bool __skb_flow_dissect(const struct net *net,
const struct sk_buff *skb,
struct flow_dissector *flow_dissector,
- void *target_container,
- void *data, __be16 proto, int nhoff, int hlen,
- unsigned int flags);
+ void *target_container, const void *data,
+ __be16 proto, int nhoff, int hlen, unsigned int flags);
static inline bool skb_flow_dissect(const struct sk_buff *skb,
struct flow_dissector *flow_dissector,
static inline bool
skb_flow_dissect_flow_keys_basic(const struct net *net,
const struct sk_buff *skb,
- struct flow_keys_basic *flow, void *data,
- __be16 proto, int nhoff, int hlen,
- unsigned int flags)
+ struct flow_keys_basic *flow,
+ const void *data, __be16 proto,
+ int nhoff, int hlen, unsigned int flags)
{
memset(flow, 0, sizeof(*flow));
return __skb_flow_dissect(net, skb, &flow_keys_basic_dissector, flow,
__wsum csum);
static inline void * __must_check
-__skb_header_pointer(const struct sk_buff *skb, int offset,
- int len, void *data, int hlen, void *buffer)
+__skb_header_pointer(const struct sk_buff *skb, int offset, int len,
+ const void *data, int hlen, void *buffer)
{
- if (hlen - offset >= len)
- return data + offset;
+ if (likely(hlen - offset >= len))
+ return (void *)data + offset;
- if (!skb ||
- skb_copy_bits(skb, offset, buffer, len) < 0)
+ if (!skb || unlikely(skb_copy_bits(skb, offset, buffer, len) < 0))
return NULL;
return buffer;
struct nft_trans_table {
bool update;
- bool enable;
+ u8 state;
+ u32 flags;
};
#define nft_trans_table_update(trans) \
(((struct nft_trans_table *)trans->data)->update)
-#define nft_trans_table_enable(trans) \
- (((struct nft_trans_table *)trans->data)->enable)
+#define nft_trans_table_state(trans) \
+ (((struct nft_trans_table *)trans->data)->state)
+#define nft_trans_table_flags(trans) \
+ (((struct nft_trans_table *)trans->data)->flags)
struct nft_trans_elem {
struct nft_set *set;
struct nft_flowtable *flowtable;
bool update;
struct list_head hook_list;
+ u32 flags;
};
#define nft_trans_flowtable(trans) \
(((struct nft_trans_flowtable *)trans->data)->update)
#define nft_trans_flowtable_hooks(trans) \
(((struct nft_trans_flowtable *)trans->data)->hook_list)
+ #define nft_trans_flowtable_flags(trans) \
+ (((struct nft_trans_flowtable *)trans->data)->flags)
int __init nft_chain_filter_init(void);
void nft_chain_filter_fini(void);
struct nlattr *nh_grp;
u16 nh_grp_type;
+ u16 nh_grp_res_num_buckets;
+ unsigned long nh_grp_res_idle_timer;
+ unsigned long nh_grp_res_unbalanced_timer;
+ bool nh_grp_res_has_num_buckets;
+ bool nh_grp_res_has_idle_timer;
+ bool nh_grp_res_has_unbalanced_timer;
struct nlattr *nh_encap;
u16 nh_encap_type;
};
};
+struct nh_res_bucket {
+ struct nh_grp_entry __rcu *nh_entry;
+ atomic_long_t used_time;
+ unsigned long migrated_time;
+ bool occupied;
+ u8 nh_flags;
+};
+
+struct nh_res_table {
+ struct net *net;
+ u32 nhg_id;
+ struct delayed_work upkeep_dw;
+
+ /* List of NHGEs that have too few buckets ("uw" for underweight).
+ * Reclaimed buckets will be given to entries in this list.
+ */
+ struct list_head uw_nh_entries;
+ unsigned long unbalanced_since;
+
+ u32 idle_timer;
+ u32 unbalanced_timer;
+
+ u16 num_nh_buckets;
+ struct nh_res_bucket nh_buckets[];
+};
+
struct nh_grp_entry {
struct nexthop *nh;
u8 weight;
struct {
atomic_t upper_bound;
} mpath;
+ struct {
+ /* Member on uw_nh_entries. */
+ struct list_head uw_nh_entry;
+
+ u16 count_buckets;
+ u16 wants_buckets;
+ } res;
};
struct list_head nh_list;
struct nh_group {
struct nh_group *spare; /* spare group for removals */
u16 num_nh;
+ bool is_multipath;
bool mpath;
+ bool resilient;
bool fdb_nh;
bool has_v4;
+
+ struct nh_res_table __rcu *res_table;
struct nh_grp_entry nh_entries[];
};
enum nexthop_event_type {
NEXTHOP_EVENT_DEL,
NEXTHOP_EVENT_REPLACE,
+ NEXTHOP_EVENT_RES_TABLE_PRE_REPLACE,
+ NEXTHOP_EVENT_BUCKET_REPLACE,
};
enum nh_notifier_info_type {
NH_NOTIFIER_INFO_TYPE_SINGLE,
NH_NOTIFIER_INFO_TYPE_GRP,
+ NH_NOTIFIER_INFO_TYPE_RES_TABLE,
+ NH_NOTIFIER_INFO_TYPE_RES_BUCKET,
};
struct nh_notifier_single_info {
struct nh_notifier_grp_entry_info nh_entries[];
};
+struct nh_notifier_res_bucket_info {
+ u16 bucket_index;
+ unsigned int idle_timer_ms;
+ bool force;
+ struct nh_notifier_single_info old_nh;
+ struct nh_notifier_single_info new_nh;
+};
+
+struct nh_notifier_res_table_info {
+ u16 num_nh_buckets;
+ struct nh_notifier_single_info nhs[];
+};
+
struct nh_notifier_info {
struct net *net;
struct netlink_ext_ack *extack;
union {
struct nh_notifier_single_info *nh;
struct nh_notifier_grp_info *nh_grp;
+ struct nh_notifier_res_table_info *nh_res_table;
+ struct nh_notifier_res_bucket_info *nh_res_bucket;
};
};
struct netlink_ext_ack *extack);
int unregister_nexthop_notifier(struct net *net, struct notifier_block *nb);
void nexthop_set_hw_flags(struct net *net, u32 id, bool offload, bool trap);
+void nexthop_bucket_set_hw_flags(struct net *net, u32 id, u16 bucket_index,
+ bool offload, bool trap);
+void nexthop_res_grp_activity_update(struct net *net, u32 id, u16 num_buckets,
+ unsigned long *activity);
/* caller is holding rcu or rtnl; no reference taken to nexthop */
struct nexthop *nexthop_find_by_id(struct net *net, u32 id);
struct nh_group *nh_grp;
nh_grp = rcu_dereference_rtnl(nh->nh_grp);
- return nh_grp->mpath;
+ return nh_grp->is_multipath;
}
return false;
}
struct nh_group *nh_grp;
nh_grp = rcu_dereference_rtnl(nh->nh_grp);
- if (nh_grp->mpath)
+ if (nh_grp->is_multipath)
rc = nh_grp->num_nh;
}
struct nh_group *nh_grp;
nh_grp = rcu_dereference_rtnl(nh->nh_grp);
- if (nh_grp->mpath) {
+ if (nh_grp->is_multipath) {
nh = nexthop_mpath_select(nh_grp, nhsel);
if (!nh)
return NULL;
int fib6_check_nexthop(struct nexthop *nh, struct fib6_config *cfg,
struct netlink_ext_ack *extack);
+ /* Caller should either hold rcu_read_lock(), or RTNL. */
static inline struct fib6_nh *nexthop_fib6_nh(struct nexthop *nh)
{
struct nh_info *nhi;
return NULL;
}
+ /* Variant of nexthop_fib6_nh().
+ * Caller should either hold rcu_read_lock_bh(), or RTNL.
+ */
+ static inline struct fib6_nh *nexthop_fib6_nh_bh(struct nexthop *nh)
+ {
+ struct nh_info *nhi;
+
+ if (nh->is_group) {
+ struct nh_group *nh_grp;
+
+ nh_grp = rcu_dereference_bh_rtnl(nh->nh_grp);
+ nh = nexthop_mpath_select(nh_grp, 0);
+ if (!nh)
+ return NULL;
+ }
+
+ nhi = rcu_dereference_bh_rtnl(nh->nh_info);
+ if (nhi->family == AF_INET6)
+ return &nhi->fib6_nh;
+
+ return NULL;
+ }
+
static inline struct net_device *fib6_info_nh_dev(struct fib6_info *f6i)
{
struct fib6_nh *fib6_nh;
} map;
};
-/* BPF syscall commands, see bpf(2) man-page for details. */
+/* BPF syscall commands, see bpf(2) man-page for more details. */
+/**
+ * DOC: eBPF Syscall Preamble
+ *
+ * The operation to be performed by the **bpf**\ () system call is determined
+ * by the *cmd* argument. Each operation takes an accompanying argument,
+ * provided via *attr*, which is a pointer to a union of type *bpf_attr* (see
+ * below). The size argument is the size of the union pointed to by *attr*.
+ */
+/**
+ * DOC: eBPF Syscall Commands
+ *
+ * BPF_MAP_CREATE
+ * Description
+ * Create a map and return a file descriptor that refers to the
+ * map. The close-on-exec file descriptor flag (see **fcntl**\ (2))
+ * is automatically enabled for the new file descriptor.
+ *
+ * Applying **close**\ (2) to the file descriptor returned by
+ * **BPF_MAP_CREATE** will delete the map (but see NOTES).
+ *
+ * Return
+ * A new file descriptor (a nonnegative integer), or -1 if an
+ * error occurred (in which case, *errno* is set appropriately).
+ *
+ * BPF_MAP_LOOKUP_ELEM
+ * Description
+ * Look up an element with a given *key* in the map referred to
+ * by the file descriptor *map_fd*.
+ *
+ * The *flags* argument may be specified as one of the
+ * following:
+ *
+ * **BPF_F_LOCK**
+ * Look up the value of a spin-locked map without
+ * returning the lock. This must be specified if the
+ * elements contain a spinlock.
+ *
+ * Return
+ * Returns zero on success. On error, -1 is returned and *errno*
+ * is set appropriately.
+ *
+ * BPF_MAP_UPDATE_ELEM
+ * Description
+ * Create or update an element (key/value pair) in a specified map.
+ *
+ * The *flags* argument should be specified as one of the
+ * following:
+ *
+ * **BPF_ANY**
+ * Create a new element or update an existing element.
+ * **BPF_NOEXIST**
+ * Create a new element only if it did not exist.
+ * **BPF_EXIST**
+ * Update an existing element.
+ * **BPF_F_LOCK**
+ * Update a spin_lock-ed map element.
+ *
+ * Return
+ * Returns zero on success. On error, -1 is returned and *errno*
+ * is set appropriately.
+ *
+ * May set *errno* to **EINVAL**, **EPERM**, **ENOMEM**,
+ * **E2BIG**, **EEXIST**, or **ENOENT**.
+ *
+ * **E2BIG**
+ * The number of elements in the map reached the
+ * *max_entries* limit specified at map creation time.
+ * **EEXIST**
+ * If *flags* specifies **BPF_NOEXIST** and the element
+ * with *key* already exists in the map.
+ * **ENOENT**
+ * If *flags* specifies **BPF_EXIST** and the element with
+ * *key* does not exist in the map.
+ *
+ * BPF_MAP_DELETE_ELEM
+ * Description
+ * Look up and delete an element by key in a specified map.
+ *
+ * Return
+ * Returns zero on success. On error, -1 is returned and *errno*
+ * is set appropriately.
+ *
+ * BPF_MAP_GET_NEXT_KEY
+ * Description
+ * Look up an element by key in a specified map and return the key
+ * of the next element. Can be used to iterate over all elements
+ * in the map.
+ *
+ * Return
+ * Returns zero on success. On error, -1 is returned and *errno*
+ * is set appropriately.
+ *
+ * The following cases can be used to iterate over all elements of
+ * the map:
+ *
+ * * If *key* is not found, the operation returns zero and sets
+ * the *next_key* pointer to the key of the first element.
+ * * If *key* is found, the operation returns zero and sets the
+ * *next_key* pointer to the key of the next element.
+ * * If *key* is the last element, returns -1 and *errno* is set
+ * to **ENOENT**.
+ *
+ * May set *errno* to **ENOMEM**, **EFAULT**, **EPERM**, or
+ * **EINVAL** on error.
+ *
+ * BPF_PROG_LOAD
+ * Description
+ * Verify and load an eBPF program, returning a new file
+ * descriptor associated with the program.
+ *
+ * Applying **close**\ (2) to the file descriptor returned by
+ * **BPF_PROG_LOAD** will unload the eBPF program (but see NOTES).
+ *
+ * The close-on-exec file descriptor flag (see **fcntl**\ (2)) is
+ * automatically enabled for the new file descriptor.
+ *
+ * Return
+ * A new file descriptor (a nonnegative integer), or -1 if an
+ * error occurred (in which case, *errno* is set appropriately).
+ *
+ * BPF_OBJ_PIN
+ * Description
+ * Pin an eBPF program or map referred by the specified *bpf_fd*
+ * to the provided *pathname* on the filesystem.
+ *
+ * The *pathname* argument must not contain a dot (".").
+ *
+ * On success, *pathname* retains a reference to the eBPF object,
+ * preventing deallocation of the object when the original
+ * *bpf_fd* is closed. This allow the eBPF object to live beyond
+ * **close**\ (\ *bpf_fd*\ ), and hence the lifetime of the parent
+ * process.
+ *
+ * Applying **unlink**\ (2) or similar calls to the *pathname*
+ * unpins the object from the filesystem, removing the reference.
+ * If no other file descriptors or filesystem nodes refer to the
+ * same object, it will be deallocated (see NOTES).
+ *
+ * The filesystem type for the parent directory of *pathname* must
+ * be **BPF_FS_MAGIC**.
+ *
+ * Return
+ * Returns zero on success. On error, -1 is returned and *errno*
+ * is set appropriately.
+ *
+ * BPF_OBJ_GET
+ * Description
+ * Open a file descriptor for the eBPF object pinned to the
+ * specified *pathname*.
+ *
+ * Return
+ * A new file descriptor (a nonnegative integer), or -1 if an
+ * error occurred (in which case, *errno* is set appropriately).
+ *
+ * BPF_PROG_ATTACH
+ * Description
+ * Attach an eBPF program to a *target_fd* at the specified
+ * *attach_type* hook.
+ *
+ * The *attach_type* specifies the eBPF attachment point to
+ * attach the program to, and must be one of *bpf_attach_type*
+ * (see below).
+ *
+ * The *attach_bpf_fd* must be a valid file descriptor for a
+ * loaded eBPF program of a cgroup, flow dissector, LIRC, sockmap
+ * or sock_ops type corresponding to the specified *attach_type*.
+ *
+ * The *target_fd* must be a valid file descriptor for a kernel
+ * object which depends on the attach type of *attach_bpf_fd*:
+ *
+ * **BPF_PROG_TYPE_CGROUP_DEVICE**,
+ * **BPF_PROG_TYPE_CGROUP_SKB**,
+ * **BPF_PROG_TYPE_CGROUP_SOCK**,
+ * **BPF_PROG_TYPE_CGROUP_SOCK_ADDR**,
+ * **BPF_PROG_TYPE_CGROUP_SOCKOPT**,
+ * **BPF_PROG_TYPE_CGROUP_SYSCTL**,
+ * **BPF_PROG_TYPE_SOCK_OPS**
+ *
+ * Control Group v2 hierarchy with the eBPF controller
+ * enabled. Requires the kernel to be compiled with
+ * **CONFIG_CGROUP_BPF**.
+ *
+ * **BPF_PROG_TYPE_FLOW_DISSECTOR**
+ *
+ * Network namespace (eg /proc/self/ns/net).
+ *
+ * **BPF_PROG_TYPE_LIRC_MODE2**
+ *
+ * LIRC device path (eg /dev/lircN). Requires the kernel
+ * to be compiled with **CONFIG_BPF_LIRC_MODE2**.
+ *
+ * **BPF_PROG_TYPE_SK_SKB**,
+ * **BPF_PROG_TYPE_SK_MSG**
+ *
+ * eBPF map of socket type (eg **BPF_MAP_TYPE_SOCKHASH**).
+ *
+ * Return
+ * Returns zero on success. On error, -1 is returned and *errno*
+ * is set appropriately.
+ *
+ * BPF_PROG_DETACH
+ * Description
+ * Detach the eBPF program associated with the *target_fd* at the
+ * hook specified by *attach_type*. The program must have been
+ * previously attached using **BPF_PROG_ATTACH**.
+ *
+ * Return
+ * Returns zero on success. On error, -1 is returned and *errno*
+ * is set appropriately.
+ *
+ * BPF_PROG_TEST_RUN
+ * Description
+ * Run the eBPF program associated with the *prog_fd* a *repeat*
+ * number of times against a provided program context *ctx_in* and
+ * data *data_in*, and return the modified program context
+ * *ctx_out*, *data_out* (for example, packet data), result of the
+ * execution *retval*, and *duration* of the test run.
+ *
+ * Return
+ * Returns zero on success. On error, -1 is returned and *errno*
+ * is set appropriately.
+ *
+ * **ENOSPC**
+ * Either *data_size_out* or *ctx_size_out* is too small.
+ * **ENOTSUPP**
+ * This command is not supported by the program type of
+ * the program referred to by *prog_fd*.
+ *
+ * BPF_PROG_GET_NEXT_ID
+ * Description
+ * Fetch the next eBPF program currently loaded into the kernel.
+ *
+ * Looks for the eBPF program with an id greater than *start_id*
+ * and updates *next_id* on success. If no other eBPF programs
+ * remain with ids higher than *start_id*, returns -1 and sets
+ * *errno* to **ENOENT**.
+ *
+ * Return
+ * Returns zero on success. On error, or when no id remains, -1
+ * is returned and *errno* is set appropriately.
+ *
+ * BPF_MAP_GET_NEXT_ID
+ * Description
+ * Fetch the next eBPF map currently loaded into the kernel.
+ *
+ * Looks for the eBPF map with an id greater than *start_id*
+ * and updates *next_id* on success. If no other eBPF maps
+ * remain with ids higher than *start_id*, returns -1 and sets
+ * *errno* to **ENOENT**.
+ *
+ * Return
+ * Returns zero on success. On error, or when no id remains, -1
+ * is returned and *errno* is set appropriately.
+ *
+ * BPF_PROG_GET_FD_BY_ID
+ * Description
+ * Open a file descriptor for the eBPF program corresponding to
+ * *prog_id*.
+ *
+ * Return
+ * A new file descriptor (a nonnegative integer), or -1 if an
+ * error occurred (in which case, *errno* is set appropriately).
+ *
+ * BPF_MAP_GET_FD_BY_ID
+ * Description
+ * Open a file descriptor for the eBPF map corresponding to
+ * *map_id*.
+ *
+ * Return
+ * A new file descriptor (a nonnegative integer), or -1 if an
+ * error occurred (in which case, *errno* is set appropriately).
+ *
+ * BPF_OBJ_GET_INFO_BY_FD
+ * Description
+ * Obtain information about the eBPF object corresponding to
+ * *bpf_fd*.
+ *
+ * Populates up to *info_len* bytes of *info*, which will be in
+ * one of the following formats depending on the eBPF object type
+ * of *bpf_fd*:
+ *
+ * * **struct bpf_prog_info**
+ * * **struct bpf_map_info**
+ * * **struct bpf_btf_info**
+ * * **struct bpf_link_info**
+ *
+ * Return
+ * Returns zero on success. On error, -1 is returned and *errno*
+ * is set appropriately.
+ *
+ * BPF_PROG_QUERY
+ * Description
+ * Obtain information about eBPF programs associated with the
+ * specified *attach_type* hook.
+ *
+ * The *target_fd* must be a valid file descriptor for a kernel
+ * object which depends on the attach type of *attach_bpf_fd*:
+ *
+ * **BPF_PROG_TYPE_CGROUP_DEVICE**,
+ * **BPF_PROG_TYPE_CGROUP_SKB**,
+ * **BPF_PROG_TYPE_CGROUP_SOCK**,
+ * **BPF_PROG_TYPE_CGROUP_SOCK_ADDR**,
+ * **BPF_PROG_TYPE_CGROUP_SOCKOPT**,
+ * **BPF_PROG_TYPE_CGROUP_SYSCTL**,
+ * **BPF_PROG_TYPE_SOCK_OPS**
+ *
+ * Control Group v2 hierarchy with the eBPF controller
+ * enabled. Requires the kernel to be compiled with
+ * **CONFIG_CGROUP_BPF**.
+ *
+ * **BPF_PROG_TYPE_FLOW_DISSECTOR**
+ *
+ * Network namespace (eg /proc/self/ns/net).
+ *
+ * **BPF_PROG_TYPE_LIRC_MODE2**
+ *
+ * LIRC device path (eg /dev/lircN). Requires the kernel
+ * to be compiled with **CONFIG_BPF_LIRC_MODE2**.
+ *
+ * **BPF_PROG_QUERY** always fetches the number of programs
+ * attached and the *attach_flags* which were used to attach those
+ * programs. Additionally, if *prog_ids* is nonzero and the number
+ * of attached programs is less than *prog_cnt*, populates
+ * *prog_ids* with the eBPF program ids of the programs attached
+ * at *target_fd*.
+ *
+ * The following flags may alter the result:
+ *
+ * **BPF_F_QUERY_EFFECTIVE**
+ * Only return information regarding programs which are
+ * currently effective at the specified *target_fd*.
+ *
+ * Return
+ * Returns zero on success. On error, -1 is returned and *errno*
+ * is set appropriately.
+ *
+ * BPF_RAW_TRACEPOINT_OPEN
+ * Description
+ * Attach an eBPF program to a tracepoint *name* to access kernel
+ * internal arguments of the tracepoint in their raw form.
+ *
+ * The *prog_fd* must be a valid file descriptor associated with
+ * a loaded eBPF program of type **BPF_PROG_TYPE_RAW_TRACEPOINT**.
+ *
+ * No ABI guarantees are made about the content of tracepoint
+ * arguments exposed to the corresponding eBPF program.
+ *
+ * Applying **close**\ (2) to the file descriptor returned by
+ * **BPF_RAW_TRACEPOINT_OPEN** will delete the map (but see NOTES).
+ *
+ * Return
+ * A new file descriptor (a nonnegative integer), or -1 if an
+ * error occurred (in which case, *errno* is set appropriately).
+ *
+ * BPF_BTF_LOAD
+ * Description
+ * Verify and load BPF Type Format (BTF) metadata into the kernel,
+ * returning a new file descriptor associated with the metadata.
+ * BTF is described in more detail at
+ * https://www.kernel.org/doc/html/latest/bpf/btf.html.
+ *
+ * The *btf* parameter must point to valid memory providing
+ * *btf_size* bytes of BTF binary metadata.
+ *
+ * The returned file descriptor can be passed to other **bpf**\ ()
+ * subcommands such as **BPF_PROG_LOAD** or **BPF_MAP_CREATE** to
+ * associate the BTF with those objects.
+ *
+ * Similar to **BPF_PROG_LOAD**, **BPF_BTF_LOAD** has optional
+ * parameters to specify a *btf_log_buf*, *btf_log_size* and
+ * *btf_log_level* which allow the kernel to return freeform log
+ * output regarding the BTF verification process.
+ *
+ * Return
+ * A new file descriptor (a nonnegative integer), or -1 if an
+ * error occurred (in which case, *errno* is set appropriately).
+ *
+ * BPF_BTF_GET_FD_BY_ID
+ * Description
+ * Open a file descriptor for the BPF Type Format (BTF)
+ * corresponding to *btf_id*.
+ *
+ * Return
+ * A new file descriptor (a nonnegative integer), or -1 if an
+ * error occurred (in which case, *errno* is set appropriately).
+ *
+ * BPF_TASK_FD_QUERY
+ * Description
+ * Obtain information about eBPF programs associated with the
+ * target process identified by *pid* and *fd*.
+ *
+ * If the *pid* and *fd* are associated with a tracepoint, kprobe
+ * or uprobe perf event, then the *prog_id* and *fd_type* will
+ * be populated with the eBPF program id and file descriptor type
+ * of type **bpf_task_fd_type**. If associated with a kprobe or
+ * uprobe, the *probe_offset* and *probe_addr* will also be
+ * populated. Optionally, if *buf* is provided, then up to
+ * *buf_len* bytes of *buf* will be populated with the name of
+ * the tracepoint, kprobe or uprobe.
+ *
+ * The resulting *prog_id* may be introspected in deeper detail
+ * using **BPF_PROG_GET_FD_BY_ID** and **BPF_OBJ_GET_INFO_BY_FD**.
+ *
+ * Return
+ * Returns zero on success. On error, -1 is returned and *errno*
+ * is set appropriately.
+ *
+ * BPF_MAP_LOOKUP_AND_DELETE_ELEM
+ * Description
+ * Look up an element with the given *key* in the map referred to
+ * by the file descriptor *fd*, and if found, delete the element.
+ *
+ * The **BPF_MAP_TYPE_QUEUE** and **BPF_MAP_TYPE_STACK** map types
+ * implement this command as a "pop" operation, deleting the top
+ * element rather than one corresponding to *key*.
+ * The *key* and *key_len* parameters should be zeroed when
+ * issuing this operation for these map types.
+ *
+ * This command is only valid for the following map types:
+ * * **BPF_MAP_TYPE_QUEUE**
+ * * **BPF_MAP_TYPE_STACK**
+ *
+ * Return
+ * Returns zero on success. On error, -1 is returned and *errno*
+ * is set appropriately.
+ *
+ * BPF_MAP_FREEZE
+ * Description
+ * Freeze the permissions of the specified map.
+ *
+ * Write permissions may be frozen by passing zero *flags*.
+ * Upon success, no future syscall invocations may alter the
+ * map state of *map_fd*. Write operations from eBPF programs
+ * are still possible for a frozen map.
+ *
+ * Not supported for maps of type **BPF_MAP_TYPE_STRUCT_OPS**.
+ *
+ * Return
+ * Returns zero on success. On error, -1 is returned and *errno*
+ * is set appropriately.
+ *
+ * BPF_BTF_GET_NEXT_ID
+ * Description
+ * Fetch the next BPF Type Format (BTF) object currently loaded
+ * into the kernel.
+ *
+ * Looks for the BTF object with an id greater than *start_id*
+ * and updates *next_id* on success. If no other BTF objects
+ * remain with ids higher than *start_id*, returns -1 and sets
+ * *errno* to **ENOENT**.
+ *
+ * Return
+ * Returns zero on success. On error, or when no id remains, -1
+ * is returned and *errno* is set appropriately.
+ *
+ * BPF_MAP_LOOKUP_BATCH
+ * Description
+ * Iterate and fetch multiple elements in a map.
+ *
+ * Two opaque values are used to manage batch operations,
+ * *in_batch* and *out_batch*. Initially, *in_batch* must be set
+ * to NULL to begin the batched operation. After each subsequent
+ * **BPF_MAP_LOOKUP_BATCH**, the caller should pass the resultant
+ * *out_batch* as the *in_batch* for the next operation to
+ * continue iteration from the current point.
+ *
+ * The *keys* and *values* are output parameters which must point
+ * to memory large enough to hold *count* items based on the key
+ * and value size of the map *map_fd*. The *keys* buffer must be
+ * of *key_size* * *count*. The *values* buffer must be of
+ * *value_size* * *count*.
+ *
+ * The *elem_flags* argument may be specified as one of the
+ * following:
+ *
+ * **BPF_F_LOCK**
+ * Look up the value of a spin-locked map without
+ * returning the lock. This must be specified if the
+ * elements contain a spinlock.
+ *
+ * On success, *count* elements from the map are copied into the
+ * user buffer, with the keys copied into *keys* and the values
+ * copied into the corresponding indices in *values*.
+ *
+ * If an error is returned and *errno* is not **EFAULT**, *count*
+ * is set to the number of successfully processed elements.
+ *
+ * Return
+ * Returns zero on success. On error, -1 is returned and *errno*
+ * is set appropriately.
+ *
+ * May set *errno* to **ENOSPC** to indicate that *keys* or
+ * *values* is too small to dump an entire bucket during
+ * iteration of a hash-based map type.
+ *
+ * BPF_MAP_LOOKUP_AND_DELETE_BATCH
+ * Description
+ * Iterate and delete all elements in a map.
+ *
+ * This operation has the same behavior as
+ * **BPF_MAP_LOOKUP_BATCH** with two exceptions:
+ *
+ * * Every element that is successfully returned is also deleted
+ * from the map. This is at least *count* elements. Note that
+ * *count* is both an input and an output parameter.
+ * * Upon returning with *errno* set to **EFAULT**, up to
+ * *count* elements may be deleted without returning the keys
+ * and values of the deleted elements.
+ *
+ * Return
+ * Returns zero on success. On error, -1 is returned and *errno*
+ * is set appropriately.
+ *
+ * BPF_MAP_UPDATE_BATCH
+ * Description
+ * Update multiple elements in a map by *key*.
+ *
+ * The *keys* and *values* are input parameters which must point
+ * to memory large enough to hold *count* items based on the key
+ * and value size of the map *map_fd*. The *keys* buffer must be
+ * of *key_size* * *count*. The *values* buffer must be of
+ * *value_size* * *count*.
+ *
+ * Each element specified in *keys* is sequentially updated to the
+ * value in the corresponding index in *values*. The *in_batch*
+ * and *out_batch* parameters are ignored and should be zeroed.
+ *
+ * The *elem_flags* argument should be specified as one of the
+ * following:
+ *
+ * **BPF_ANY**
+ * Create new elements or update a existing elements.
+ * **BPF_NOEXIST**
+ * Create new elements only if they do not exist.
+ * **BPF_EXIST**
+ * Update existing elements.
+ * **BPF_F_LOCK**
+ * Update spin_lock-ed map elements. This must be
+ * specified if the map value contains a spinlock.
+ *
+ * On success, *count* elements from the map are updated.
+ *
+ * If an error is returned and *errno* is not **EFAULT**, *count*
+ * is set to the number of successfully processed elements.
+ *
+ * Return
+ * Returns zero on success. On error, -1 is returned and *errno*
+ * is set appropriately.
+ *
+ * May set *errno* to **EINVAL**, **EPERM**, **ENOMEM**, or
+ * **E2BIG**. **E2BIG** indicates that the number of elements in
+ * the map reached the *max_entries* limit specified at map
+ * creation time.
+ *
+ * May set *errno* to one of the following error codes under
+ * specific circumstances:
+ *
+ * **EEXIST**
+ * If *flags* specifies **BPF_NOEXIST** and the element
+ * with *key* already exists in the map.
+ * **ENOENT**
+ * If *flags* specifies **BPF_EXIST** and the element with
+ * *key* does not exist in the map.
+ *
+ * BPF_MAP_DELETE_BATCH
+ * Description
+ * Delete multiple elements in a map by *key*.
+ *
+ * The *keys* parameter is an input parameter which must point
+ * to memory large enough to hold *count* items based on the key
+ * size of the map *map_fd*, that is, *key_size* * *count*.
+ *
+ * Each element specified in *keys* is sequentially deleted. The
+ * *in_batch*, *out_batch*, and *values* parameters are ignored
+ * and should be zeroed.
+ *
+ * The *elem_flags* argument may be specified as one of the
+ * following:
+ *
+ * **BPF_F_LOCK**
+ * Look up the value of a spin-locked map without
+ * returning the lock. This must be specified if the
+ * elements contain a spinlock.
+ *
+ * On success, *count* elements from the map are updated.
+ *
+ * If an error is returned and *errno* is not **EFAULT**, *count*
+ * is set to the number of successfully processed elements. If
+ * *errno* is **EFAULT**, up to *count* elements may be been
+ * deleted.
+ *
+ * Return
+ * Returns zero on success. On error, -1 is returned and *errno*
+ * is set appropriately.
+ *
+ * BPF_LINK_CREATE
+ * Description
+ * Attach an eBPF program to a *target_fd* at the specified
+ * *attach_type* hook and return a file descriptor handle for
+ * managing the link.
+ *
+ * Return
+ * A new file descriptor (a nonnegative integer), or -1 if an
+ * error occurred (in which case, *errno* is set appropriately).
+ *
+ * BPF_LINK_UPDATE
+ * Description
+ * Update the eBPF program in the specified *link_fd* to
+ * *new_prog_fd*.
+ *
+ * Return
+ * Returns zero on success. On error, -1 is returned and *errno*
+ * is set appropriately.
+ *
+ * BPF_LINK_GET_FD_BY_ID
+ * Description
+ * Open a file descriptor for the eBPF Link corresponding to
+ * *link_id*.
+ *
+ * Return
+ * A new file descriptor (a nonnegative integer), or -1 if an
+ * error occurred (in which case, *errno* is set appropriately).
+ *
+ * BPF_LINK_GET_NEXT_ID
+ * Description
+ * Fetch the next eBPF link currently loaded into the kernel.
+ *
+ * Looks for the eBPF link with an id greater than *start_id*
+ * and updates *next_id* on success. If no other eBPF links
+ * remain with ids higher than *start_id*, returns -1 and sets
+ * *errno* to **ENOENT**.
+ *
+ * Return
+ * Returns zero on success. On error, or when no id remains, -1
+ * is returned and *errno* is set appropriately.
+ *
+ * BPF_ENABLE_STATS
+ * Description
+ * Enable eBPF runtime statistics gathering.
+ *
+ * Runtime statistics gathering for the eBPF runtime is disabled
+ * by default to minimize the corresponding performance overhead.
+ * This command enables statistics globally.
+ *
+ * Multiple programs may independently enable statistics.
+ * After gathering the desired statistics, eBPF runtime statistics
+ * may be disabled again by calling **close**\ (2) for the file
+ * descriptor returned by this function. Statistics will only be
+ * disabled system-wide when all outstanding file descriptors
+ * returned by prior calls for this subcommand are closed.
+ *
+ * Return
+ * A new file descriptor (a nonnegative integer), or -1 if an
+ * error occurred (in which case, *errno* is set appropriately).
+ *
+ * BPF_ITER_CREATE
+ * Description
+ * Create an iterator on top of the specified *link_fd* (as
+ * previously created using **BPF_LINK_CREATE**) and return a
+ * file descriptor that can be used to trigger the iteration.
+ *
+ * If the resulting file descriptor is pinned to the filesystem
+ * using **BPF_OBJ_PIN**, then subsequent **read**\ (2) syscalls
+ * for that path will trigger the iterator to read kernel state
+ * using the eBPF program attached to *link_fd*.
+ *
+ * Return
+ * A new file descriptor (a nonnegative integer), or -1 if an
+ * error occurred (in which case, *errno* is set appropriately).
+ *
+ * BPF_LINK_DETACH
+ * Description
+ * Forcefully detach the specified *link_fd* from its
+ * corresponding attachment point.
+ *
+ * Return
+ * Returns zero on success. On error, -1 is returned and *errno*
+ * is set appropriately.
+ *
+ * BPF_PROG_BIND_MAP
+ * Description
+ * Bind a map to the lifetime of an eBPF program.
+ *
+ * The map identified by *map_fd* is bound to the program
+ * identified by *prog_fd* and only released when *prog_fd* is
+ * released. This may be used in cases where metadata should be
+ * associated with a program which otherwise does not contain any
+ * references to the map (for example, embedded in the eBPF
+ * program instructions).
+ *
+ * Return
+ * Returns zero on success. On error, -1 is returned and *errno*
+ * is set appropriately.
+ *
+ * NOTES
+ * eBPF objects (maps and programs) can be shared between processes.
+ *
+ * * After **fork**\ (2), the child inherits file descriptors
+ * referring to the same eBPF objects.
+ * * File descriptors referring to eBPF objects can be transferred over
+ * **unix**\ (7) domain sockets.
+ * * File descriptors referring to eBPF objects can be duplicated in the
+ * usual way, using **dup**\ (2) and similar calls.
+ * * File descriptors referring to eBPF objects can be pinned to the
+ * filesystem using the **BPF_OBJ_PIN** command of **bpf**\ (2).
+ *
+ * An eBPF object is deallocated only after all file descriptors referring
+ * to the object have been closed and no references remain pinned to the
+ * filesystem or attached (for example, bound to a program or device).
+ */
enum bpf_cmd {
BPF_MAP_CREATE,
BPF_MAP_LOOKUP_ELEM,
* is struct/union.
*/
#define BPF_PSEUDO_BTF_ID 3
+/* insn[0].src_reg: BPF_PSEUDO_FUNC
+ * insn[0].imm: insn offset to the func
+ * insn[1].imm: 0
+ * insn[0].off: 0
+ * insn[1].off: 0
+ * ldimm64 rewrite: address of the function
+ * verifier type: PTR_TO_FUNC.
+ */
+#define BPF_PSEUDO_FUNC 4
/* when bpf_call->src_reg == BPF_PSEUDO_CALL, bpf_call->imm == pc-relative
* offset to another bpf function
* parsed and used to produce a manual page. The workflow is the following,
* and requires the rst2man utility:
*
- * $ ./scripts/bpf_helpers_doc.py \
+ * $ ./scripts/bpf_doc.py \
* --filename include/uapi/linux/bpf.h > /tmp/bpf-helpers.rst
* $ rst2man /tmp/bpf-helpers.rst > /tmp/bpf-helpers.7
* $ man /tmp/bpf-helpers.7
* Use with ENCAP_L3/L4 flags to further specify the tunnel
* type; *len* is the length of the inner MAC header.
*
+ * * **BPF_F_ADJ_ROOM_ENCAP_L2_ETH**:
+ * Use with BPF_F_ADJ_ROOM_ENCAP_L2 flag to further specify the
+ * L2 type as Ethernet.
+ *
* A call to this helper is susceptible to change the underlying
* packet buffer. Therefore, at load time, all checks on pointers
* previously done by the verifier are invalidated and must be
*
* long bpf_check_mtu(void *ctx, u32 ifindex, u32 *mtu_len, s32 len_diff, u64 flags)
* Description
- * Check ctx packet size against exceeding MTU of net device (based
+ * Check packet size against exceeding MTU of net device (based
* on *ifindex*). This helper will likely be used in combination
* with helpers that adjust/change the packet size.
*
* against the current net device. This is practical if this isn't
* used prior to redirect.
*
+ * On input *mtu_len* must be a valid pointer, else verifier will
+ * reject BPF program. If the value *mtu_len* is initialized to
+ * zero then the ctx packet size is use. When value *mtu_len* is
+ * provided as input this specify the L3 length that the MTU check
+ * is done against. Remember XDP and TC length operate at L2, but
+ * this value is L3 as this correlate to MTU and IP-header tot_len
+ * values which are L3 (similar behavior as bpf_fib_lookup).
+ *
* The Linux kernel route table can configure MTUs on a more
* specific per route level, which is not provided by this helper.
* For route level MTU checks use the **bpf_fib_lookup**\ ()
*
* On return *mtu_len* pointer contains the MTU value of the net
* device. Remember the net device configured MTU is the L3 size,
- * which is returned here and XDP and TX length operate at L2.
+ * which is returned here and XDP and TC length operate at L2.
* Helper take this into account for you, but remember when using
- * MTU value in your BPF-code. On input *mtu_len* must be a valid
- * pointer and be initialized (to zero), else verifier will reject
- * BPF program.
+ * MTU value in your BPF-code.
*
* Return
* * 0 on success, and populate MTU value in *mtu_len* pointer.
* * **BPF_MTU_CHK_RET_FRAG_NEEDED**
* * **BPF_MTU_CHK_RET_SEGS_TOOBIG**
*
+ * long bpf_for_each_map_elem(struct bpf_map *map, void *callback_fn, void *callback_ctx, u64 flags)
+ * Description
+ * For each element in **map**, call **callback_fn** function with
+ * **map**, **callback_ctx** and other map-specific parameters.
+ * The **callback_fn** should be a static function and
+ * the **callback_ctx** should be a pointer to the stack.
+ * The **flags** is used to control certain aspects of the helper.
+ * Currently, the **flags** must be 0.
+ *
+ * The following are a list of supported map types and their
+ * respective expected callback signatures:
+ *
+ * BPF_MAP_TYPE_HASH, BPF_MAP_TYPE_PERCPU_HASH,
+ * BPF_MAP_TYPE_LRU_HASH, BPF_MAP_TYPE_LRU_PERCPU_HASH,
+ * BPF_MAP_TYPE_ARRAY, BPF_MAP_TYPE_PERCPU_ARRAY
+ *
+ * long (\*callback_fn)(struct bpf_map \*map, const void \*key, void \*value, void \*ctx);
+ *
+ * For per_cpu maps, the map_value is the value on the cpu where the
+ * bpf_prog is running.
+ *
+ * If **callback_fn** return 0, the helper will continue to the next
+ * element. If return value is 1, the helper will skip the rest of
+ * elements and return. Other return values are not used now.
+ *
+ * Return
+ * The number of traversed map elements for success, **-EINVAL** for
+ * invalid **flags**.
*/
#define __BPF_FUNC_MAPPER(FN) \
FN(unspec), \
FN(ima_inode_hash), \
FN(sock_from_file), \
FN(check_mtu), \
+ FN(for_each_map_elem), \
/* */
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
BPF_F_ADJ_ROOM_ENCAP_L4_GRE = (1ULL << 3),
BPF_F_ADJ_ROOM_ENCAP_L4_UDP = (1ULL << 4),
BPF_F_ADJ_ROOM_NO_CSUM_RESET = (1ULL << 5),
+ BPF_F_ADJ_ROOM_ENCAP_L2_ETH = (1ULL << 6),
};
enum {
/* User accessible data for SK_LOOKUP programs. Add new fields at the end. */
struct bpf_sk_lookup {
- __bpf_md_ptr(struct bpf_sock *, sk); /* Selected socket */
+ union {
+ __bpf_md_ptr(struct bpf_sock *, sk); /* Selected socket */
+ __u64 cookie; /* Non-zero if socket was selected in PROG_TEST_RUN */
+ };
__u32 family; /* Protocol family (AF_INET, AF_INET6) */
__u32 protocol; /* IP protocol (IPPROTO_TCP, IPPROTO_UDP) */
#define __UAPI_PSAMPLE_H
enum {
- /* sampled packet metadata */
PSAMPLE_ATTR_IIFINDEX,
PSAMPLE_ATTR_OIFINDEX,
PSAMPLE_ATTR_ORIGSIZE,
PSAMPLE_ATTR_GROUP_SEQ,
PSAMPLE_ATTR_SAMPLE_RATE,
PSAMPLE_ATTR_DATA,
- PSAMPLE_ATTR_TUNNEL,
-
- /* commands attributes */
PSAMPLE_ATTR_GROUP_REFCOUNT,
+ PSAMPLE_ATTR_TUNNEL,
+ PSAMPLE_ATTR_PAD,
+ PSAMPLE_ATTR_OUT_TC, /* u16 */
+ PSAMPLE_ATTR_OUT_TC_OCC, /* u64, bytes */
+ PSAMPLE_ATTR_LATENCY, /* u64, nanoseconds */
+ PSAMPLE_ATTR_TIMESTAMP, /* u64, nanoseconds */
+ PSAMPLE_ATTR_PROTO, /* u16 */
+
__PSAMPLE_ATTR_MAX
};
When the compiler is updated, Kconfig will be invoked.
- Ensure full rebuild when the compiler is updated
- include/linux/kconfig.h contains this option in the comment line so
- fixdep adds include/config/cc/version/text.h into the auto-generated
- dependency. When the compiler is updated, syncconfig will touch it
- and then every file will be rebuilt.
+ include/linux/compiler-version.h contains this option in the comment
+ line so fixdep adds include/config/cc/version/text.h into the
+ auto-generated dependency. When the compiler is updated, syncconfig
+ will touch it and then every file will be rebuilt.
config CC_IS_GCC
def_bool $(success,test "$(cc-name)" = GCC)
config COMPILE_TEST
bool "Compile also drivers which will not load"
- depends on !UML && !S390
- default n
+ depends on HAS_IOMEM
help
Some drivers can be compiled on a different platform than they are
intended to be run on. Despite they cannot be loaded there (or even
select BPF
select IRQ_WORK
select TASKS_TRACE_RCU
+ select NET_SOCK_MSG if INET
default n
help
Enable the bpf() system call that allows to manipulate eBPF
fd = *(int *)key;
f = fget_raw(fd);
if (!f)
- return NULL;
+ return ERR_PTR(-EBADF);
sdata = inode_storage_lookup(f->f_inode, map, true);
fput(f);
smap = (struct bpf_local_storage_map *)map;
bpf_local_storage_cache_idx_free(&inode_cache, smap->cache_idx);
- bpf_local_storage_map_free(smap);
+ bpf_local_storage_map_free(smap, NULL);
}
static int inode_storage_map_btf_id;
insn->src_reg == BPF_PSEUDO_CALL;
}
+static bool bpf_pseudo_func(const struct bpf_insn *insn)
+{
+ return insn->code == (BPF_LD | BPF_IMM | BPF_DW) &&
+ insn->src_reg == BPF_PSEUDO_FUNC;
+}
+
struct bpf_call_arg_meta {
struct bpf_map *map_ptr;
bool raw_mode;
u32 btf_id;
struct btf *ret_btf;
u32 ret_btf_id;
+ u32 subprogno;
};
struct btf *btf_vmlinux;
env->prev_linfo = linfo;
}
+static void verbose_invalid_scalar(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg,
+ struct tnum *range, const char *ctx,
+ const char *reg_name)
+{
+ char tn_buf[48];
+
+ verbose(env, "At %s the register %s ", ctx, reg_name);
+ if (!tnum_is_unknown(reg->var_off)) {
+ tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+ verbose(env, "has value %s", tn_buf);
+ } else {
+ verbose(env, "has unknown scalar value");
+ }
+ tnum_strn(tn_buf, sizeof(tn_buf), *range);
+ verbose(env, " should have been in %s\n", tn_buf);
+}
+
static bool type_is_pkt_pointer(enum bpf_reg_type type)
{
return type == PTR_TO_PACKET ||
return type == PTR_TO_SOCKET ||
type == PTR_TO_TCP_SOCK ||
type == PTR_TO_MAP_VALUE ||
+ type == PTR_TO_MAP_KEY ||
type == PTR_TO_SOCK_COMMON;
}
type == ARG_PTR_TO_MEM_OR_NULL ||
type == ARG_PTR_TO_CTX_OR_NULL ||
type == ARG_PTR_TO_SOCKET_OR_NULL ||
- type == ARG_PTR_TO_ALLOC_MEM_OR_NULL;
+ type == ARG_PTR_TO_ALLOC_MEM_OR_NULL ||
+ type == ARG_PTR_TO_STACK_OR_NULL;
}
/* Determine whether the function releases some resources allocated by another
[PTR_TO_RDONLY_BUF_OR_NULL] = "rdonly_buf_or_null",
[PTR_TO_RDWR_BUF] = "rdwr_buf",
[PTR_TO_RDWR_BUF_OR_NULL] = "rdwr_buf_or_null",
+ [PTR_TO_FUNC] = "func",
+ [PTR_TO_MAP_KEY] = "map_key",
};
static char slot_type_char[] = {
if (type_is_pkt_pointer(t))
verbose(env, ",r=%d", reg->range);
else if (t == CONST_PTR_TO_MAP ||
+ t == PTR_TO_MAP_KEY ||
t == PTR_TO_MAP_VALUE ||
t == PTR_TO_MAP_VALUE_OR_NULL)
verbose(env, ",ks=%d,vs=%d",
}
ret = find_subprog(env, off);
if (ret >= 0)
- return 0;
+ return ret;
if (env->subprog_cnt >= BPF_MAX_SUBPROGS) {
verbose(env, "too many subprograms\n");
return -E2BIG;
env->subprog_info[env->subprog_cnt++].start = off;
sort(env->subprog_info, env->subprog_cnt,
sizeof(env->subprog_info[0]), cmp_subprogs, NULL);
- return 0;
+ return env->subprog_cnt - 1;
}
static int check_subprogs(struct bpf_verifier_env *env)
/* determine subprog starts. The end is one before the next starts */
for (i = 0; i < insn_cnt; i++) {
+ if (bpf_pseudo_func(insn + i)) {
+ if (!env->bpf_capable) {
+ verbose(env,
+ "function pointers are allowed for CAP_BPF and CAP_SYS_ADMIN\n");
+ return -EPERM;
+ }
+ ret = add_subprog(env, i + insn[i].imm + 1);
+ if (ret < 0)
+ return ret;
+ /* remember subprog */
+ insn[i + 1].imm = ret;
+ continue;
+ }
if (!bpf_pseudo_call(insn + i))
continue;
if (!env->bpf_capable) {
case PTR_TO_PERCPU_BTF_ID:
case PTR_TO_MEM:
case PTR_TO_MEM_OR_NULL:
+ case PTR_TO_FUNC:
+ case PTR_TO_MAP_KEY:
return true;
default:
return false;
reg = &cur_regs(env)[regno];
switch (reg->type) {
+ case PTR_TO_MAP_KEY:
+ verbose(env, "invalid access to map key, key_size=%d off=%d size=%d\n",
+ mem_size, off, size);
+ break;
case PTR_TO_MAP_VALUE:
verbose(env, "invalid access to map value, value_size=%d off=%d size=%d\n",
mem_size, off, size);
case PTR_TO_FLOW_KEYS:
pointer_desc = "flow keys ";
break;
+ case PTR_TO_MAP_KEY:
+ pointer_desc = "key ";
+ break;
case PTR_TO_MAP_VALUE:
pointer_desc = "value ";
break;
continue_func:
subprog_end = subprog[idx + 1].start;
for (; i < subprog_end; i++) {
- if (!bpf_pseudo_call(insn + i))
+ if (!bpf_pseudo_call(insn + i) && !bpf_pseudo_func(insn + i))
continue;
/* remember insn and function to return to */
ret_insn[frame] = i + 1;
/* for access checks, reg->off is just part of off */
off += reg->off;
- if (reg->type == PTR_TO_MAP_VALUE) {
+ if (reg->type == PTR_TO_MAP_KEY) {
+ if (t == BPF_WRITE) {
+ verbose(env, "write to change key R%d not allowed\n", regno);
+ return -EACCES;
+ }
+
+ err = check_mem_region_access(env, regno, off, size,
+ reg->map_ptr->key_size, false);
+ if (err)
+ return err;
+ if (value_regno >= 0)
+ mark_reg_unknown(env, regs, value_regno);
+ } else if (reg->type == PTR_TO_MAP_VALUE) {
if (t == BPF_WRITE && value_regno >= 0 &&
is_pointer_value(env, value_regno)) {
verbose(env, "R%d leaks addr into map\n", value_regno);
case PTR_TO_PACKET_META:
return check_packet_access(env, regno, reg->off, access_size,
zero_size_allowed);
+ case PTR_TO_MAP_KEY:
+ return check_mem_region_access(env, regno, reg->off, access_size,
+ reg->map_ptr->key_size, false);
case PTR_TO_MAP_VALUE:
if (check_map_access_type(env, regno, reg->off, access_size,
meta && meta->raw_mode ? BPF_WRITE :
PTR_TO_STACK,
PTR_TO_PACKET,
PTR_TO_PACKET_META,
+ PTR_TO_MAP_KEY,
PTR_TO_MAP_VALUE,
},
};
PTR_TO_STACK,
PTR_TO_PACKET,
PTR_TO_PACKET_META,
+ PTR_TO_MAP_KEY,
PTR_TO_MAP_VALUE,
PTR_TO_MEM,
PTR_TO_RDONLY_BUF,
PTR_TO_STACK,
PTR_TO_PACKET,
PTR_TO_PACKET_META,
+ PTR_TO_MAP_KEY,
PTR_TO_MAP_VALUE,
},
};
static const struct bpf_reg_types btf_ptr_types = { .types = { PTR_TO_BTF_ID } };
static const struct bpf_reg_types spin_lock_types = { .types = { PTR_TO_MAP_VALUE } };
static const struct bpf_reg_types percpu_btf_ptr_types = { .types = { PTR_TO_PERCPU_BTF_ID } };
+static const struct bpf_reg_types func_ptr_types = { .types = { PTR_TO_FUNC } };
+static const struct bpf_reg_types stack_ptr_types = { .types = { PTR_TO_STACK } };
static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = {
[ARG_PTR_TO_MAP_KEY] = &map_key_value_types,
[ARG_PTR_TO_INT] = &int_ptr_types,
[ARG_PTR_TO_LONG] = &int_ptr_types,
[ARG_PTR_TO_PERCPU_BTF_ID] = &percpu_btf_ptr_types,
+ [ARG_PTR_TO_FUNC] = &func_ptr_types,
+ [ARG_PTR_TO_STACK_OR_NULL] = &stack_ptr_types,
};
static int check_reg_type(struct bpf_verifier_env *env, u32 regno,
verbose(env, "verifier internal error\n");
return -EFAULT;
}
+ } else if (arg_type == ARG_PTR_TO_FUNC) {
+ meta->subprogno = reg->subprogno;
} else if (arg_type_is_mem_ptr(arg_type)) {
/* The access to this pointer is only checked when we hit the
* next is_mem_size argument below.
}
}
-static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
- int *insn_idx)
+typedef int (*set_callee_state_fn)(struct bpf_verifier_env *env,
+ struct bpf_func_state *caller,
+ struct bpf_func_state *callee,
+ int insn_idx);
+
+static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
+ int *insn_idx, int subprog,
+ set_callee_state_fn set_callee_state_cb)
{
struct bpf_verifier_state *state = env->cur_state;
struct bpf_func_info_aux *func_info_aux;
struct bpf_func_state *caller, *callee;
- int i, err, subprog, target_insn;
+ int err;
bool is_global = false;
if (state->curframe + 1 >= MAX_CALL_FRAMES) {
return -E2BIG;
}
- target_insn = *insn_idx + insn->imm;
- subprog = find_subprog(env, target_insn + 1);
- if (subprog < 0) {
- verbose(env, "verifier bug. No program starts at insn %d\n",
- target_insn + 1);
- return -EFAULT;
- }
-
caller = state->frame[state->curframe];
if (state->frame[state->curframe + 1]) {
verbose(env, "verifier bug. Frame %d already allocated\n",
if (err)
return err;
- /* copy r1 - r5 args that callee can access. The copy includes parent
- * pointers, which connects us up to the liveness chain
- */
- for (i = BPF_REG_1; i <= BPF_REG_5; i++)
- callee->regs[i] = caller->regs[i];
+ err = set_callee_state_cb(env, caller, callee, *insn_idx);
+ if (err)
+ return err;
clear_caller_saved_regs(env, caller->regs);
state->curframe++;
/* and go analyze first insn of the callee */
- *insn_idx = target_insn;
+ *insn_idx = env->subprog_info[subprog].start - 1;
if (env->log.level & BPF_LOG_LEVEL) {
verbose(env, "caller:\n");
return 0;
}
+int map_set_for_each_callback_args(struct bpf_verifier_env *env,
+ struct bpf_func_state *caller,
+ struct bpf_func_state *callee)
+{
+ /* bpf_for_each_map_elem(struct bpf_map *map, void *callback_fn,
+ * void *callback_ctx, u64 flags);
+ * callback_fn(struct bpf_map *map, void *key, void *value,
+ * void *callback_ctx);
+ */
+ callee->regs[BPF_REG_1] = caller->regs[BPF_REG_1];
+
+ callee->regs[BPF_REG_2].type = PTR_TO_MAP_KEY;
+ __mark_reg_known_zero(&callee->regs[BPF_REG_2]);
+ callee->regs[BPF_REG_2].map_ptr = caller->regs[BPF_REG_1].map_ptr;
+
+ callee->regs[BPF_REG_3].type = PTR_TO_MAP_VALUE;
+ __mark_reg_known_zero(&callee->regs[BPF_REG_3]);
+ callee->regs[BPF_REG_3].map_ptr = caller->regs[BPF_REG_1].map_ptr;
+
+ /* pointer to stack or null */
+ callee->regs[BPF_REG_4] = caller->regs[BPF_REG_3];
+
+ /* unused */
+ __mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
+ return 0;
+}
+
+static int set_callee_state(struct bpf_verifier_env *env,
+ struct bpf_func_state *caller,
+ struct bpf_func_state *callee, int insn_idx)
+{
+ int i;
+
+ /* copy r1 - r5 args that callee can access. The copy includes parent
+ * pointers, which connects us up to the liveness chain
+ */
+ for (i = BPF_REG_1; i <= BPF_REG_5; i++)
+ callee->regs[i] = caller->regs[i];
+ return 0;
+}
+
+static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
+ int *insn_idx)
+{
+ int subprog, target_insn;
+
+ target_insn = *insn_idx + insn->imm + 1;
+ subprog = find_subprog(env, target_insn);
+ if (subprog < 0) {
+ verbose(env, "verifier bug. No program starts at insn %d\n",
+ target_insn);
+ return -EFAULT;
+ }
+
+ return __check_func_call(env, insn, insn_idx, subprog, set_callee_state);
+}
+
+static int set_map_elem_callback_state(struct bpf_verifier_env *env,
+ struct bpf_func_state *caller,
+ struct bpf_func_state *callee,
+ int insn_idx)
+{
+ struct bpf_insn_aux_data *insn_aux = &env->insn_aux_data[insn_idx];
+ struct bpf_map *map;
+ int err;
+
+ if (bpf_map_ptr_poisoned(insn_aux)) {
+ verbose(env, "tail_call abusing map_ptr\n");
+ return -EINVAL;
+ }
+
+ map = BPF_MAP_PTR(insn_aux->map_ptr_state);
+ if (!map->ops->map_set_for_each_callback_args ||
+ !map->ops->map_for_each_callback) {
+ verbose(env, "callback function not allowed for map\n");
+ return -ENOTSUPP;
+ }
+
+ err = map->ops->map_set_for_each_callback_args(env, caller, callee);
+ if (err)
+ return err;
+
+ callee->in_callback_fn = true;
+ return 0;
+}
+
static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
{
struct bpf_verifier_state *state = env->cur_state;
state->curframe--;
caller = state->frame[state->curframe];
- /* return to the caller whatever r0 had in the callee */
- caller->regs[BPF_REG_0] = *r0;
+ if (callee->in_callback_fn) {
+ /* enforce R0 return value range [0, 1]. */
+ struct tnum range = tnum_range(0, 1);
+
+ if (r0->type != SCALAR_VALUE) {
+ verbose(env, "R0 not a scalar value\n");
+ return -EACCES;
+ }
+ if (!tnum_in(range, r0->var_off)) {
+ verbose_invalid_scalar(env, r0, &range, "callback return", "R0");
+ return -EINVAL;
+ }
+ } else {
+ /* return to the caller whatever r0 had in the callee */
+ caller->regs[BPF_REG_0] = *r0;
+ }
/* Transfer references to the caller */
err = transfer_reference_state(caller, callee);
func_id != BPF_FUNC_map_delete_elem &&
func_id != BPF_FUNC_map_push_elem &&
func_id != BPF_FUNC_map_pop_elem &&
- func_id != BPF_FUNC_map_peek_elem)
+ func_id != BPF_FUNC_map_peek_elem &&
+ func_id != BPF_FUNC_for_each_map_elem &&
+ func_id != BPF_FUNC_redirect_map)
return 0;
if (map == NULL) {
return state->acquired_refs ? -EINVAL : 0;
}
-static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
+static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
+ int *insn_idx_p)
{
const struct bpf_func_proto *fn = NULL;
struct bpf_reg_state *regs;
struct bpf_call_arg_meta meta;
+ int insn_idx = *insn_idx_p;
bool changes_data;
- int i, err;
+ int i, err, func_id;
/* find function prototype */
+ func_id = insn->imm;
if (func_id < 0 || func_id >= __BPF_FUNC_MAX_ID) {
verbose(env, "invalid func %s#%d\n", func_id_name(func_id),
func_id);
meta.func_id = func_id;
/* check args */
- for (i = 0; i < 5; i++) {
+ for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) {
err = check_func_arg(env, i, &meta, fn);
if (err)
return err;
return -EINVAL;
}
+ if (func_id == BPF_FUNC_for_each_map_elem) {
+ err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
+ set_map_elem_callback_state);
+ if (err < 0)
+ return -EINVAL;
+ }
+
/* reset caller saved regs */
for (i = 0; i < CALLER_SAVED_REGS; i++) {
mark_reg_not_init(env, regs, caller_saved[i]);
{
bool mask_to_left = (opcode == BPF_ADD && off_is_neg) ||
(opcode == BPF_SUB && !off_is_neg);
- u32 off;
+ u32 off, max;
switch (ptr_reg->type) {
case PTR_TO_STACK:
+ /* Offset 0 is out-of-bounds, but acceptable start for the
+ * left direction, see BPF_REG_FP.
+ */
+ max = MAX_BPF_STACK + mask_to_left;
/* Indirect variable offset stack access is prohibited in
* unprivileged mode so it's not handled here.
*/
if (mask_to_left)
*ptr_limit = MAX_BPF_STACK + off;
else
- *ptr_limit = -off;
- return 0;
+ *ptr_limit = -off - 1;
+ return *ptr_limit >= max ? -ERANGE : 0;
+ case PTR_TO_MAP_KEY:
+ /* Currently, this code is not exercised as the only use
+ * is bpf_for_each_map_elem() helper which requires
+ * bpf_capble. The code has been tested manually for
+ * future use.
+ */
+ if (mask_to_left) {
+ *ptr_limit = ptr_reg->umax_value + ptr_reg->off;
+ } else {
+ off = ptr_reg->smin_value + ptr_reg->off;
+ *ptr_limit = ptr_reg->map_ptr->key_size - off;
+ }
+ return 0;
case PTR_TO_MAP_VALUE:
+ max = ptr_reg->map_ptr->value_size;
if (mask_to_left) {
*ptr_limit = ptr_reg->umax_value + ptr_reg->off;
} else {
off = ptr_reg->smin_value + ptr_reg->off;
- *ptr_limit = ptr_reg->map_ptr->value_size - off;
+ *ptr_limit = ptr_reg->map_ptr->value_size - off - 1;
}
- return 0;
+ return *ptr_limit >= max ? -ERANGE : 0;
default:
return -EINVAL;
}
aux->alu_limit != alu_limit))
return -EACCES;
- /* Corresponding fixup done in fixup_bpf_calls(). */
+ /* Corresponding fixup done in do_misc_fixups(). */
aux->alu_state = alu_state;
aux->alu_limit = alu_limit;
return 0;
u32 alu_state, alu_limit;
struct bpf_reg_state tmp;
bool ret;
+ int err;
if (can_skip_alu_sanitation(env, insn))
return 0;
alu_state |= ptr_is_dst_reg ?
BPF_ALU_SANITIZE_SRC : BPF_ALU_SANITIZE_DST;
- if (retrieve_ptr_limit(ptr_reg, &alu_limit, opcode, off_is_neg))
- return 0;
- if (update_alu_sanitation_state(aux, alu_state, alu_limit))
- return -EACCES;
+ err = retrieve_ptr_limit(ptr_reg, &alu_limit, opcode, off_is_neg);
+ if (err < 0)
+ return err;
+
+ err = update_alu_sanitation_state(aux, alu_state, alu_limit);
+ if (err < 0)
+ return err;
do_sim:
/* Simulate and find potential out-of-bounds access under
* speculative execution from truncation as a result of
verbose(env, "R%d pointer arithmetic on %s prohibited\n",
dst, reg_type_str[ptr_reg->type]);
return -EACCES;
+ case PTR_TO_MAP_KEY:
case PTR_TO_MAP_VALUE:
if (!env->allow_ptr_leaks && !known && (smin_val < 0) != (smax_val < 0)) {
verbose(env, "R%d has unknown scalar with mixed signed bounds, pointer arithmetic with it prohibited for !root\n",
case BPF_ADD:
ret = sanitize_ptr_alu(env, insn, ptr_reg, dst_reg, smin_val < 0);
if (ret < 0) {
- verbose(env, "R%d tried to add from different maps or paths\n", dst);
+ verbose(env, "R%d tried to add from different maps, paths, or prohibited types\n", dst);
return ret;
}
/* We can take a fixed offset as long as it doesn't overflow
case BPF_SUB:
ret = sanitize_ptr_alu(env, insn, ptr_reg, dst_reg, smin_val < 0);
if (ret < 0) {
- verbose(env, "R%d tried to sub from different maps or paths\n", dst);
+ verbose(env, "R%d tried to sub from different maps, paths, or prohibited types\n", dst);
return ret;
}
if (dst_reg == off_reg) {
return 0;
}
+ if (insn->src_reg == BPF_PSEUDO_FUNC) {
+ struct bpf_prog_aux *aux = env->prog->aux;
+ u32 subprogno = insn[1].imm;
+
+ if (!aux->func_info) {
+ verbose(env, "missing btf func_info\n");
+ return -EINVAL;
+ }
+ if (aux->func_info_aux[subprogno].linkage != BTF_FUNC_STATIC) {
+ verbose(env, "callback function not static\n");
+ return -EINVAL;
+ }
+
+ dst_reg->type = PTR_TO_FUNC;
+ dst_reg->subprogno = subprogno;
+ return 0;
+ }
+
map = env->used_maps[aux->map_index];
mark_reg_known_zero(env, regs, insn->dst_reg);
dst_reg->map_ptr = map;
}
if (!tnum_in(range, reg->var_off)) {
- char tn_buf[48];
-
- verbose(env, "At program exit the register R0 ");
- if (!tnum_is_unknown(reg->var_off)) {
- tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
- verbose(env, "has value %s", tn_buf);
- } else {
- verbose(env, "has unknown scalar value");
- }
- tnum_strn(tn_buf, sizeof(tn_buf), range);
- verbose(env, " should have been in %s\n", tn_buf);
+ verbose_invalid_scalar(env, reg, &range, "program exit", "R0");
return -EINVAL;
}
return DONE_EXPLORING;
}
+static int visit_func_call_insn(int t, int insn_cnt,
+ struct bpf_insn *insns,
+ struct bpf_verifier_env *env,
+ bool visit_callee)
+{
+ int ret;
+
+ ret = push_insn(t, t + 1, FALLTHROUGH, env, false);
+ if (ret)
+ return ret;
+
+ if (t + 1 < insn_cnt)
+ init_explored_state(env, t + 1);
+ if (visit_callee) {
+ init_explored_state(env, t);
+ ret = push_insn(t, t + insns[t].imm + 1, BRANCH,
+ env, false);
+ }
+ return ret;
+}
+
/* Visits the instruction at index t and returns one of the following:
* < 0 - an error occurred
* DONE_EXPLORING - the instruction was fully explored
struct bpf_insn *insns = env->prog->insnsi;
int ret;
+ if (bpf_pseudo_func(insns + t))
+ return visit_func_call_insn(t, insn_cnt, insns, env, true);
+
/* All non-branch instructions have a single fall-through edge. */
if (BPF_CLASS(insns[t].code) != BPF_JMP &&
BPF_CLASS(insns[t].code) != BPF_JMP32)
return DONE_EXPLORING;
case BPF_CALL:
- ret = push_insn(t, t + 1, FALLTHROUGH, env, false);
- if (ret)
- return ret;
-
- if (t + 1 < insn_cnt)
- init_explored_state(env, t + 1);
- if (insns[t].src_reg == BPF_PSEUDO_CALL) {
- init_explored_state(env, t);
- ret = push_insn(t, t + insns[t].imm + 1, BRANCH,
- env, false);
- }
- return ret;
+ return visit_func_call_insn(t, insn_cnt, insns, env,
+ insns[t].src_reg == BPF_PSEUDO_CALL);
case BPF_JA:
if (BPF_SRC(insns[t].code) != BPF_K)
btf = btf_get_by_fd(attr->prog_btf_fd);
if (IS_ERR(btf))
return PTR_ERR(btf);
+ if (btf_is_kernel(btf)) {
+ btf_put(btf);
+ return -EACCES;
+ }
env->prog->aux->btf = btf;
err = check_btf_func(env, attr, uattr);
*/
return false;
}
+ case PTR_TO_MAP_KEY:
case PTR_TO_MAP_VALUE:
/* If the new min/max/var_off satisfy the old ones and
* everything else matches, we are OK.
if (insn->src_reg == BPF_PSEUDO_CALL)
err = check_func_call(env, insn, &env->insn_idx);
else
- err = check_helper_call(env, insn->imm, env->insn_idx);
+ err = check_helper_call(env, insn, &env->insn_idx);
if (err)
return err;
-
} else if (opcode == BPF_JA) {
if (BPF_SRC(insn->code) != BPF_K ||
insn->imm != 0 ||
goto next_insn;
}
+ if (insn[0].src_reg == BPF_PSEUDO_FUNC) {
+ aux = &env->insn_aux_data[i];
+ aux->ptr_type = PTR_TO_FUNC;
+ goto next_insn;
+ }
+
/* In final convert_pseudo_ld_imm64() step, this is
* converted into regular 64-bit imm load insn.
*/
int insn_cnt = env->prog->len;
int i;
- for (i = 0; i < insn_cnt; i++, insn++)
- if (insn->code == (BPF_LD | BPF_IMM | BPF_DW))
- insn->src_reg = 0;
+ for (i = 0; i < insn_cnt; i++, insn++) {
+ if (insn->code != (BPF_LD | BPF_IMM | BPF_DW))
+ continue;
+ if (insn->src_reg == BPF_PSEUDO_FUNC)
+ continue;
+ insn->src_reg = 0;
+ }
}
/* single env->prog->insni[off] instruction was replaced with the range
return 0;
for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
+ if (bpf_pseudo_func(insn)) {
+ env->insn_aux_data[i].call_imm = insn->imm;
+ /* subprog is encoded in insn[1].imm */
+ continue;
+ }
+
if (!bpf_pseudo_call(insn))
continue;
/* Upon error here we cannot fall back to interpreter but
for (i = 0; i < env->subprog_cnt; i++) {
insn = func[i]->insnsi;
for (j = 0; j < func[i]->len; j++, insn++) {
+ if (bpf_pseudo_func(insn)) {
+ subprog = insn[1].imm;
+ insn[0].imm = (u32)(long)func[subprog]->bpf_func;
+ insn[1].imm = ((u64)(long)func[subprog]->bpf_func) >> 32;
+ continue;
+ }
if (!bpf_pseudo_call(insn))
continue;
subprog = insn->off;
* later look the same as if they were interpreted only.
*/
for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
+ if (bpf_pseudo_func(insn)) {
+ insn[0].imm = env->insn_aux_data[i].call_imm;
+ insn[1].imm = find_subprog(env, i + insn[0].imm + 1);
+ continue;
+ }
if (!bpf_pseudo_call(insn))
continue;
insn->off = env->insn_aux_data[i].call_imm;
return -EINVAL;
}
for (i = 0; i < prog->len; i++, insn++) {
+ if (bpf_pseudo_func(insn)) {
+ /* When JIT fails the progs with callback calls
+ * have to be rejected, since interpreter doesn't support them yet.
+ */
+ verbose(env, "callbacks are not allowed in non-JITed programs\n");
+ return -EINVAL;
+ }
+
if (!bpf_pseudo_call(insn))
continue;
depth = get_callee_stack_depth(env, insn, i);
return err;
}
-/* fixup insn->imm field of bpf_call instructions
- * and inline eligible helpers as explicit sequence of BPF instructions
- *
- * this function is called after eBPF program passed verification
+/* Do various post-verification rewrites in a single program pass.
+ * These rewrites simplify JIT and interpreter implementations.
*/
-static int fixup_bpf_calls(struct bpf_verifier_env *env)
+static int do_misc_fixups(struct bpf_verifier_env *env)
{
struct bpf_prog *prog = env->prog;
bool expect_blinding = bpf_jit_blinding_enabled(prog);
int i, ret, cnt, delta = 0;
for (i = 0; i < insn_cnt; i++, insn++) {
+ /* Make divide-by-zero exceptions impossible. */
if (insn->code == (BPF_ALU64 | BPF_MOD | BPF_X) ||
insn->code == (BPF_ALU64 | BPF_DIV | BPF_X) ||
insn->code == (BPF_ALU | BPF_MOD | BPF_X) ||
continue;
}
+ /* Implement LD_ABS and LD_IND with a rewrite, if supported by the program type. */
if (BPF_CLASS(insn->code) == BPF_LD &&
(BPF_MODE(insn->code) == BPF_ABS ||
BPF_MODE(insn->code) == BPF_IND)) {
continue;
}
+ /* Rewrite pointer arithmetic to mitigate speculation attacks. */
if (insn->code == (BPF_ALU64 | BPF_ADD | BPF_X) ||
insn->code == (BPF_ALU64 | BPF_SUB | BPF_X)) {
const u8 code_add = BPF_ALU64 | BPF_ADD | BPF_X;
off_reg = issrc ? insn->src_reg : insn->dst_reg;
if (isneg)
*patch++ = BPF_ALU64_IMM(BPF_MUL, off_reg, -1);
- *patch++ = BPF_MOV32_IMM(BPF_REG_AX, aux->alu_limit - 1);
+ *patch++ = BPF_MOV32_IMM(BPF_REG_AX, aux->alu_limit);
*patch++ = BPF_ALU64_REG(BPF_SUB, BPF_REG_AX, off_reg);
*patch++ = BPF_ALU64_REG(BPF_OR, BPF_REG_AX, off_reg);
*patch++ = BPF_ALU64_IMM(BPF_NEG, BPF_REG_AX, 0);
insn->imm == BPF_FUNC_map_delete_elem ||
insn->imm == BPF_FUNC_map_push_elem ||
insn->imm == BPF_FUNC_map_pop_elem ||
- insn->imm == BPF_FUNC_map_peek_elem)) {
+ insn->imm == BPF_FUNC_map_peek_elem ||
+ insn->imm == BPF_FUNC_redirect_map)) {
aux = &env->insn_aux_data[i + delta];
if (bpf_map_ptr_poisoned(aux))
goto patch_call_imm;
(int (*)(struct bpf_map *map, void *value))NULL));
BUILD_BUG_ON(!__same_type(ops->map_peek_elem,
(int (*)(struct bpf_map *map, void *value))NULL));
+ BUILD_BUG_ON(!__same_type(ops->map_redirect,
+ (int (*)(struct bpf_map *map, u32 ifindex, u64 flags))NULL));
+
patch_map_ops_generic:
switch (insn->imm) {
case BPF_FUNC_map_lookup_elem:
insn->imm = BPF_CAST_CALL(ops->map_peek_elem) -
__bpf_call_base;
continue;
+ case BPF_FUNC_redirect_map:
+ insn->imm = BPF_CAST_CALL(ops->map_redirect) -
+ __bpf_call_base;
+ continue;
}
goto patch_call_imm;
}
+ /* Implement bpf_jiffies64 inline. */
if (prog->jit_requested && BITS_PER_LONG == 64 &&
insn->imm == BPF_FUNC_jiffies64) {
struct bpf_insn ld_jiffies_addr[2] = {
ret = convert_ctx_accesses(env);
if (ret == 0)
- ret = fixup_bpf_calls(env);
+ ret = do_misc_fixups(env);
/* do 32-bit optimization after insn patching has done so those patched
* insns could be handled correctly.
#include <linux/kasan.h>
#include <linux/scs.h>
#include <linux/io_uring.h>
+#include <linux/bpf.h>
#include <asm/pgalloc.h>
#include <linux/uaccess.h>
cgroup_free(tsk);
task_numa_free(tsk, true);
security_task_free(tsk);
+ bpf_task_storage_free(tsk);
exit_creds(tsk);
delayacct_tsk_free(tsk);
put_signal_struct(tsk->signal);
#endif
}
+ static void mm_init_pasid(struct mm_struct *mm)
+ {
+ #ifdef CONFIG_IOMMU_SUPPORT
+ mm->pasid = INIT_PASID;
+ #endif
+ }
+
static void mm_init_uprobes_state(struct mm_struct *mm)
{
#ifdef CONFIG_UPROBES
mm_init_cpumask(mm);
mm_init_aio(mm);
mm_init_owner(mm, p);
+ mm_init_pasid(mm);
RCU_INIT_POINTER(mm->exe_file, NULL);
mmu_notifier_subscriptions_init(mm);
init_tlb_flush_pending(mm);
p->sequential_io = 0;
p->sequential_io_avg = 0;
#endif
+#ifdef CONFIG_BPF_SYSCALL
+ RCU_INIT_POINTER(p->bpf_storage, NULL);
+#endif
/* Perform scheduler related setup. Assign this task to a CPU. */
retval = sched_fork(clone_flags, p);
}
EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
+static struct net_device_path *dev_fwd_path(struct net_device_path_stack *stack)
+{
+ int k = stack->num_paths++;
+
+ if (WARN_ON_ONCE(k >= NET_DEVICE_PATH_STACK_MAX))
+ return NULL;
+
+ return &stack->path[k];
+}
+
+int dev_fill_forward_path(const struct net_device *dev, const u8 *daddr,
+ struct net_device_path_stack *stack)
+{
+ const struct net_device *last_dev;
+ struct net_device_path_ctx ctx = {
+ .dev = dev,
+ .daddr = daddr,
+ };
+ struct net_device_path *path;
+ int ret = 0;
+
+ stack->num_paths = 0;
+ while (ctx.dev && ctx.dev->netdev_ops->ndo_fill_forward_path) {
+ last_dev = ctx.dev;
+ path = dev_fwd_path(stack);
+ if (!path)
+ return -1;
+
+ memset(path, 0, sizeof(struct net_device_path));
+ ret = ctx.dev->netdev_ops->ndo_fill_forward_path(&ctx, path);
+ if (ret < 0)
+ return -1;
+
+ if (WARN_ON_ONCE(last_dev == ctx.dev))
+ return -1;
+ }
+ path = dev_fwd_path(stack);
+ if (!path)
+ return -1;
+ path->type = DEV_PATH_ETHERNET;
+ path->dev = ctx.dev;
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(dev_fill_forward_path);
+
/**
* __dev_get_by_name - find a device by its name
* @net: the applicable net namespace
return -ENOMEM;
for_each_netdev(net, d) {
+ struct netdev_name_node *name_node;
+ list_for_each_entry(name_node, &d->name_node->list, list) {
+ if (!sscanf(name_node->name, name, &i))
+ continue;
+ if (i < 0 || i >= max_netdevices)
+ continue;
+
+ /* avoid cases where sscanf is not exact inverse of printf */
+ snprintf(buf, IFNAMSIZ, name, i);
+ if (!strncmp(buf, name_node->name, IFNAMSIZ))
+ set_bit(i, inuse);
+ }
if (!sscanf(d->name, name, &i))
continue;
if (i < 0 || i >= max_netdevices)
EXPORT_SYMBOL(netdev_txq_to_tc);
#ifdef CONFIG_XPS
-struct static_key xps_needed __read_mostly;
-EXPORT_SYMBOL(xps_needed);
-struct static_key xps_rxqs_needed __read_mostly;
-EXPORT_SYMBOL(xps_rxqs_needed);
+static struct static_key xps_needed __read_mostly;
+static struct static_key xps_rxqs_needed __read_mostly;
static DEFINE_MUTEX(xps_map_mutex);
#define xmap_dereference(P) \
rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
- int tci, u16 index)
+ struct xps_dev_maps *old_maps, int tci, u16 index)
{
struct xps_map *map = NULL;
int pos;
break;
}
+ if (old_maps)
+ RCU_INIT_POINTER(old_maps->attr_map[tci], NULL);
RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL);
kfree_rcu(map, rcu);
return false;
struct xps_dev_maps *dev_maps,
int cpu, u16 offset, u16 count)
{
- int num_tc = dev->num_tc ? : 1;
+ int num_tc = dev_maps->num_tc;
bool active = false;
int tci;
int i, j;
for (i = count, j = offset; i--; j++) {
- if (!remove_xps_queue(dev_maps, tci, j))
+ if (!remove_xps_queue(dev_maps, NULL, tci, j))
break;
}
static void reset_xps_maps(struct net_device *dev,
struct xps_dev_maps *dev_maps,
- bool is_rxqs_map)
+ enum xps_map_type type)
{
- if (is_rxqs_map) {
- static_key_slow_dec_cpuslocked(&xps_rxqs_needed);
- RCU_INIT_POINTER(dev->xps_rxqs_map, NULL);
- } else {
- RCU_INIT_POINTER(dev->xps_cpus_map, NULL);
- }
static_key_slow_dec_cpuslocked(&xps_needed);
+ if (type == XPS_RXQS)
+ static_key_slow_dec_cpuslocked(&xps_rxqs_needed);
+
+ RCU_INIT_POINTER(dev->xps_maps[type], NULL);
+
kfree_rcu(dev_maps, rcu);
}
-static void clean_xps_maps(struct net_device *dev, const unsigned long *mask,
- struct xps_dev_maps *dev_maps, unsigned int nr_ids,
- u16 offset, u16 count, bool is_rxqs_map)
+static void clean_xps_maps(struct net_device *dev, enum xps_map_type type,
+ u16 offset, u16 count)
{
+ struct xps_dev_maps *dev_maps;
bool active = false;
int i, j;
- for (j = -1; j = netif_attrmask_next(j, mask, nr_ids),
- j < nr_ids;)
- active |= remove_xps_queue_cpu(dev, dev_maps, j, offset,
- count);
+ dev_maps = xmap_dereference(dev->xps_maps[type]);
+ if (!dev_maps)
+ return;
+
+ for (j = 0; j < dev_maps->nr_ids; j++)
+ active |= remove_xps_queue_cpu(dev, dev_maps, j, offset, count);
if (!active)
- reset_xps_maps(dev, dev_maps, is_rxqs_map);
+ reset_xps_maps(dev, dev_maps, type);
- if (!is_rxqs_map) {
- for (i = offset + (count - 1); count--; i--) {
+ if (type == XPS_CPUS) {
+ for (i = offset + (count - 1); count--; i--)
netdev_queue_numa_node_write(
- netdev_get_tx_queue(dev, i),
- NUMA_NO_NODE);
- }
+ netdev_get_tx_queue(dev, i), NUMA_NO_NODE);
}
}
static void netif_reset_xps_queues(struct net_device *dev, u16 offset,
u16 count)
{
- const unsigned long *possible_mask = NULL;
- struct xps_dev_maps *dev_maps;
- unsigned int nr_ids;
-
if (!static_key_false(&xps_needed))
return;
cpus_read_lock();
mutex_lock(&xps_map_mutex);
- if (static_key_false(&xps_rxqs_needed)) {
- dev_maps = xmap_dereference(dev->xps_rxqs_map);
- if (dev_maps) {
- nr_ids = dev->num_rx_queues;
- clean_xps_maps(dev, possible_mask, dev_maps, nr_ids,
- offset, count, true);
- }
- }
-
- dev_maps = xmap_dereference(dev->xps_cpus_map);
- if (!dev_maps)
- goto out_no_maps;
+ if (static_key_false(&xps_rxqs_needed))
+ clean_xps_maps(dev, XPS_RXQS, offset, count);
- if (num_possible_cpus() > 1)
- possible_mask = cpumask_bits(cpu_possible_mask);
- nr_ids = nr_cpu_ids;
- clean_xps_maps(dev, possible_mask, dev_maps, nr_ids, offset, count,
- false);
+ clean_xps_maps(dev, XPS_CPUS, offset, count);
-out_no_maps:
mutex_unlock(&xps_map_mutex);
cpus_read_unlock();
}
return new_map;
}
+/* Copy xps maps at a given index */
+static void xps_copy_dev_maps(struct xps_dev_maps *dev_maps,
+ struct xps_dev_maps *new_dev_maps, int index,
+ int tc, bool skip_tc)
+{
+ int i, tci = index * dev_maps->num_tc;
+ struct xps_map *map;
+
+ /* copy maps belonging to foreign traffic classes */
+ for (i = 0; i < dev_maps->num_tc; i++, tci++) {
+ if (i == tc && skip_tc)
+ continue;
+
+ /* fill in the new device map from the old device map */
+ map = xmap_dereference(dev_maps->attr_map[tci]);
+ RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
+ }
+}
+
/* Must be called under cpus_read_lock */
int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
- u16 index, bool is_rxqs_map)
+ u16 index, enum xps_map_type type)
{
- const unsigned long *online_mask = NULL, *possible_mask = NULL;
- struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
+ struct xps_dev_maps *dev_maps, *new_dev_maps = NULL, *old_dev_maps = NULL;
+ const unsigned long *online_mask = NULL;
+ bool active = false, copy = false;
int i, j, tci, numa_node_id = -2;
int maps_sz, num_tc = 1, tc = 0;
struct xps_map *map, *new_map;
- bool active = false;
unsigned int nr_ids;
if (dev->num_tc) {
}
mutex_lock(&xps_map_mutex);
- if (is_rxqs_map) {
+
+ dev_maps = xmap_dereference(dev->xps_maps[type]);
+ if (type == XPS_RXQS) {
maps_sz = XPS_RXQ_DEV_MAPS_SIZE(num_tc, dev->num_rx_queues);
- dev_maps = xmap_dereference(dev->xps_rxqs_map);
nr_ids = dev->num_rx_queues;
} else {
maps_sz = XPS_CPU_DEV_MAPS_SIZE(num_tc);
- if (num_possible_cpus() > 1) {
+ if (num_possible_cpus() > 1)
online_mask = cpumask_bits(cpu_online_mask);
- possible_mask = cpumask_bits(cpu_possible_mask);
- }
- dev_maps = xmap_dereference(dev->xps_cpus_map);
nr_ids = nr_cpu_ids;
}
if (maps_sz < L1_CACHE_BYTES)
maps_sz = L1_CACHE_BYTES;
+ /* The old dev_maps could be larger or smaller than the one we're
+ * setting up now, as dev->num_tc or nr_ids could have been updated in
+ * between. We could try to be smart, but let's be safe instead and only
+ * copy foreign traffic classes if the two map sizes match.
+ */
+ if (dev_maps &&
+ dev_maps->num_tc == num_tc && dev_maps->nr_ids == nr_ids)
+ copy = true;
+
/* allocate memory for queue storage */
for (j = -1; j = netif_attrmask_next_and(j, online_mask, mask, nr_ids),
j < nr_ids;) {
- if (!new_dev_maps)
- new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
if (!new_dev_maps) {
- mutex_unlock(&xps_map_mutex);
- return -ENOMEM;
+ new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
+ if (!new_dev_maps) {
+ mutex_unlock(&xps_map_mutex);
+ return -ENOMEM;
+ }
+
+ new_dev_maps->nr_ids = nr_ids;
+ new_dev_maps->num_tc = num_tc;
}
tci = j * num_tc + tc;
- map = dev_maps ? xmap_dereference(dev_maps->attr_map[tci]) :
- NULL;
+ map = copy ? xmap_dereference(dev_maps->attr_map[tci]) : NULL;
- map = expand_xps_map(map, j, index, is_rxqs_map);
+ map = expand_xps_map(map, j, index, type == XPS_RXQS);
if (!map)
goto error;
if (!dev_maps) {
/* Increment static keys at most once per type */
static_key_slow_inc_cpuslocked(&xps_needed);
- if (is_rxqs_map)
+ if (type == XPS_RXQS)
static_key_slow_inc_cpuslocked(&xps_rxqs_needed);
}
- for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
- j < nr_ids;) {
- /* copy maps belonging to foreign traffic classes */
- for (i = tc, tci = j * num_tc; dev_maps && i--; tci++) {
- /* fill in the new device map from the old device map */
- map = xmap_dereference(dev_maps->attr_map[tci]);
- RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
- }
+ for (j = 0; j < nr_ids; j++) {
+ bool skip_tc = false;
- /* We need to explicitly update tci as prevous loop
- * could break out early if dev_maps is NULL.
- */
tci = j * num_tc + tc;
-
if (netif_attr_test_mask(j, mask, nr_ids) &&
netif_attr_test_online(j, online_mask, nr_ids)) {
/* add tx-queue to CPU/rx-queue maps */
int pos = 0;
+ skip_tc = true;
+
map = xmap_dereference(new_dev_maps->attr_map[tci]);
while ((pos < map->len) && (map->queues[pos] != index))
pos++;
if (pos == map->len)
map->queues[map->len++] = index;
#ifdef CONFIG_NUMA
- if (!is_rxqs_map) {
+ if (type == XPS_CPUS) {
if (numa_node_id == -2)
numa_node_id = cpu_to_node(j);
else if (numa_node_id != cpu_to_node(j))
numa_node_id = -1;
}
#endif
- } else if (dev_maps) {
- /* fill in the new device map from the old device map */
- map = xmap_dereference(dev_maps->attr_map[tci]);
- RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
}
- /* copy maps belonging to foreign traffic classes */
- for (i = num_tc - tc, tci++; dev_maps && --i; tci++) {
- /* fill in the new device map from the old device map */
- map = xmap_dereference(dev_maps->attr_map[tci]);
- RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
- }
+ if (copy)
+ xps_copy_dev_maps(dev_maps, new_dev_maps, j, tc,
+ skip_tc);
}
- if (is_rxqs_map)
- rcu_assign_pointer(dev->xps_rxqs_map, new_dev_maps);
- else
- rcu_assign_pointer(dev->xps_cpus_map, new_dev_maps);
+ rcu_assign_pointer(dev->xps_maps[type], new_dev_maps);
/* Cleanup old maps */
if (!dev_maps)
goto out_no_old_maps;
- for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
- j < nr_ids;) {
- for (i = num_tc, tci = j * num_tc; i--; tci++) {
- new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
+ for (j = 0; j < dev_maps->nr_ids; j++) {
+ for (i = num_tc, tci = j * dev_maps->num_tc; i--; tci++) {
map = xmap_dereference(dev_maps->attr_map[tci]);
- if (map && map != new_map)
- kfree_rcu(map, rcu);
+ if (!map)
+ continue;
+
+ if (copy) {
+ new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
+ if (map == new_map)
+ continue;
+ }
+
+ RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL);
+ kfree_rcu(map, rcu);
}
}
- kfree_rcu(dev_maps, rcu);
+ old_dev_maps = dev_maps;
out_no_old_maps:
dev_maps = new_dev_maps;
active = true;
out_no_new_maps:
- if (!is_rxqs_map) {
+ if (type == XPS_CPUS)
/* update Tx queue numa node */
netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
(numa_node_id >= 0) ?
numa_node_id : NUMA_NO_NODE);
- }
if (!dev_maps)
goto out_no_maps;
/* removes tx-queue from unused CPUs/rx-queues */
- for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
- j < nr_ids;) {
- for (i = tc, tci = j * num_tc; i--; tci++)
- active |= remove_xps_queue(dev_maps, tci, index);
- if (!netif_attr_test_mask(j, mask, nr_ids) ||
- !netif_attr_test_online(j, online_mask, nr_ids))
- active |= remove_xps_queue(dev_maps, tci, index);
- for (i = num_tc - tc, tci++; --i; tci++)
- active |= remove_xps_queue(dev_maps, tci, index);
+ for (j = 0; j < dev_maps->nr_ids; j++) {
+ tci = j * dev_maps->num_tc;
+
+ for (i = 0; i < dev_maps->num_tc; i++, tci++) {
+ if (i == tc &&
+ netif_attr_test_mask(j, mask, dev_maps->nr_ids) &&
+ netif_attr_test_online(j, online_mask, dev_maps->nr_ids))
+ continue;
+
+ active |= remove_xps_queue(dev_maps,
+ copy ? old_dev_maps : NULL,
+ tci, index);
+ }
}
+ if (old_dev_maps)
+ kfree_rcu(old_dev_maps, rcu);
+
/* free map if not active */
if (!active)
- reset_xps_maps(dev, dev_maps, is_rxqs_map);
+ reset_xps_maps(dev, dev_maps, type);
out_no_maps:
mutex_unlock(&xps_map_mutex);
return 0;
error:
/* remove any maps that we added */
- for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
- j < nr_ids;) {
+ for (j = 0; j < nr_ids; j++) {
for (i = num_tc, tci = j * num_tc; i--; tci++) {
new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
- map = dev_maps ?
+ map = copy ?
xmap_dereference(dev_maps->attr_map[tci]) :
NULL;
if (new_map && new_map != map)
int ret;
cpus_read_lock();
- ret = __netif_set_xps_queue(dev, cpumask_bits(mask), index, false);
+ ret = __netif_set_xps_queue(dev, cpumask_bits(mask), index, XPS_CPUS);
cpus_read_unlock();
return ret;
static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb,
struct xps_dev_maps *dev_maps, unsigned int tci)
{
+ int tc = netdev_get_prio_tc_map(dev, skb->priority);
struct xps_map *map;
int queue_index = -1;
- if (dev->num_tc) {
- tci *= dev->num_tc;
- tci += netdev_get_prio_tc_map(dev, skb->priority);
- }
+ if (tc >= dev_maps->num_tc || tci >= dev_maps->nr_ids)
+ return queue_index;
+
+ tci *= dev_maps->num_tc;
+ tci += tc;
map = rcu_dereference(dev_maps->attr_map[tci]);
if (map) {
if (!static_key_false(&xps_rxqs_needed))
goto get_cpus_map;
- dev_maps = rcu_dereference(sb_dev->xps_rxqs_map);
+ dev_maps = rcu_dereference(sb_dev->xps_maps[XPS_RXQS]);
if (dev_maps) {
int tci = sk_rx_queue_get(sk);
- if (tci >= 0 && tci < dev->num_rx_queues)
+ if (tci >= 0)
queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
tci);
}
get_cpus_map:
if (queue_index < 0) {
- dev_maps = rcu_dereference(sb_dev->xps_cpus_map);
+ dev_maps = rcu_dereference(sb_dev->xps_maps[XPS_CPUS]);
if (dev_maps) {
unsigned int tci = skb->sender_cpu - 1;
*/
thread = READ_ONCE(napi->thread);
if (thread) {
+ /* Avoid doing set_bit() if the thread is in
+ * INTERRUPTIBLE state, cause napi_thread_wait()
+ * makes sure to proceed with napi polling
+ * if the thread is explicitly woken from here.
+ */
+ if (READ_ONCE(thread->state) != TASK_INTERRUPTIBLE)
+ set_bit(NAPI_STATE_SCHED_THREADED, &napi->state);
wake_up_process(thread);
return;
}
goto another_round;
case RX_HANDLER_EXACT:
deliver_exact = true;
+ break;
case RX_HANDLER_PASS:
break;
default:
}
EXPORT_SYMBOL(napi_gro_flush);
-static struct list_head *gro_list_prepare(struct napi_struct *napi,
- struct sk_buff *skb)
+static void gro_list_prepare(const struct list_head *head,
+ const struct sk_buff *skb)
{
unsigned int maclen = skb->dev->hard_header_len;
u32 hash = skb_get_hash_raw(skb);
- struct list_head *head;
struct sk_buff *p;
- head = &napi->gro_hash[hash & (GRO_HASH_BUCKETS - 1)].list;
list_for_each_entry(p, head, list) {
unsigned long diffs;
maclen);
NAPI_GRO_CB(p)->same_flow = !diffs;
}
-
- return head;
}
static void skb_gro_reset_offset(struct sk_buff *skb)
static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
{
- u32 hash = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1);
+ u32 bucket = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1);
+ struct gro_list *gro_list = &napi->gro_hash[bucket];
struct list_head *head = &offload_base;
struct packet_offload *ptype;
__be16 type = skb->protocol;
- struct list_head *gro_head;
struct sk_buff *pp = NULL;
enum gro_result ret;
int same_flow;
if (netif_elide_gro(skb->dev))
goto normal;
- gro_head = gro_list_prepare(napi, skb);
+ gro_list_prepare(&gro_list->list, skb);
rcu_read_lock();
list_for_each_entry_rcu(ptype, head, list) {
pp = INDIRECT_CALL_INET(ptype->callbacks.gro_receive,
ipv6_gro_receive, inet_gro_receive,
- gro_head, skb);
+ &gro_list->list, skb);
break;
}
rcu_read_unlock();
if (pp) {
skb_list_del_init(pp);
napi_gro_complete(napi, pp);
- napi->gro_hash[hash].count--;
+ gro_list->count--;
}
if (same_flow)
if (NAPI_GRO_CB(skb)->flush)
goto normal;
- if (unlikely(napi->gro_hash[hash].count >= MAX_GRO_SKBS)) {
- gro_flush_oldest(napi, gro_head);
- } else {
- napi->gro_hash[hash].count++;
- }
+ if (unlikely(gro_list->count >= MAX_GRO_SKBS))
+ gro_flush_oldest(napi, &gro_list->list);
+ else
+ gro_list->count++;
+
NAPI_GRO_CB(skb)->count = 1;
NAPI_GRO_CB(skb)->age = jiffies;
NAPI_GRO_CB(skb)->last = skb;
skb_shinfo(skb)->gso_size = skb_gro_len(skb);
- list_add(&skb->list, gro_head);
+ list_add(&skb->list, &gro_list->list);
ret = GRO_HELD;
pull:
if (grow > 0)
gro_pull_from_frag0(skb, grow);
ok:
- if (napi->gro_hash[hash].count) {
- if (!test_bit(hash, &napi->gro_bitmask))
- __set_bit(hash, &napi->gro_bitmask);
- } else if (test_bit(hash, &napi->gro_bitmask)) {
- __clear_bit(hash, &napi->gro_bitmask);
+ if (gro_list->count) {
+ if (!test_bit(bucket, &napi->gro_bitmask))
+ __set_bit(bucket, &napi->gro_bitmask);
+ } else if (test_bit(bucket, &napi->gro_bitmask)) {
+ __clear_bit(bucket, &napi->gro_bitmask);
}
return ret;
WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED));
new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED |
+ NAPIF_STATE_SCHED_THREADED |
NAPIF_STATE_PREFER_BUSY_POLL);
/* If STATE_MISSED was set, leave STATE_SCHED set,
return err;
}
+EXPORT_SYMBOL(dev_set_threaded);
void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
int (*poll)(struct napi_struct *, int), int weight)
static int napi_thread_wait(struct napi_struct *napi)
{
+ bool woken = false;
+
set_current_state(TASK_INTERRUPTIBLE);
while (!kthread_should_stop() && !napi_disable_pending(napi)) {
- if (test_bit(NAPI_STATE_SCHED, &napi->state)) {
+ /* Testing SCHED_THREADED bit here to make sure the current
+ * kthread owns this napi and could poll on this napi.
+ * Testing SCHED bit is not enough because SCHED bit might be
+ * set by some other busy poll thread or by napi_disable().
+ */
+ if (test_bit(NAPI_STATE_SCHED_THREADED, &napi->state) || woken) {
WARN_ON(!list_empty(&napi->poll_list));
__set_current_state(TASK_RUNNING);
return 0;
}
schedule();
+ /* woken being true indicates this thread owns this napi. */
+ woken = true;
set_current_state(TASK_INTERRUPTIBLE);
}
__set_current_state(TASK_RUNNING);
int netdev_refcnt_read(const struct net_device *dev)
{
+#ifdef CONFIG_PCPU_DEV_REFCNT
int i, refcnt = 0;
for_each_possible_cpu(i)
refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
return refcnt;
+#else
+ return refcount_read(&dev->dev_refcnt);
+#endif
}
EXPORT_SYMBOL(netdev_refcnt_read);
+int netdev_unregister_timeout_secs __read_mostly = 10;
+
#define WAIT_REFS_MIN_MSECS 1
#define WAIT_REFS_MAX_MSECS 250
/**
rebroadcast_time = warning_time = jiffies;
refcnt = netdev_refcnt_read(dev);
- while (refcnt != 0) {
+ while (refcnt != 1) {
if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
rtnl_lock();
refcnt = netdev_refcnt_read(dev);
- if (refcnt && time_after(jiffies, warning_time + 10 * HZ)) {
+ if (refcnt &&
+ time_after(jiffies, warning_time +
+ netdev_unregister_timeout_secs * HZ)) {
pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
dev->name, refcnt);
warning_time = jiffies;
netdev_wait_allrefs(dev);
/* paranoia */
- BUG_ON(netdev_refcnt_read(dev));
+ BUG_ON(netdev_refcnt_read(dev) != 1);
BUG_ON(!list_empty(&dev->ptype_all));
BUG_ON(!list_empty(&dev->ptype_specific));
WARN_ON(rcu_access_pointer(dev->ip_ptr));
dev = PTR_ALIGN(p, NETDEV_ALIGN);
dev->padded = (char *)dev - (char *)p;
+#ifdef CONFIG_PCPU_DEV_REFCNT
dev->pcpu_refcnt = alloc_percpu(int);
if (!dev->pcpu_refcnt)
goto free_dev;
+ dev_hold(dev);
+#else
+ refcount_set(&dev->dev_refcnt, 1);
+#endif
if (dev_addr_init(dev))
goto free_pcpu;
return NULL;
free_pcpu:
+#ifdef CONFIG_PCPU_DEV_REFCNT
free_percpu(dev->pcpu_refcnt);
free_dev:
+#endif
netdev_freemem(dev);
return NULL;
}
list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
netif_napi_del(p);
+#ifdef CONFIG_PCPU_DEV_REFCNT
free_percpu(dev->pcpu_refcnt);
dev->pcpu_refcnt = NULL;
+#endif
free_percpu(dev->xdp_bulkq);
dev->xdp_bulkq = NULL;
continue;
/* Leave virtual devices for the generic cleanup */
- if (dev->rtnl_link_ops)
+ if (dev->rtnl_link_ops && !dev->rtnl_link_ops->netns_refund)
continue;
/* Push remaining network devices to init_net */
return 0;
err_module_put:
+ for_each_possible_cpu(cpu) {
+ struct per_cpu_dm_data *hw_data = &per_cpu(dm_hw_cpu_data, cpu);
+ struct sk_buff *skb;
+
+ del_timer_sync(&hw_data->send_timer);
+ cancel_work_sync(&hw_data->dm_alert_work);
+ while ((skb = __skb_dequeue(&hw_data->drop_queue))) {
+ struct devlink_trap_metadata *hw_metadata;
+
+ hw_metadata = NET_DM_SKB_CB(skb)->hw_metadata;
+ net_dm_hw_metadata_free(hw_metadata);
+ consume_skb(skb);
+ }
+ }
module_put(THIS_MODULE);
return rc;
}
err_unregister_trace:
unregister_trace_kfree_skb(ops->kfree_skb_probe, NULL);
err_module_put:
+ for_each_possible_cpu(cpu) {
+ struct per_cpu_dm_data *data = &per_cpu(dm_cpu_data, cpu);
+ struct sk_buff *skb;
+
+ del_timer_sync(&data->send_timer);
+ cancel_work_sync(&data->dm_alert_work);
+ while ((skb = __skb_dequeue(&data->drop_queue)))
+ consume_skb(skb);
+ }
module_put(THIS_MODULE);
return rc;
}
/*
* Because of the module_get/put we do in the trace state change path
- * we are guarnateed not to have any current users when we get here
+ * we are guaranteed not to have any current users when we get here
*/
for_each_possible_cpu(cpu) {
static inline int sk_skb_try_make_writable(struct sk_buff *skb,
unsigned int write_len)
{
- int err = __bpf_try_make_writable(skb, write_len);
-
- bpf_compute_data_end_sk_skb(skb);
- return err;
+ return __bpf_try_make_writable(skb, write_len);
}
BPF_CALL_2(sk_skb_pull_data, struct sk_buff *, skb, u32, len)
BPF_F_ADJ_ROOM_ENCAP_L3_MASK | \
BPF_F_ADJ_ROOM_ENCAP_L4_GRE | \
BPF_F_ADJ_ROOM_ENCAP_L4_UDP | \
+ BPF_F_ADJ_ROOM_ENCAP_L2_ETH | \
BPF_F_ADJ_ROOM_ENCAP_L2( \
BPF_ADJ_ROOM_ENCAP_L2_MASK))
flags & BPF_F_ADJ_ROOM_ENCAP_L4_UDP)
return -EINVAL;
+ if (flags & BPF_F_ADJ_ROOM_ENCAP_L2_ETH &&
+ inner_mac_len < ETH_HLEN)
+ return -EINVAL;
+
if (skb->encapsulation)
return -EALREADY;
skb->inner_mac_header = inner_net - inner_mac_len;
skb->inner_network_header = inner_net;
skb->inner_transport_header = inner_trans;
- skb_set_inner_protocol(skb, skb->protocol);
+
+ if (flags & BPF_F_ADJ_ROOM_ENCAP_L2_ETH)
+ skb_set_inner_protocol(skb, htons(ETH_P_TEB));
+ else
+ skb_set_inner_protocol(skb, skb->protocol);
skb->encapsulation = 1;
skb_set_network_header(skb, mac_len);
return -ENOMEM;
__skb_pull(skb, len_diff_abs);
}
- bpf_compute_data_end_sk_skb(skb);
if (tls_sw_has_ctx_rx(skb->sk)) {
struct strp_msg *rxm = strp_msg(skb);
BPF_CALL_3(sk_skb_change_tail, struct sk_buff *, skb, u32, new_len,
u64, flags)
{
- int ret = __bpf_skb_change_tail(skb, new_len, flags);
-
- bpf_compute_data_end_sk_skb(skb);
- return ret;
+ return __bpf_skb_change_tail(skb, new_len, flags);
}
static const struct bpf_func_proto sk_skb_change_tail_proto = {
BPF_CALL_3(sk_skb_change_head, struct sk_buff *, skb, u32, head_room,
u64, flags)
{
- int ret = __bpf_skb_change_head(skb, head_room, flags);
-
- bpf_compute_data_end_sk_skb(skb);
- return ret;
+ return __bpf_skb_change_head(skb, head_room, flags);
}
static const struct bpf_func_proto sk_skb_change_head_proto = {
.arg2_type = ARG_ANYTHING,
};
-static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd,
- struct bpf_map *map, struct xdp_buff *xdp)
-{
- switch (map->map_type) {
- case BPF_MAP_TYPE_DEVMAP:
- case BPF_MAP_TYPE_DEVMAP_HASH:
- return dev_map_enqueue(fwd, xdp, dev_rx);
- case BPF_MAP_TYPE_CPUMAP:
- return cpu_map_enqueue(fwd, xdp, dev_rx);
- case BPF_MAP_TYPE_XSKMAP:
- return __xsk_map_redirect(fwd, xdp);
- default:
- return -EBADRQC;
- }
- return 0;
-}
-
void xdp_do_flush(void)
{
__dev_flush();
}
EXPORT_SYMBOL_GPL(xdp_do_flush);
-static inline void *__xdp_map_lookup_elem(struct bpf_map *map, u32 index)
-{
- switch (map->map_type) {
- case BPF_MAP_TYPE_DEVMAP:
- return __dev_map_lookup_elem(map, index);
- case BPF_MAP_TYPE_DEVMAP_HASH:
- return __dev_map_hash_lookup_elem(map, index);
- case BPF_MAP_TYPE_CPUMAP:
- return __cpu_map_lookup_elem(map, index);
- case BPF_MAP_TYPE_XSKMAP:
- return __xsk_map_lookup_elem(map, index);
- default:
- return NULL;
- }
-}
-
-void bpf_clear_redirect_map(struct bpf_map *map)
-{
- struct bpf_redirect_info *ri;
- int cpu;
-
- for_each_possible_cpu(cpu) {
- ri = per_cpu_ptr(&bpf_redirect_info, cpu);
- /* Avoid polluting remote cacheline due to writes if
- * not needed. Once we pass this test, we need the
- * cmpxchg() to make sure it hasn't been changed in
- * the meantime by remote CPU.
- */
- if (unlikely(READ_ONCE(ri->map) == map))
- cmpxchg(&ri->map, map, NULL);
- }
-}
-
int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
struct bpf_prog *xdp_prog)
{
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
- struct bpf_map *map = READ_ONCE(ri->map);
- u32 index = ri->tgt_index;
+ enum bpf_map_type map_type = ri->map_type;
void *fwd = ri->tgt_value;
+ u32 map_id = ri->map_id;
int err;
- ri->tgt_index = 0;
- ri->tgt_value = NULL;
- WRITE_ONCE(ri->map, NULL);
+ ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */
+ ri->map_type = BPF_MAP_TYPE_UNSPEC;
- if (unlikely(!map)) {
- fwd = dev_get_by_index_rcu(dev_net(dev), index);
- if (unlikely(!fwd)) {
- err = -EINVAL;
- goto err;
+ switch (map_type) {
+ case BPF_MAP_TYPE_DEVMAP:
+ fallthrough;
+ case BPF_MAP_TYPE_DEVMAP_HASH:
+ err = dev_map_enqueue(fwd, xdp, dev);
+ break;
+ case BPF_MAP_TYPE_CPUMAP:
+ err = cpu_map_enqueue(fwd, xdp, dev);
+ break;
+ case BPF_MAP_TYPE_XSKMAP:
+ err = __xsk_map_redirect(fwd, xdp);
+ break;
+ case BPF_MAP_TYPE_UNSPEC:
+ if (map_id == INT_MAX) {
+ fwd = dev_get_by_index_rcu(dev_net(dev), ri->tgt_index);
+ if (unlikely(!fwd)) {
+ err = -EINVAL;
+ break;
+ }
+ err = dev_xdp_enqueue(fwd, xdp, dev);
+ break;
}
-
- err = dev_xdp_enqueue(fwd, xdp, dev);
- } else {
- err = __bpf_tx_xdp_map(dev, fwd, map, xdp);
+ fallthrough;
+ default:
+ err = -EBADRQC;
}
if (unlikely(err))
goto err;
- _trace_xdp_redirect_map(dev, xdp_prog, fwd, map, index);
+ _trace_xdp_redirect_map(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index);
return 0;
err:
- _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map, index, err);
+ _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index, err);
return err;
}
EXPORT_SYMBOL_GPL(xdp_do_redirect);
struct sk_buff *skb,
struct xdp_buff *xdp,
struct bpf_prog *xdp_prog,
- struct bpf_map *map)
+ void *fwd,
+ enum bpf_map_type map_type, u32 map_id)
{
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
- u32 index = ri->tgt_index;
- void *fwd = ri->tgt_value;
- int err = 0;
-
- ri->tgt_index = 0;
- ri->tgt_value = NULL;
- WRITE_ONCE(ri->map, NULL);
-
- if (map->map_type == BPF_MAP_TYPE_DEVMAP ||
- map->map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
- struct bpf_dtab_netdev *dst = fwd;
+ int err;
- err = dev_map_generic_redirect(dst, skb, xdp_prog);
+ switch (map_type) {
+ case BPF_MAP_TYPE_DEVMAP:
+ fallthrough;
+ case BPF_MAP_TYPE_DEVMAP_HASH:
+ err = dev_map_generic_redirect(fwd, skb, xdp_prog);
if (unlikely(err))
goto err;
- } else if (map->map_type == BPF_MAP_TYPE_XSKMAP) {
- struct xdp_sock *xs = fwd;
-
- err = xsk_generic_rcv(xs, xdp);
+ break;
+ case BPF_MAP_TYPE_XSKMAP:
+ err = xsk_generic_rcv(fwd, xdp);
if (err)
goto err;
consume_skb(skb);
- } else {
+ break;
+ default:
/* TODO: Handle BPF_MAP_TYPE_CPUMAP */
err = -EBADRQC;
goto err;
}
- _trace_xdp_redirect_map(dev, xdp_prog, fwd, map, index);
+ _trace_xdp_redirect_map(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index);
return 0;
err:
- _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map, index, err);
+ _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index, err);
return err;
}
struct xdp_buff *xdp, struct bpf_prog *xdp_prog)
{
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
- struct bpf_map *map = READ_ONCE(ri->map);
- u32 index = ri->tgt_index;
- struct net_device *fwd;
- int err = 0;
-
- if (map)
- return xdp_do_generic_redirect_map(dev, skb, xdp, xdp_prog,
- map);
- ri->tgt_index = 0;
- fwd = dev_get_by_index_rcu(dev_net(dev), index);
- if (unlikely(!fwd)) {
- err = -EINVAL;
- goto err;
- }
+ enum bpf_map_type map_type = ri->map_type;
+ void *fwd = ri->tgt_value;
+ u32 map_id = ri->map_id;
+ int err;
- err = xdp_ok_fwd_dev(fwd, skb->len);
- if (unlikely(err))
- goto err;
+ ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */
+ ri->map_type = BPF_MAP_TYPE_UNSPEC;
- skb->dev = fwd;
- _trace_xdp_redirect(dev, xdp_prog, index);
- generic_xdp_tx(skb, xdp_prog);
- return 0;
+ if (map_type == BPF_MAP_TYPE_UNSPEC && map_id == INT_MAX) {
+ fwd = dev_get_by_index_rcu(dev_net(dev), ri->tgt_index);
+ if (unlikely(!fwd)) {
+ err = -EINVAL;
+ goto err;
+ }
+
+ err = xdp_ok_fwd_dev(fwd, skb->len);
+ if (unlikely(err))
+ goto err;
+
+ skb->dev = fwd;
+ _trace_xdp_redirect(dev, xdp_prog, ri->tgt_index);
+ generic_xdp_tx(skb, xdp_prog);
+ return 0;
+ }
+
+ return xdp_do_generic_redirect_map(dev, skb, xdp, xdp_prog, fwd, map_type, map_id);
err:
- _trace_xdp_redirect_err(dev, xdp_prog, index, err);
+ _trace_xdp_redirect_err(dev, xdp_prog, ri->tgt_index, err);
return err;
}
if (unlikely(flags))
return XDP_ABORTED;
- ri->flags = flags;
+ /* NB! Map type UNSPEC and map_id == INT_MAX (never generated
+ * by map_idr) is used for ifindex based XDP redirect.
+ */
ri->tgt_index = ifindex;
- ri->tgt_value = NULL;
- WRITE_ONCE(ri->map, NULL);
+ ri->map_id = INT_MAX;
+ ri->map_type = BPF_MAP_TYPE_UNSPEC;
return XDP_REDIRECT;
}
BPF_CALL_3(bpf_xdp_redirect_map, struct bpf_map *, map, u32, ifindex,
u64, flags)
{
- struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
-
- /* Lower bits of the flags are used as return code on lookup failure */
- if (unlikely(flags > XDP_TX))
- return XDP_ABORTED;
-
- ri->tgt_value = __xdp_map_lookup_elem(map, ifindex);
- if (unlikely(!ri->tgt_value)) {
- /* If the lookup fails we want to clear out the state in the
- * redirect_info struct completely, so that if an eBPF program
- * performs multiple lookups, the last one always takes
- * precedence.
- */
- WRITE_ONCE(ri->map, NULL);
- return flags;
- }
-
- ri->flags = flags;
- ri->tgt_index = ifindex;
- WRITE_ONCE(ri->map, map);
-
- return XDP_REDIRECT;
+ return map->ops->map_redirect(map, ifindex, flags);
}
static const struct bpf_func_proto bpf_xdp_redirect_map_proto = {
if (unlikely(flags & ~(BPF_MTU_CHK_SEGS)))
return -EINVAL;
- if (unlikely(flags & BPF_MTU_CHK_SEGS && len_diff))
+ if (unlikely(flags & BPF_MTU_CHK_SEGS && (len_diff || *mtu_len)))
return -EINVAL;
dev = __dev_via_ifindex(dev, ifindex);
mtu = READ_ONCE(dev->mtu);
dev_len = mtu + dev->hard_header_len;
- skb_len = skb->len + len_diff; /* minus result pass check */
+
+ /* If set use *mtu_len as input, L3 as iph->tot_len (like fib_lookup) */
+ skb_len = *mtu_len ? *mtu_len + dev->hard_header_len : skb->len;
+
+ skb_len += len_diff; /* minus result pass check */
if (skb_len <= dev_len) {
ret = BPF_MTU_CHK_RET_SUCCESS;
goto out;
/* Add L2-header as dev MTU is L3 size */
dev_len = mtu + dev->hard_header_len;
+ /* Use *mtu_len as input, L3 as iph->tot_len (like fib_lookup) */
+ if (*mtu_len)
+ xdp_len = *mtu_len + dev->hard_header_len;
+
xdp_len += len_diff; /* minus result pass check */
if (xdp_len > dev_len)
ret = BPF_MTU_CHK_RET_FRAG_NEEDED;
return insn - insn_buf;
}
+/* data_end = skb->data + skb_headlen() */
+static struct bpf_insn *bpf_convert_data_end_access(const struct bpf_insn *si,
+ struct bpf_insn *insn)
+{
+ /* si->dst_reg = skb->data */
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data),
+ si->dst_reg, si->src_reg,
+ offsetof(struct sk_buff, data));
+ /* AX = skb->len */
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, len),
+ BPF_REG_AX, si->src_reg,
+ offsetof(struct sk_buff, len));
+ /* si->dst_reg = skb->data + skb->len */
+ *insn++ = BPF_ALU64_REG(BPF_ADD, si->dst_reg, BPF_REG_AX);
+ /* AX = skb->data_len */
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data_len),
+ BPF_REG_AX, si->src_reg,
+ offsetof(struct sk_buff, data_len));
+ /* si->dst_reg = skb->data + skb->len - skb->data_len */
+ *insn++ = BPF_ALU64_REG(BPF_SUB, si->dst_reg, BPF_REG_AX);
+
+ return insn;
+}
+
static u32 sk_skb_convert_ctx_access(enum bpf_access_type type,
const struct bpf_insn *si,
struct bpf_insn *insn_buf,
struct bpf_prog *prog, u32 *target_size)
{
struct bpf_insn *insn = insn_buf;
- int off;
switch (si->off) {
case offsetof(struct __sk_buff, data_end):
- off = si->off;
- off -= offsetof(struct __sk_buff, data_end);
- off += offsetof(struct sk_buff, cb);
- off += offsetof(struct tcp_skb_cb, bpf.data_end);
- *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg,
- si->src_reg, off);
+ insn = bpf_convert_data_end_access(si, insn);
break;
default:
return bpf_convert_ctx_access(type, si, insn_buf, prog,
}
const struct bpf_prog_ops sk_lookup_prog_ops = {
+ .test_run = bpf_prog_test_run_sk_lookup,
};
const struct bpf_verifier_ops sk_lookup_verifier_ops = {
* is the protocol port offset returned from proto_ports_offset
*/
__be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto,
- void *data, int hlen)
+ const void *data, int hlen)
{
int poff = proto_ports_offset(ip_proto);
*/
void skb_flow_get_icmp_tci(const struct sk_buff *skb,
struct flow_dissector_key_icmp *key_icmp,
- void *data, int thoff, int hlen)
+ const void *data, int thoff, int hlen)
{
struct icmphdr *ih, _ih;
* avoid confusion with packets without such field
*/
if (icmp_has_id(ih->type))
- key_icmp->id = ih->un.echo.id ? : 1;
+ key_icmp->id = ih->un.echo.id ? ntohs(ih->un.echo.id) : 1;
else
key_icmp->id = 0;
}
*/
static void __skb_flow_dissect_icmp(const struct sk_buff *skb,
struct flow_dissector *flow_dissector,
- void *target_container,
- void *data, int thoff, int hlen)
+ void *target_container, const void *data,
+ int thoff, int hlen)
{
struct flow_dissector_key_icmp *key_icmp;
static enum flow_dissect_ret
__skb_flow_dissect_mpls(const struct sk_buff *skb,
struct flow_dissector *flow_dissector,
- void *target_container, void *data, int nhoff, int hlen,
- int lse_index, bool *entropy_label)
+ void *target_container, const void *data, int nhoff,
+ int hlen, int lse_index, bool *entropy_label)
{
struct mpls_label *hdr, _hdr;
u32 entry, label, bos;
static enum flow_dissect_ret
__skb_flow_dissect_arp(const struct sk_buff *skb,
struct flow_dissector *flow_dissector,
- void *target_container, void *data, int nhoff, int hlen)
+ void *target_container, const void *data,
+ int nhoff, int hlen)
{
struct flow_dissector_key_arp *key_arp;
struct {
__skb_flow_dissect_gre(const struct sk_buff *skb,
struct flow_dissector_key_control *key_control,
struct flow_dissector *flow_dissector,
- void *target_container, void *data,
+ void *target_container, const void *data,
__be16 *p_proto, int *p_nhoff, int *p_hlen,
unsigned int flags)
{
static enum flow_dissect_ret
__skb_flow_dissect_batadv(const struct sk_buff *skb,
struct flow_dissector_key_control *key_control,
- void *data, __be16 *p_proto, int *p_nhoff, int hlen,
- unsigned int flags)
+ const void *data, __be16 *p_proto, int *p_nhoff,
+ int hlen, unsigned int flags)
{
struct {
struct batadv_unicast_packet batadv_unicast;
static void
__skb_flow_dissect_tcp(const struct sk_buff *skb,
struct flow_dissector *flow_dissector,
- void *target_container, void *data, int thoff, int hlen)
+ void *target_container, const void *data,
+ int thoff, int hlen)
{
struct flow_dissector_key_tcp *key_tcp;
struct tcphdr *th, _th;
static void
__skb_flow_dissect_ports(const struct sk_buff *skb,
struct flow_dissector *flow_dissector,
- void *target_container, void *data, int nhoff,
- u8 ip_proto, int hlen)
+ void *target_container, const void *data,
+ int nhoff, u8 ip_proto, int hlen)
{
enum flow_dissector_key_id dissector_ports = FLOW_DISSECTOR_KEY_MAX;
struct flow_dissector_key_ports *key_ports;
static void
__skb_flow_dissect_ipv4(const struct sk_buff *skb,
struct flow_dissector *flow_dissector,
- void *target_container, void *data, const struct iphdr *iph)
+ void *target_container, const void *data,
+ const struct iphdr *iph)
{
struct flow_dissector_key_ip *key_ip;
static void
__skb_flow_dissect_ipv6(const struct sk_buff *skb,
struct flow_dissector *flow_dissector,
- void *target_container, void *data, const struct ipv6hdr *iph)
+ void *target_container, const void *data,
+ const struct ipv6hdr *iph)
{
struct flow_dissector_key_ip *key_ip;
bool __skb_flow_dissect(const struct net *net,
const struct sk_buff *skb,
struct flow_dissector *flow_dissector,
- void *target_container,
- void *data, __be16 proto, int nhoff, int hlen,
- unsigned int flags)
+ void *target_container, const void *data,
+ __be16 proto, int nhoff, int hlen, unsigned int flags)
{
struct flow_dissector_key_control *key_control;
struct flow_dissector_key_basic *key_basic;
}
EXPORT_SYMBOL(skb_get_hash_perturb);
-u32 __skb_get_poff(const struct sk_buff *skb, void *data,
+u32 __skb_get_poff(const struct sk_buff *skb, const void *data,
const struct flow_keys_basic *keys, int hlen)
{
u32 poff = keys->control.thoff;
* Alan Cox : Added BSD route gw semantics
* Alan Cox : Super /proc >4K
* Alan Cox : MTU in route table
- * Alan Cox : MSS actually. Also added the window
+ * Alan Cox : MSS actually. Also added the window
* clamper.
* Sam Lantinga : Fixed route matching in rt_del()
* Alan Cox : Routing cache support.
* Olaf Erb : irtt wasn't being copied right.
* Bjorn Ekwall : Kerneld route support.
* Alan Cox : Multicast fixed (I hope)
- * Pavel Krauz : Limited broadcast fixed
+ * Pavel Krauz : Limited broadcast fixed
* Mike McLagan : Routing by source
* Alexey Kuznetsov : End of old history. Split to fib.c and
* route.c and rewritten from scratch.
* Robert Olsson : Added rt_cache statistics
* Arnaldo C. Melo : Convert proc stuff to seq_file
* Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
- * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
- * Ilia Sotnikov : Removed TOS from hash calculations
+ * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
+ * Ilia Sotnikov : Removed TOS from hash calculations
*/
#define pr_fmt(fmt) "IPv4: " fmt
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/mm.h>
+#include <linux/memblock.h>
#include <linux/string.h>
#include <linux/socket.h>
#include <linux/sockios.h>
.show = rt_cache_seq_show,
};
-static int rt_cache_seq_open(struct inode *inode, struct file *file)
-{
- return seq_open(file, &rt_cache_seq_ops);
-}
-
-static const struct proc_ops rt_cache_proc_ops = {
- .proc_open = rt_cache_seq_open,
- .proc_read = seq_read,
- .proc_lseek = seq_lseek,
- .proc_release = seq_release,
-};
-
-
static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
{
int cpu;
.show = rt_cpu_seq_show,
};
-
-static int rt_cpu_seq_open(struct inode *inode, struct file *file)
-{
- return seq_open(file, &rt_cpu_seq_ops);
-}
-
-static const struct proc_ops rt_cpu_proc_ops = {
- .proc_open = rt_cpu_seq_open,
- .proc_read = seq_read,
- .proc_lseek = seq_lseek,
- .proc_release = seq_release,
-};
-
#ifdef CONFIG_IP_ROUTE_CLASSID
static int rt_acct_proc_show(struct seq_file *m, void *v)
{
{
struct proc_dir_entry *pde;
- pde = proc_create("rt_cache", 0444, net->proc_net,
- &rt_cache_proc_ops);
+ pde = proc_create_seq("rt_cache", 0444, net->proc_net,
+ &rt_cache_seq_ops);
if (!pde)
goto err1;
- pde = proc_create("rt_cache", 0444,
- net->proc_net_stat, &rt_cpu_proc_ops);
+ pde = proc_create_seq("rt_cache", 0444, net->proc_net_stat,
+ &rt_cpu_seq_ops);
if (!pde)
goto err2;
__ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
}
-#define IP_IDENTS_SZ 2048u
-
+/* Hash tables of size 2048..262144 depending on RAM size.
+ * Each bucket uses 8 bytes.
+ */
+static u32 ip_idents_mask __read_mostly;
static atomic_t *ip_idents __read_mostly;
static u32 *ip_tstamps __read_mostly;
*/
u32 ip_idents_reserve(u32 hash, int segs)
{
- u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
- atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
- u32 old = READ_ONCE(*p_tstamp);
- u32 now = (u32)jiffies;
+ u32 bucket, old, now = (u32)jiffies;
+ atomic_t *p_id;
+ u32 *p_tstamp;
u32 delta = 0;
+ bucket = hash & ip_idents_mask;
+ p_tstamp = ip_tstamps + bucket;
+ p_id = ip_idents + bucket;
+ old = READ_ONCE(*p_tstamp);
+
if (old != now && cmpxchg(p_tstamp, old, now) == old)
delta = prandom_u32_max(now - old);
for_each_possible_cpu(i) {
struct rtable __rcu **prt;
+
prt = per_cpu_ptr(nhc->nhc_pcpu_rth_output, i);
rt = rcu_dereference(*prt);
if (rt)
}
/*
- We do not cache source address of outgoing interface,
- because it is used only by IP RR, TS and SRR options,
- so that it out of fast path.
-
- BTW remember: "addr" is allowed to be not aligned
- in IP options!
+ * We do not cache source address of outgoing interface,
+ * because it is used only by IP RR, TS and SRR options,
+ * so that it out of fast path.
+ *
+ * BTW remember: "addr" is allowed to be not aligned
+ * in IP options!
*/
void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
goto out;
/* Check for the most weird martians, which can be not detected
- by fib_lookup.
+ * by fib_lookup.
*/
tun_info = skb_tunnel_info(skb);
if (res->type == RTN_UNREACHABLE) {
rth->dst.input= ip_error;
rth->dst.error= -err;
- rth->rt_flags &= ~RTCF_LOCAL;
+ rth->rt_flags &= ~RTCF_LOCAL;
}
if (do_cache) {
u8 tos, struct net_device *dev, struct fib_result *res)
{
/* Multicast recognition logic is moved from route cache to here.
- The problem was that too many Ethernet cards have broken/missing
- hardware multicast filters :-( As result the host on multicasting
- network acquires a lot of useless route cache entries, sort of
- SDR messages from all the world. Now we try to get rid of them.
- Really, provided software IP multicast filter is organized
- reasonably (at least, hashed), it does not result in a slowdown
- comparing with route cache reject entries.
- Note, that multicast routers are not affected, because
- route cache entry is created eventually.
+ * The problem was that too many Ethernet cards have broken/missing
+ * hardware multicast filters :-( As result the host on multicasting
+ * network acquires a lot of useless route cache entries, sort of
+ * SDR messages from all the world. Now we try to get rid of them.
+ * Really, provided software IP multicast filter is organized
+ * reasonably (at least, hashed), it does not result in a slowdown
+ * comparing with route cache reject entries.
+ * Note, that multicast routers are not affected, because
+ * route cache entry is created eventually.
*/
if (ipv4_is_multicast(daddr)) {
struct in_device *in_dev = __in_dev_get_rcu(dev);
rth = ERR_PTR(-ENETUNREACH);
/* I removed check for oif == dev_out->oif here.
- It was wrong for two reasons:
- 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
- is assigned to multiple interfaces.
- 2. Moreover, we are allowed to send packets with saddr
- of another iface. --ANK
+ * It was wrong for two reasons:
+ * 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
+ * is assigned to multiple interfaces.
+ * 2. Moreover, we are allowed to send packets with saddr
+ * of another iface. --ANK
*/
if (fl4->flowi4_oif == 0 &&
goto out;
/* Special hack: user can direct multicasts
- and limited broadcast via necessary interface
- without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
- This hack is not just for fun, it allows
- vic,vat and friends to work.
- They bind socket to loopback, set ttl to zero
- and expect that it will work.
- From the viewpoint of routing cache they are broken,
- because we are not allowed to build multicast path
- with loopback source addr (look, routing cache
- cannot know, that ttl is zero, so that packet
- will not leave this host and route is valid).
- Luckily, this hack is good workaround.
+ * and limited broadcast via necessary interface
+ * without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
+ * This hack is not just for fun, it allows
+ * vic,vat and friends to work.
+ * They bind socket to loopback, set ttl to zero
+ * and expect that it will work.
+ * From the viewpoint of routing cache they are broken,
+ * because we are not allowed to build multicast path
+ * with loopback source addr (look, routing cache
+ * cannot know, that ttl is zero, so that packet
+ * will not leave this host and route is valid).
+ * Luckily, this hack is good workaround.
*/
fl4->flowi4_oif = dev_out->ifindex;
(ipv4_is_multicast(fl4->daddr) ||
!netif_index_is_l3_master(net, fl4->flowi4_oif))) {
/* Apparently, routing tables are wrong. Assume,
- that the destination is on link.
-
- WHY? DW.
- Because we are allowed to send to iface
- even if it has NO routes and NO assigned
- addresses. When oif is specified, routing
- tables are looked up with only one purpose:
- to catch if destination is gatewayed, rather than
- direct. Moreover, if MSG_DONTROUTE is set,
- we send packet, ignoring both routing tables
- and ifaddr state. --ANK
-
-
- We could make it even if oif is unknown,
- likely IPv6, but we do not.
+ * that the destination is on link.
+ *
+ * WHY? DW.
+ * Because we are allowed to send to iface
+ * even if it has NO routes and NO assigned
+ * addresses. When oif is specified, routing
+ * tables are looked up with only one purpose:
+ * to catch if destination is gatewayed, rather than
+ * direct. Moreover, if MSG_DONTROUTE is set,
+ * we send packet, ignoring both routing tables
+ * and ifaddr state. --ANK
+ *
+ *
+ * We could make it even if oif is unknown,
+ * likely IPv6, but we do not.
*/
if (fl4->saddr == 0)
return rth;
}
- static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
- {
- return NULL;
- }
-
- static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
- {
- unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
-
- return mtu ? : dst->dev->mtu;
- }
-
- static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
- struct sk_buff *skb, u32 mtu,
- bool confirm_neigh)
- {
- }
-
- static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
- struct sk_buff *skb)
- {
- }
-
- static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
- unsigned long old)
- {
- return NULL;
- }
-
static struct dst_ops ipv4_dst_blackhole_ops = {
- .family = AF_INET,
- .check = ipv4_blackhole_dst_check,
- .mtu = ipv4_blackhole_mtu,
- .default_advmss = ipv4_default_advmss,
- .update_pmtu = ipv4_rt_blackhole_update_pmtu,
- .redirect = ipv4_rt_blackhole_redirect,
- .cow_metrics = ipv4_rt_blackhole_cow_metrics,
- .neigh_lookup = ipv4_neigh_lookup,
+ .family = AF_INET,
+ .default_advmss = ipv4_default_advmss,
+ .neigh_lookup = ipv4_neigh_lookup,
+ .check = dst_blackhole_check,
+ .cow_metrics = dst_blackhole_cow_metrics,
+ .update_pmtu = dst_blackhole_update_pmtu,
+ .redirect = dst_blackhole_redirect,
+ .mtu = dst_blackhole_mtu,
};
struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
int __init ip_rt_init(void)
{
+ void *idents_hash;
int cpu;
- ip_idents = kmalloc_array(IP_IDENTS_SZ, sizeof(*ip_idents),
- GFP_KERNEL);
- if (!ip_idents)
- panic("IP: failed to allocate ip_idents\n");
+ /* For modern hosts, this will use 2 MB of memory */
+ idents_hash = alloc_large_system_hash("IP idents",
+ sizeof(*ip_idents) + sizeof(*ip_tstamps),
+ 0,
+ 16, /* one bucket per 64 KB */
+ HASH_ZERO,
+ NULL,
+ &ip_idents_mask,
+ 2048,
+ 256*1024);
+
+ ip_idents = idents_hash;
- prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
+ prandom_bytes(ip_idents, (ip_idents_mask + 1) * sizeof(*ip_idents));
- ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
- if (!ip_tstamps)
- panic("IP: failed to allocate ip_tstamps\n");
+ ip_tstamps = idents_hash + (ip_idents_mask + 1) * sizeof(*ip_idents);
for_each_possible_cpu(cpu) {
struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
.confirm_neigh = ip6_confirm_neigh,
};
- static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
- {
- unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
-
- return mtu ? : dst->dev->mtu;
- }
-
- static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
- struct sk_buff *skb, u32 mtu,
- bool confirm_neigh)
- {
- }
-
- static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
- struct sk_buff *skb)
- {
- }
-
static struct dst_ops ip6_dst_blackhole_ops = {
- .family = AF_INET6,
- .destroy = ip6_dst_destroy,
- .check = ip6_dst_check,
- .mtu = ip6_blackhole_mtu,
- .default_advmss = ip6_default_advmss,
- .update_pmtu = ip6_rt_blackhole_update_pmtu,
- .redirect = ip6_rt_blackhole_redirect,
- .cow_metrics = dst_cow_metrics_generic,
- .neigh_lookup = ip6_dst_neigh_lookup,
+ .family = AF_INET6,
+ .default_advmss = ip6_default_advmss,
+ .neigh_lookup = ip6_dst_neigh_lookup,
+ .check = ip6_dst_check,
+ .destroy = ip6_dst_destroy,
+ .cow_metrics = dst_cow_metrics_generic,
+ .update_pmtu = dst_blackhole_update_pmtu,
+ .redirect = dst_blackhole_redirect,
+ .mtu = dst_blackhole_mtu,
};
static const u32 ip6_template_metrics[RTAX_MAX] = {
memset(&hash_keys, 0, sizeof(hash_keys));
- if (!flkeys) {
+ if (!flkeys) {
skb_flow_dissect_flow_keys(skb, &keys, flag);
flkeys = &keys;
}
struct flowi6 *fl6,
int flags)
{
- struct dst_entry *dst;
- struct rt6_info *rt6;
+ struct dst_entry *dst;
+ struct rt6_info *rt6;
- rcu_read_lock();
- dst = ip6_route_output_flags_noref(net, sk, fl6, flags);
- rt6 = (struct rt6_info *)dst;
- /* For dst cached in uncached_list, refcnt is already taken. */
- if (list_empty(&rt6->rt6i_uncached) && !dst_hold_safe(dst)) {
- dst = &net->ipv6.ip6_null_entry->dst;
- dst_hold(dst);
- }
- rcu_read_unlock();
+ rcu_read_lock();
+ dst = ip6_route_output_flags_noref(net, sk, fl6, flags);
+ rt6 = (struct rt6_info *)dst;
+ /* For dst cached in uncached_list, refcnt is already taken. */
+ if (list_empty(&rt6->rt6i_uncached) && !dst_hold_safe(dst)) {
+ dst = &net->ipv6.ip6_null_entry->dst;
+ dst_hold(dst);
+ }
+ rcu_read_unlock();
- return dst;
+ return dst;
}
EXPORT_SYMBOL_GPL(ip6_route_output_flags);
int expected_opsize;
u8 version;
u8 flags;
+ u8 i;
switch (subtype) {
case MPTCPOPT_MP_CAPABLE:
break;
case MPTCPOPT_RM_ADDR:
- if (opsize != TCPOLEN_MPTCP_RM_ADDR_BASE)
+ if (opsize < TCPOLEN_MPTCP_RM_ADDR_BASE + 1 ||
+ opsize > TCPOLEN_MPTCP_RM_ADDR_BASE + MPTCP_RM_IDS_MAX)
break;
ptr++;
mp_opt->rm_addr = 1;
- mp_opt->rm_id = *ptr++;
- pr_debug("RM_ADDR: id=%d", mp_opt->rm_id);
+ mp_opt->rm_list.nr = opsize - TCPOLEN_MPTCP_RM_ADDR_BASE;
+ for (i = 0; i < mp_opt->rm_list.nr; i++)
+ mp_opt->rm_list.ids[i] = *ptr++;
+ pr_debug("RM_ADDR: rm_list_nr=%d", mp_opt->rm_list.nr);
break;
case MPTCPOPT_MP_PRIO:
}
static u64 add_addr_generate_hmac(u64 key1, u64 key2, u8 addr_id,
- struct in_addr *addr)
+ struct in_addr *addr, u16 port)
{
u8 hmac[SHA256_DIGEST_SIZE];
u8 msg[7];
msg[0] = addr_id;
memcpy(&msg[1], &addr->s_addr, 4);
- msg[5] = 0;
- msg[6] = 0;
+ msg[5] = port >> 8;
+ msg[6] = port & 0xFF;
mptcp_crypto_hmac_sha(key1, key2, msg, 7, hmac);
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
static u64 add_addr6_generate_hmac(u64 key1, u64 key2, u8 addr_id,
- struct in6_addr *addr)
+ struct in6_addr *addr, u16 port)
{
u8 hmac[SHA256_DIGEST_SIZE];
u8 msg[19];
msg[0] = addr_id;
memcpy(&msg[1], &addr->s6_addr, 16);
- msg[17] = 0;
- msg[18] = 0;
+ msg[17] = port >> 8;
+ msg[18] = port & 0xFF;
mptcp_crypto_hmac_sha(key1, key2, msg, 19, hmac);
opts->ahmac = add_addr_generate_hmac(msk->local_key,
msk->remote_key,
opts->addr_id,
- &opts->addr);
+ &opts->addr,
+ opts->port);
}
}
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
opts->ahmac = add_addr6_generate_hmac(msk->local_key,
msk->remote_key,
opts->addr_id,
- &opts->addr6);
+ &opts->addr6,
+ opts->port);
}
}
#endif
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
struct mptcp_sock *msk = mptcp_sk(subflow->conn);
- u8 rm_id;
+ struct mptcp_rm_list rm_list;
+ int i, len;
if (!mptcp_pm_should_rm_signal(msk) ||
- !(mptcp_pm_rm_addr_signal(msk, remaining, &rm_id)))
+ !(mptcp_pm_rm_addr_signal(msk, remaining, &rm_list)))
return false;
- if (remaining < TCPOLEN_MPTCP_RM_ADDR_BASE)
+ len = mptcp_rm_addr_len(&rm_list);
+ if (len < 0)
+ return false;
+ if (remaining < len)
return false;
- *size = TCPOLEN_MPTCP_RM_ADDR_BASE;
+ *size = len;
opts->suboptions |= OPTION_MPTCP_RM_ADDR;
- opts->rm_id = rm_id;
+ opts->rm_list = rm_list;
- pr_debug("rm_id=%d", opts->rm_id);
+ for (i = 0; i < opts->rm_list.nr; i++)
+ pr_debug("rm_list_ids[%d]=%d", i, opts->rm_list.ids[i]);
return true;
}
if (mp_opt->family == MPTCP_ADDR_IPVERSION_4)
hmac = add_addr_generate_hmac(msk->remote_key,
msk->local_key,
- mp_opt->addr_id, &mp_opt->addr);
+ mp_opt->addr_id, &mp_opt->addr,
+ mp_opt->port);
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
else
hmac = add_addr6_generate_hmac(msk->remote_key,
msk->local_key,
- mp_opt->addr_id, &mp_opt->addr6);
+ mp_opt->addr_id, &mp_opt->addr6,
+ mp_opt->port);
#endif
pr_debug("msk=%p, ahmac=%llu, mp_opt->ahmac=%llu\n",
}
if (mp_opt.rm_addr) {
- mptcp_pm_rm_addr_received(msk, mp_opt.rm_id);
+ mptcp_pm_rm_addr_received(msk, &mp_opt.rm_list);
mp_opt.rm_addr = 0;
}
}
if (OPTION_MPTCP_RM_ADDR & opts->suboptions) {
+ u8 i = 1;
+
*ptr++ = mptcp_option(MPTCPOPT_RM_ADDR,
- TCPOLEN_MPTCP_RM_ADDR_BASE,
- 0, opts->rm_id);
+ TCPOLEN_MPTCP_RM_ADDR_BASE + opts->rm_list.nr,
+ 0, opts->rm_list.ids[0]);
+
+ while (i < opts->rm_list.nr) {
+ u8 id1, id2, id3, id4;
+
+ id1 = opts->rm_list.ids[i];
+ id2 = i + 1 < opts->rm_list.nr ? opts->rm_list.ids[i + 1] : TCPOPT_NOP;
+ id3 = i + 2 < opts->rm_list.nr ? opts->rm_list.ids[i + 2] : TCPOPT_NOP;
+ id4 = i + 3 < opts->rm_list.nr ? opts->rm_list.ids[i + 3] : TCPOPT_NOP;
+ put_unaligned_be32(id1 << 24 | id2 << 16 | id3 << 8 | id4, ptr);
+ ptr += 1;
+ i += 4;
+ }
}
if (OPTION_MPTCP_PRIO & opts->suboptions) {
enum flow_offload_tuple_dir dir)
{
struct flow_offload_tuple *flow_tuple = &flow->tuplehash[dir].tuple;
- struct dst_entry *other_dst = route->tuple[!dir].dst;
struct dst_entry *dst = route->tuple[dir].dst;
-
- if (!dst_hold_safe(route->tuple[dir].dst))
- return -1;
+ int i, j = 0;
switch (flow_tuple->l3proto) {
case NFPROTO_IPV4:
break;
}
- flow_tuple->iifidx = other_dst->dev->ifindex;
- flow_tuple->dst_cache = dst;
+ flow_tuple->iifidx = route->tuple[dir].in.ifindex;
+ for (i = route->tuple[dir].in.num_encaps - 1; i >= 0; i--) {
+ flow_tuple->encap[j].id = route->tuple[dir].in.encap[i].id;
+ flow_tuple->encap[j].proto = route->tuple[dir].in.encap[i].proto;
+ if (route->tuple[dir].in.ingress_vlans & BIT(i))
+ flow_tuple->in_vlan_ingress |= BIT(j);
+ j++;
+ }
+ flow_tuple->encap_num = route->tuple[dir].in.num_encaps;
+
+ switch (route->tuple[dir].xmit_type) {
+ case FLOW_OFFLOAD_XMIT_DIRECT:
+ memcpy(flow_tuple->out.h_dest, route->tuple[dir].out.h_dest,
+ ETH_ALEN);
+ memcpy(flow_tuple->out.h_source, route->tuple[dir].out.h_source,
+ ETH_ALEN);
+ flow_tuple->out.ifidx = route->tuple[dir].out.ifindex;
+ flow_tuple->out.hw_ifidx = route->tuple[dir].out.hw_ifindex;
+ break;
+ case FLOW_OFFLOAD_XMIT_XFRM:
+ case FLOW_OFFLOAD_XMIT_NEIGH:
+ if (!dst_hold_safe(route->tuple[dir].dst))
+ return -1;
+
+ flow_tuple->dst_cache = dst;
+ break;
+ }
+ flow_tuple->xmit_type = route->tuple[dir].xmit_type;
return 0;
}
+static void nft_flow_dst_release(struct flow_offload *flow,
+ enum flow_offload_tuple_dir dir)
+{
+ if (flow->tuplehash[dir].tuple.xmit_type == FLOW_OFFLOAD_XMIT_NEIGH ||
+ flow->tuplehash[dir].tuple.xmit_type == FLOW_OFFLOAD_XMIT_XFRM)
+ dst_release(flow->tuplehash[dir].tuple.dst_cache);
+}
+
int flow_offload_route_init(struct flow_offload *flow,
const struct nf_flow_route *route)
{
return 0;
err_route_reply:
- dst_release(route->tuple[FLOW_OFFLOAD_DIR_ORIGINAL].dst);
+ nft_flow_dst_release(flow, FLOW_OFFLOAD_DIR_ORIGINAL);
return err;
}
static void flow_offload_route_release(struct flow_offload *flow)
{
- dst_release(flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_cache);
- dst_release(flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_cache);
+ nft_flow_dst_release(flow, FLOW_OFFLOAD_DIR_ORIGINAL);
+ nft_flow_dst_release(flow, FLOW_OFFLOAD_DIR_REPLY);
}
void flow_offload_free(struct flow_offload *flow)
queue_delayed_work(system_power_efficient_wq, &flow_table->gc_work, HZ);
}
-
-static int nf_flow_nat_port_tcp(struct sk_buff *skb, unsigned int thoff,
- __be16 port, __be16 new_port)
+static void nf_flow_nat_port_tcp(struct sk_buff *skb, unsigned int thoff,
+ __be16 port, __be16 new_port)
{
struct tcphdr *tcph;
- if (skb_try_make_writable(skb, thoff + sizeof(*tcph)))
- return -1;
-
tcph = (void *)(skb_network_header(skb) + thoff);
inet_proto_csum_replace2(&tcph->check, skb, port, new_port, false);
-
- return 0;
}
-static int nf_flow_nat_port_udp(struct sk_buff *skb, unsigned int thoff,
- __be16 port, __be16 new_port)
+static void nf_flow_nat_port_udp(struct sk_buff *skb, unsigned int thoff,
+ __be16 port, __be16 new_port)
{
struct udphdr *udph;
- if (skb_try_make_writable(skb, thoff + sizeof(*udph)))
- return -1;
-
udph = (void *)(skb_network_header(skb) + thoff);
if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) {
inet_proto_csum_replace2(&udph->check, skb, port,
if (!udph->check)
udph->check = CSUM_MANGLED_0;
}
-
- return 0;
}
-static int nf_flow_nat_port(struct sk_buff *skb, unsigned int thoff,
- u8 protocol, __be16 port, __be16 new_port)
+static void nf_flow_nat_port(struct sk_buff *skb, unsigned int thoff,
+ u8 protocol, __be16 port, __be16 new_port)
{
switch (protocol) {
case IPPROTO_TCP:
- if (nf_flow_nat_port_tcp(skb, thoff, port, new_port) < 0)
- return NF_DROP;
+ nf_flow_nat_port_tcp(skb, thoff, port, new_port);
break;
case IPPROTO_UDP:
- if (nf_flow_nat_port_udp(skb, thoff, port, new_port) < 0)
- return NF_DROP;
+ nf_flow_nat_port_udp(skb, thoff, port, new_port);
break;
}
-
- return 0;
}
-int nf_flow_snat_port(const struct flow_offload *flow,
- struct sk_buff *skb, unsigned int thoff,
- u8 protocol, enum flow_offload_tuple_dir dir)
+void nf_flow_snat_port(const struct flow_offload *flow,
+ struct sk_buff *skb, unsigned int thoff,
+ u8 protocol, enum flow_offload_tuple_dir dir)
{
struct flow_ports *hdr;
__be16 port, new_port;
- if (skb_try_make_writable(skb, thoff + sizeof(*hdr)))
- return -1;
-
hdr = (void *)(skb_network_header(skb) + thoff);
switch (dir) {
new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_port;
hdr->dest = new_port;
break;
- default:
- return -1;
}
- return nf_flow_nat_port(skb, thoff, protocol, port, new_port);
+ nf_flow_nat_port(skb, thoff, protocol, port, new_port);
}
EXPORT_SYMBOL_GPL(nf_flow_snat_port);
-int nf_flow_dnat_port(const struct flow_offload *flow,
- struct sk_buff *skb, unsigned int thoff,
- u8 protocol, enum flow_offload_tuple_dir dir)
+void nf_flow_dnat_port(const struct flow_offload *flow, struct sk_buff *skb,
+ unsigned int thoff, u8 protocol,
+ enum flow_offload_tuple_dir dir)
{
struct flow_ports *hdr;
__be16 port, new_port;
- if (skb_try_make_writable(skb, thoff + sizeof(*hdr)))
- return -1;
-
hdr = (void *)(skb_network_header(skb) + thoff);
switch (dir) {
new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_port;
hdr->source = new_port;
break;
- default:
- return -1;
}
- return nf_flow_nat_port(skb, thoff, protocol, port, new_port);
+ nf_flow_nat_port(skb, thoff, protocol, port, new_port);
}
EXPORT_SYMBOL_GPL(nf_flow_dnat_port);
{
int err;
- INIT_DEFERRABLE_WORK(&flowtable->gc_work, nf_flow_offload_work_gc);
+ INIT_DELAYED_WORK(&flowtable->gc_work, nf_flow_offload_work_gc);
flow_block_init(&flowtable->flow_block);
init_rwsem(&flowtable->flow_block_lock);
nft_table_disable(net, table, 0);
}
+enum {
+ NFT_TABLE_STATE_UNCHANGED = 0,
+ NFT_TABLE_STATE_DORMANT,
+ NFT_TABLE_STATE_WAKEUP
+};
+
static int nf_tables_updtable(struct nft_ctx *ctx)
{
struct nft_trans *trans;
if ((flags & NFT_TABLE_F_DORMANT) &&
!(ctx->table->flags & NFT_TABLE_F_DORMANT)) {
- nft_trans_table_enable(trans) = false;
+ nft_trans_table_state(trans) = NFT_TABLE_STATE_DORMANT;
} else if (!(flags & NFT_TABLE_F_DORMANT) &&
ctx->table->flags & NFT_TABLE_F_DORMANT) {
- ctx->table->flags &= ~NFT_TABLE_F_DORMANT;
ret = nf_tables_table_enable(ctx->net, ctx->table);
if (ret >= 0)
- nft_trans_table_enable(trans) = true;
- else
- ctx->table->flags |= NFT_TABLE_F_DORMANT;
+ nft_trans_table_state(trans) = NFT_TABLE_STATE_WAKEUP;
}
if (ret < 0)
goto err;
+ nft_trans_table_flags(trans) = flags;
nft_trans_table_update(trans) = true;
list_add_tail(&trans->list, &ctx->net->nft.commit_list);
return 0;
list_for_each_entry(hook, hook_list, list) {
list_for_each_entry(ft, &table->flowtables, list) {
+ if (!nft_is_active_next(net, ft))
+ continue;
+
list_for_each_entry(hook2, &ft->hook_list, list) {
if (hook->ops.dev == hook2->ops.dev &&
hook->ops.pf == hook2->ops.pf) {
struct nft_hook *hook, *next;
struct nft_trans *trans;
bool unregister = false;
+ u32 flags;
int err;
err = nft_flowtable_parse_hook(ctx, nla[NFTA_FLOWTABLE_HOOK],
}
}
+ if (nla[NFTA_FLOWTABLE_FLAGS]) {
+ flags = ntohl(nla_get_be32(nla[NFTA_FLOWTABLE_FLAGS]));
+ if (flags & ~NFT_FLOWTABLE_MASK)
+ return -EOPNOTSUPP;
+ if ((flowtable->data.flags & NFT_FLOWTABLE_HW_OFFLOAD) ^
+ (flags & NFT_FLOWTABLE_HW_OFFLOAD))
+ return -EOPNOTSUPP;
+ } else {
+ flags = flowtable->data.flags;
+ }
+
err = nft_register_flowtable_net_hooks(ctx->net, ctx->table,
&flowtable_hook.list, flowtable);
if (err < 0)
goto err_flowtable_update_hook;
}
+ nft_trans_flowtable_flags(trans) = flags;
nft_trans_flowtable(trans) = flowtable;
nft_trans_flowtable_update(trans) = true;
INIT_LIST_HEAD(&nft_trans_flowtable_hooks(trans));
if (nla[NFTA_FLOWTABLE_FLAGS]) {
flowtable->data.flags =
ntohl(nla_get_be32(nla[NFTA_FLOWTABLE_FLAGS]));
- if (flowtable->data.flags & ~NFT_FLOWTABLE_MASK)
+ if (flowtable->data.flags & ~NFT_FLOWTABLE_MASK) {
+ err = -EOPNOTSUPP;
goto err3;
+ }
}
write_pnet(&flowtable->data.net, net);
switch (trans->msg_type) {
case NFT_MSG_NEWTABLE:
if (nft_trans_table_update(trans)) {
- if (!nft_trans_table_enable(trans)) {
- nf_tables_table_disable(net,
- trans->ctx.table);
- trans->ctx.table->flags |= NFT_TABLE_F_DORMANT;
- }
+ if (nft_trans_table_state(trans) == NFT_TABLE_STATE_DORMANT)
+ nf_tables_table_disable(net, trans->ctx.table);
+
+ trans->ctx.table->flags = nft_trans_table_flags(trans);
} else {
nft_clear(net, trans->ctx.table);
}
break;
case NFT_MSG_NEWFLOWTABLE:
if (nft_trans_flowtable_update(trans)) {
+ nft_trans_flowtable(trans)->data.flags =
+ nft_trans_flowtable_flags(trans);
nf_tables_flowtable_notify(&trans->ctx,
nft_trans_flowtable(trans),
&nft_trans_flowtable_hooks(trans),
switch (trans->msg_type) {
case NFT_MSG_NEWTABLE:
if (nft_trans_table_update(trans)) {
- if (nft_trans_table_enable(trans)) {
- nf_tables_table_disable(net,
- trans->ctx.table);
- trans->ctx.table->flags |= NFT_TABLE_F_DORMANT;
- }
+ if (nft_trans_table_state(trans) == NFT_TABLE_STATE_WAKEUP)
+ nf_tables_table_disable(net, trans->ctx.table);
+
nft_trans_destroy(trans);
} else {
list_del_rcu(&trans->ctx.table->list);
data->verdict.chain);
if (err < 0)
return err;
+ break;
default:
break;
}
return TC_ACT_SHOT;
ext->chain = last_executed_chain;
ext->mru = qdisc_skb_cb(skb)->mru;
+ ext->post_ct = qdisc_skb_cb(skb)->post_ct;
}
return ret;
entry->police.burst = tcf_police_burst(act);
entry->police.rate_bytes_ps =
tcf_police_rate_bytes_ps(act);
+ entry->police.burst_pkt = tcf_police_burst_pkt(act);
+ entry->police.rate_pkt_ps =
+ tcf_police_rate_pkt_ps(act);
entry->police.mtu = tcf_police_tcfp_mtu(act);
entry->police.index = act->tcfa_index;
} else if (is_tcf_ct(act)) {
struct fl_flow_key *key,
struct fl_flow_key *mkey)
{
- __be16 min_mask, max_mask, min_val, max_val;
+ u16 min_mask, max_mask, min_val, max_val;
- min_mask = htons(filter->mask->key.tp_range.tp_min.dst);
- max_mask = htons(filter->mask->key.tp_range.tp_max.dst);
- min_val = htons(filter->key.tp_range.tp_min.dst);
- max_val = htons(filter->key.tp_range.tp_max.dst);
+ min_mask = ntohs(filter->mask->key.tp_range.tp_min.dst);
+ max_mask = ntohs(filter->mask->key.tp_range.tp_max.dst);
+ min_val = ntohs(filter->key.tp_range.tp_min.dst);
+ max_val = ntohs(filter->key.tp_range.tp_max.dst);
if (min_mask && max_mask) {
- if (htons(key->tp_range.tp.dst) < min_val ||
- htons(key->tp_range.tp.dst) > max_val)
+ if (ntohs(key->tp_range.tp.dst) < min_val ||
+ ntohs(key->tp_range.tp.dst) > max_val)
return false;
/* skb does not have min and max values */
struct fl_flow_key *key,
struct fl_flow_key *mkey)
{
- __be16 min_mask, max_mask, min_val, max_val;
+ u16 min_mask, max_mask, min_val, max_val;
- min_mask = htons(filter->mask->key.tp_range.tp_min.src);
- max_mask = htons(filter->mask->key.tp_range.tp_max.src);
- min_val = htons(filter->key.tp_range.tp_min.src);
- max_val = htons(filter->key.tp_range.tp_max.src);
+ min_mask = ntohs(filter->mask->key.tp_range.tp_min.src);
+ max_mask = ntohs(filter->mask->key.tp_range.tp_max.src);
+ min_val = ntohs(filter->key.tp_range.tp_min.src);
+ max_val = ntohs(filter->key.tp_range.tp_max.src);
if (min_mask && max_mask) {
- if (htons(key->tp_range.tp.src) < min_val ||
- htons(key->tp_range.tp.src) > max_val)
+ if (ntohs(key->tp_range.tp.src) < min_val ||
+ ntohs(key->tp_range.tp.src) > max_val)
return false;
/* skb does not have min and max values */
TCA_FLOWER_UNSPEC, sizeof(key->tp_range.tp_max.src));
if (mask->tp_range.tp_min.dst && mask->tp_range.tp_max.dst &&
- htons(key->tp_range.tp_max.dst) <=
- htons(key->tp_range.tp_min.dst)) {
+ ntohs(key->tp_range.tp_max.dst) <=
+ ntohs(key->tp_range.tp_min.dst)) {
NL_SET_ERR_MSG_ATTR(extack,
tb[TCA_FLOWER_KEY_PORT_DST_MIN],
"Invalid destination port range (min must be strictly smaller than max)");
return -EINVAL;
}
if (mask->tp_range.tp_min.src && mask->tp_range.tp_max.src &&
- htons(key->tp_range.tp_max.src) <=
- htons(key->tp_range.tp_min.src)) {
+ ntohs(key->tp_range.tp_max.src) <=
+ ntohs(key->tp_range.tp_min.src)) {
NL_SET_ERR_MSG_ATTR(extack,
tb[TCA_FLOWER_KEY_PORT_SRC_MIN],
"Invalid source port range (min must be strictly smaller than max)");
return -EINVAL;
}
- key = be32_to_cpu(nla_get_u32(tb[TCA_FLOWER_KEY_FLAGS]));
- mask = be32_to_cpu(nla_get_u32(tb[TCA_FLOWER_KEY_FLAGS_MASK]));
+ key = be32_to_cpu(nla_get_be32(tb[TCA_FLOWER_KEY_FLAGS]));
+ mask = be32_to_cpu(nla_get_be32(tb[TCA_FLOWER_KEY_FLAGS_MASK]));
*flags_key = 0;
*flags_mask = 0;
&mask->ct_state, TCA_FLOWER_KEY_CT_STATE_MASK,
sizeof(key->ct_state));
- err = fl_validate_ct_state(mask->ct_state,
+ err = fl_validate_ct_state(key->ct_state & mask->ct_state,
tb[TCA_FLOWER_KEY_CT_STATE_MASK],
extack);
if (err)
}
static void tipc_node_read_lock(struct tipc_node *n)
+ __acquires(n->lock)
{
read_lock_bh(&n->lock);
}
static void tipc_node_read_unlock(struct tipc_node *n)
+ __releases(n->lock)
{
read_unlock_bh(&n->lock);
}
static void tipc_node_write_lock(struct tipc_node *n)
+ __acquires(n->lock)
{
write_lock_bh(&n->lock);
}
static void tipc_node_write_unlock_fast(struct tipc_node *n)
+ __releases(n->lock)
{
write_unlock_bh(&n->lock);
}
static void tipc_node_write_unlock(struct tipc_node *n)
+ __releases(n->lock)
{
+ struct tipc_socket_addr sk;
struct net *net = n->net;
- u32 addr = 0;
u32 flags = n->action_flags;
- u32 link_id = 0;
- u32 bearer_id;
struct list_head *publ_list;
+ struct tipc_uaddr ua;
+ u32 bearer_id;
if (likely(!flags)) {
write_unlock_bh(&n->lock);
return;
}
- addr = n->addr;
- link_id = n->link_id;
- bearer_id = link_id & 0xffff;
+ tipc_uaddr(&ua, TIPC_SERVICE_RANGE, TIPC_NODE_SCOPE,
+ TIPC_LINK_STATE, n->addr, n->addr);
+ sk.ref = n->link_id;
+ sk.node = n->addr;
+ bearer_id = n->link_id & 0xffff;
publ_list = &n->publ_list;
n->action_flags &= ~(TIPC_NOTIFY_NODE_DOWN | TIPC_NOTIFY_NODE_UP |
write_unlock_bh(&n->lock);
if (flags & TIPC_NOTIFY_NODE_DOWN)
- tipc_publ_notify(net, publ_list, addr, n->capabilities);
+ tipc_publ_notify(net, publ_list, n->addr, n->capabilities);
if (flags & TIPC_NOTIFY_NODE_UP)
- tipc_named_node_up(net, addr, n->capabilities);
+ tipc_named_node_up(net, n->addr, n->capabilities);
if (flags & TIPC_NOTIFY_LINK_UP) {
- tipc_mon_peer_up(net, addr, bearer_id);
- tipc_nametbl_publish(net, TIPC_LINK_STATE, addr, addr,
- TIPC_NODE_SCOPE, link_id, link_id);
+ tipc_mon_peer_up(net, n->addr, bearer_id);
+ tipc_nametbl_publish(net, &ua, &sk, n->link_id);
}
if (flags & TIPC_NOTIFY_LINK_DOWN) {
- tipc_mon_peer_down(net, addr, bearer_id);
- tipc_nametbl_withdraw(net, TIPC_LINK_STATE, addr,
- addr, link_id);
+ tipc_mon_peer_down(net, n->addr, bearer_id);
+ tipc_nametbl_withdraw(net, &ua, &sk, n->link_id);
}
}
#ifdef CONFIG_TIPC_CRYPTO
static int tipc_nl_retrieve_key(struct nlattr **attrs,
- struct tipc_aead_key **key)
+ struct tipc_aead_key **pkey)
{
struct nlattr *attr = attrs[TIPC_NLA_NODE_KEY];
+ struct tipc_aead_key *key;
if (!attr)
return -ENODATA;
- *key = (struct tipc_aead_key *)nla_data(attr);
- if (nla_len(attr) < tipc_aead_key_size(*key))
+ if (nla_len(attr) < sizeof(*key))
+ return -EINVAL;
+ key = (struct tipc_aead_key *)nla_data(attr);
+ if (key->keylen > TIPC_AEAD_KEYLEN_MAX ||
+ nla_len(attr) < tipc_aead_key_size(key))
return -EINVAL;
+ *pkey = key;
return 0;
}
$(Q)$(MAKE) $(build)=libbpf OUTPUT=$(STATIC_OBJDIR)
$(BPF_HELPER_DEFS): $(srctree)/tools/include/uapi/linux/bpf.h
- $(QUIET_GEN)$(srctree)/scripts/bpf_helpers_doc.py --header \
+ $(QUIET_GEN)$(srctree)/scripts/bpf_doc.py --header \
--file $(srctree)/tools/include/uapi/linux/bpf.h > $(BPF_HELPER_DEFS)
$(OUTPUT)libbpf.so: $(OUTPUT)libbpf.so.$(LIBBPF_VERSION)
if [ ! -d '$(DESTDIR_SQ)$2' ]; then \
$(INSTALL) -d -m 755 '$(DESTDIR_SQ)$2'; \
fi; \
- $(INSTALL) $1 $(if $3,-m $3,) '$(DESTDIR_SQ)$2'
+ $(INSTALL) $(if $3,-m $3,) $1 '$(DESTDIR_SQ)$2'
endef
install_lib: all_cmd
case BTF_KIND_INT:
case BTF_KIND_ENUM:
case BTF_KIND_FWD:
+ case BTF_KIND_FLOAT:
break;
case BTF_KIND_VOLATILE:
switch (btf_kind(t)) {
case BTF_KIND_INT:
+ case BTF_KIND_FLOAT:
tstate->order_state = ORDERED;
return 0;
return err;
case BTF_KIND_ARRAY:
- return btf_dump_order_type(d, btf_array(t)->type, through_ptr);
+ return btf_dump_order_type(d, btf_array(t)->type, false);
case BTF_KIND_STRUCT:
case BTF_KIND_UNION: {
case BTF_KIND_STRUCT:
case BTF_KIND_UNION:
case BTF_KIND_TYPEDEF:
+ case BTF_KIND_FLOAT:
goto done;
default:
pr_warn("unexpected type in decl chain, kind:%u, id:[%u]\n",
switch (kind) {
case BTF_KIND_INT:
+ case BTF_KIND_FLOAT:
btf_dump_emit_mods(d, decls);
name = btf_name_of(d, t->name_off);
btf_dump_printf(d, "%s", name);
FEAT_PROG_BIND_MAP,
/* Kernel support for module BTFs */
FEAT_MODULE_BTF,
+ /* BTF_KIND_FLOAT support */
+ FEAT_BTF_FLOAT,
__FEAT_CNT,
};
RELO_CALL,
RELO_DATA,
RELO_EXTERN,
+ RELO_SUBPROG_ADDR,
};
struct reloc_desc {
insn->off == 0;
}
+static bool is_ldimm64(struct bpf_insn *insn)
+{
+ return insn->code == (BPF_LD | BPF_IMM | BPF_DW);
+}
+
+static bool insn_is_pseudo_func(struct bpf_insn *insn)
+{
+ return is_ldimm64(insn) && insn->src_reg == BPF_PSEUDO_FUNC;
+}
+
static int
bpf_object__init_prog(struct bpf_object *obj, struct bpf_program *prog,
const char *name, size_t sec_idx, const char *sec_name,
if (!elf_rawdata(elf_getscn(obj->efile.elf, obj->efile.shstrndx), NULL)) {
pr_warn("elf: failed to get section names strings from %s: %s\n",
obj->path, elf_errmsg(-1));
- return -LIBBPF_ERRNO__FORMAT;
+ err = -LIBBPF_ERRNO__FORMAT;
+ goto errout;
}
/* Old LLVM set e_machine to EM_NONE */
case BTF_KIND_FUNC_PROTO: return "func_proto";
case BTF_KIND_VAR: return "var";
case BTF_KIND_DATASEC: return "datasec";
+ case BTF_KIND_FLOAT: return "float";
default: return "unknown";
}
}
{
bool has_func_global = kernel_supports(FEAT_BTF_GLOBAL_FUNC);
bool has_datasec = kernel_supports(FEAT_BTF_DATASEC);
+ bool has_float = kernel_supports(FEAT_BTF_FLOAT);
bool has_func = kernel_supports(FEAT_BTF_FUNC);
- return !has_func || !has_datasec || !has_func_global;
+ return !has_func || !has_datasec || !has_func_global || !has_float;
}
static void bpf_object__sanitize_btf(struct bpf_object *obj, struct btf *btf)
{
bool has_func_global = kernel_supports(FEAT_BTF_GLOBAL_FUNC);
bool has_datasec = kernel_supports(FEAT_BTF_DATASEC);
+ bool has_float = kernel_supports(FEAT_BTF_FLOAT);
bool has_func = kernel_supports(FEAT_BTF_FUNC);
struct btf_type *t;
int i, j, vlen;
} else if (!has_func_global && btf_is_func(t)) {
/* replace BTF_FUNC_GLOBAL with BTF_FUNC_STATIC */
t->info = BTF_INFO_ENC(BTF_KIND_FUNC, 0, 0);
+ } else if (!has_float && btf_is_float(t)) {
+ /* replace FLOAT with an equally-sized empty STRUCT;
+ * since C compilers do not accept e.g. "float" as a
+ * valid struct name, make it anonymous
+ */
+ t->name_off = 0;
+ t->info = BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 0);
}
}
}
GELF_ST_TYPE(sym->st_info) == STT_NOTYPE;
}
+static bool sym_is_subprog(const GElf_Sym *sym, int text_shndx)
+{
+ int bind = GELF_ST_BIND(sym->st_info);
+ int type = GELF_ST_TYPE(sym->st_info);
+
+ /* in .text section */
+ if (sym->st_shndx != text_shndx)
+ return false;
+
+ /* local function */
+ if (bind == STB_LOCAL && type == STT_SECTION)
+ return true;
+
+ /* global function */
+ return bind == STB_GLOBAL && type == STT_FUNC;
+}
+
static int find_extern_btf_id(const struct btf *btf, const char *ext_name)
{
const struct btf_type *t;
return 0;
}
- if (insn->code != (BPF_LD | BPF_IMM | BPF_DW)) {
+ if (!is_ldimm64(insn)) {
pr_warn("prog '%s': invalid relo against '%s' for insns[%d].code 0x%x\n",
prog->name, sym_name, insn_idx, insn->code);
return -LIBBPF_ERRNO__RELOC;
return -LIBBPF_ERRNO__RELOC;
}
+ /* loading subprog addresses */
+ if (sym_is_subprog(sym, obj->efile.text_shndx)) {
+ /* global_func: sym->st_value = offset in the section, insn->imm = 0.
+ * local_func: sym->st_value = 0, insn->imm = offset in the section.
+ */
+ if ((sym->st_value % BPF_INSN_SZ) || (insn->imm % BPF_INSN_SZ)) {
+ pr_warn("prog '%s': bad subprog addr relo against '%s' at offset %zu+%d\n",
+ prog->name, sym_name, (size_t)sym->st_value, insn->imm);
+ return -LIBBPF_ERRNO__RELOC;
+ }
+
+ reloc_desc->type = RELO_SUBPROG_ADDR;
+ reloc_desc->insn_idx = insn_idx;
+ reloc_desc->sym_off = sym->st_value;
+ return 0;
+ }
+
type = bpf_object__section_to_libbpf_map_type(obj, shdr_idx);
sym_sec_name = elf_sec_name(obj, elf_sec_by_idx(obj, shdr_idx));
strs, sizeof(strs)));
}
+static int probe_kern_btf_float(void)
+{
+ static const char strs[] = "\0float";
+ __u32 types[] = {
+ /* float */
+ BTF_TYPE_FLOAT_ENC(1, 4),
+ };
+
+ return probe_fd(libbpf__load_raw_btf((char *)types, sizeof(types),
+ strs, sizeof(strs)));
+}
+
static int probe_kern_array_mmap(void)
{
struct bpf_create_map_attr attr = {
[FEAT_MODULE_BTF] = {
"module BTF support", probe_module_btf,
},
+ [FEAT_BTF_FLOAT] = {
+ "BTF_KIND_FLOAT support", probe_kern_btf_float,
+ },
};
static bool kernel_supports(enum kern_feature_id feat_id)
insn->imm = 195896080; /* => 0xbad2310 => "bad relo" */
}
-static bool is_ldimm64(struct bpf_insn *insn)
-{
- return insn->code == (BPF_LD | BPF_IMM | BPF_DW);
-}
-
static int insn_bpf_size_to_bytes(struct bpf_insn *insn)
{
switch (BPF_SIZE(insn->code)) {
}
relo->processed = true;
break;
+ case RELO_SUBPROG_ADDR:
+ insn[0].src_reg = BPF_PSEUDO_FUNC;
+ /* will be handled as a follow up pass */
+ break;
case RELO_CALL:
/* will be handled as a follow up pass */
break;
for (insn_idx = 0; insn_idx < prog->sec_insn_cnt; insn_idx++) {
insn = &main_prog->insns[prog->sub_insn_off + insn_idx];
- if (!insn_is_subprog_call(insn))
+ if (!insn_is_subprog_call(insn) && !insn_is_pseudo_func(insn))
continue;
relo = find_prog_insn_relo(prog, insn_idx);
- if (relo && relo->type != RELO_CALL) {
+ if (relo && relo->type != RELO_CALL && relo->type != RELO_SUBPROG_ADDR) {
pr_warn("prog '%s': unexpected relo for insn #%zu, type %d\n",
prog->name, insn_idx, relo->type);
return -LIBBPF_ERRNO__RELOC;
* call always has imm = -1, but for static functions
* relocation is against STT_SECTION and insn->imm
* points to a start of a static function
+ *
+ * for subprog addr relocation, the relo->sym_off + insn->imm is
+ * the byte offset in the corresponding section.
*/
- sub_insn_idx = relo->sym_off / BPF_INSN_SZ + insn->imm + 1;
+ if (relo->type == RELO_CALL)
+ sub_insn_idx = relo->sym_off / BPF_INSN_SZ + insn->imm + 1;
+ else
+ sub_insn_idx = (relo->sym_off + insn->imm) / BPF_INSN_SZ;
+ } else if (insn_is_pseudo_func(insn)) {
+ /*
+ * RELO_SUBPROG_ADDR relo is always emitted even if both
+ * functions are in the same section, so it shouldn't reach here.
+ */
+ pr_warn("prog '%s': missing subprog addr relo for insn #%zu\n",
+ prog->name, insn_idx);
+ return -LIBBPF_ERRNO__RELOC;
} else {
/* if subprogram call is to a static function within
* the same ELF section, there won't be any relocation
};
};
+ struct struct_in_array {};
+
+ struct struct_in_array_typed {};
+
+ typedef struct struct_in_array_typed struct_in_array_t[2];
+
struct struct_with_embedded_stuff {
int a;
struct {
} r[5];
struct struct_in_struct s[10];
int t[11];
+ struct struct_in_array (*u)[2];
+ struct_in_array_t *v;
};
+struct float_struct {
+ float f;
+ const double *d;
+ volatile long double *ld;
+};
+
struct root_struct {
enum e1 _1;
enum e2 _2;
union_fwd_t *_12;
union_fwd_ptr_t _13;
struct struct_with_embedded_stuff _14;
+ struct float_struct _15;
};
/* ------ END-EXPECTED-OUTPUT ------ */
timeout=30
mptcp_connect=""
capture=0
+ do_all_tests=1
TEST_COUNT=0
-j DROP
}
- for arg in "$@"; do
- if [ "$arg" = "-c" ]; then
- capture=1
- fi
- done
-
ip -Version > /dev/null 2>&1
if [ $? -ne 0 ];then
echo "SKIP: Could not run test without ip tool"
let rm_nr_ns1=-addr_nr_ns1
if [ $rm_nr_ns1 -lt 8 ]; then
counter=1
- sleep 1
-
- while [ $counter -le $rm_nr_ns1 ]
- do
- ip netns exec ${listener_ns} ./pm_nl_ctl del $counter
+ dump=(`ip netns exec ${listener_ns} ./pm_nl_ctl dump`)
+ if [ ${#dump[@]} -gt 0 ]; then
+ id=${dump[1]}
sleep 1
- let counter+=1
- done
+
+ while [ $counter -le $rm_nr_ns1 ]
+ do
+ ip netns exec ${listener_ns} ./pm_nl_ctl del $id
+ sleep 1
+ let counter+=1
+ let id+=1
+ done
+ fi
else
sleep 1
ip netns exec ${listener_ns} ./pm_nl_ctl flush
let rm_nr_ns2=-addr_nr_ns2
if [ $rm_nr_ns2 -lt 8 ]; then
counter=1
- sleep 1
-
- while [ $counter -le $rm_nr_ns2 ]
- do
- ip netns exec ${connector_ns} ./pm_nl_ctl del $counter
+ dump=(`ip netns exec ${connector_ns} ./pm_nl_ctl dump`)
+ if [ ${#dump[@]} -gt 0 ]; then
+ id=${dump[1]}
sleep 1
- let counter+=1
- done
+
+ while [ $counter -le $rm_nr_ns2 ]
+ do
+ ip netns exec ${connector_ns} ./pm_nl_ctl del $id
+ sleep 1
+ let counter+=1
+ let id+=1
+ done
+ fi
else
sleep 1
ip netns exec ${connector_ns} ./pm_nl_ctl flush
{
local rm_addr_nr=$1
local rm_subflow_nr=$2
+ local invert=${3:-""}
local count
local dump_stats
+ local addr_ns
+ local subflow_ns
+
+ if [ -z $invert ]; then
+ addr_ns=$ns1
+ subflow_ns=$ns2
+ elif [ $invert = "invert" ]; then
+ addr_ns=$ns2
+ subflow_ns=$ns1
+ fi
printf "%-39s %s" " " "rm "
- count=`ip netns exec $ns1 nstat -as | grep MPTcpExtRmAddr | awk '{print $2}'`
+ count=`ip netns exec $addr_ns nstat -as | grep MPTcpExtRmAddr | awk '{print $2}'`
[ -z "$count" ] && count=0
if [ "$count" != "$rm_addr_nr" ]; then
echo "[fail] got $count RM_ADDR[s] expected $rm_addr_nr"
fi
echo -n " - sf "
- count=`ip netns exec $ns2 nstat -as | grep MPTcpExtRmSubflow | awk '{print $2}'`
+ count=`ip netns exec $subflow_ns nstat -as | grep MPTcpExtRmSubflow | awk '{print $2}'`
[ -z "$count" ] && count=0
if [ "$count" != "$rm_subflow_nr" ]; then
echo "[fail] got $count RM_SUBFLOW[s] expected $rm_subflow_nr"
run_tests $ns1 $ns2 10.0.1.1 0 -1 0 slow
chk_join_nr "remove single address" 1 1 1
chk_add_nr 1 1
- chk_rm_nr 0 0
+ chk_rm_nr 1 1 invert
# subflow and signal, remove
reset
chk_join_nr "flush subflows and signal" 3 3 3
chk_add_nr 1 1
chk_rm_nr 2 2
+
+ # subflows flush
+ reset
+ ip netns exec $ns1 ./pm_nl_ctl limits 3 3
+ ip netns exec $ns2 ./pm_nl_ctl limits 3 3
+ ip netns exec $ns2 ./pm_nl_ctl add 10.0.2.2 flags subflow id 150
+ ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow
+ ip netns exec $ns2 ./pm_nl_ctl add 10.0.4.2 flags subflow
+ run_tests $ns1 $ns2 10.0.1.1 0 -8 -8 slow
+ chk_join_nr "flush subflows" 3 3 3
+ chk_rm_nr 3 3
+
+ # addresses flush
+ reset
+ ip netns exec $ns1 ./pm_nl_ctl limits 3 3
+ ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal id 250
+ ip netns exec $ns1 ./pm_nl_ctl add 10.0.3.1 flags signal
+ ip netns exec $ns1 ./pm_nl_ctl add 10.0.4.1 flags signal
+ ip netns exec $ns2 ./pm_nl_ctl limits 3 3
+ run_tests $ns1 $ns2 10.0.1.1 0 -8 -8 slow
+ chk_join_nr "flush addresses" 3 3 3
+ chk_add_nr 3 3
+ chk_rm_nr 3 3 invert
}
add_tests()
run_tests $ns1 $ns2 dead:beef:1::1 0 -1 0 slow
chk_join_nr "remove single address IPv6" 1 1 1
chk_add_nr 1 1
- chk_rm_nr 0 0
+ chk_rm_nr 1 1 invert
# subflow and signal IPv6, remove
reset
run_tests $ns1 $ns2 10.0.1.1 0 -1 0 slow
chk_join_nr "remove single address with port" 1 1 1
chk_add_nr 1 1 1
- chk_rm_nr 0 0
+ chk_rm_nr 1 1 invert
# subflow and signal with port, remove
reset
echo " -4 v4mapped_tests"
echo " -b backup_tests"
echo " -p add_addr_ports_tests"
- echo " -c syncookies_tests"
+ echo " -k syncookies_tests"
+ echo " -c capture pcap files"
echo " -h help"
}
make_file "$sin" "server" 1
trap cleanup EXIT
- if [ -z $1 ]; then
+ for arg in "$@"; do
+ # check for "capture" arg before launching tests
+ if [[ "${arg}" =~ ^"-"[0-9a-zA-Z]*"c"[0-9a-zA-Z]*$ ]]; then
+ capture=1
+ fi
+
+ # exception for the capture option, the rest means: a part of the tests
+ if [ "${arg}" != "-c" ]; then
+ do_all_tests=0
+ fi
+ done
+
+ if [ $do_all_tests -eq 1 ]; then
all_tests
exit $ret
fi
- while getopts 'fsltra64bpch' opt; do
+ while getopts 'fsltra64bpkch' opt; do
case $opt in
f)
subflows_tests
p)
add_addr_ports_tests
;;
- c)
+ k)
syncookies_tests
;;
+ c)
+ ;;
h | *)
usage
;;