Merge git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next
authorDavid S. Miller <davem@davemloft.net>
Tue, 16 Feb 2021 21:14:06 +0000 (13:14 -0800)
committerDavid S. Miller <davem@davemloft.net>
Tue, 16 Feb 2021 21:14:06 +0000 (13:14 -0800)
Daniel Borkmann says:

====================
pull-request: bpf-next 2021-02-16

The following pull-request contains BPF updates for your *net-next* tree.

There's a small merge conflict between 7eeba1706eba ("tcp: Add receive timestamp
support for receive zerocopy.") from net-next tree and 9cacf81f8161 ("bpf: Remove
extra lock_sock for TCP_ZEROCOPY_RECEIVE") from bpf-next tree. Resolve as follows:

  [...]
                lock_sock(sk);
                err = tcp_zerocopy_receive(sk, &zc, &tss);
                err = BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sk, level, optname,
                                                          &zc, &len, err);
                release_sock(sk);
  [...]

We've added 116 non-merge commits during the last 27 day(s) which contain
a total of 156 files changed, 5662 insertions(+), 1489 deletions(-).

The main changes are:

1) Adds support of pointers to types with known size among global function
   args to overcome the limit on max # of allowed args, from Dmitrii Banshchikov.

2) Add bpf_iter for task_vma which can be used to generate information similar
   to /proc/pid/maps, from Song Liu.

3) Enable bpf_{g,s}etsockopt() from all sock_addr related program hooks. Allow
   rewriting bind user ports from BPF side below the ip_unprivileged_port_start
   range, both from Stanislav Fomichev.

4) Prevent recursion on fentry/fexit & sleepable programs and allow map-in-map
   as well as per-cpu maps for the latter, from Alexei Starovoitov.

5) Add selftest script to run BPF CI locally. Also enable BPF ringbuffer
   for sleepable programs, both from KP Singh.

6) Extend verifier to enable variable offset read/write access to the BPF
   program stack, from Andrei Matei.

7) Improve tc & XDP MTU handling and add a new bpf_check_mtu() helper to
   query device MTU from programs, from Jesper Dangaard Brouer.

8) Allow bpf_get_socket_cookie() helper also be called from [sleepable] BPF
   tracing programs, from Florent Revest.

9) Extend x86 JIT to pad JMPs with NOPs for helping image to converge when
   otherwise too many passes are required, from Gary Lin.

10) Verifier fixes on atomics with BPF_FETCH as well as function-by-function
    verification both related to zero-extension handling, from Ilya Leoshkevich.

11) Better kernel build integration of resolve_btfids tool, from Jiri Olsa.

12) Batch of AF_XDP selftest cleanups and small performance improvement
    for libbpf's xsk map redirect for newer kernels, from Björn Töpel.

13) Follow-up BPF doc and verifier improvements around atomics with
    BPF_FETCH, from Brendan Jackman.

14) Permit zero-sized data sections e.g. if ELF .rodata section contains
    read-only data from local variables, from Yonghong Song.

15) veth driver skb bulk-allocation for ndo_xdp_xmit, from Lorenzo Bianconi.
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
17 files changed:
1  2 
Makefile
include/linux/indirect_call_wrapper.h
include/linux/netdevice.h
include/net/sock.h
include/net/tcp.h
kernel/bpf/cgroup.c
kernel/bpf/verifier.c
kernel/trace/bpf_trace.c
net/core/dev.c
net/core/filter.c
net/ipv4/af_inet.c
net/ipv4/tcp.c
net/ipv4/tcp_ipv4.c
net/ipv4/udp.c
net/ipv6/af_inet6.c
net/ipv6/tcp_ipv6.c
net/ipv6/udp.c

diff --combined Makefile
+++ b/Makefile
@@@ -2,7 -2,7 +2,7 @@@
  VERSION = 5
  PATCHLEVEL = 11
  SUBLEVEL = 0
 -EXTRAVERSION = -rc4
 +EXTRAVERSION = -rc7
  NAME = Kleptomaniac Octopus
  
  # *DOCUMENTATION*
@@@ -452,6 -452,7 +452,6 @@@ AWK                = aw
  INSTALLKERNEL  := installkernel
  DEPMOD                = depmod
  PERL          = perl
 -PYTHON                = python
  PYTHON3               = python3
  CHECK         = sparse
  BASH          = bash
@@@ -507,7 -508,7 +507,7 @@@ CLANG_FLAGS :
  
  export ARCH SRCARCH CONFIG_SHELL BASH HOSTCC KBUILD_HOSTCFLAGS CROSS_COMPILE LD CC
  export CPP AR NM STRIP OBJCOPY OBJDUMP READELF PAHOLE RESOLVE_BTFIDS LEX YACC AWK INSTALLKERNEL
 -export PERL PYTHON PYTHON3 CHECK CHECKFLAGS MAKE UTS_MACHINE HOSTCXX
 +export PERL PYTHON3 CHECK CHECKFLAGS MAKE UTS_MACHINE HOSTCXX
  export KGZIP KBZIP2 KLZOP LZMA LZ4 XZ ZSTD
  export KBUILD_HOSTCXXFLAGS KBUILD_HOSTLDFLAGS KBUILD_HOSTLDLIBS LDFLAGS_MODULE
  
@@@ -648,8 -649,7 +648,8 @@@ ifeq ($(KBUILD_EXTMOD),
  core-y                := init/ usr/
  drivers-y     := drivers/ sound/
  drivers-$(CONFIG_SAMPLES) += samples/
 -drivers-y     += net/ virt/
 +drivers-$(CONFIG_NET) += net/
 +drivers-y     += virt/
  libs-y                := lib/
  endif # KBUILD_EXTMOD
  
@@@ -812,12 -812,10 +812,12 @@@ KBUILD_CFLAGS   += -ftrivial-auto-var-ini
  KBUILD_CFLAGS += -enable-trivial-auto-var-init-zero-knowing-it-will-be-removed-from-clang
  endif
  
 +DEBUG_CFLAGS  :=
 +
  # Workaround for GCC versions < 5.0
  # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=61801
  ifdef CONFIG_CC_IS_GCC
 -DEBUG_CFLAGS  := $(call cc-ifversion, -lt, 0500, $(call cc-option, -fno-var-tracking-assignments))
 +DEBUG_CFLAGS  += $(call cc-ifversion, -lt, 0500, $(call cc-option, -fno-var-tracking-assignments))
  endif
  
  ifdef CONFIG_DEBUG_INFO
@@@ -950,6 -948,12 +950,6 @@@ KBUILD_CFLAGS   += $(call cc-option,-We
  # change __FILE__ to the relative path from the srctree
  KBUILD_CPPFLAGS += $(call cc-option,-fmacro-prefix-map=$(srctree)/=)
  
 -# ensure -fcf-protection is disabled when using retpoline as it is
 -# incompatible with -mindirect-branch=thunk-extern
 -ifdef CONFIG_RETPOLINE
 -KBUILD_CFLAGS += $(call cc-option,-fcf-protection=none)
 -endif
 -
  # include additional Makefiles when needed
  include-y                     := scripts/Makefile.extrawarn
  include-$(CONFIG_KASAN)               += scripts/Makefile.kasan
@@@ -1082,6 -1086,17 +1082,17 @@@ ifdef CONFIG_STACK_VALIDATIO
    endif
  endif
  
+ PHONY += resolve_btfids_clean
+ resolve_btfids_O = $(abspath $(objtree))/tools/bpf/resolve_btfids
+ # tools/bpf/resolve_btfids directory might not exist
+ # in output directory, skip its clean in that case
+ resolve_btfids_clean:
+ ifneq ($(wildcard $(resolve_btfids_O)),)
+       $(Q)$(MAKE) -sC $(srctree)/tools/bpf/resolve_btfids O=$(resolve_btfids_O) clean
+ endif
  ifdef CONFIG_BPF
  ifdef CONFIG_DEBUG_INFO_BTF
    ifeq ($(has_libelf),1)
@@@ -1491,7 -1506,7 +1502,7 @@@ vmlinuxclean
        $(Q)$(CONFIG_SHELL) $(srctree)/scripts/link-vmlinux.sh clean
        $(Q)$(if $(ARCH_POSTLINK), $(MAKE) -f $(ARCH_POSTLINK) clean)
  
- clean: archclean vmlinuxclean
+ clean: archclean vmlinuxclean resolve_btfids_clean
  
  # mrproper - Delete all generated files, including .config
  #
@@@ -36,7 -36,6 +36,7 @@@
  
  #define INDIRECT_CALLABLE_DECLARE(f)  f
  #define INDIRECT_CALLABLE_SCOPE
 +#define EXPORT_INDIRECT_CALLABLE(f)   EXPORT_SYMBOL(f)
  
  #else
  #define INDIRECT_CALL_1(f, f1, ...) f(__VA_ARGS__)
@@@ -45,7 -44,6 +45,7 @@@
  #define INDIRECT_CALL_4(f, f4, f3, f2, f1, ...) f(__VA_ARGS__)
  #define INDIRECT_CALLABLE_DECLARE(f)
  #define INDIRECT_CALLABLE_SCOPE               static
 +#define EXPORT_INDIRECT_CALLABLE(f)
  #endif
  
  /*
  #define INDIRECT_CALL_INET(f, f2, f1, ...) f(__VA_ARGS__)
  #endif
  
+ #if IS_ENABLED(CONFIG_INET)
+ #define INDIRECT_CALL_INET_1(f, f1, ...) INDIRECT_CALL_1(f, f1, __VA_ARGS__)
+ #else
+ #define INDIRECT_CALL_INET_1(f, f1, ...) f(__VA_ARGS__)
+ #endif
  #endif
@@@ -347,7 -347,6 +347,7 @@@ struct napi_struct 
        struct list_head        dev_list;
        struct hlist_node       napi_hash_node;
        unsigned int            napi_id;
 +      struct task_struct      *thread;
  };
  
  enum {
        NAPI_STATE_NO_BUSY_POLL,        /* Do not add in napi_hash, no busy polling */
        NAPI_STATE_IN_BUSY_POLL,        /* sk_busy_loop() owns this NAPI */
        NAPI_STATE_PREFER_BUSY_POLL,    /* prefer busy-polling over softirq processing*/
 +      NAPI_STATE_THREADED,            /* The poll is performed inside its own thread*/
  };
  
  enum {
        NAPIF_STATE_NO_BUSY_POLL        = BIT(NAPI_STATE_NO_BUSY_POLL),
        NAPIF_STATE_IN_BUSY_POLL        = BIT(NAPI_STATE_IN_BUSY_POLL),
        NAPIF_STATE_PREFER_BUSY_POLL    = BIT(NAPI_STATE_PREFER_BUSY_POLL),
 +      NAPIF_STATE_THREADED            = BIT(NAPI_STATE_THREADED),
  };
  
  enum gro_result {
@@@ -497,8 -494,6 +497,8 @@@ static inline bool napi_complete(struc
        return napi_complete_done(n, 0);
  }
  
 +int dev_set_threaded(struct net_device *dev, bool threaded);
 +
  /**
   *    napi_disable - prevent NAPI from scheduling
   *    @n: NAPI context
   */
  void napi_disable(struct napi_struct *n);
  
 -/**
 - *    napi_enable - enable NAPI scheduling
 - *    @n: NAPI context
 - *
 - * Resume NAPI from being scheduled on this context.
 - * Must be paired with napi_disable.
 - */
 -static inline void napi_enable(struct napi_struct *n)
 -{
 -      BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
 -      smp_mb__before_atomic();
 -      clear_bit(NAPI_STATE_SCHED, &n->state);
 -      clear_bit(NAPI_STATE_NPSVC, &n->state);
 -}
 +void napi_enable(struct napi_struct *n);
  
  /**
   *    napi_synchronize - wait until NAPI is not running
@@@ -850,7 -858,6 +850,7 @@@ enum tc_setup_type 
        TC_SETUP_QDISC_ETS,
        TC_SETUP_QDISC_TBF,
        TC_SETUP_QDISC_FIFO,
 +      TC_SETUP_QDISC_HTB,
  };
  
  /* These structures hold the attributes of bpf state that are being passed
@@@ -1819,8 -1826,6 +1819,8 @@@ enum netdev_priv_flags 
   *
   *    @wol_enabled:   Wake-on-LAN is enabled
   *
 + *    @threaded:      napi threaded mode is enabled
 + *
   *    @net_notifier_list:     List of per-net netdev notifier block
   *                            that follow this device when it is moved
   *                            to another network namespace.
@@@ -1852,6 -1857,7 +1852,6 @@@ struct net_device 
        unsigned long           mem_end;
        unsigned long           mem_start;
        unsigned long           base_addr;
 -      int                     irq;
  
        /*
         *      Some hardware also needs these fields (state,dev_list,
                struct list_head lower;
        } adj_list;
  
 +      /* Read-mostly cache-line for fast-path access */
 +      unsigned int            flags;
 +      unsigned int            priv_flags;
 +      const struct net_device_ops *netdev_ops;
 +      int                     ifindex;
 +      unsigned short          gflags;
 +      unsigned short          hard_header_len;
 +
 +      /* Note : dev->mtu is often read without holding a lock.
 +       * Writers usually hold RTNL.
 +       * It is recommended to use READ_ONCE() to annotate the reads,
 +       * and to use WRITE_ONCE() to annotate the writes.
 +       */
 +      unsigned int            mtu;
 +      unsigned short          needed_headroom;
 +      unsigned short          needed_tailroom;
 +
        netdev_features_t       features;
        netdev_features_t       hw_features;
        netdev_features_t       wanted_features;
        netdev_features_t       mpls_features;
        netdev_features_t       gso_partial_features;
  
 -      int                     ifindex;
 +      unsigned int            min_mtu;
 +      unsigned int            max_mtu;
 +      unsigned short          type;
 +      unsigned char           min_header_len;
 +      unsigned char           name_assign_type;
 +
        int                     group;
  
 -      struct net_device_stats stats;
 +      struct net_device_stats stats; /* not used by modern drivers */
  
        atomic_long_t           rx_dropped;
        atomic_long_t           tx_dropped;
        const struct iw_handler_def *wireless_handlers;
        struct iw_public_data   *wireless_data;
  #endif
 -      const struct net_device_ops *netdev_ops;
        const struct ethtool_ops *ethtool_ops;
  #ifdef CONFIG_NET_L3_MASTER_DEV
        const struct l3mdev_ops *l3mdev_ops;
  
        const struct header_ops *header_ops;
  
 -      unsigned int            flags;
 -      unsigned int            priv_flags;
 -
 -      unsigned short          gflags;
 -      unsigned short          padded;
 -
        unsigned char           operstate;
        unsigned char           link_mode;
  
        unsigned char           if_port;
        unsigned char           dma;
  
 -      /* Note : dev->mtu is often read without holding a lock.
 -       * Writers usually hold RTNL.
 -       * It is recommended to use READ_ONCE() to annotate the reads,
 -       * and to use WRITE_ONCE() to annotate the writes.
 -       */
 -      unsigned int            mtu;
 -      unsigned int            min_mtu;
 -      unsigned int            max_mtu;
 -      unsigned short          type;
 -      unsigned short          hard_header_len;
 -      unsigned char           min_header_len;
 -      unsigned char           name_assign_type;
 -
 -      unsigned short          needed_headroom;
 -      unsigned short          needed_tailroom;
 -
        /* Interface address info. */
        unsigned char           perm_addr[MAX_ADDR_LEN];
        unsigned char           addr_assign_type;
        unsigned short          neigh_priv_len;
        unsigned short          dev_id;
        unsigned short          dev_port;
 +      unsigned short          padded;
 +
        spinlock_t              addr_list_lock;
 +      int                     irq;
  
        struct netdev_hw_addr_list      uc;
        struct netdev_hw_addr_list      mc;
        struct lock_class_key   *qdisc_running_key;
        bool                    proto_down;
        unsigned                wol_enabled:1;
 +      unsigned                threaded:1;
  
        struct list_head        net_notifier_list;
  
@@@ -3902,9 -3905,6 +3902,9 @@@ int dev_pre_changeaddr_notify(struct ne
                              struct netlink_ext_ack *extack);
  int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa,
                        struct netlink_ext_ack *extack);
 +int dev_set_mac_address_user(struct net_device *dev, struct sockaddr *sa,
 +                           struct netlink_ext_ack *extack);
 +int dev_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name);
  int dev_change_carrier(struct net_device *, bool new_carrier);
  int dev_get_phys_port_id(struct net_device *dev,
                         struct netdev_phys_item_id *ppid);
@@@ -3931,14 -3931,42 +3931,42 @@@ int xdp_umem_query(struct net_device *d
  
  int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb);
  int dev_forward_skb(struct net_device *dev, struct sk_buff *skb);
+ int dev_forward_skb_nomtu(struct net_device *dev, struct sk_buff *skb);
  bool is_skb_forwardable(const struct net_device *dev,
                        const struct sk_buff *skb);
  
+ static __always_inline bool __is_skb_forwardable(const struct net_device *dev,
+                                                const struct sk_buff *skb,
+                                                const bool check_mtu)
+ {
+       const u32 vlan_hdr_len = 4; /* VLAN_HLEN */
+       unsigned int len;
+       if (!(dev->flags & IFF_UP))
+               return false;
+       if (!check_mtu)
+               return true;
+       len = dev->mtu + dev->hard_header_len + vlan_hdr_len;
+       if (skb->len <= len)
+               return true;
+       /* if TSO is enabled, we don't care about the length as the packet
+        * could be forwarded without being segmented before
+        */
+       if (skb_is_gso(skb))
+               return true;
+       return false;
+ }
  static __always_inline int ____dev_forward_skb(struct net_device *dev,
-                                              struct sk_buff *skb)
+                                              struct sk_buff *skb,
+                                              const bool check_mtu)
  {
        if (skb_orphan_frags(skb, GFP_ATOMIC) ||
-           unlikely(!is_skb_forwardable(dev, skb))) {
+           unlikely(!__is_skb_forwardable(dev, skb, check_mtu))) {
                atomic_long_inc(&dev->rx_dropped);
                kfree_skb(skb);
                return NET_RX_DROP;
@@@ -4339,7 -4367,6 +4367,7 @@@ static inline void netif_tx_disable(str
  
        local_bh_disable();
        cpu = smp_processor_id();
 +      spin_lock(&dev->tx_global_lock);
        for (i = 0; i < dev->num_tx_queues; i++) {
                struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
  
                netif_tx_stop_queue(txq);
                __netif_tx_unlock(txq);
        }
 +      spin_unlock(&dev->tx_global_lock);
        local_bh_enable();
  }
  
diff --combined include/net/sock.h
@@@ -226,7 -226,7 +226,7 @@@ struct sock_common 
                struct hlist_nulls_node skc_nulls_node;
        };
        unsigned short          skc_tx_queue_mapping;
 -#ifdef CONFIG_XPS
 +#ifdef CONFIG_SOCK_RX_QUEUE_MAPPING
        unsigned short          skc_rx_queue_mapping;
  #endif
        union {
@@@ -356,7 -356,7 +356,7 @@@ struct sock 
  #define sk_nulls_node         __sk_common.skc_nulls_node
  #define sk_refcnt             __sk_common.skc_refcnt
  #define sk_tx_queue_mapping   __sk_common.skc_tx_queue_mapping
 -#ifdef CONFIG_XPS
 +#ifdef CONFIG_SOCK_RX_QUEUE_MAPPING
  #define sk_rx_queue_mapping   __sk_common.skc_rx_queue_mapping
  #endif
  
@@@ -1174,6 -1174,8 +1174,8 @@@ struct proto 
  
        int                     (*backlog_rcv) (struct sock *sk,
                                                struct sk_buff *skb);
+       bool                    (*bpf_bypass_getsockopt)(int level,
+                                                        int optname);
  
        void            (*release_cb)(struct sock *sk);
  
@@@ -1350,18 -1352,14 +1352,18 @@@ sk_memory_allocated_sub(struct sock *sk
        atomic_long_sub(amt, sk->sk_prot->memory_allocated);
  }
  
 +#define SK_ALLOC_PERCPU_COUNTER_BATCH 16
 +
  static inline void sk_sockets_allocated_dec(struct sock *sk)
  {
 -      percpu_counter_dec(sk->sk_prot->sockets_allocated);
 +      percpu_counter_add_batch(sk->sk_prot->sockets_allocated, -1,
 +                               SK_ALLOC_PERCPU_COUNTER_BATCH);
  }
  
  static inline void sk_sockets_allocated_inc(struct sock *sk)
  {
 -      percpu_counter_inc(sk->sk_prot->sockets_allocated);
 +      percpu_counter_add_batch(sk->sk_prot->sockets_allocated, 1,
 +                               SK_ALLOC_PERCPU_COUNTER_BATCH);
  }
  
  static inline u64
@@@ -1838,7 -1836,7 +1840,7 @@@ static inline int sk_tx_queue_get(cons
  
  static inline void sk_rx_queue_set(struct sock *sk, const struct sk_buff *skb)
  {
 -#ifdef CONFIG_XPS
 +#ifdef CONFIG_SOCK_RX_QUEUE_MAPPING
        if (skb_rx_queue_recorded(skb)) {
                u16 rx_queue = skb_get_rx_queue(skb);
  
  
  static inline void sk_rx_queue_clear(struct sock *sk)
  {
 -#ifdef CONFIG_XPS
 +#ifdef CONFIG_SOCK_RX_QUEUE_MAPPING
        sk->sk_rx_queue_mapping = NO_QUEUE_MAPPING;
  #endif
  }
  
 -#ifdef CONFIG_XPS
  static inline int sk_rx_queue_get(const struct sock *sk)
  {
 +#ifdef CONFIG_SOCK_RX_QUEUE_MAPPING
        if (sk && sk->sk_rx_queue_mapping != NO_QUEUE_MAPPING)
                return sk->sk_rx_queue_mapping;
 +#endif
  
        return -1;
  }
 -#endif
  
  static inline void sk_set_socket(struct sock *sk, struct socket *sock)
  {
diff --combined include/net/tcp.h
@@@ -403,6 -403,7 +403,7 @@@ __poll_t tcp_poll(struct file *file, st
                      struct poll_table_struct *wait);
  int tcp_getsockopt(struct sock *sk, int level, int optname,
                   char __user *optval, int __user *optlen);
+ bool tcp_bpf_bypass_getsockopt(int level, int optname);
  int tcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
                   unsigned int optlen);
  void tcp_set_keepalive(struct sock *sk, int val);
@@@ -630,7 -631,6 +631,7 @@@ static inline void tcp_clear_xmit_timer
  
  unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu);
  unsigned int tcp_current_mss(struct sock *sk);
 +u32 tcp_clamp_probe0_to_user_timeout(const struct sock *sk, u32 when);
  
  /* Bound MSS / TSO packet size with the half of the window */
  static inline int tcp_bound_to_half_wnd(struct tcp_sock *tp, int pktsize)
@@@ -1431,29 -1431,12 +1432,29 @@@ void tcp_cleanup_rbuf(struct sock *sk, 
   */
  static inline bool tcp_rmem_pressure(const struct sock *sk)
  {
 -      int rcvbuf = READ_ONCE(sk->sk_rcvbuf);
 -      int threshold = rcvbuf - (rcvbuf >> 3);
 +      int rcvbuf, threshold;
 +
 +      if (tcp_under_memory_pressure(sk))
 +              return true;
 +
 +      rcvbuf = READ_ONCE(sk->sk_rcvbuf);
 +      threshold = rcvbuf - (rcvbuf >> 3);
  
        return atomic_read(&sk->sk_rmem_alloc) > threshold;
  }
  
 +static inline bool tcp_epollin_ready(const struct sock *sk, int target)
 +{
 +      const struct tcp_sock *tp = tcp_sk(sk);
 +      int avail = READ_ONCE(tp->rcv_nxt) - READ_ONCE(tp->copied_seq);
 +
 +      if (avail <= 0)
 +              return false;
 +
 +      return (avail >= target) || tcp_rmem_pressure(sk) ||
 +             (tcp_receive_window(tp) <= inet_csk(sk)->icsk_ack.rcv_mss);
 +}
 +
  extern void tcp_openreq_init_rwin(struct request_sock *req,
                                  const struct sock *sk_listener,
                                  const struct dst_entry *dst);
@@@ -2078,7 -2061,7 +2079,7 @@@ void tcp_mark_skb_lost(struct sock *sk
  void tcp_newreno_mark_lost(struct sock *sk, bool snd_una_advanced);
  extern s32 tcp_rack_skb_timeout(struct tcp_sock *tp, struct sk_buff *skb,
                                u32 reo_wnd);
 -extern void tcp_rack_mark_lost(struct sock *sk);
 +extern bool tcp_rack_mark_lost(struct sock *sk);
  extern void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq,
                             u64 xmit_time);
  extern void tcp_rack_reo_timeout(struct sock *sk);
diff --combined kernel/bpf/cgroup.c
@@@ -19,7 -19,7 +19,7 @@@
  
  #include "../cgroup/cgroup-internal.h"
  
- DEFINE_STATIC_KEY_FALSE(cgroup_bpf_enabled_key);
+ DEFINE_STATIC_KEY_ARRAY_FALSE(cgroup_bpf_enabled_key, MAX_BPF_ATTACH_TYPE);
  EXPORT_SYMBOL(cgroup_bpf_enabled_key);
  
  void cgroup_bpf_offline(struct cgroup *cgrp)
@@@ -128,7 -128,7 +128,7 @@@ static void cgroup_bpf_release(struct w
                        if (pl->link)
                                bpf_cgroup_link_auto_detach(pl->link);
                        kfree(pl);
-                       static_branch_dec(&cgroup_bpf_enabled_key);
+                       static_branch_dec(&cgroup_bpf_enabled_key[type]);
                }
                old_array = rcu_dereference_protected(
                                cgrp->bpf.effective[type],
@@@ -499,7 -499,7 +499,7 @@@ int __cgroup_bpf_attach(struct cgroup *
        if (old_prog)
                bpf_prog_put(old_prog);
        else
-               static_branch_inc(&cgroup_bpf_enabled_key);
+               static_branch_inc(&cgroup_bpf_enabled_key[type]);
        bpf_cgroup_storages_link(new_storage, cgrp, type);
        return 0;
  
@@@ -698,7 -698,7 +698,7 @@@ int __cgroup_bpf_detach(struct cgroup *
                cgrp->bpf.flags[type] = 0;
        if (old_prog)
                bpf_prog_put(old_prog);
-       static_branch_dec(&cgroup_bpf_enabled_key);
+       static_branch_dec(&cgroup_bpf_enabled_key[type]);
        return 0;
  
  cleanup:
@@@ -1055,6 -1055,8 +1055,8 @@@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_s
   * @uaddr: sockaddr struct provided by user
   * @type: The type of program to be exectuted
   * @t_ctx: Pointer to attach type specific context
+  * @flags: Pointer to u32 which contains higher bits of BPF program
+  *         return value (OR'ed together).
   *
   * socket is expected to be of type INET or INET6.
   *
  int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
                                      struct sockaddr *uaddr,
                                      enum bpf_attach_type type,
-                                     void *t_ctx)
+                                     void *t_ctx,
+                                     u32 *flags)
  {
        struct bpf_sock_addr_kern ctx = {
                .sk = sk,
        }
  
        cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
-       ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, BPF_PROG_RUN);
+       ret = BPF_PROG_RUN_ARRAY_FLAGS(cgrp->bpf.effective[type], &ctx,
+                                      BPF_PROG_RUN, flags);
  
        return ret == 1 ? 0 : -EPERM;
  }
@@@ -1298,7 -1302,8 +1302,8 @@@ static bool __cgroup_bpf_prog_array_is_
        return empty;
  }
  
- static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen)
+ static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen,
+                            struct bpf_sockopt_buf *buf)
  {
        if (unlikely(max_optlen < 0))
                return -EINVAL;
                max_optlen = PAGE_SIZE;
        }
  
+       if (max_optlen <= sizeof(buf->data)) {
+               /* When the optval fits into BPF_SOCKOPT_KERN_BUF_SIZE
+                * bytes avoid the cost of kzalloc.
+                */
+               ctx->optval = buf->data;
+               ctx->optval_end = ctx->optval + max_optlen;
+               return max_optlen;
+       }
        ctx->optval = kzalloc(max_optlen, GFP_USER);
        if (!ctx->optval)
                return -ENOMEM;
        return max_optlen;
  }
  
- static void sockopt_free_buf(struct bpf_sockopt_kern *ctx)
+ static void sockopt_free_buf(struct bpf_sockopt_kern *ctx,
+                            struct bpf_sockopt_buf *buf)
  {
+       if (ctx->optval == buf->data)
+               return;
        kfree(ctx->optval);
  }
  
+ static bool sockopt_buf_allocated(struct bpf_sockopt_kern *ctx,
+                                 struct bpf_sockopt_buf *buf)
+ {
+       return ctx->optval != buf->data;
+ }
  int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
                                       int *optname, char __user *optval,
                                       int *optlen, char **kernel_optval)
  {
        struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
+       struct bpf_sockopt_buf buf = {};
        struct bpf_sockopt_kern ctx = {
                .sk = sk,
                .level = *level,
         * attached to the hook so we don't waste time allocating
         * memory and locking the socket.
         */
-       if (!cgroup_bpf_enabled ||
-           __cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_SETSOCKOPT))
+       if (__cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_SETSOCKOPT))
                return 0;
  
        /* Allocate a bit more than the initial user buffer for
         */
        max_optlen = max_t(int, 16, *optlen);
  
-       max_optlen = sockopt_alloc_buf(&ctx, max_optlen);
+       max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf);
        if (max_optlen < 0)
                return max_optlen;
  
                 */
                if (ctx.optlen != 0) {
                        *optlen = ctx.optlen;
-                       *kernel_optval = ctx.optval;
+                       /* We've used bpf_sockopt_kern->buf as an intermediary
+                        * storage, but the BPF program indicates that we need
+                        * to pass this data to the kernel setsockopt handler.
+                        * No way to export on-stack buf, have to allocate a
+                        * new buffer.
+                        */
+                       if (!sockopt_buf_allocated(&ctx, &buf)) {
+                               void *p = kmalloc(ctx.optlen, GFP_USER);
+                               if (!p) {
+                                       ret = -ENOMEM;
+                                       goto out;
+                               }
+                               memcpy(p, ctx.optval, ctx.optlen);
+                               *kernel_optval = p;
+                       } else {
+                               *kernel_optval = ctx.optval;
+                       }
                        /* export and don't free sockopt buf */
                        return 0;
                }
        }
  
  out:
-       sockopt_free_buf(&ctx);
+       sockopt_free_buf(&ctx, &buf);
        return ret;
  }
  
@@@ -1407,6 -1447,7 +1447,7 @@@ int __cgroup_bpf_run_filter_getsockopt(
                                       int retval)
  {
        struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
+       struct bpf_sockopt_buf buf = {};
        struct bpf_sockopt_kern ctx = {
                .sk = sk,
                .level = level,
         * attached to the hook so we don't waste time allocating
         * memory and locking the socket.
         */
-       if (!cgroup_bpf_enabled ||
-           __cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_GETSOCKOPT))
+       if (__cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_GETSOCKOPT))
                return retval;
  
        ctx.optlen = max_optlen;
  
-       max_optlen = sockopt_alloc_buf(&ctx, max_optlen);
+       max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf);
        if (max_optlen < 0)
                return max_optlen;
  
                        goto out;
                }
  
 +              if (ctx.optlen < 0) {
 +                      ret = -EFAULT;
 +                      goto out;
 +              }
 +
                if (copy_from_user(ctx.optval, optval,
                                   min(ctx.optlen, max_optlen)) != 0) {
                        ret = -EFAULT;
                goto out;
        }
  
 -      if (ctx.optlen > max_optlen) {
 +      if (ctx.optlen > max_optlen || ctx.optlen < 0) {
                ret = -EFAULT;
                goto out;
        }
        ret = ctx.retval;
  
  out:
-       sockopt_free_buf(&ctx);
+       sockopt_free_buf(&ctx, &buf);
        return ret;
  }
+ int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level,
+                                           int optname, void *optval,
+                                           int *optlen, int retval)
+ {
+       struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
+       struct bpf_sockopt_kern ctx = {
+               .sk = sk,
+               .level = level,
+               .optname = optname,
+               .retval = retval,
+               .optlen = *optlen,
+               .optval = optval,
+               .optval_end = optval + *optlen,
+       };
+       int ret;
+       /* Note that __cgroup_bpf_run_filter_getsockopt doesn't copy
+        * user data back into BPF buffer when reval != 0. This is
+        * done as an optimization to avoid extra copy, assuming
+        * kernel won't populate the data in case of an error.
+        * Here we always pass the data and memset() should
+        * be called if that data shouldn't be "exported".
+        */
+       ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_GETSOCKOPT],
+                                &ctx, BPF_PROG_RUN);
+       if (!ret)
+               return -EPERM;
+       if (ctx.optlen > *optlen)
+               return -EFAULT;
+       /* BPF programs only allowed to set retval to 0, not some
+        * arbitrary value.
+        */
+       if (ctx.retval != 0 && ctx.retval != retval)
+               return -EFAULT;
+       /* BPF programs can shrink the buffer, export the modifications.
+        */
+       if (ctx.optlen != 0)
+               *optlen = ctx.optlen;
+       return ctx.retval;
+ }
  #endif
  
  static ssize_t sysctl_cpy_dir(const struct ctl_dir *dir, char **bufp,
diff --combined kernel/bpf/verifier.c
@@@ -228,6 -228,12 +228,12 @@@ static void bpf_map_key_store(struct bp
                             (poisoned ? BPF_MAP_KEY_POISON : 0ULL);
  }
  
+ static bool bpf_pseudo_call(const struct bpf_insn *insn)
+ {
+       return insn->code == (BPF_JMP | BPF_CALL) &&
+              insn->src_reg == BPF_PSEUDO_CALL;
+ }
  struct bpf_call_arg_meta {
        struct bpf_map *map_ptr;
        bool raw_mode;
@@@ -1073,6 -1079,51 +1079,51 @@@ static void mark_reg_known_zero(struct 
        __mark_reg_known_zero(regs + regno);
  }
  
+ static void mark_ptr_not_null_reg(struct bpf_reg_state *reg)
+ {
+       switch (reg->type) {
+       case PTR_TO_MAP_VALUE_OR_NULL: {
+               const struct bpf_map *map = reg->map_ptr;
+               if (map->inner_map_meta) {
+                       reg->type = CONST_PTR_TO_MAP;
+                       reg->map_ptr = map->inner_map_meta;
+               } else if (map->map_type == BPF_MAP_TYPE_XSKMAP) {
+                       reg->type = PTR_TO_XDP_SOCK;
+               } else if (map->map_type == BPF_MAP_TYPE_SOCKMAP ||
+                          map->map_type == BPF_MAP_TYPE_SOCKHASH) {
+                       reg->type = PTR_TO_SOCKET;
+               } else {
+                       reg->type = PTR_TO_MAP_VALUE;
+               }
+               break;
+       }
+       case PTR_TO_SOCKET_OR_NULL:
+               reg->type = PTR_TO_SOCKET;
+               break;
+       case PTR_TO_SOCK_COMMON_OR_NULL:
+               reg->type = PTR_TO_SOCK_COMMON;
+               break;
+       case PTR_TO_TCP_SOCK_OR_NULL:
+               reg->type = PTR_TO_TCP_SOCK;
+               break;
+       case PTR_TO_BTF_ID_OR_NULL:
+               reg->type = PTR_TO_BTF_ID;
+               break;
+       case PTR_TO_MEM_OR_NULL:
+               reg->type = PTR_TO_MEM;
+               break;
+       case PTR_TO_RDONLY_BUF_OR_NULL:
+               reg->type = PTR_TO_RDONLY_BUF;
+               break;
+       case PTR_TO_RDWR_BUF_OR_NULL:
+               reg->type = PTR_TO_RDWR_BUF;
+               break;
+       default:
+               WARN_ON("unknown nullable register type");
+       }
+ }
  static bool reg_is_pkt_pointer(const struct bpf_reg_state *reg)
  {
        return type_is_pkt_pointer(reg->type);
@@@ -1486,9 -1537,7 +1537,7 @@@ static int check_subprogs(struct bpf_ve
  
        /* determine subprog starts. The end is one before the next starts */
        for (i = 0; i < insn_cnt; i++) {
-               if (insn[i].code != (BPF_JMP | BPF_CALL))
-                       continue;
-               if (insn[i].src_reg != BPF_PSEUDO_CALL)
+               if (!bpf_pseudo_call(insn + i))
                        continue;
                if (!env->bpf_capable) {
                        verbose(env,
@@@ -2271,12 -2320,14 +2320,14 @@@ static void save_register_state(struct 
                state->stack[spi].slot_type[i] = STACK_SPILL;
  }
  
- /* check_stack_read/write functions track spill/fill of registers,
+ /* check_stack_{read,write}_fixed_off functions track spill/fill of registers,
   * stack boundary and alignment are checked in check_mem_access()
   */
- static int check_stack_write(struct bpf_verifier_env *env,
-                            struct bpf_func_state *state, /* func where register points to */
-                            int off, int size, int value_regno, int insn_idx)
+ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
+                                      /* stack frame we're writing to */
+                                      struct bpf_func_state *state,
+                                      int off, int size, int value_regno,
+                                      int insn_idx)
  {
        struct bpf_func_state *cur; /* state of the current function */
        int i, slot = -off - 1, spi = slot / BPF_REG_SIZE, err;
        return 0;
  }
  
- static int check_stack_read(struct bpf_verifier_env *env,
-                           struct bpf_func_state *reg_state /* func where register points to */,
-                           int off, int size, int value_regno)
+ /* Write the stack: 'stack[ptr_regno + off] = value_regno'. 'ptr_regno' is
+  * known to contain a variable offset.
+  * This function checks whether the write is permitted and conservatively
+  * tracks the effects of the write, considering that each stack slot in the
+  * dynamic range is potentially written to.
+  *
+  * 'off' includes 'regno->off'.
+  * 'value_regno' can be -1, meaning that an unknown value is being written to
+  * the stack.
+  *
+  * Spilled pointers in range are not marked as written because we don't know
+  * what's going to be actually written. This means that read propagation for
+  * future reads cannot be terminated by this write.
+  *
+  * For privileged programs, uninitialized stack slots are considered
+  * initialized by this write (even though we don't know exactly what offsets
+  * are going to be written to). The idea is that we don't want the verifier to
+  * reject future reads that access slots written to through variable offsets.
+  */
+ static int check_stack_write_var_off(struct bpf_verifier_env *env,
+                                    /* func where register points to */
+                                    struct bpf_func_state *state,
+                                    int ptr_regno, int off, int size,
+                                    int value_regno, int insn_idx)
+ {
+       struct bpf_func_state *cur; /* state of the current function */
+       int min_off, max_off;
+       int i, err;
+       struct bpf_reg_state *ptr_reg = NULL, *value_reg = NULL;
+       bool writing_zero = false;
+       /* set if the fact that we're writing a zero is used to let any
+        * stack slots remain STACK_ZERO
+        */
+       bool zero_used = false;
+       cur = env->cur_state->frame[env->cur_state->curframe];
+       ptr_reg = &cur->regs[ptr_regno];
+       min_off = ptr_reg->smin_value + off;
+       max_off = ptr_reg->smax_value + off + size;
+       if (value_regno >= 0)
+               value_reg = &cur->regs[value_regno];
+       if (value_reg && register_is_null(value_reg))
+               writing_zero = true;
+       err = realloc_func_state(state, round_up(-min_off, BPF_REG_SIZE),
+                                state->acquired_refs, true);
+       if (err)
+               return err;
+       /* Variable offset writes destroy any spilled pointers in range. */
+       for (i = min_off; i < max_off; i++) {
+               u8 new_type, *stype;
+               int slot, spi;
+               slot = -i - 1;
+               spi = slot / BPF_REG_SIZE;
+               stype = &state->stack[spi].slot_type[slot % BPF_REG_SIZE];
+               if (!env->allow_ptr_leaks
+                               && *stype != NOT_INIT
+                               && *stype != SCALAR_VALUE) {
+                       /* Reject the write if there's are spilled pointers in
+                        * range. If we didn't reject here, the ptr status
+                        * would be erased below (even though not all slots are
+                        * actually overwritten), possibly opening the door to
+                        * leaks.
+                        */
+                       verbose(env, "spilled ptr in range of var-offset stack write; insn %d, ptr off: %d",
+                               insn_idx, i);
+                       return -EINVAL;
+               }
+               /* Erase all spilled pointers. */
+               state->stack[spi].spilled_ptr.type = NOT_INIT;
+               /* Update the slot type. */
+               new_type = STACK_MISC;
+               if (writing_zero && *stype == STACK_ZERO) {
+                       new_type = STACK_ZERO;
+                       zero_used = true;
+               }
+               /* If the slot is STACK_INVALID, we check whether it's OK to
+                * pretend that it will be initialized by this write. The slot
+                * might not actually be written to, and so if we mark it as
+                * initialized future reads might leak uninitialized memory.
+                * For privileged programs, we will accept such reads to slots
+                * that may or may not be written because, if we're reject
+                * them, the error would be too confusing.
+                */
+               if (*stype == STACK_INVALID && !env->allow_uninit_stack) {
+                       verbose(env, "uninit stack in range of var-offset write prohibited for !root; insn %d, off: %d",
+                                       insn_idx, i);
+                       return -EINVAL;
+               }
+               *stype = new_type;
+       }
+       if (zero_used) {
+               /* backtracking doesn't work for STACK_ZERO yet. */
+               err = mark_chain_precision(env, value_regno);
+               if (err)
+                       return err;
+       }
+       return 0;
+ }
+ /* When register 'dst_regno' is assigned some values from stack[min_off,
+  * max_off), we set the register's type according to the types of the
+  * respective stack slots. If all the stack values are known to be zeros, then
+  * so is the destination reg. Otherwise, the register is considered to be
+  * SCALAR. This function does not deal with register filling; the caller must
+  * ensure that all spilled registers in the stack range have been marked as
+  * read.
+  */
+ static void mark_reg_stack_read(struct bpf_verifier_env *env,
+                               /* func where src register points to */
+                               struct bpf_func_state *ptr_state,
+                               int min_off, int max_off, int dst_regno)
+ {
+       struct bpf_verifier_state *vstate = env->cur_state;
+       struct bpf_func_state *state = vstate->frame[vstate->curframe];
+       int i, slot, spi;
+       u8 *stype;
+       int zeros = 0;
+       for (i = min_off; i < max_off; i++) {
+               slot = -i - 1;
+               spi = slot / BPF_REG_SIZE;
+               stype = ptr_state->stack[spi].slot_type;
+               if (stype[slot % BPF_REG_SIZE] != STACK_ZERO)
+                       break;
+               zeros++;
+       }
+       if (zeros == max_off - min_off) {
+               /* any access_size read into register is zero extended,
+                * so the whole register == const_zero
+                */
+               __mark_reg_const_zero(&state->regs[dst_regno]);
+               /* backtracking doesn't support STACK_ZERO yet,
+                * so mark it precise here, so that later
+                * backtracking can stop here.
+                * Backtracking may not need this if this register
+                * doesn't participate in pointer adjustment.
+                * Forward propagation of precise flag is not
+                * necessary either. This mark is only to stop
+                * backtracking. Any register that contributed
+                * to const 0 was marked precise before spill.
+                */
+               state->regs[dst_regno].precise = true;
+       } else {
+               /* have read misc data from the stack */
+               mark_reg_unknown(env, state->regs, dst_regno);
+       }
+       state->regs[dst_regno].live |= REG_LIVE_WRITTEN;
+ }
+ /* Read the stack at 'off' and put the results into the register indicated by
+  * 'dst_regno'. It handles reg filling if the addressed stack slot is a
+  * spilled reg.
+  *
+  * 'dst_regno' can be -1, meaning that the read value is not going to a
+  * register.
+  *
+  * The access is assumed to be within the current stack bounds.
+  */
+ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
+                                     /* func where src register points to */
+                                     struct bpf_func_state *reg_state,
+                                     int off, int size, int dst_regno)
  {
        struct bpf_verifier_state *vstate = env->cur_state;
        struct bpf_func_state *state = vstate->frame[vstate->curframe];
        struct bpf_reg_state *reg;
        u8 *stype;
  
-       if (reg_state->allocated_stack <= slot) {
-               verbose(env, "invalid read from stack off %d+0 size %d\n",
-                       off, size);
-               return -EACCES;
-       }
        stype = reg_state->stack[spi].slot_type;
        reg = &reg_state->stack[spi].spilled_ptr;
  
                                verbose(env, "invalid size of register fill\n");
                                return -EACCES;
                        }
-                       if (value_regno >= 0) {
-                               mark_reg_unknown(env, state->regs, value_regno);
-                               state->regs[value_regno].live |= REG_LIVE_WRITTEN;
+                       if (dst_regno >= 0) {
+                               mark_reg_unknown(env, state->regs, dst_regno);
+                               state->regs[dst_regno].live |= REG_LIVE_WRITTEN;
                        }
                        mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64);
                        return 0;
                        }
                }
  
-               if (value_regno >= 0) {
+               if (dst_regno >= 0) {
                        /* restore register state from stack */
-                       state->regs[value_regno] = *reg;
+                       state->regs[dst_regno] = *reg;
                        /* mark reg as written since spilled pointer state likely
                         * has its liveness marks cleared by is_state_visited()
                         * which resets stack/reg liveness for state transitions
                         */
-                       state->regs[value_regno].live |= REG_LIVE_WRITTEN;
+                       state->regs[dst_regno].live |= REG_LIVE_WRITTEN;
                } else if (__is_pointer_value(env->allow_ptr_leaks, reg)) {
-                       /* If value_regno==-1, the caller is asking us whether
+                       /* If dst_regno==-1, the caller is asking us whether
                         * it is acceptable to use this value as a SCALAR_VALUE
                         * (e.g. for XADD).
                         * We must not allow unprivileged callers to do that
                }
                mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64);
        } else {
-               int zeros = 0;
+               u8 type;
  
                for (i = 0; i < size; i++) {
-                       if (stype[(slot - i) % BPF_REG_SIZE] == STACK_MISC)
+                       type = stype[(slot - i) % BPF_REG_SIZE];
+                       if (type == STACK_MISC)
                                continue;
-                       if (stype[(slot - i) % BPF_REG_SIZE] == STACK_ZERO) {
-                               zeros++;
+                       if (type == STACK_ZERO)
                                continue;
-                       }
                        verbose(env, "invalid read from stack off %d+%d size %d\n",
                                off, i, size);
                        return -EACCES;
                }
                mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64);
-               if (value_regno >= 0) {
-                       if (zeros == size) {
-                               /* any size read into register is zero extended,
-                                * so the whole register == const_zero
-                                */
-                               __mark_reg_const_zero(&state->regs[value_regno]);
-                               /* backtracking doesn't support STACK_ZERO yet,
-                                * so mark it precise here, so that later
-                                * backtracking can stop here.
-                                * Backtracking may not need this if this register
-                                * doesn't participate in pointer adjustment.
-                                * Forward propagation of precise flag is not
-                                * necessary either. This mark is only to stop
-                                * backtracking. Any register that contributed
-                                * to const 0 was marked precise before spill.
-                                */
-                               state->regs[value_regno].precise = true;
-                       } else {
-                               /* have read misc data from the stack */
-                               mark_reg_unknown(env, state->regs, value_regno);
-                       }
-                       state->regs[value_regno].live |= REG_LIVE_WRITTEN;
-               }
+               if (dst_regno >= 0)
+                       mark_reg_stack_read(env, reg_state, off, off + size, dst_regno);
        }
        return 0;
  }
  
- static int check_stack_access(struct bpf_verifier_env *env,
-                             const struct bpf_reg_state *reg,
-                             int off, int size)
+ enum stack_access_src {
+       ACCESS_DIRECT = 1,  /* the access is performed by an instruction */
+       ACCESS_HELPER = 2,  /* the access is performed by a helper */
+ };
+ static int check_stack_range_initialized(struct bpf_verifier_env *env,
+                                        int regno, int off, int access_size,
+                                        bool zero_size_allowed,
+                                        enum stack_access_src type,
+                                        struct bpf_call_arg_meta *meta);
+ static struct bpf_reg_state *reg_state(struct bpf_verifier_env *env, int regno)
+ {
+       return cur_regs(env) + regno;
+ }
+ /* Read the stack at 'ptr_regno + off' and put the result into the register
+  * 'dst_regno'.
+  * 'off' includes the pointer register's fixed offset(i.e. 'ptr_regno.off'),
+  * but not its variable offset.
+  * 'size' is assumed to be <= reg size and the access is assumed to be aligned.
+  *
+  * As opposed to check_stack_read_fixed_off, this function doesn't deal with
+  * filling registers (i.e. reads of spilled register cannot be detected when
+  * the offset is not fixed). We conservatively mark 'dst_regno' as containing
+  * SCALAR_VALUE. That's why we assert that the 'ptr_regno' has a variable
+  * offset; for a fixed offset check_stack_read_fixed_off should be used
+  * instead.
+  */
+ static int check_stack_read_var_off(struct bpf_verifier_env *env,
+                                   int ptr_regno, int off, int size, int dst_regno)
  {
-       /* Stack accesses must be at a fixed offset, so that we
-        * can determine what type of data were returned. See
-        * check_stack_read().
+       /* The state of the source register. */
+       struct bpf_reg_state *reg = reg_state(env, ptr_regno);
+       struct bpf_func_state *ptr_state = func(env, reg);
+       int err;
+       int min_off, max_off;
+       /* Note that we pass a NULL meta, so raw access will not be permitted.
         */
-       if (!tnum_is_const(reg->var_off)) {
+       err = check_stack_range_initialized(env, ptr_regno, off, size,
+                                           false, ACCESS_DIRECT, NULL);
+       if (err)
+               return err;
+       min_off = reg->smin_value + off;
+       max_off = reg->smax_value + off;
+       mark_reg_stack_read(env, ptr_state, min_off, max_off + size, dst_regno);
+       return 0;
+ }
+ /* check_stack_read dispatches to check_stack_read_fixed_off or
+  * check_stack_read_var_off.
+  *
+  * The caller must ensure that the offset falls within the allocated stack
+  * bounds.
+  *
+  * 'dst_regno' is a register which will receive the value from the stack. It
+  * can be -1, meaning that the read value is not going to a register.
+  */
+ static int check_stack_read(struct bpf_verifier_env *env,
+                           int ptr_regno, int off, int size,
+                           int dst_regno)
+ {
+       struct bpf_reg_state *reg = reg_state(env, ptr_regno);
+       struct bpf_func_state *state = func(env, reg);
+       int err;
+       /* Some accesses are only permitted with a static offset. */
+       bool var_off = !tnum_is_const(reg->var_off);
+       /* The offset is required to be static when reads don't go to a
+        * register, in order to not leak pointers (see
+        * check_stack_read_fixed_off).
+        */
+       if (dst_regno < 0 && var_off) {
                char tn_buf[48];
  
                tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
-               verbose(env, "variable stack access var_off=%s off=%d size=%d\n",
+               verbose(env, "variable offset stack pointer cannot be passed into helper function; var_off=%s off=%d size=%d\n",
                        tn_buf, off, size);
                return -EACCES;
        }
+       /* Variable offset is prohibited for unprivileged mode for simplicity
+        * since it requires corresponding support in Spectre masking for stack
+        * ALU. See also retrieve_ptr_limit().
+        */
+       if (!env->bypass_spec_v1 && var_off) {
+               char tn_buf[48];
  
-       if (off >= 0 || off < -MAX_BPF_STACK) {
-               verbose(env, "invalid stack off=%d size=%d\n", off, size);
+               tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+               verbose(env, "R%d variable offset stack access prohibited for !root, var_off=%s\n",
+                               ptr_regno, tn_buf);
                return -EACCES;
        }
  
-       return 0;
+       if (!var_off) {
+               off += reg->var_off.value;
+               err = check_stack_read_fixed_off(env, state, off, size,
+                                                dst_regno);
+       } else {
+               /* Variable offset stack reads need more conservative handling
+                * than fixed offset ones. Note that dst_regno >= 0 on this
+                * branch.
+                */
+               err = check_stack_read_var_off(env, ptr_regno, off, size,
+                                              dst_regno);
+       }
+       return err;
+ }
+ /* check_stack_write dispatches to check_stack_write_fixed_off or
+  * check_stack_write_var_off.
+  *
+  * 'ptr_regno' is the register used as a pointer into the stack.
+  * 'off' includes 'ptr_regno->off', but not its variable offset (if any).
+  * 'value_regno' is the register whose value we're writing to the stack. It can
+  * be -1, meaning that we're not writing from a register.
+  *
+  * The caller must ensure that the offset falls within the maximum stack size.
+  */
+ static int check_stack_write(struct bpf_verifier_env *env,
+                            int ptr_regno, int off, int size,
+                            int value_regno, int insn_idx)
+ {
+       struct bpf_reg_state *reg = reg_state(env, ptr_regno);
+       struct bpf_func_state *state = func(env, reg);
+       int err;
+       if (tnum_is_const(reg->var_off)) {
+               off += reg->var_off.value;
+               err = check_stack_write_fixed_off(env, state, off, size,
+                                                 value_regno, insn_idx);
+       } else {
+               /* Variable offset stack reads need more conservative handling
+                * than fixed offset ones.
+                */
+               err = check_stack_write_var_off(env, state,
+                                               ptr_regno, off, size,
+                                               value_regno, insn_idx);
+       }
+       return err;
  }
  
  static int check_map_access_type(struct bpf_verifier_env *env, u32 regno,
@@@ -2858,11 -3167,6 +3167,6 @@@ static int check_sock_access(struct bpf
        return -EACCES;
  }
  
- static struct bpf_reg_state *reg_state(struct bpf_verifier_env *env, int regno)
- {
-       return cur_regs(env) + regno;
- }
  static bool is_pointer_value(struct bpf_verifier_env *env, int regno)
  {
        return __is_pointer_value(env->allow_ptr_leaks, reg_state(env, regno));
@@@ -2981,8 -3285,8 +3285,8 @@@ static int check_ptr_alignment(struct b
                break;
        case PTR_TO_STACK:
                pointer_desc = "stack ";
-               /* The stack spill tracking logic in check_stack_write()
-                * and check_stack_read() relies on stack accesses being
+               /* The stack spill tracking logic in check_stack_write_fixed_off()
+                * and check_stack_read_fixed_off() relies on stack accesses being
                 * aligned.
                 */
                strict = true;
@@@ -3074,9 -3378,7 +3378,7 @@@ process_func
  continue_func:
        subprog_end = subprog[idx + 1].start;
        for (; i < subprog_end; i++) {
-               if (insn[i].code != (BPF_JMP | BPF_CALL))
-                       continue;
-               if (insn[i].src_reg != BPF_PSEUDO_CALL)
+               if (!bpf_pseudo_call(insn + i))
                        continue;
                /* remember insn and function to return to */
                ret_insn[frame] = i + 1;
@@@ -3400,6 -3702,91 +3702,91 @@@ static int check_ptr_to_map_access(stru
        return 0;
  }
  
+ /* Check that the stack access at the given offset is within bounds. The
+  * maximum valid offset is -1.
+  *
+  * The minimum valid offset is -MAX_BPF_STACK for writes, and
+  * -state->allocated_stack for reads.
+  */
+ static int check_stack_slot_within_bounds(int off,
+                                         struct bpf_func_state *state,
+                                         enum bpf_access_type t)
+ {
+       int min_valid_off;
+       if (t == BPF_WRITE)
+               min_valid_off = -MAX_BPF_STACK;
+       else
+               min_valid_off = -state->allocated_stack;
+       if (off < min_valid_off || off > -1)
+               return -EACCES;
+       return 0;
+ }
+ /* Check that the stack access at 'regno + off' falls within the maximum stack
+  * bounds.
+  *
+  * 'off' includes `regno->offset`, but not its dynamic part (if any).
+  */
+ static int check_stack_access_within_bounds(
+               struct bpf_verifier_env *env,
+               int regno, int off, int access_size,
+               enum stack_access_src src, enum bpf_access_type type)
+ {
+       struct bpf_reg_state *regs = cur_regs(env);
+       struct bpf_reg_state *reg = regs + regno;
+       struct bpf_func_state *state = func(env, reg);
+       int min_off, max_off;
+       int err;
+       char *err_extra;
+       if (src == ACCESS_HELPER)
+               /* We don't know if helpers are reading or writing (or both). */
+               err_extra = " indirect access to";
+       else if (type == BPF_READ)
+               err_extra = " read from";
+       else
+               err_extra = " write to";
+       if (tnum_is_const(reg->var_off)) {
+               min_off = reg->var_off.value + off;
+               if (access_size > 0)
+                       max_off = min_off + access_size - 1;
+               else
+                       max_off = min_off;
+       } else {
+               if (reg->smax_value >= BPF_MAX_VAR_OFF ||
+                   reg->smin_value <= -BPF_MAX_VAR_OFF) {
+                       verbose(env, "invalid unbounded variable-offset%s stack R%d\n",
+                               err_extra, regno);
+                       return -EACCES;
+               }
+               min_off = reg->smin_value + off;
+               if (access_size > 0)
+                       max_off = reg->smax_value + off + access_size - 1;
+               else
+                       max_off = min_off;
+       }
+       err = check_stack_slot_within_bounds(min_off, state, type);
+       if (!err)
+               err = check_stack_slot_within_bounds(max_off, state, type);
+       if (err) {
+               if (tnum_is_const(reg->var_off)) {
+                       verbose(env, "invalid%s stack R%d off=%d size=%d\n",
+                               err_extra, regno, off, access_size);
+               } else {
+                       char tn_buf[48];
+                       tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+                       verbose(env, "invalid variable-offset%s stack R%d var_off=%s size=%d\n",
+                               err_extra, regno, tn_buf, access_size);
+               }
+       }
+       return err;
+ }
  
  /* check whether memory at (regno + off) is accessible for t = (read | write)
   * if t==write, value_regno is a register which value is stored into memory
@@@ -3515,8 -3902,8 +3902,8 @@@ static int check_mem_access(struct bpf_
                }
  
        } else if (reg->type == PTR_TO_STACK) {
-               off += reg->var_off.value;
-               err = check_stack_access(env, reg, off, size);
+               /* Basic bounds checks. */
+               err = check_stack_access_within_bounds(env, regno, off, size, ACCESS_DIRECT, t);
                if (err)
                        return err;
  
                if (err)
                        return err;
  
-               if (t == BPF_WRITE)
-                       err = check_stack_write(env, state, off, size,
-                                               value_regno, insn_idx);
-               else
-                       err = check_stack_read(env, state, off, size,
+               if (t == BPF_READ)
+                       err = check_stack_read(env, regno, off, size,
                                               value_regno);
+               else
+                       err = check_stack_write(env, regno, off, size,
+                                               value_regno, insn_idx);
        } else if (reg_is_pkt_pointer(reg)) {
                if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL, t)) {
                        verbose(env, "cannot write into packet\n");
@@@ -3665,9 -4052,26 +4052,26 @@@ static int check_atomic(struct bpf_veri
                return -EACCES;
        }
  
+       if (insn->imm & BPF_FETCH) {
+               if (insn->imm == BPF_CMPXCHG)
+                       load_reg = BPF_REG_0;
+               else
+                       load_reg = insn->src_reg;
+               /* check and record load of old value */
+               err = check_reg_arg(env, load_reg, DST_OP);
+               if (err)
+                       return err;
+       } else {
+               /* This instruction accesses a memory location but doesn't
+                * actually load it into a register.
+                */
+               load_reg = -1;
+       }
        /* check whether we can read the memory */
        err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
-                              BPF_SIZE(insn->code), BPF_READ, -1, true);
+                              BPF_SIZE(insn->code), BPF_READ, load_reg, true);
        if (err)
                return err;
  
        if (err)
                return err;
  
-       if (!(insn->imm & BPF_FETCH))
-               return 0;
-       if (insn->imm == BPF_CMPXCHG)
-               load_reg = BPF_REG_0;
-       else
-               load_reg = insn->src_reg;
-       /* check and record load of old value */
-       err = check_reg_arg(env, load_reg, DST_OP);
-       if (err)
-               return err;
        return 0;
  }
  
- static int __check_stack_boundary(struct bpf_verifier_env *env, u32 regno,
-                                 int off, int access_size,
-                                 bool zero_size_allowed)
+ /* When register 'regno' is used to read the stack (either directly or through
+  * a helper function) make sure that it's within stack boundary and, depending
+  * on the access type, that all elements of the stack are initialized.
+  *
+  * 'off' includes 'regno->off', but not its dynamic part (if any).
+  *
+  * All registers that have been spilled on the stack in the slots within the
+  * read offsets are marked as read.
+  */
+ static int check_stack_range_initialized(
+               struct bpf_verifier_env *env, int regno, int off,
+               int access_size, bool zero_size_allowed,
+               enum stack_access_src type, struct bpf_call_arg_meta *meta)
  {
        struct bpf_reg_state *reg = reg_state(env, regno);
+       struct bpf_func_state *state = func(env, reg);
+       int err, min_off, max_off, i, j, slot, spi;
+       char *err_extra = type == ACCESS_HELPER ? " indirect" : "";
+       enum bpf_access_type bounds_check_type;
+       /* Some accesses can write anything into the stack, others are
+        * read-only.
+        */
+       bool clobber = false;
  
-       if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 ||
-           access_size < 0 || (access_size == 0 && !zero_size_allowed)) {
-               if (tnum_is_const(reg->var_off)) {
-                       verbose(env, "invalid stack type R%d off=%d access_size=%d\n",
-                               regno, off, access_size);
-               } else {
-                       char tn_buf[48];
-                       tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
-                       verbose(env, "invalid stack type R%d var_off=%s access_size=%d\n",
-                               regno, tn_buf, access_size);
-               }
+       if (access_size == 0 && !zero_size_allowed) {
+               verbose(env, "invalid zero-sized read\n");
                return -EACCES;
        }
-       return 0;
- }
  
- /* when register 'regno' is passed into function that will read 'access_size'
-  * bytes from that pointer, make sure that it's within stack boundary
-  * and all elements of stack are initialized.
-  * Unlike most pointer bounds-checking functions, this one doesn't take an
-  * 'off' argument, so it has to add in reg->off itself.
-  */
- static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
-                               int access_size, bool zero_size_allowed,
-                               struct bpf_call_arg_meta *meta)
- {
-       struct bpf_reg_state *reg = reg_state(env, regno);
-       struct bpf_func_state *state = func(env, reg);
-       int err, min_off, max_off, i, j, slot, spi;
+       if (type == ACCESS_HELPER) {
+               /* The bounds checks for writes are more permissive than for
+                * reads. However, if raw_mode is not set, we'll do extra
+                * checks below.
+                */
+               bounds_check_type = BPF_WRITE;
+               clobber = true;
+       } else {
+               bounds_check_type = BPF_READ;
+       }
+       err = check_stack_access_within_bounds(env, regno, off, access_size,
+                                              type, bounds_check_type);
+       if (err)
+               return err;
  
        if (tnum_is_const(reg->var_off)) {
-               min_off = max_off = reg->var_off.value + reg->off;
-               err = __check_stack_boundary(env, regno, min_off, access_size,
-                                            zero_size_allowed);
-               if (err)
-                       return err;
+               min_off = max_off = reg->var_off.value + off;
        } else {
                /* Variable offset is prohibited for unprivileged mode for
                 * simplicity since it requires corresponding support in
                        char tn_buf[48];
  
                        tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
-                       verbose(env, "R%d indirect variable offset stack access prohibited for !root, var_off=%s\n",
-                               regno, tn_buf);
+                       verbose(env, "R%d%s variable offset stack access prohibited for !root, var_off=%s\n",
+                               regno, err_extra, tn_buf);
                        return -EACCES;
                }
                /* Only initialized buffer on stack is allowed to be accessed
                if (meta && meta->raw_mode)
                        meta = NULL;
  
-               if (reg->smax_value >= BPF_MAX_VAR_OFF ||
-                   reg->smax_value <= -BPF_MAX_VAR_OFF) {
-                       verbose(env, "R%d unbounded indirect variable offset stack access\n",
-                               regno);
-                       return -EACCES;
-               }
-               min_off = reg->smin_value + reg->off;
-               max_off = reg->smax_value + reg->off;
-               err = __check_stack_boundary(env, regno, min_off, access_size,
-                                            zero_size_allowed);
-               if (err) {
-                       verbose(env, "R%d min value is outside of stack bound\n",
-                               regno);
-                       return err;
-               }
-               err = __check_stack_boundary(env, regno, max_off, access_size,
-                                            zero_size_allowed);
-               if (err) {
-                       verbose(env, "R%d max value is outside of stack bound\n",
-                               regno);
-                       return err;
-               }
+               min_off = reg->smin_value + off;
+               max_off = reg->smax_value + off;
        }
  
        if (meta && meta->raw_mode) {
                if (*stype == STACK_MISC)
                        goto mark;
                if (*stype == STACK_ZERO) {
-                       /* helper can write anything into the stack */
-                       *stype = STACK_MISC;
+                       if (clobber) {
+                               /* helper can write anything into the stack */
+                               *stype = STACK_MISC;
+                       }
                        goto mark;
                }
  
                if (state->stack[spi].slot_type[0] == STACK_SPILL &&
                    (state->stack[spi].spilled_ptr.type == SCALAR_VALUE ||
                     env->allow_ptr_leaks)) {
-                       __mark_reg_unknown(env, &state->stack[spi].spilled_ptr);
-                       for (j = 0; j < BPF_REG_SIZE; j++)
-                               state->stack[spi].slot_type[j] = STACK_MISC;
+                       if (clobber) {
+                               __mark_reg_unknown(env, &state->stack[spi].spilled_ptr);
+                               for (j = 0; j < BPF_REG_SIZE; j++)
+                                       state->stack[spi].slot_type[j] = STACK_MISC;
+                       }
                        goto mark;
                }
  
  err:
                if (tnum_is_const(reg->var_off)) {
-                       verbose(env, "invalid indirect read from stack off %d+%d size %d\n",
-                               min_off, i - min_off, access_size);
+                       verbose(env, "invalid%s read from stack R%d off %d+%d size %d\n",
+                               err_extra, regno, min_off, i - min_off, access_size);
                } else {
                        char tn_buf[48];
  
                        tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
-                       verbose(env, "invalid indirect read from stack var_off %s+%d size %d\n",
-                               tn_buf, i - min_off, access_size);
+                       verbose(env, "invalid%s read from stack R%d var_off %s+%d size %d\n",
+                               err_extra, regno, tn_buf, i - min_off, access_size);
                }
                return -EACCES;
  mark:
@@@ -3876,8 -4255,10 +4255,10 @@@ static int check_helper_mem_access(stru
                                           "rdwr",
                                           &env->prog->aux->max_rdwr_access);
        case PTR_TO_STACK:
-               return check_stack_boundary(env, regno, access_size,
-                                           zero_size_allowed, meta);
+               return check_stack_range_initialized(
+                               env,
+                               regno, reg->off, access_size,
+                               zero_size_allowed, ACCESS_HELPER, meta);
        default: /* scalar_value or invalid ptr */
                /* Allow zero-byte read from NULL, regardless of pointer type */
                if (zero_size_allowed && access_size == 0 &&
        }
  }
  
+ int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+                  u32 regno, u32 mem_size)
+ {
+       if (register_is_null(reg))
+               return 0;
+       if (reg_type_may_be_null(reg->type)) {
+               /* Assuming that the register contains a value check if the memory
+                * access is safe. Temporarily save and restore the register's state as
+                * the conversion shouldn't be visible to a caller.
+                */
+               const struct bpf_reg_state saved_reg = *reg;
+               int rv;
+               mark_ptr_not_null_reg(reg);
+               rv = check_helper_mem_access(env, regno, mem_size, true, NULL);
+               *reg = saved_reg;
+               return rv;
+       }
+       return check_helper_mem_access(env, regno, mem_size, true, NULL);
+ }
  /* Implementation details:
   * bpf_map_lookup returns PTR_TO_MAP_VALUE_OR_NULL
   * Two bpf_map_lookups (even with the same key) will have different reg->id.
@@@ -4875,8 -5279,9 +5279,9 @@@ static int check_func_call(struct bpf_v
                                        subprog);
                        clear_caller_saved_regs(env, caller->regs);
  
-                       /* All global functions return SCALAR_VALUE */
+                       /* All global functions return a 64-bit SCALAR_VALUE */
                        mark_reg_unknown(env, caller->regs, BPF_REG_0);
+                       caller->regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG;
  
                        /* continue with next insn after call */
                        return 0;
@@@ -5541,6 -5946,41 +5946,41 @@@ do_sim
        return !ret ? -EFAULT : 0;
  }
  
+ /* check that stack access falls within stack limits and that 'reg' doesn't
+  * have a variable offset.
+  *
+  * Variable offset is prohibited for unprivileged mode for simplicity since it
+  * requires corresponding support in Spectre masking for stack ALU.  See also
+  * retrieve_ptr_limit().
+  *
+  *
+  * 'off' includes 'reg->off'.
+  */
+ static int check_stack_access_for_ptr_arithmetic(
+                               struct bpf_verifier_env *env,
+                               int regno,
+                               const struct bpf_reg_state *reg,
+                               int off)
+ {
+       if (!tnum_is_const(reg->var_off)) {
+               char tn_buf[48];
+               tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+               verbose(env, "R%d variable stack access prohibited for !root, var_off=%s off=%d\n",
+                       regno, tn_buf, off);
+               return -EACCES;
+       }
+       if (off >= 0 || off < -MAX_BPF_STACK) {
+               verbose(env, "R%d stack pointer arithmetic goes out of range, "
+                       "prohibited for !root; off=%d\n", regno, off);
+               return -EACCES;
+       }
+       return 0;
+ }
  /* Handles arithmetic on a pointer and a scalar: computes new min/max and var_off.
   * Caller should also handle BPF_MOV case separately.
   * If we return -EACCES, caller may want to try again treating pointer as a
@@@ -5784,10 -6224,9 +6224,9 @@@ static int adjust_ptr_min_max_vals(stru
                                "prohibited for !root\n", dst);
                        return -EACCES;
                } else if (dst_reg->type == PTR_TO_STACK &&
-                          check_stack_access(env, dst_reg, dst_reg->off +
-                                             dst_reg->var_off.value, 1)) {
-                       verbose(env, "R%d stack pointer arithmetic goes out of range, "
-                               "prohibited for !root\n", dst);
+                          check_stack_access_for_ptr_arithmetic(
+                                  env, dst, dst_reg, dst_reg->off +
+                                  dst_reg->var_off.value)) {
                        return -EACCES;
                }
        }
@@@ -6266,7 -6705,7 +6705,7 @@@ static void scalar32_min_max_rsh(struc
         * 3) the signed bounds cross zero, so they tell us nothing
         *    about the result
         * If the value in dst_reg is known nonnegative, then again the
-        * unsigned bounts capture the signed bounds.
+        * unsigned bounds capture the signed bounds.
         * Thus, in all cases it suffices to blow away our signed bounds
         * and rely on inferring new ones from the unsigned bounds and
         * var_off of the result.
@@@ -6297,7 -6736,7 +6736,7 @@@ static void scalar_min_max_rsh(struct b
         * 3) the signed bounds cross zero, so they tell us nothing
         *    about the result
         * If the value in dst_reg is known nonnegative, then again the
-        * unsigned bounts capture the signed bounds.
+        * unsigned bounds capture the signed bounds.
         * Thus, in all cases it suffices to blow away our signed bounds
         * and rely on inferring new ones from the unsigned bounds and
         * var_off of the result.
@@@ -6918,7 -7357,7 +7357,7 @@@ static int is_branch32_taken(struct bpf
        case BPF_JSGT:
                if (reg->s32_min_value > sval)
                        return 1;
 -              else if (reg->s32_max_value < sval)
 +              else if (reg->s32_max_value <= sval)
                        return 0;
                break;
        case BPF_JLT:
@@@ -6991,7 -7430,7 +7430,7 @@@ static int is_branch64_taken(struct bpf
        case BPF_JSGT:
                if (reg->smin_value > sval)
                        return 1;
 -              else if (reg->smax_value < sval)
 +              else if (reg->smax_value <= sval)
                        return 0;
                break;
        case BPF_JLT:
@@@ -7367,43 -7806,19 +7806,19 @@@ static void mark_ptr_or_null_reg(struc
                }
                if (is_null) {
                        reg->type = SCALAR_VALUE;
-               } else if (reg->type == PTR_TO_MAP_VALUE_OR_NULL) {
-                       const struct bpf_map *map = reg->map_ptr;
-                       if (map->inner_map_meta) {
-                               reg->type = CONST_PTR_TO_MAP;
-                               reg->map_ptr = map->inner_map_meta;
-                       } else if (map->map_type == BPF_MAP_TYPE_XSKMAP) {
-                               reg->type = PTR_TO_XDP_SOCK;
-                       } else if (map->map_type == BPF_MAP_TYPE_SOCKMAP ||
-                                  map->map_type == BPF_MAP_TYPE_SOCKHASH) {
-                               reg->type = PTR_TO_SOCKET;
-                       } else {
-                               reg->type = PTR_TO_MAP_VALUE;
-                       }
-               } else if (reg->type == PTR_TO_SOCKET_OR_NULL) {
-                       reg->type = PTR_TO_SOCKET;
-               } else if (reg->type == PTR_TO_SOCK_COMMON_OR_NULL) {
-                       reg->type = PTR_TO_SOCK_COMMON;
-               } else if (reg->type == PTR_TO_TCP_SOCK_OR_NULL) {
-                       reg->type = PTR_TO_TCP_SOCK;
-               } else if (reg->type == PTR_TO_BTF_ID_OR_NULL) {
-                       reg->type = PTR_TO_BTF_ID;
-               } else if (reg->type == PTR_TO_MEM_OR_NULL) {
-                       reg->type = PTR_TO_MEM;
-               } else if (reg->type == PTR_TO_RDONLY_BUF_OR_NULL) {
-                       reg->type = PTR_TO_RDONLY_BUF;
-               } else if (reg->type == PTR_TO_RDWR_BUF_OR_NULL) {
-                       reg->type = PTR_TO_RDWR_BUF;
-               }
-               if (is_null) {
                        /* We don't need id and ref_obj_id from this point
                         * onwards anymore, thus we should better reset it,
                         * so that state pruning has chances to take effect.
                         */
                        reg->id = 0;
                        reg->ref_obj_id = 0;
-               } else if (!reg_may_point_to_spin_lock(reg)) {
+                       return;
+               }
+               mark_ptr_not_null_reg(reg);
+               if (!reg_may_point_to_spin_lock(reg)) {
                        /* For not-NULL ptr, reg->ref_obj_id will be reset
                         * in release_reg_references().
                         *
@@@ -7986,6 -8401,9 +8401,9 @@@ static int check_return_code(struct bpf
                    env->prog->expected_attach_type == BPF_CGROUP_INET4_GETSOCKNAME ||
                    env->prog->expected_attach_type == BPF_CGROUP_INET6_GETSOCKNAME)
                        range = tnum_range(1, 1);
+               if (env->prog->expected_attach_type == BPF_CGROUP_INET4_BIND ||
+                   env->prog->expected_attach_type == BPF_CGROUP_INET6_BIND)
+                       range = tnum_range(0, 3);
                break;
        case BPF_PROG_TYPE_CGROUP_SKB:
                if (env->prog->expected_attach_type == BPF_CGROUP_INET_EGRESS) {
@@@ -8631,11 -9049,7 +9049,11 @@@ static bool range_within(struct bpf_reg
        return old->umin_value <= cur->umin_value &&
               old->umax_value >= cur->umax_value &&
               old->smin_value <= cur->smin_value &&
 -             old->smax_value >= cur->smax_value;
 +             old->smax_value >= cur->smax_value &&
 +             old->u32_min_value <= cur->u32_min_value &&
 +             old->u32_max_value >= cur->u32_max_value &&
 +             old->s32_min_value <= cur->s32_min_value &&
 +             old->s32_max_value >= cur->s32_max_value;
  }
  
  /* Maximum number of register states that can exist at once */
@@@ -10015,15 -10429,22 +10433,22 @@@ static int check_map_prog_compatibility
                case BPF_MAP_TYPE_HASH:
                case BPF_MAP_TYPE_LRU_HASH:
                case BPF_MAP_TYPE_ARRAY:
+               case BPF_MAP_TYPE_PERCPU_HASH:
+               case BPF_MAP_TYPE_PERCPU_ARRAY:
+               case BPF_MAP_TYPE_LRU_PERCPU_HASH:
+               case BPF_MAP_TYPE_ARRAY_OF_MAPS:
+               case BPF_MAP_TYPE_HASH_OF_MAPS:
                        if (!is_preallocated_map(map)) {
                                verbose(env,
-                                       "Sleepable programs can only use preallocated hash maps\n");
+                                       "Sleepable programs can only use preallocated maps\n");
                                return -EINVAL;
                        }
                        break;
+               case BPF_MAP_TYPE_RINGBUF:
+                       break;
                default:
                        verbose(env,
-                               "Sleepable programs can only use array and hash maps\n");
+                               "Sleepable programs can only use array, hash, and ringbuf maps\n");
                        return -EINVAL;
                }
  
@@@ -10581,6 -11002,7 +11006,7 @@@ static int opt_subreg_zext_lo32_rnd_hi3
        for (i = 0; i < len; i++) {
                int adj_idx = i + delta;
                struct bpf_insn insn;
+               u8 load_reg;
  
                insn = insns[adj_idx];
                if (!aux[adj_idx].zext_dst) {
                if (!bpf_jit_needs_zext())
                        continue;
  
+               /* zext_dst means that we want to zero-extend whatever register
+                * the insn defines, which is dst_reg most of the time, with
+                * the notable exception of BPF_STX + BPF_ATOMIC + BPF_FETCH.
+                */
+               if (BPF_CLASS(insn.code) == BPF_STX &&
+                   BPF_MODE(insn.code) == BPF_ATOMIC) {
+                       /* BPF_STX + BPF_ATOMIC insns without BPF_FETCH do not
+                        * define any registers, therefore zext_dst cannot be
+                        * set.
+                        */
+                       if (WARN_ON(!(insn.imm & BPF_FETCH)))
+                               return -EINVAL;
+                       load_reg = insn.imm == BPF_CMPXCHG ? BPF_REG_0
+                                                          : insn.src_reg;
+               } else {
+                       load_reg = insn.dst_reg;
+               }
                zext_patch[0] = insn;
-               zext_patch[1].dst_reg = insn.dst_reg;
-               zext_patch[1].src_reg = insn.dst_reg;
+               zext_patch[1].dst_reg = load_reg;
+               zext_patch[1].src_reg = load_reg;
                patch = zext_patch;
                patch_len = 2;
  apply_patch_buffer:
@@@ -10841,8 -11281,7 +11285,7 @@@ static int jit_subprogs(struct bpf_veri
                return 0;
  
        for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
-               if (insn->code != (BPF_JMP | BPF_CALL) ||
-                   insn->src_reg != BPF_PSEUDO_CALL)
+               if (!bpf_pseudo_call(insn))
                        continue;
                /* Upon error here we cannot fall back to interpreter but
                 * need a hard reject of the program. Thus -EFAULT is
                /* BPF_PROG_RUN doesn't call subprogs directly,
                 * hence main prog stats include the runtime of subprogs.
                 * subprogs don't have IDs and not reachable via prog_get_next_id
-                * func[i]->aux->stats will never be accessed and stays NULL
+                * func[i]->stats will never be accessed and stays NULL
                 */
                func[i] = bpf_prog_alloc_no_stats(bpf_prog_size(len), GFP_USER);
                if (!func[i])
        for (i = 0; i < env->subprog_cnt; i++) {
                insn = func[i]->insnsi;
                for (j = 0; j < func[i]->len; j++, insn++) {
-                       if (insn->code != (BPF_JMP | BPF_CALL) ||
-                           insn->src_reg != BPF_PSEUDO_CALL)
+                       if (!bpf_pseudo_call(insn))
                                continue;
                        subprog = insn->off;
                        insn->imm = BPF_CAST_CALL(func[subprog]->bpf_func) -
         * later look the same as if they were interpreted only.
         */
        for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
-               if (insn->code != (BPF_JMP | BPF_CALL) ||
-                   insn->src_reg != BPF_PSEUDO_CALL)
+               if (!bpf_pseudo_call(insn))
                        continue;
                insn->off = env->insn_aux_data[i].call_imm;
                subprog = find_subprog(env, i + insn->off + 1);
@@@ -11047,8 -11484,7 +11488,7 @@@ out_undo_insn
        /* cleanup main prog to be interpreted */
        prog->jit_requested = 0;
        for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
-               if (insn->code != (BPF_JMP | BPF_CALL) ||
-                   insn->src_reg != BPF_PSEUDO_CALL)
+               if (!bpf_pseudo_call(insn))
                        continue;
                insn->off = 0;
                insn->imm = env->insn_aux_data[i].call_imm;
@@@ -11083,8 -11519,7 +11523,7 @@@ static int fixup_call_args(struct bpf_v
                return -EINVAL;
        }
        for (i = 0; i < prog->len; i++, insn++) {
-               if (insn->code != (BPF_JMP | BPF_CALL) ||
-                   insn->src_reg != BPF_PSEUDO_CALL)
+               if (!bpf_pseudo_call(insn))
                        continue;
                depth = get_callee_stack_depth(env, insn, i);
                if (depth < 0)
@@@ -11121,28 -11556,30 +11560,28 @@@ static int fixup_bpf_calls(struct bpf_v
                    insn->code == (BPF_ALU | BPF_MOD | BPF_X) ||
                    insn->code == (BPF_ALU | BPF_DIV | BPF_X)) {
                        bool is64 = BPF_CLASS(insn->code) == BPF_ALU64;
 -                      struct bpf_insn mask_and_div[] = {
 -                              BPF_MOV32_REG(insn->src_reg, insn->src_reg),
 +                      bool isdiv = BPF_OP(insn->code) == BPF_DIV;
 +                      struct bpf_insn *patchlet;
 +                      struct bpf_insn chk_and_div[] = {
                                /* Rx div 0 -> 0 */
 -                              BPF_JMP_IMM(BPF_JNE, insn->src_reg, 0, 2),
 +                              BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) |
 +                                           BPF_JNE | BPF_K, insn->src_reg,
 +                                           0, 2, 0),
                                BPF_ALU32_REG(BPF_XOR, insn->dst_reg, insn->dst_reg),
                                BPF_JMP_IMM(BPF_JA, 0, 0, 1),
                                *insn,
                        };
 -                      struct bpf_insn mask_and_mod[] = {
 -                              BPF_MOV32_REG(insn->src_reg, insn->src_reg),
 +                      struct bpf_insn chk_and_mod[] = {
                                /* Rx mod 0 -> Rx */
 -                              BPF_JMP_IMM(BPF_JEQ, insn->src_reg, 0, 1),
 +                              BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) |
 +                                           BPF_JEQ | BPF_K, insn->src_reg,
 +                                           0, 1, 0),
                                *insn,
                        };
 -                      struct bpf_insn *patchlet;
  
 -                      if (insn->code == (BPF_ALU64 | BPF_DIV | BPF_X) ||
 -                          insn->code == (BPF_ALU | BPF_DIV | BPF_X)) {
 -                              patchlet = mask_and_div + (is64 ? 1 : 0);
 -                              cnt = ARRAY_SIZE(mask_and_div) - (is64 ? 1 : 0);
 -                      } else {
 -                              patchlet = mask_and_mod + (is64 ? 1 : 0);
 -                              cnt = ARRAY_SIZE(mask_and_mod) - (is64 ? 1 : 0);
 -                      }
 +                      patchlet = isdiv ? chk_and_div : chk_and_mod;
 +                      cnt = isdiv ? ARRAY_SIZE(chk_and_div) :
 +                                    ARRAY_SIZE(chk_and_mod);
  
                        new_prog = bpf_patch_insn_data(env, i + delta, patchlet, cnt);
                        if (!new_prog)
@@@ -11547,6 -11984,13 +11986,13 @@@ static int do_check_common(struct bpf_v
                                mark_reg_known_zero(env, regs, i);
                        else if (regs[i].type == SCALAR_VALUE)
                                mark_reg_unknown(env, regs, i);
+                       else if (regs[i].type == PTR_TO_MEM_OR_NULL) {
+                               const u32 mem_size = regs[i].mem_size;
+                               mark_reg_known_zero(env, regs, i);
+                               regs[i].mem_size = mem_size;
+                               regs[i].id = ++env->id_gen;
+                       }
                }
        } else {
                /* 1st arg to a function */
@@@ -12125,6 -12569,7 +12571,7 @@@ int bpf_check(struct bpf_prog **prog, u
                env->strict_alignment = false;
  
        env->allow_ptr_leaks = bpf_allow_ptr_leaks();
+       env->allow_uninit_stack = bpf_allow_uninit_stack();
        env->allow_ptr_to_map_access = bpf_allow_ptr_to_map_access();
        env->bypass_spec_v1 = bpf_bypass_spec_v1();
        env->bypass_spec_v4 = bpf_bypass_spec_v4();
diff --combined kernel/trace/bpf_trace.c
@@@ -96,6 -96,9 +96,6 @@@ unsigned int trace_call_bpf(struct trac
  {
        unsigned int ret;
  
 -      if (in_nmi()) /* not supported yet */
 -              return 1;
 -
        cant_sleep();
  
        if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) {
@@@ -1188,6 -1191,10 +1188,10 @@@ BTF_SET_END(btf_allowlist_d_path
  
  static bool bpf_d_path_allowed(const struct bpf_prog *prog)
  {
+       if (prog->type == BPF_PROG_TYPE_TRACING &&
+           prog->expected_attach_type == BPF_TRACE_ITER)
+               return true;
        if (prog->type == BPF_PROG_TYPE_LSM)
                return bpf_lsm_is_sleepable_hook(prog->aux->attach_btf_id);
  
@@@ -1757,6 -1764,8 +1761,8 @@@ tracing_prog_func_proto(enum bpf_func_i
                return &bpf_sk_storage_delete_tracing_proto;
        case BPF_FUNC_sock_from_file:
                return &bpf_sock_from_file_proto;
+       case BPF_FUNC_get_socket_cookie:
+               return &bpf_get_socket_ptr_cookie_proto;
  #endif
        case BPF_FUNC_seq_printf:
                return prog->expected_attach_type == BPF_TRACE_ITER ?
diff --combined net/core/dev.c
@@@ -91,7 -91,6 +91,7 @@@
  #include <linux/etherdevice.h>
  #include <linux/ethtool.h>
  #include <linux/skbuff.h>
 +#include <linux/kthread.h>
  #include <linux/bpf.h>
  #include <linux/bpf_trace.h>
  #include <net/net_namespace.h>
  #include <net/dsa.h>
  #include <net/dst.h>
  #include <net/dst_metadata.h>
 +#include <net/gro.h>
  #include <net/pkt_sched.h>
  #include <net/pkt_cls.h>
  #include <net/checksum.h>
@@@ -1495,27 -1493,6 +1495,27 @@@ void netdev_notify_peers(struct net_dev
  }
  EXPORT_SYMBOL(netdev_notify_peers);
  
 +static int napi_threaded_poll(void *data);
 +
 +static int napi_kthread_create(struct napi_struct *n)
 +{
 +      int err = 0;
 +
 +      /* Create and wake up the kthread once to put it in
 +       * TASK_INTERRUPTIBLE mode to avoid the blocked task
 +       * warning and work with loadavg.
 +       */
 +      n->thread = kthread_run(napi_threaded_poll, n, "napi/%s-%d",
 +                              n->dev->name, n->napi_id);
 +      if (IS_ERR(n->thread)) {
 +              err = PTR_ERR(n->thread);
 +              pr_err("kthread_run failed with err %d\n", err);
 +              n->thread = NULL;
 +      }
 +
 +      return err;
 +}
 +
  static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
  {
        const struct net_device_ops *ops = dev->netdev_ops;
@@@ -2217,28 -2194,14 +2217,14 @@@ static inline void net_timestamp_set(st
  
  bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb)
  {
-       unsigned int len;
-       if (!(dev->flags & IFF_UP))
-               return false;
-       len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
-       if (skb->len <= len)
-               return true;
-       /* if TSO is enabled, we don't care about the length as the packet
-        * could be forwarded without being segmented before
-        */
-       if (skb_is_gso(skb))
-               return true;
-       return false;
+       return __is_skb_forwardable(dev, skb, true);
  }
  EXPORT_SYMBOL_GPL(is_skb_forwardable);
  
- int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
+ static int __dev_forward_skb2(struct net_device *dev, struct sk_buff *skb,
+                             bool check_mtu)
  {
-       int ret = ____dev_forward_skb(dev, skb);
+       int ret = ____dev_forward_skb(dev, skb, check_mtu);
  
        if (likely(!ret)) {
                skb->protocol = eth_type_trans(skb, dev);
  
        return ret;
  }
+ int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
+ {
+       return __dev_forward_skb2(dev, skb, true);
+ }
  EXPORT_SYMBOL_GPL(__dev_forward_skb);
  
  /**
@@@ -2273,6 -2241,11 +2264,11 @@@ int dev_forward_skb(struct net_device *
  }
  EXPORT_SYMBOL_GPL(dev_forward_skb);
  
+ int dev_forward_skb_nomtu(struct net_device *dev, struct sk_buff *skb)
+ {
+       return __dev_forward_skb2(dev, skb, false) ?: netif_rx_internal(skb);
+ }
  static inline int deliver_skb(struct sk_buff *skb,
                              struct packet_type *pt_prev,
                              struct net_device *orig_dev)
@@@ -3644,18 -3617,7 +3640,18 @@@ int skb_csum_hwoffload_help(struct sk_b
                return !!(features & NETIF_F_SCTP_CRC) ? 0 :
                        skb_crc32c_csum_help(skb);
  
 -      return !!(features & NETIF_F_CSUM_MASK) ? 0 : skb_checksum_help(skb);
 +      if (features & NETIF_F_HW_CSUM)
 +              return 0;
 +
 +      if (features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) {
 +              switch (skb->csum_offset) {
 +              case offsetof(struct tcphdr, check):
 +              case offsetof(struct udphdr, check):
 +                      return 0;
 +              }
 +      }
 +
 +      return skb_checksum_help(skb);
  }
  EXPORT_SYMBOL(skb_csum_hwoffload_help);
  
@@@ -3912,7 -3874,6 +3908,7 @@@ sch_handle_egress(struct sk_buff *skb, 
  
        /* qdisc_skb_cb(skb)->pkt_len was already set by the caller. */
        qdisc_skb_cb(skb)->mru = 0;
 +      qdisc_skb_cb(skb)->post_ct = false;
        mini_qdisc_bstats_cpu_update(miniq, skb);
  
        switch (tcf_classify(skb, miniq->filter_list, &cl_res, false)) {
@@@ -4118,7 -4079,7 +4114,7 @@@ static int __dev_queue_xmit(struct sk_b
        skb_reset_mac_header(skb);
  
        if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
 -              __skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
 +              __skb_tstamp_tx(skb, NULL, NULL, skb->sk, SCM_TSTAMP_SCHED);
  
        /* Disable soft irqs for various locks below. Also
         * stops preemption for RCU.
@@@ -4287,22 -4248,6 +4283,22 @@@ int gro_normal_batch __read_mostly = 8
  static inline void ____napi_schedule(struct softnet_data *sd,
                                     struct napi_struct *napi)
  {
 +      struct task_struct *thread;
 +
 +      if (test_bit(NAPI_STATE_THREADED, &napi->state)) {
 +              /* Paired with smp_mb__before_atomic() in
 +               * napi_enable()/dev_set_threaded().
 +               * Use READ_ONCE() to guarantee a complete
 +               * read on napi->thread. Only call
 +               * wake_up_process() when it's not NULL.
 +               */
 +              thread = READ_ONCE(napi->thread);
 +              if (thread) {
 +                      wake_up_process(thread);
 +                      return;
 +              }
 +      }
 +
        list_add_tail(&napi->poll_list, &sd->poll_list);
        __raise_softirq_irqoff(NET_RX_SOFTIRQ);
  }
@@@ -4944,6 -4889,8 +4940,6 @@@ static __latent_entropy void net_tx_act
                        else
                                __kfree_skb_defer(skb);
                }
 -
 -              __kfree_skb_flush();
        }
  
        if (sd->output_queue) {
@@@ -5009,7 -4956,6 +5005,7 @@@ sch_handle_ingress(struct sk_buff *skb
  
        qdisc_skb_cb(skb)->pkt_len = skb->len;
        qdisc_skb_cb(skb)->mru = 0;
 +      qdisc_skb_cb(skb)->post_ct = false;
        skb->tc_at_ingress = 1;
        mini_qdisc_bstats_cpu_update(miniq, skb);
  
@@@ -5759,7 -5705,7 +5755,7 @@@ static void flush_all_backlogs(void
        }
  
        /* we can have in flight packet[s] on the cpus we are not flushing,
 -       * synchronize_net() in rollback_registered_many() will take care of
 +       * synchronize_net() in unregister_netdevice_many() will take care of
         * them
         */
        for_each_cpu(cpu, &flush_cpus)
@@@ -5781,14 -5727,15 +5777,14 @@@ static void gro_normal_list(struct napi
  /* Queue one GRO_NORMAL SKB up for list processing. If batch size exceeded,
   * pass the whole batch up to the stack.
   */
 -static void gro_normal_one(struct napi_struct *napi, struct sk_buff *skb)
 +static void gro_normal_one(struct napi_struct *napi, struct sk_buff *skb, int segs)
  {
        list_add_tail(&skb->list, &napi->rx_list);
 -      if (++napi->rx_count >= gro_normal_batch)
 +      napi->rx_count += segs;
 +      if (napi->rx_count >= gro_normal_batch)
                gro_normal_list(napi);
  }
  
 -INDIRECT_CALLABLE_DECLARE(int inet_gro_complete(struct sk_buff *, int));
 -INDIRECT_CALLABLE_DECLARE(int ipv6_gro_complete(struct sk_buff *, int));
  static int napi_gro_complete(struct napi_struct *napi, struct sk_buff *skb)
  {
        struct packet_offload *ptype;
        }
  
  out:
 -      gro_normal_one(napi, skb);
 +      gro_normal_one(napi, skb, NAPI_GRO_CB(skb)->count);
        return NET_RX_SUCCESS;
  }
  
@@@ -5957,6 -5904,10 +5953,6 @@@ static void gro_flush_oldest(struct nap
        napi_gro_complete(napi, oldest);
  }
  
 -INDIRECT_CALLABLE_DECLARE(struct sk_buff *inet_gro_receive(struct list_head *,
 -                                                         struct sk_buff *));
 -INDIRECT_CALLABLE_DECLARE(struct sk_buff *ipv6_gro_receive(struct list_head *,
 -                                                         struct sk_buff *));
  static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
  {
        u32 hash = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1);
@@@ -6095,20 -6046,27 +6091,20 @@@ struct packet_offload *gro_find_complet
  }
  EXPORT_SYMBOL(gro_find_complete_by_type);
  
 -static void napi_skb_free_stolen_head(struct sk_buff *skb)
 -{
 -      skb_dst_drop(skb);
 -      skb_ext_put(skb);
 -      kmem_cache_free(skbuff_head_cache, skb);
 -}
 -
  static gro_result_t napi_skb_finish(struct napi_struct *napi,
                                    struct sk_buff *skb,
                                    gro_result_t ret)
  {
        switch (ret) {
        case GRO_NORMAL:
 -              gro_normal_one(napi, skb);
 +              gro_normal_one(napi, skb, 1);
                break;
  
        case GRO_MERGED_FREE:
                if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
                        napi_skb_free_stolen_head(skb);
                else
 -                      __kfree_skb(skb);
 +                      __kfree_skb_defer(skb);
                break;
  
        case GRO_HELD:
@@@ -6185,7 -6143,7 +6181,7 @@@ static gro_result_t napi_frags_finish(s
                __skb_push(skb, ETH_HLEN);
                skb->protocol = eth_type_trans(skb, skb->dev);
                if (ret == GRO_NORMAL)
 -                      gro_normal_one(napi, skb);
 +                      gro_normal_one(napi, skb, 1);
                break;
  
        case GRO_MERGED_FREE:
@@@ -6731,49 -6689,6 +6727,49 @@@ static void init_gro_hash(struct napi_s
        napi->gro_bitmask = 0;
  }
  
 +int dev_set_threaded(struct net_device *dev, bool threaded)
 +{
 +      struct napi_struct *napi;
 +      int err = 0;
 +
 +      if (dev->threaded == threaded)
 +              return 0;
 +
 +      if (threaded) {
 +              list_for_each_entry(napi, &dev->napi_list, dev_list) {
 +                      if (!napi->thread) {
 +                              err = napi_kthread_create(napi);
 +                              if (err) {
 +                                      threaded = false;
 +                                      break;
 +                              }
 +                      }
 +              }
 +      }
 +
 +      dev->threaded = threaded;
 +
 +      /* Make sure kthread is created before THREADED bit
 +       * is set.
 +       */
 +      smp_mb__before_atomic();
 +
 +      /* Setting/unsetting threaded mode on a napi might not immediately
 +       * take effect, if the current napi instance is actively being
 +       * polled. In this case, the switch between threaded mode and
 +       * softirq mode will happen in the next round of napi_schedule().
 +       * This should not cause hiccups/stalls to the live traffic.
 +       */
 +      list_for_each_entry(napi, &dev->napi_list, dev_list) {
 +              if (threaded)
 +                      set_bit(NAPI_STATE_THREADED, &napi->state);
 +              else
 +                      clear_bit(NAPI_STATE_THREADED, &napi->state);
 +      }
 +
 +      return err;
 +}
 +
  void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
                    int (*poll)(struct napi_struct *, int), int weight)
  {
        set_bit(NAPI_STATE_NPSVC, &napi->state);
        list_add_rcu(&napi->dev_list, &dev->napi_list);
        napi_hash_add(napi);
 +      /* Create kthread for this napi if dev->threaded is set.
 +       * Clear dev->threaded if kthread creation failed so that
 +       * threaded mode will not be enabled in napi_enable().
 +       */
 +      if (dev->threaded && napi_kthread_create(napi))
 +              dev->threaded = 0;
  }
  EXPORT_SYMBOL(netif_napi_add);
  
@@@ -6824,28 -6733,9 +6820,28 @@@ void napi_disable(struct napi_struct *n
  
        clear_bit(NAPI_STATE_PREFER_BUSY_POLL, &n->state);
        clear_bit(NAPI_STATE_DISABLE, &n->state);
 +      clear_bit(NAPI_STATE_THREADED, &n->state);
  }
  EXPORT_SYMBOL(napi_disable);
  
 +/**
 + *    napi_enable - enable NAPI scheduling
 + *    @n: NAPI context
 + *
 + * Resume NAPI from being scheduled on this context.
 + * Must be paired with napi_disable.
 + */
 +void napi_enable(struct napi_struct *n)
 +{
 +      BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
 +      smp_mb__before_atomic();
 +      clear_bit(NAPI_STATE_SCHED, &n->state);
 +      clear_bit(NAPI_STATE_NPSVC, &n->state);
 +      if (n->dev->threaded && n->thread)
 +              set_bit(NAPI_STATE_THREADED, &n->state);
 +}
 +EXPORT_SYMBOL(napi_enable);
 +
  static void flush_gro_hash(struct napi_struct *napi)
  {
        int i;
@@@ -6871,18 -6761,18 +6867,18 @@@ void __netif_napi_del(struct napi_struc
  
        flush_gro_hash(napi);
        napi->gro_bitmask = 0;
 +
 +      if (napi->thread) {
 +              kthread_stop(napi->thread);
 +              napi->thread = NULL;
 +      }
  }
  EXPORT_SYMBOL(__netif_napi_del);
  
 -static int napi_poll(struct napi_struct *n, struct list_head *repoll)
 +static int __napi_poll(struct napi_struct *n, bool *repoll)
  {
 -      void *have;
        int work, weight;
  
 -      list_del_init(&n->poll_list);
 -
 -      have = netpoll_poll_lock(n);
 -
        weight = n->weight;
  
        /* This NAPI_STATE_SCHED test is for avoiding a race
                            n->poll, work, weight);
  
        if (likely(work < weight))
 -              goto out_unlock;
 +              return work;
  
        /* Drivers must not modify the NAPI state if they
         * consume the entire weight.  In such cases this code
         */
        if (unlikely(napi_disable_pending(n))) {
                napi_complete(n);
 -              goto out_unlock;
 +              return work;
        }
  
        /* The NAPI context has more processing work, but busy-polling
                         */
                        napi_schedule(n);
                }
 -              goto out_unlock;
 +              return work;
        }
  
        if (n->gro_bitmask) {
        if (unlikely(!list_empty(&n->poll_list))) {
                pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
                             n->dev ? n->dev->name : "backlog");
 -              goto out_unlock;
 +              return work;
        }
  
 -      list_add_tail(&n->poll_list, repoll);
 +      *repoll = true;
 +
 +      return work;
 +}
 +
 +static int napi_poll(struct napi_struct *n, struct list_head *repoll)
 +{
 +      bool do_repoll = false;
 +      void *have;
 +      int work;
 +
 +      list_del_init(&n->poll_list);
 +
 +      have = netpoll_poll_lock(n);
 +
 +      work = __napi_poll(n, &do_repoll);
 +
 +      if (do_repoll)
 +              list_add_tail(&n->poll_list, repoll);
  
 -out_unlock:
        netpoll_poll_unlock(have);
  
        return work;
  }
  
 +static int napi_thread_wait(struct napi_struct *napi)
 +{
 +      set_current_state(TASK_INTERRUPTIBLE);
 +
 +      while (!kthread_should_stop() && !napi_disable_pending(napi)) {
 +              if (test_bit(NAPI_STATE_SCHED, &napi->state)) {
 +                      WARN_ON(!list_empty(&napi->poll_list));
 +                      __set_current_state(TASK_RUNNING);
 +                      return 0;
 +              }
 +
 +              schedule();
 +              set_current_state(TASK_INTERRUPTIBLE);
 +      }
 +      __set_current_state(TASK_RUNNING);
 +      return -1;
 +}
 +
 +static int napi_threaded_poll(void *data)
 +{
 +      struct napi_struct *napi = data;
 +      void *have;
 +
 +      while (!napi_thread_wait(napi)) {
 +              for (;;) {
 +                      bool repoll = false;
 +
 +                      local_bh_disable();
 +
 +                      have = netpoll_poll_lock(napi);
 +                      __napi_poll(napi, &repoll);
 +                      netpoll_poll_unlock(have);
 +
 +                      local_bh_enable();
 +
 +                      if (!repoll)
 +                              break;
 +
 +                      cond_resched();
 +              }
 +      }
 +      return 0;
 +}
 +
  static __latent_entropy void net_rx_action(struct softirq_action *h)
  {
        struct softnet_data *sd = this_cpu_ptr(&softnet_data);
  
                if (list_empty(&list)) {
                        if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
 -                              goto out;
 +                              return;
                        break;
                }
  
                __raise_softirq_irqoff(NET_RX_SOFTIRQ);
  
        net_rps_action_and_irq_enable(sd);
 -out:
 -      __kfree_skb_flush();
  }
  
  struct netdev_adjacent {
@@@ -8925,48 -8756,6 +8921,48 @@@ int dev_set_mac_address(struct net_devi
  }
  EXPORT_SYMBOL(dev_set_mac_address);
  
 +static DECLARE_RWSEM(dev_addr_sem);
 +
 +int dev_set_mac_address_user(struct net_device *dev, struct sockaddr *sa,
 +                           struct netlink_ext_ack *extack)
 +{
 +      int ret;
 +
 +      down_write(&dev_addr_sem);
 +      ret = dev_set_mac_address(dev, sa, extack);
 +      up_write(&dev_addr_sem);
 +      return ret;
 +}
 +EXPORT_SYMBOL(dev_set_mac_address_user);
 +
 +int dev_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name)
 +{
 +      size_t size = sizeof(sa->sa_data);
 +      struct net_device *dev;
 +      int ret = 0;
 +
 +      down_read(&dev_addr_sem);
 +      rcu_read_lock();
 +
 +      dev = dev_get_by_name_rcu(net, dev_name);
 +      if (!dev) {
 +              ret = -ENODEV;
 +              goto unlock;
 +      }
 +      if (!dev->addr_len)
 +              memset(sa->sa_data, 0, size);
 +      else
 +              memcpy(sa->sa_data, dev->dev_addr,
 +                     min_t(size_t, size, dev->addr_len));
 +      sa->sa_family = dev->type;
 +
 +unlock:
 +      rcu_read_unlock();
 +      up_read(&dev_addr_sem);
 +      return ret;
 +}
 +EXPORT_SYMBOL(dev_get_mac_address);
 +
  /**
   *    dev_change_carrier - Change device carrier
   *    @dev: device
@@@ -9666,6 -9455,106 +9662,6 @@@ static void net_set_todo(struct net_dev
        dev_net(dev)->dev_unreg_count++;
  }
  
 -static void rollback_registered_many(struct list_head *head)
 -{
 -      struct net_device *dev, *tmp;
 -      LIST_HEAD(close_head);
 -
 -      BUG_ON(dev_boot_phase);
 -      ASSERT_RTNL();
 -
 -      list_for_each_entry_safe(dev, tmp, head, unreg_list) {
 -              /* Some devices call without registering
 -               * for initialization unwind. Remove those
 -               * devices and proceed with the remaining.
 -               */
 -              if (dev->reg_state == NETREG_UNINITIALIZED) {
 -                      pr_debug("unregister_netdevice: device %s/%p never was registered\n",
 -                               dev->name, dev);
 -
 -                      WARN_ON(1);
 -                      list_del(&dev->unreg_list);
 -                      continue;
 -              }
 -              dev->dismantle = true;
 -              BUG_ON(dev->reg_state != NETREG_REGISTERED);
 -      }
 -
 -      /* If device is running, close it first. */
 -      list_for_each_entry(dev, head, unreg_list)
 -              list_add_tail(&dev->close_list, &close_head);
 -      dev_close_many(&close_head, true);
 -
 -      list_for_each_entry(dev, head, unreg_list) {
 -              /* And unlink it from device chain. */
 -              unlist_netdevice(dev);
 -
 -              dev->reg_state = NETREG_UNREGISTERING;
 -      }
 -      flush_all_backlogs();
 -
 -      synchronize_net();
 -
 -      list_for_each_entry(dev, head, unreg_list) {
 -              struct sk_buff *skb = NULL;
 -
 -              /* Shutdown queueing discipline. */
 -              dev_shutdown(dev);
 -
 -              dev_xdp_uninstall(dev);
 -
 -              /* Notify protocols, that we are about to destroy
 -               * this device. They should clean all the things.
 -               */
 -              call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
 -
 -              if (!dev->rtnl_link_ops ||
 -                  dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
 -                      skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0,
 -                                                   GFP_KERNEL, NULL, 0);
 -
 -              /*
 -               *      Flush the unicast and multicast chains
 -               */
 -              dev_uc_flush(dev);
 -              dev_mc_flush(dev);
 -
 -              netdev_name_node_alt_flush(dev);
 -              netdev_name_node_free(dev->name_node);
 -
 -              if (dev->netdev_ops->ndo_uninit)
 -                      dev->netdev_ops->ndo_uninit(dev);
 -
 -              if (skb)
 -                      rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
 -
 -              /* Notifier chain MUST detach us all upper devices. */
 -              WARN_ON(netdev_has_any_upper_dev(dev));
 -              WARN_ON(netdev_has_any_lower_dev(dev));
 -
 -              /* Remove entries from kobject tree */
 -              netdev_unregister_kobject(dev);
 -#ifdef CONFIG_XPS
 -              /* Remove XPS queueing entries */
 -              netif_reset_xps_queues_gt(dev, 0);
 -#endif
 -      }
 -
 -      synchronize_net();
 -
 -      list_for_each_entry(dev, head, unreg_list)
 -              dev_put(dev);
 -}
 -
 -static void rollback_registered(struct net_device *dev)
 -{
 -      LIST_HEAD(single);
 -
 -      list_add(&dev->unreg_list, &single);
 -      rollback_registered_many(&single);
 -      list_del(&single);
 -}
 -
  static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
        struct net_device *upper, netdev_features_t features)
  {
@@@ -10215,7 -10104,8 +10211,7 @@@ int register_netdevice(struct net_devic
        if (ret) {
                /* Expect explicit free_netdev() on failure */
                dev->needs_free_netdev = false;
 -              rollback_registered(dev);
 -              net_set_todo(dev);
 +              unregister_netdevice_queue(dev, NULL);
                goto out;
        }
        /*
@@@ -10837,10 -10727,9 +10833,10 @@@ void unregister_netdevice_queue(struct 
        if (head) {
                list_move_tail(&dev->unreg_list, head);
        } else {
 -              rollback_registered(dev);
 -              /* Finish processing unregister after unlock */
 -              net_set_todo(dev);
 +              LIST_HEAD(single);
 +
 +              list_add(&dev->unreg_list, &single);
 +              unregister_netdevice_many(&single);
        }
  }
  EXPORT_SYMBOL(unregister_netdevice_queue);
   */
  void unregister_netdevice_many(struct list_head *head)
  {
 -      struct net_device *dev;
 +      struct net_device *dev, *tmp;
 +      LIST_HEAD(close_head);
 +
 +      BUG_ON(dev_boot_phase);
 +      ASSERT_RTNL();
 +
 +      if (list_empty(head))
 +              return;
  
 -      if (!list_empty(head)) {
 -              rollback_registered_many(head);
 -              list_for_each_entry(dev, head, unreg_list)
 -                      net_set_todo(dev);
 -              list_del(head);
 +      list_for_each_entry_safe(dev, tmp, head, unreg_list) {
 +              /* Some devices call without registering
 +               * for initialization unwind. Remove those
 +               * devices and proceed with the remaining.
 +               */
 +              if (dev->reg_state == NETREG_UNINITIALIZED) {
 +                      pr_debug("unregister_netdevice: device %s/%p never was registered\n",
 +                               dev->name, dev);
 +
 +                      WARN_ON(1);
 +                      list_del(&dev->unreg_list);
 +                      continue;
 +              }
 +              dev->dismantle = true;
 +              BUG_ON(dev->reg_state != NETREG_REGISTERED);
 +      }
 +
 +      /* If device is running, close it first. */
 +      list_for_each_entry(dev, head, unreg_list)
 +              list_add_tail(&dev->close_list, &close_head);
 +      dev_close_many(&close_head, true);
 +
 +      list_for_each_entry(dev, head, unreg_list) {
 +              /* And unlink it from device chain. */
 +              unlist_netdevice(dev);
 +
 +              dev->reg_state = NETREG_UNREGISTERING;
 +      }
 +      flush_all_backlogs();
 +
 +      synchronize_net();
 +
 +      list_for_each_entry(dev, head, unreg_list) {
 +              struct sk_buff *skb = NULL;
 +
 +              /* Shutdown queueing discipline. */
 +              dev_shutdown(dev);
 +
 +              dev_xdp_uninstall(dev);
 +
 +              /* Notify protocols, that we are about to destroy
 +               * this device. They should clean all the things.
 +               */
 +              call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
 +
 +              if (!dev->rtnl_link_ops ||
 +                  dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
 +                      skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0,
 +                                                   GFP_KERNEL, NULL, 0);
 +
 +              /*
 +               *      Flush the unicast and multicast chains
 +               */
 +              dev_uc_flush(dev);
 +              dev_mc_flush(dev);
 +
 +              netdev_name_node_alt_flush(dev);
 +              netdev_name_node_free(dev->name_node);
 +
 +              if (dev->netdev_ops->ndo_uninit)
 +                      dev->netdev_ops->ndo_uninit(dev);
 +
 +              if (skb)
 +                      rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
 +
 +              /* Notifier chain MUST detach us all upper devices. */
 +              WARN_ON(netdev_has_any_upper_dev(dev));
 +              WARN_ON(netdev_has_any_lower_dev(dev));
 +
 +              /* Remove entries from kobject tree */
 +              netdev_unregister_kobject(dev);
 +#ifdef CONFIG_XPS
 +              /* Remove XPS queueing entries */
 +              netif_reset_xps_queues_gt(dev, 0);
 +#endif
 +      }
 +
 +      synchronize_net();
 +
 +      list_for_each_entry(dev, head, unreg_list) {
 +              dev_put(dev);
 +              net_set_todo(dev);
        }
 +
 +      list_del(head);
  }
  EXPORT_SYMBOL(unregister_netdevice_many);
  
diff --combined net/core/filter.c
@@@ -2083,13 -2083,13 +2083,13 @@@ static const struct bpf_func_proto bpf_
  
  static inline int __bpf_rx_skb(struct net_device *dev, struct sk_buff *skb)
  {
-       return dev_forward_skb(dev, skb);
+       return dev_forward_skb_nomtu(dev, skb);
  }
  
  static inline int __bpf_rx_skb_no_mac(struct net_device *dev,
                                      struct sk_buff *skb)
  {
-       int ret = ____dev_forward_skb(dev, skb);
+       int ret = ____dev_forward_skb(dev, skb, false);
  
        if (likely(!ret)) {
                skb->dev = dev;
@@@ -2480,7 -2480,7 +2480,7 @@@ int skb_do_redirect(struct sk_buff *skb
                        goto out_drop;
                dev = ops->ndo_get_peer_dev(dev);
                if (unlikely(!dev ||
-                            !is_skb_forwardable(dev, skb) ||
+                            !(dev->flags & IFF_UP) ||
                             net_eq(net, dev_net(dev))))
                        goto out_drop;
                skb->dev = dev;
@@@ -3552,11 -3552,7 +3552,7 @@@ static int bpf_skb_net_shrink(struct sk
        return 0;
  }
  
- static u32 __bpf_skb_max_len(const struct sk_buff *skb)
- {
-       return skb->dev ? skb->dev->mtu + skb->dev->hard_header_len :
-                         SKB_MAX_ALLOC;
- }
+ #define BPF_SKB_MAX_LEN SKB_MAX_ALLOC
  
  BPF_CALL_4(sk_skb_adjust_room, struct sk_buff *, skb, s32, len_diff,
           u32, mode, u64, flags)
@@@ -3605,7 -3601,7 +3601,7 @@@ BPF_CALL_4(bpf_skb_adjust_room, struct 
  {
        u32 len_cur, len_diff_abs = abs(len_diff);
        u32 len_min = bpf_skb_net_base_len(skb);
-       u32 len_max = __bpf_skb_max_len(skb);
+       u32 len_max = BPF_SKB_MAX_LEN;
        __be16 proto = skb->protocol;
        bool shrink = len_diff < 0;
        u32 off;
@@@ -3688,7 -3684,7 +3684,7 @@@ static int bpf_skb_trim_rcsum(struct sk
  static inline int __bpf_skb_change_tail(struct sk_buff *skb, u32 new_len,
                                        u64 flags)
  {
-       u32 max_len = __bpf_skb_max_len(skb);
+       u32 max_len = BPF_SKB_MAX_LEN;
        u32 min_len = __bpf_skb_min_len(skb);
        int ret;
  
@@@ -3764,7 -3760,7 +3760,7 @@@ static const struct bpf_func_proto sk_s
  static inline int __bpf_skb_change_head(struct sk_buff *skb, u32 head_room,
                                        u64 flags)
  {
-       u32 max_len = __bpf_skb_max_len(skb);
+       u32 max_len = BPF_SKB_MAX_LEN;
        u32 new_len = skb->len + head_room;
        int ret;
  
@@@ -4631,6 -4627,18 +4627,18 @@@ static const struct bpf_func_proto bpf_
        .arg1_type      = ARG_PTR_TO_CTX,
  };
  
+ BPF_CALL_1(bpf_get_socket_ptr_cookie, struct sock *, sk)
+ {
+       return sk ? sock_gen_cookie(sk) : 0;
+ }
+ const struct bpf_func_proto bpf_get_socket_ptr_cookie_proto = {
+       .func           = bpf_get_socket_ptr_cookie,
+       .gpl_only       = false,
+       .ret_type       = RET_INTEGER,
+       .arg1_type      = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
+ };
  BPF_CALL_1(bpf_get_socket_cookie_sock_ops, struct bpf_sock_ops_kern *, ctx)
  {
        return __sock_gen_cookie(ctx->sk);
@@@ -4645,9 -4653,11 +4653,9 @@@ static const struct bpf_func_proto bpf_
  
  static u64 __bpf_get_netns_cookie(struct sock *sk)
  {
 -#ifdef CONFIG_NET_NS
 -      return __net_gen_cookie(sk ? sk->sk_net.net : &init_net);
 -#else
 -      return 0;
 -#endif
 +      const struct net *net = sk ? sock_net(sk) : &init_net;
 +
 +      return net->net_cookie;
  }
  
  BPF_CALL_1(bpf_get_netns_cookie_sock, struct sock *, ctx)
@@@ -5291,12 -5301,14 +5299,14 @@@ static const struct bpf_func_proto bpf_
  #if IS_ENABLED(CONFIG_INET) || IS_ENABLED(CONFIG_IPV6)
  static int bpf_fib_set_fwd_params(struct bpf_fib_lookup *params,
                                  const struct neighbour *neigh,
-                                 const struct net_device *dev)
+                                 const struct net_device *dev, u32 mtu)
  {
        memcpy(params->dmac, neigh->ha, ETH_ALEN);
        memcpy(params->smac, dev->dev_addr, ETH_ALEN);
        params->h_vlan_TCI = 0;
        params->h_vlan_proto = 0;
+       if (mtu)
+               params->mtu_result = mtu; /* union with tot_len */
  
        return 0;
  }
@@@ -5312,8 -5324,8 +5322,8 @@@ static int bpf_ipv4_fib_lookup(struct n
        struct net_device *dev;
        struct fib_result res;
        struct flowi4 fl4;
+       u32 mtu = 0;
        int err;
-       u32 mtu;
  
        dev = dev_get_by_index_rcu(net, params->ifindex);
        if (unlikely(!dev))
  
        if (check_mtu) {
                mtu = ip_mtu_from_fib_result(&res, params->ipv4_dst);
-               if (params->tot_len > mtu)
+               if (params->tot_len > mtu) {
+                       params->mtu_result = mtu; /* union with tot_len */
                        return BPF_FIB_LKUP_RET_FRAG_NEEDED;
+               }
        }
  
        nhc = res.nhc;
        if (!neigh)
                return BPF_FIB_LKUP_RET_NO_NEIGH;
  
-       return bpf_fib_set_fwd_params(params, neigh, dev);
+       return bpf_fib_set_fwd_params(params, neigh, dev, mtu);
  }
  #endif
  
@@@ -5432,7 -5446,7 +5444,7 @@@ static int bpf_ipv6_fib_lookup(struct n
        struct flowi6 fl6;
        int strict = 0;
        int oif, err;
-       u32 mtu;
+       u32 mtu = 0;
  
        /* link local addresses are never forwarded */
        if (rt6_need_strict(dst) || rt6_need_strict(src))
  
        if (check_mtu) {
                mtu = ipv6_stub->ip6_mtu_from_fib6(&res, dst, src);
-               if (params->tot_len > mtu)
+               if (params->tot_len > mtu) {
+                       params->mtu_result = mtu; /* union with tot_len */
                        return BPF_FIB_LKUP_RET_FRAG_NEEDED;
+               }
        }
  
        if (res.nh->fib_nh_lws)
        if (!neigh)
                return BPF_FIB_LKUP_RET_NO_NEIGH;
  
-       return bpf_fib_set_fwd_params(params, neigh, dev);
+       return bpf_fib_set_fwd_params(params, neigh, dev, mtu);
  }
  #endif
  
@@@ -5571,6 -5587,7 +5585,7 @@@ BPF_CALL_4(bpf_skb_fib_lookup, struct s
  {
        struct net *net = dev_net(skb->dev);
        int rc = -EAFNOSUPPORT;
+       bool check_mtu = false;
  
        if (plen < sizeof(*params))
                return -EINVAL;
        if (flags & ~(BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_OUTPUT))
                return -EINVAL;
  
+       if (params->tot_len)
+               check_mtu = true;
        switch (params->family) {
  #if IS_ENABLED(CONFIG_INET)
        case AF_INET:
-               rc = bpf_ipv4_fib_lookup(net, params, flags, false);
+               rc = bpf_ipv4_fib_lookup(net, params, flags, check_mtu);
                break;
  #endif
  #if IS_ENABLED(CONFIG_IPV6)
        case AF_INET6:
-               rc = bpf_ipv6_fib_lookup(net, params, flags, false);
+               rc = bpf_ipv6_fib_lookup(net, params, flags, check_mtu);
                break;
  #endif
        }
  
-       if (!rc) {
+       if (rc == BPF_FIB_LKUP_RET_SUCCESS && !check_mtu) {
                struct net_device *dev;
  
+               /* When tot_len isn't provided by user, check skb
+                * against MTU of FIB lookup resulting net_device
+                */
                dev = dev_get_by_index_rcu(net, params->ifindex);
                if (!is_skb_forwardable(dev, skb))
                        rc = BPF_FIB_LKUP_RET_FRAG_NEEDED;
+               params->mtu_result = dev->mtu; /* union with tot_len */
        }
  
        return rc;
@@@ -5612,6 -5637,116 +5635,116 @@@ static const struct bpf_func_proto bpf_
        .arg4_type      = ARG_ANYTHING,
  };
  
+ static struct net_device *__dev_via_ifindex(struct net_device *dev_curr,
+                                           u32 ifindex)
+ {
+       struct net *netns = dev_net(dev_curr);
+       /* Non-redirect use-cases can use ifindex=0 and save ifindex lookup */
+       if (ifindex == 0)
+               return dev_curr;
+       return dev_get_by_index_rcu(netns, ifindex);
+ }
+ BPF_CALL_5(bpf_skb_check_mtu, struct sk_buff *, skb,
+          u32, ifindex, u32 *, mtu_len, s32, len_diff, u64, flags)
+ {
+       int ret = BPF_MTU_CHK_RET_FRAG_NEEDED;
+       struct net_device *dev = skb->dev;
+       int skb_len, dev_len;
+       int mtu;
+       if (unlikely(flags & ~(BPF_MTU_CHK_SEGS)))
+               return -EINVAL;
+       if (unlikely(flags & BPF_MTU_CHK_SEGS && len_diff))
+               return -EINVAL;
+       dev = __dev_via_ifindex(dev, ifindex);
+       if (unlikely(!dev))
+               return -ENODEV;
+       mtu = READ_ONCE(dev->mtu);
+       dev_len = mtu + dev->hard_header_len;
+       skb_len = skb->len + len_diff; /* minus result pass check */
+       if (skb_len <= dev_len) {
+               ret = BPF_MTU_CHK_RET_SUCCESS;
+               goto out;
+       }
+       /* At this point, skb->len exceed MTU, but as it include length of all
+        * segments, it can still be below MTU.  The SKB can possibly get
+        * re-segmented in transmit path (see validate_xmit_skb).  Thus, user
+        * must choose if segs are to be MTU checked.
+        */
+       if (skb_is_gso(skb)) {
+               ret = BPF_MTU_CHK_RET_SUCCESS;
+               if (flags & BPF_MTU_CHK_SEGS &&
+                   !skb_gso_validate_network_len(skb, mtu))
+                       ret = BPF_MTU_CHK_RET_SEGS_TOOBIG;
+       }
+ out:
+       /* BPF verifier guarantees valid pointer */
+       *mtu_len = mtu;
+       return ret;
+ }
+ BPF_CALL_5(bpf_xdp_check_mtu, struct xdp_buff *, xdp,
+          u32, ifindex, u32 *, mtu_len, s32, len_diff, u64, flags)
+ {
+       struct net_device *dev = xdp->rxq->dev;
+       int xdp_len = xdp->data_end - xdp->data;
+       int ret = BPF_MTU_CHK_RET_SUCCESS;
+       int mtu, dev_len;
+       /* XDP variant doesn't support multi-buffer segment check (yet) */
+       if (unlikely(flags))
+               return -EINVAL;
+       dev = __dev_via_ifindex(dev, ifindex);
+       if (unlikely(!dev))
+               return -ENODEV;
+       mtu = READ_ONCE(dev->mtu);
+       /* Add L2-header as dev MTU is L3 size */
+       dev_len = mtu + dev->hard_header_len;
+       xdp_len += len_diff; /* minus result pass check */
+       if (xdp_len > dev_len)
+               ret = BPF_MTU_CHK_RET_FRAG_NEEDED;
+       /* BPF verifier guarantees valid pointer */
+       *mtu_len = mtu;
+       return ret;
+ }
+ static const struct bpf_func_proto bpf_skb_check_mtu_proto = {
+       .func           = bpf_skb_check_mtu,
+       .gpl_only       = true,
+       .ret_type       = RET_INTEGER,
+       .arg1_type      = ARG_PTR_TO_CTX,
+       .arg2_type      = ARG_ANYTHING,
+       .arg3_type      = ARG_PTR_TO_INT,
+       .arg4_type      = ARG_ANYTHING,
+       .arg5_type      = ARG_ANYTHING,
+ };
+ static const struct bpf_func_proto bpf_xdp_check_mtu_proto = {
+       .func           = bpf_xdp_check_mtu,
+       .gpl_only       = true,
+       .ret_type       = RET_INTEGER,
+       .arg1_type      = ARG_PTR_TO_CTX,
+       .arg2_type      = ARG_ANYTHING,
+       .arg3_type      = ARG_PTR_TO_INT,
+       .arg4_type      = ARG_ANYTHING,
+       .arg5_type      = ARG_ANYTHING,
+ };
  #if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
  static int bpf_push_seg6_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len)
  {
@@@ -7021,6 -7156,14 +7154,14 @@@ sock_addr_func_proto(enum bpf_func_id f
                case BPF_CGROUP_INET6_BIND:
                case BPF_CGROUP_INET4_CONNECT:
                case BPF_CGROUP_INET6_CONNECT:
+               case BPF_CGROUP_UDP4_RECVMSG:
+               case BPF_CGROUP_UDP6_RECVMSG:
+               case BPF_CGROUP_UDP4_SENDMSG:
+               case BPF_CGROUP_UDP6_SENDMSG:
+               case BPF_CGROUP_INET4_GETPEERNAME:
+               case BPF_CGROUP_INET6_GETPEERNAME:
+               case BPF_CGROUP_INET4_GETSOCKNAME:
+               case BPF_CGROUP_INET6_GETSOCKNAME:
                        return &bpf_sock_addr_setsockopt_proto;
                default:
                        return NULL;
                case BPF_CGROUP_INET6_BIND:
                case BPF_CGROUP_INET4_CONNECT:
                case BPF_CGROUP_INET6_CONNECT:
+               case BPF_CGROUP_UDP4_RECVMSG:
+               case BPF_CGROUP_UDP6_RECVMSG:
+               case BPF_CGROUP_UDP4_SENDMSG:
+               case BPF_CGROUP_UDP6_SENDMSG:
+               case BPF_CGROUP_INET4_GETPEERNAME:
+               case BPF_CGROUP_INET6_GETPEERNAME:
+               case BPF_CGROUP_INET4_GETSOCKNAME:
+               case BPF_CGROUP_INET6_GETSOCKNAME:
                        return &bpf_sock_addr_getsockopt_proto;
                default:
                        return NULL;
@@@ -7181,6 -7332,8 +7330,8 @@@ tc_cls_act_func_proto(enum bpf_func_id 
                return &bpf_get_socket_uid_proto;
        case BPF_FUNC_fib_lookup:
                return &bpf_skb_fib_lookup_proto;
+       case BPF_FUNC_check_mtu:
+               return &bpf_skb_check_mtu_proto;
        case BPF_FUNC_sk_fullsock:
                return &bpf_sk_fullsock_proto;
        case BPF_FUNC_sk_storage_get:
@@@ -7250,6 -7403,8 +7401,8 @@@ xdp_func_proto(enum bpf_func_id func_id
                return &bpf_xdp_adjust_tail_proto;
        case BPF_FUNC_fib_lookup:
                return &bpf_xdp_fib_lookup_proto;
+       case BPF_FUNC_check_mtu:
+               return &bpf_xdp_check_mtu_proto;
  #ifdef CONFIG_INET
        case BPF_FUNC_sk_lookup_udp:
                return &bpf_xdp_sk_lookup_udp_proto;
@@@ -8814,7 -8969,7 +8967,7 @@@ u32 bpf_sock_convert_ctx_access(enum bp
                                       target_size));
                break;
        case offsetof(struct bpf_sock, rx_queue_mapping):
 -#ifdef CONFIG_XPS
 +#ifdef CONFIG_SOCK_RX_QUEUE_MAPPING
                *insn++ = BPF_LDX_MEM(
                        BPF_FIELD_SIZEOF(struct sock, sk_rx_queue_mapping),
                        si->dst_reg, si->src_reg,
diff --combined net/ipv4/af_inet.c
@@@ -438,6 -438,7 +438,7 @@@ EXPORT_SYMBOL(inet_release)
  int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
  {
        struct sock *sk = sock->sk;
+       u32 flags = BIND_WITH_LOCK;
        int err;
  
        /* If the socket has its own bind function then use it. (RAW) */
        /* BPF prog is run before any checks are done so that if the prog
         * changes context in a wrong way it will be caught.
         */
-       err = BPF_CGROUP_RUN_PROG_INET4_BIND_LOCK(sk, uaddr);
+       err = BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr,
+                                                BPF_CGROUP_INET4_BIND, &flags);
        if (err)
                return err;
  
-       return __inet_bind(sk, uaddr, addr_len, BIND_WITH_LOCK);
+       return __inet_bind(sk, uaddr, addr_len, flags);
  }
  EXPORT_SYMBOL(inet_bind);
  
@@@ -499,7 -501,8 +501,8 @@@ int __inet_bind(struct sock *sk, struc
  
        snum = ntohs(addr->sin_port);
        err = -EACCES;
-       if (snum && inet_port_requires_bind_service(net, snum) &&
+       if (!(flags & BIND_NO_CAP_NET_BIND_SERVICE) &&
+           snum && inet_port_requires_bind_service(net, snum) &&
            !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE))
                goto out;
  
@@@ -777,18 -780,19 +780,19 @@@ int inet_getname(struct socket *sock, s
                        return -ENOTCONN;
                sin->sin_port = inet->inet_dport;
                sin->sin_addr.s_addr = inet->inet_daddr;
+               BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin,
+                                           BPF_CGROUP_INET4_GETPEERNAME,
+                                           NULL);
        } else {
                __be32 addr = inet->inet_rcv_saddr;
                if (!addr)
                        addr = inet->inet_saddr;
                sin->sin_port = inet->inet_sport;
                sin->sin_addr.s_addr = addr;
-       }
-       if (cgroup_bpf_enabled)
                BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin,
-                                           peer ? BPF_CGROUP_INET4_GETPEERNAME :
-                                                  BPF_CGROUP_INET4_GETSOCKNAME,
+                                           BPF_CGROUP_INET4_GETSOCKNAME,
                                            NULL);
+       }
        memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
        return sizeof(*sin);
  }
@@@ -1419,6 -1423,7 +1423,6 @@@ struct sk_buff *inet_gso_segment(struc
  out:
        return segs;
  }
 -EXPORT_SYMBOL(inet_gso_segment);
  
  static struct sk_buff *ipip_gso_segment(struct sk_buff *skb,
                                        netdev_features_t features)
@@@ -1549,6 -1554,7 +1553,6 @@@ out
  
        return pp;
  }
 -EXPORT_SYMBOL(inet_gro_receive);
  
  static struct sk_buff *ipip_gro_receive(struct list_head *head,
                                        struct sk_buff *skb)
@@@ -1634,6 -1640,7 +1638,6 @@@ out_unlock
  
        return err;
  }
 -EXPORT_SYMBOL(inet_gro_complete);
  
  static int ipip_gro_complete(struct sk_buff *skb, int nhoff)
  {
@@@ -1868,8 -1875,6 +1872,8 @@@ static __net_init int inet_init_net(str
        net->ipv4.sysctl_igmp_llm_reports = 1;
        net->ipv4.sysctl_igmp_qrv = 2;
  
 +      net->ipv4.sysctl_fib_notify_on_flag_change = 0;
 +
        return 0;
  }
  
diff --combined net/ipv4/tcp.c
  #include <asm/ioctls.h>
  #include <net/busy_poll.h>
  
 +/* Track pending CMSGs. */
 +enum {
 +      TCP_CMSG_INQ = 1,
 +      TCP_CMSG_TS = 2
 +};
 +
  struct percpu_counter tcp_orphan_count;
  EXPORT_SYMBOL_GPL(tcp_orphan_count);
  
@@@ -481,11 -475,19 +481,11 @@@ static void tcp_tx_timestamp(struct soc
        }
  }
  
 -static inline bool tcp_stream_is_readable(const struct tcp_sock *tp,
 -                                        int target, struct sock *sk)
 +static bool tcp_stream_is_readable(struct sock *sk, int target)
  {
 -      int avail = READ_ONCE(tp->rcv_nxt) - READ_ONCE(tp->copied_seq);
 -
 -      if (avail > 0) {
 -              if (avail >= target)
 -                      return true;
 -              if (tcp_rmem_pressure(sk))
 -                      return true;
 -              if (tcp_receive_window(tp) <= inet_csk(sk)->icsk_ack.rcv_mss)
 -                      return true;
 -      }
 +      if (tcp_epollin_ready(sk, target))
 +              return true;
 +
        if (sk->sk_prot->stream_memory_read)
                return sk->sk_prot->stream_memory_read(sk);
        return false;
@@@ -560,7 -562,7 +560,7 @@@ __poll_t tcp_poll(struct file *file, st
                    tp->urg_data)
                        target++;
  
 -              if (tcp_stream_is_readable(tp, target, sk))
 +              if (tcp_stream_is_readable(sk, target))
                        mask |= EPOLLIN | EPOLLRDNORM;
  
                if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
@@@ -1737,20 -1739,6 +1737,20 @@@ int tcp_set_rcvlowat(struct sock *sk, i
  }
  EXPORT_SYMBOL(tcp_set_rcvlowat);
  
 +static void tcp_update_recv_tstamps(struct sk_buff *skb,
 +                                  struct scm_timestamping_internal *tss)
 +{
 +      if (skb->tstamp)
 +              tss->ts[0] = ktime_to_timespec64(skb->tstamp);
 +      else
 +              tss->ts[0] = (struct timespec64) {0};
 +
 +      if (skb_hwtstamps(skb)->hwtstamp)
 +              tss->ts[2] = ktime_to_timespec64(skb_hwtstamps(skb)->hwtstamp);
 +      else
 +              tss->ts[2] = (struct timespec64) {0};
 +}
 +
  #ifdef CONFIG_MMU
  static const struct vm_operations_struct tcp_vm_ops = {
  };
@@@ -1854,13 -1842,13 +1854,13 @@@ static int tcp_recvmsg_locked(struct so
                              struct scm_timestamping_internal *tss,
                              int *cmsg_flags);
  static int receive_fallback_to_copy(struct sock *sk,
 -                                  struct tcp_zerocopy_receive *zc, int inq)
 +                                  struct tcp_zerocopy_receive *zc, int inq,
 +                                  struct scm_timestamping_internal *tss)
  {
        unsigned long copy_address = (unsigned long)zc->copybuf_address;
 -      struct scm_timestamping_internal tss_unused;
 -      int err, cmsg_flags_unused;
        struct msghdr msg = {};
        struct iovec iov;
 +      int err;
  
        zc->length = 0;
        zc->recv_skip_hint = 0;
                return err;
  
        err = tcp_recvmsg_locked(sk, &msg, inq, /*nonblock=*/1, /*flags=*/0,
 -                               &tss_unused, &cmsg_flags_unused);
 +                               tss, &zc->msg_flags);
        if (err < 0)
                return err;
  
@@@ -1915,27 -1903,21 +1915,27 @@@ static int tcp_copy_straggler_data(stru
        return (__s32)copylen;
  }
  
 -static int tcp_zerocopy_handle_leftover_data(struct tcp_zerocopy_receive *zc,
 -                                           struct sock *sk,
 -                                           struct sk_buff *skb,
 -                                           u32 *seq,
 -                                           s32 copybuf_len)
 +static int tcp_zc_handle_leftover(struct tcp_zerocopy_receive *zc,
 +                                struct sock *sk,
 +                                struct sk_buff *skb,
 +                                u32 *seq,
 +                                s32 copybuf_len,
 +                                struct scm_timestamping_internal *tss)
  {
        u32 offset, copylen = min_t(u32, copybuf_len, zc->recv_skip_hint);
  
        if (!copylen)
                return 0;
        /* skb is null if inq < PAGE_SIZE. */
 -      if (skb)
 +      if (skb) {
                offset = *seq - TCP_SKB_CB(skb)->seq;
 -      else
 +      } else {
                skb = tcp_recv_skb(sk, *seq, &offset);
 +              if (TCP_SKB_CB(skb)->has_rxtstamp) {
 +                      tcp_update_recv_tstamps(skb, tss);
 +                      zc->msg_flags |= TCP_CMSG_TS;
 +              }
 +      }
  
        zc->copybuf_len = tcp_copy_straggler_data(zc, skb, copylen, &offset,
                                                  seq);
@@@ -2022,38 -2004,9 +2022,38 @@@ static int tcp_zerocopy_vm_insert_batch
                err);
  }
  
 +#define TCP_VALID_ZC_MSG_FLAGS   (TCP_CMSG_TS)
 +static void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk,
 +                             struct scm_timestamping_internal *tss);
 +static void tcp_zc_finalize_rx_tstamp(struct sock *sk,
 +                                    struct tcp_zerocopy_receive *zc,
 +                                    struct scm_timestamping_internal *tss)
 +{
 +      unsigned long msg_control_addr;
 +      struct msghdr cmsg_dummy;
 +
 +      msg_control_addr = (unsigned long)zc->msg_control;
 +      cmsg_dummy.msg_control = (void *)msg_control_addr;
 +      cmsg_dummy.msg_controllen =
 +              (__kernel_size_t)zc->msg_controllen;
 +      cmsg_dummy.msg_flags = in_compat_syscall()
 +              ? MSG_CMSG_COMPAT : 0;
 +      zc->msg_flags = 0;
 +      if (zc->msg_control == msg_control_addr &&
 +          zc->msg_controllen == cmsg_dummy.msg_controllen) {
 +              tcp_recv_timestamp(&cmsg_dummy, sk, tss);
 +              zc->msg_control = (__u64)
 +                      ((uintptr_t)cmsg_dummy.msg_control);
 +              zc->msg_controllen =
 +                      (__u64)cmsg_dummy.msg_controllen;
 +              zc->msg_flags = (__u32)cmsg_dummy.msg_flags;
 +      }
 +}
 +
  #define TCP_ZEROCOPY_PAGE_BATCH_SIZE 32
  static int tcp_zerocopy_receive(struct sock *sk,
 -                              struct tcp_zerocopy_receive *zc)
 +                              struct tcp_zerocopy_receive *zc,
 +                              struct scm_timestamping_internal *tss)
  {
        u32 length = 0, offset, vma_len, avail_len, copylen = 0;
        unsigned long address = (unsigned long)zc->address;
        int ret;
  
        zc->copybuf_len = 0;
 +      zc->msg_flags = 0;
  
        if (address & (PAGE_SIZE - 1) || address != zc->address)
                return -EINVAL;
        sock_rps_record_flow(sk);
  
        if (inq && inq <= copybuf_len)
 -              return receive_fallback_to_copy(sk, zc, inq);
 +              return receive_fallback_to_copy(sk, zc, inq, tss);
  
        if (inq < PAGE_SIZE) {
                zc->length = 0;
                        } else {
                                skb = tcp_recv_skb(sk, seq, &offset);
                        }
 +
 +                      if (TCP_SKB_CB(skb)->has_rxtstamp) {
 +                              tcp_update_recv_tstamps(skb, tss);
 +                              zc->msg_flags |= TCP_CMSG_TS;
 +                      }
                        zc->recv_skip_hint = skb->len - offset;
                        frags = skb_advance_to_frag(skb, offset, &offset_frag);
                        if (!frags || offset_frag)
@@@ -2173,7 -2120,8 +2173,7 @@@ out
        mmap_read_unlock(current->mm);
        /* Try to copy straggler data. */
        if (!ret)
 -              copylen = tcp_zerocopy_handle_leftover_data(zc, sk, skb, &seq,
 -                                                          copybuf_len);
 +              copylen = tcp_zc_handle_leftover(zc, sk, skb, &seq, copybuf_len, tss);
  
        if (length + copylen) {
                WRITE_ONCE(tp->copied_seq, seq);
  }
  #endif
  
 -static void tcp_update_recv_tstamps(struct sk_buff *skb,
 -                                  struct scm_timestamping_internal *tss)
 -{
 -      if (skb->tstamp)
 -              tss->ts[0] = ktime_to_timespec64(skb->tstamp);
 -      else
 -              tss->ts[0] = (struct timespec64) {0};
 -
 -      if (skb_hwtstamps(skb)->hwtstamp)
 -              tss->ts[2] = ktime_to_timespec64(skb_hwtstamps(skb)->hwtstamp);
 -      else
 -              tss->ts[2] = (struct timespec64) {0};
 -}
 -
  /* Similar to __sock_recv_timestamp, but does not require an skb */
  static void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk,
                               struct scm_timestamping_internal *tss)
@@@ -2310,7 -2272,7 +2310,7 @@@ static int tcp_recvmsg_locked(struct so
                goto out;
  
        if (tp->recvmsg_inq)
 -              *cmsg_flags = 1;
 +              *cmsg_flags = TCP_CMSG_INQ;
        timeo = sock_rcvtimeo(sk, nonblock);
  
        /* Urgent data needs to be handled specially. */
@@@ -2491,7 -2453,7 +2491,7 @@@ skip_copy
  
                if (TCP_SKB_CB(skb)->has_rxtstamp) {
                        tcp_update_recv_tstamps(skb, tss);
 -                      *cmsg_flags |= 2;
 +                      *cmsg_flags |= TCP_CMSG_TS;
                }
  
                if (used + offset < skb->len)
@@@ -2551,9 -2513,9 +2551,9 @@@ int tcp_recvmsg(struct sock *sk, struc
        release_sock(sk);
  
        if (cmsg_flags && ret >= 0) {
 -              if (cmsg_flags & 2)
 +              if (cmsg_flags & TCP_CMSG_TS)
                        tcp_recv_timestamp(msg, sk, &tss);
 -              if (cmsg_flags & 1) {
 +              if (cmsg_flags & TCP_CMSG_INQ) {
                        inq = tcp_inq_hint(sk);
                        put_cmsg(msg, SOL_TCP, TCP_CM_INQ, sizeof(inq), &inq);
                }
@@@ -3805,24 -3767,11 +3805,24 @@@ static size_t tcp_opt_stats_get_size(vo
                nla_total_size(sizeof(u16)) + /* TCP_NLA_TIMEOUT_REHASH */
                nla_total_size(sizeof(u32)) + /* TCP_NLA_BYTES_NOTSENT */
                nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_EDT */
 +              nla_total_size(sizeof(u8)) + /* TCP_NLA_TTL */
                0;
  }
  
 +/* Returns TTL or hop limit of an incoming packet from skb. */
 +static u8 tcp_skb_ttl_or_hop_limit(const struct sk_buff *skb)
 +{
 +      if (skb->protocol == htons(ETH_P_IP))
 +              return ip_hdr(skb)->ttl;
 +      else if (skb->protocol == htons(ETH_P_IPV6))
 +              return ipv6_hdr(skb)->hop_limit;
 +      else
 +              return 0;
 +}
 +
  struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk,
 -                                             const struct sk_buff *orig_skb)
 +                                             const struct sk_buff *orig_skb,
 +                                             const struct sk_buff *ack_skb)
  {
        const struct tcp_sock *tp = tcp_sk(sk);
        struct sk_buff *stats;
                    max_t(int, 0, tp->write_seq - tp->snd_nxt));
        nla_put_u64_64bit(stats, TCP_NLA_EDT, orig_skb->skb_mstamp_ns,
                          TCP_NLA_PAD);
 +      if (ack_skb)
 +              nla_put_u8(stats, TCP_NLA_TTL,
 +                         tcp_skb_ttl_or_hop_limit(ack_skb));
  
        return stats;
  }
@@@ -4137,7 -4083,6 +4137,7 @@@ static int do_tcp_getsockopt(struct soc
        }
  #ifdef CONFIG_MMU
        case TCP_ZEROCOPY_RECEIVE: {
 +              struct scm_timestamping_internal tss;
                struct tcp_zerocopy_receive zc = {};
                int err;
  
                        return -EFAULT;
                if (len < offsetofend(struct tcp_zerocopy_receive, length))
                        return -EINVAL;
 -              if (len > sizeof(zc)) {
 +              if (unlikely(len > sizeof(zc))) {
 +                      err = check_zeroed_user(optval + sizeof(zc),
 +                                              len - sizeof(zc));
 +                      if (err < 1)
 +                              return err == 0 ? -EINVAL : err;
                        len = sizeof(zc);
                        if (put_user(len, optlen))
                                return -EFAULT;
                }
                if (copy_from_user(&zc, optval, len))
                        return -EFAULT;
 +              if (zc.reserved)
 +                      return -EINVAL;
 +              if (zc.msg_flags &  ~(TCP_VALID_ZC_MSG_FLAGS))
 +                      return -EINVAL;
                lock_sock(sk);
 -              err = tcp_zerocopy_receive(sk, &zc);
 +              err = tcp_zerocopy_receive(sk, &zc, &tss);
+               err = BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sk, level, optname,
+                                                         &zc, &len, err);
                release_sock(sk);
 -              if (len >= offsetofend(struct tcp_zerocopy_receive, err))
 -                      goto zerocopy_rcv_sk_err;
 +              if (len >= offsetofend(struct tcp_zerocopy_receive, msg_flags))
 +                      goto zerocopy_rcv_cmsg;
                switch (len) {
 +              case offsetofend(struct tcp_zerocopy_receive, msg_flags):
 +                      goto zerocopy_rcv_cmsg;
 +              case offsetofend(struct tcp_zerocopy_receive, msg_controllen):
 +              case offsetofend(struct tcp_zerocopy_receive, msg_control):
 +              case offsetofend(struct tcp_zerocopy_receive, flags):
 +              case offsetofend(struct tcp_zerocopy_receive, copybuf_len):
 +              case offsetofend(struct tcp_zerocopy_receive, copybuf_address):
                case offsetofend(struct tcp_zerocopy_receive, err):
                        goto zerocopy_rcv_sk_err;
                case offsetofend(struct tcp_zerocopy_receive, inq):
                default:
                        goto zerocopy_rcv_out;
                }
 +zerocopy_rcv_cmsg:
 +              if (zc.msg_flags & TCP_CMSG_TS)
 +                      tcp_zc_finalize_rx_tstamp(sk, &zc, &tss);
 +              else
 +                      zc.msg_flags = 0;
  zerocopy_rcv_sk_err:
                if (!err)
                        zc.err = sock_error(sk);
@@@ -4208,6 -4135,18 +4210,18 @@@ zerocopy_rcv_out
        return 0;
  }
  
+ bool tcp_bpf_bypass_getsockopt(int level, int optname)
+ {
+       /* TCP do_tcp_getsockopt has optimized getsockopt implementation
+        * to avoid extra socket lock for TCP_ZEROCOPY_RECEIVE.
+        */
+       if (level == SOL_TCP && optname == TCP_ZEROCOPY_RECEIVE)
+               return true;
+       return false;
+ }
+ EXPORT_SYMBOL(tcp_bpf_bypass_getsockopt);
  int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
                   int __user *optlen)
  {
diff --combined net/ipv4/tcp_ipv4.c
@@@ -1649,8 -1649,6 +1649,8 @@@ u16 tcp_v4_get_syncookie(struct sock *s
        return mss;
  }
  
 +INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
 +                                                         u32));
  /* The socket must have it's spinlock held when we get
   * here, unless it is a TCP_LISTEN socket.
   *
@@@ -1670,8 -1668,7 +1670,8 @@@ int tcp_v4_do_rcv(struct sock *sk, stru
                sk_mark_napi_id(sk, skb);
                if (dst) {
                        if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
 -                          !dst->ops->check(dst, 0)) {
 +                          !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
 +                                           dst, 0)) {
                                dst_release(dst);
                                sk->sk_rx_dst = NULL;
                        }
@@@ -2796,6 -2793,7 +2796,7 @@@ struct proto tcp_prot = 
        .shutdown               = tcp_shutdown,
        .setsockopt             = tcp_setsockopt,
        .getsockopt             = tcp_getsockopt,
+       .bpf_bypass_getsockopt  = tcp_bpf_bypass_getsockopt,
        .keepalive              = tcp_set_keepalive,
        .recvmsg                = tcp_recvmsg,
        .sendmsg                = tcp_sendmsg,
diff --combined net/ipv4/udp.c
@@@ -596,12 -596,6 +596,12 @@@ void udp_encap_enable(void
  }
  EXPORT_SYMBOL(udp_encap_enable);
  
 +void udp_encap_disable(void)
 +{
 +      static_branch_dec(&udp_encap_needed_key);
 +}
 +EXPORT_SYMBOL(udp_encap_disable);
 +
  /* Handler for tunnels with arbitrary destination ports: no socket lookup, go
   * through error handlers in encapsulations looking for a match.
   */
@@@ -1130,7 -1124,7 +1130,7 @@@ int udp_sendmsg(struct sock *sk, struc
                rcu_read_unlock();
        }
  
-       if (cgroup_bpf_enabled && !connected) {
+       if (cgroup_bpf_enabled(BPF_CGROUP_UDP4_SENDMSG) && !connected) {
                err = BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk,
                                            (struct sockaddr *)usin, &ipc.addr);
                if (err)
@@@ -1864,9 -1858,8 +1864,8 @@@ try_again
                memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
                *addr_len = sizeof(*sin);
  
-               if (cgroup_bpf_enabled)
-                       BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk,
-                                                       (struct sockaddr *)sin);
+               BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk,
+                                                     (struct sockaddr *)sin);
        }
  
        if (udp_sk(sk)->gro_enabled)
diff --combined net/ipv6/af_inet6.c
@@@ -295,7 -295,8 +295,8 @@@ static int __inet6_bind(struct sock *sk
                return -EINVAL;
  
        snum = ntohs(addr->sin6_port);
-       if (snum && inet_port_requires_bind_service(net, snum) &&
+       if (!(flags & BIND_NO_CAP_NET_BIND_SERVICE) &&
+           snum && inet_port_requires_bind_service(net, snum) &&
            !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE))
                return -EACCES;
  
@@@ -439,6 -440,7 +440,7 @@@ out_unlock
  int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
  {
        struct sock *sk = sock->sk;
+       u32 flags = BIND_WITH_LOCK;
        int err = 0;
  
        /* If the socket has its own bind function then use it. */
        /* BPF prog is run before any checks are done so that if the prog
         * changes context in a wrong way it will be caught.
         */
-       err = BPF_CGROUP_RUN_PROG_INET6_BIND_LOCK(sk, uaddr);
+       err = BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr,
+                                                BPF_CGROUP_INET6_BIND, &flags);
        if (err)
                return err;
  
-       return __inet6_bind(sk, uaddr, addr_len, BIND_WITH_LOCK);
+       return __inet6_bind(sk, uaddr, addr_len, flags);
  }
  EXPORT_SYMBOL(inet6_bind);
  
@@@ -527,18 -530,19 +530,19 @@@ int inet6_getname(struct socket *sock, 
                sin->sin6_addr = sk->sk_v6_daddr;
                if (np->sndflow)
                        sin->sin6_flowinfo = np->flow_label;
+               BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin,
+                                           BPF_CGROUP_INET6_GETPEERNAME,
+                                           NULL);
        } else {
                if (ipv6_addr_any(&sk->sk_v6_rcv_saddr))
                        sin->sin6_addr = np->saddr;
                else
                        sin->sin6_addr = sk->sk_v6_rcv_saddr;
                sin->sin6_port = inet->inet_sport;
-       }
-       if (cgroup_bpf_enabled)
                BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin,
-                                           peer ? BPF_CGROUP_INET6_GETPEERNAME :
-                                                  BPF_CGROUP_INET6_GETSOCKNAME,
+                                           BPF_CGROUP_INET6_GETSOCKNAME,
                                            NULL);
+       }
        sin->sin6_scope_id = ipv6_iface_scope_id(&sin->sin6_addr,
                                                 sk->sk_bound_dev_if);
        return sizeof(*sin);
@@@ -954,7 -958,6 +958,7 @@@ static int __net_init inet6_net_init(st
        net->ipv6.sysctl.max_hbh_opts_cnt = IP6_DEFAULT_MAX_HBH_OPTS_CNT;
        net->ipv6.sysctl.max_dst_opts_len = IP6_DEFAULT_MAX_DST_OPTS_LEN;
        net->ipv6.sysctl.max_hbh_opts_len = IP6_DEFAULT_MAX_HBH_OPTS_LEN;
 +      net->ipv6.sysctl.fib_notify_on_flag_change = 0;
        atomic_set(&net->ipv6.fib6_sernum, 1);
  
        err = ipv6_init_mibs(net);
diff --combined net/ipv6/tcp_ipv6.c
@@@ -1420,8 -1420,6 +1420,8 @@@ out
        return NULL;
  }
  
 +INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
 +                                                         u32));
  /* The socket must have it's spinlock held when we get
   * here, unless it is a TCP_LISTEN socket.
   *
@@@ -1475,8 -1473,7 +1475,8 @@@ static int tcp_v6_do_rcv(struct sock *s
                sk_mark_napi_id(sk, skb);
                if (dst) {
                        if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
 -                          dst->ops->check(dst, np->rx_dst_cookie) == NULL) {
 +                          INDIRECT_CALL_1(dst->ops->check, ip6_dst_check,
 +                                          dst, np->rx_dst_cookie) == NULL) {
                                dst_release(dst);
                                sk->sk_rx_dst = NULL;
                        }
@@@ -2124,6 -2121,7 +2124,7 @@@ struct proto tcpv6_prot = 
        .shutdown               = tcp_shutdown,
        .setsockopt             = tcp_setsockopt,
        .getsockopt             = tcp_getsockopt,
+       .bpf_bypass_getsockopt  = tcp_bpf_bypass_getsockopt,
        .keepalive              = tcp_set_keepalive,
        .recvmsg                = tcp_recvmsg,
        .sendmsg                = tcp_sendmsg,
diff --combined net/ipv6/udp.c
@@@ -409,9 -409,8 +409,8 @@@ try_again
                }
                *addr_len = sizeof(*sin6);
  
-               if (cgroup_bpf_enabled)
-                       BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk,
-                                               (struct sockaddr *)sin6);
+               BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk,
+                                                     (struct sockaddr *)sin6);
        }
  
        if (udp_sk(sk)->gro_enabled)
@@@ -1462,7 -1461,7 +1461,7 @@@ do_udp_sendmsg
                fl6.saddr = np->saddr;
        fl6.fl6_sport = inet->inet_sport;
  
-       if (cgroup_bpf_enabled && !connected) {
+       if (cgroup_bpf_enabled(BPF_CGROUP_UDP6_SENDMSG) && !connected) {
                err = BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk,
                                           (struct sockaddr *)sin6, &fl6.saddr);
                if (err)
@@@ -1608,10 -1607,8 +1607,10 @@@ void udpv6_destroy_sock(struct sock *sk
                        if (encap_destroy)
                                encap_destroy(sk);
                }
 -              if (up->encap_enabled)
 +              if (up->encap_enabled) {
                        static_branch_dec(&udpv6_encap_needed_key);
 +                      udp_encap_disable();
 +              }
        }
  
        inet6_destroy_sock(sk);