Merge git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next

author David S. Miller <davem@davemloft.net>

Tue, 16 Feb 2021 21:14:06 +0000 (13:14 -0800)

committer David S. Miller <davem@davemloft.net>

Tue, 16 Feb 2021 21:14:06 +0000 (13:14 -0800)
author David S. Miller <davem@davemloft.net>
Tue, 16 Feb 2021 21:14:06 +0000 (13:14 -0800)
committer David S. Miller <davem@davemloft.net>
Tue, 16 Feb 2021 21:14:06 +0000 (13:14 -0800)
diff --combined Makefile

index b83df65,b6fa039..c03225c
--- 1/Makefile
--- 2/Makefile
+++ b/Makefile
@@@ -2,7 -2,7 +2,7 @@@
   VERSION = 5
   PATCHLEVEL = 11
   SUBLEVEL = 0
- -EXTRAVERSION = -rc4
+ +EXTRAVERSION = -rc7
   NAME = Kleptomaniac Octopus
   
   # *DOCUMENTATION*
@@@ -452,6 -452,7 +452,6 @@@ AWK                = aw
   INSTALLKERNEL  := installkernel
   DEPMOD                = depmod
   PERL          = perl
- -PYTHON                = python
   PYTHON3               = python3
   CHECK         = sparse
   BASH          = bash
@@@ -507,7 -508,7 +507,7 @@@ CLANG_FLAGS :
   
   export ARCH SRCARCH CONFIG_SHELL BASH HOSTCC KBUILD_HOSTCFLAGS CROSS_COMPILE LD CC
   export CPP AR NM STRIP OBJCOPY OBJDUMP READELF PAHOLE RESOLVE_BTFIDS LEX YACC AWK INSTALLKERNEL
- -export PERL PYTHON PYTHON3 CHECK CHECKFLAGS MAKE UTS_MACHINE HOSTCXX
+ +export PERL PYTHON3 CHECK CHECKFLAGS MAKE UTS_MACHINE HOSTCXX
   export KGZIP KBZIP2 KLZOP LZMA LZ4 XZ ZSTD
   export KBUILD_HOSTCXXFLAGS KBUILD_HOSTLDFLAGS KBUILD_HOSTLDLIBS LDFLAGS_MODULE
   
@@@ -648,8 -649,7 +648,8 @@@ ifeq ($(KBUILD_EXTMOD),
   core-y                := init/ usr/
   drivers-y     := drivers/ sound/
   drivers-$(CONFIG_SAMPLES) += samples/
- -drivers-y     += net/ virt/
+ +drivers-$(CONFIG_NET) += net/
+ +drivers-y     += virt/
   libs-y                := lib/
   endif # KBUILD_EXTMOD
   
@@@ -812,12 -812,10 +812,12 @@@ KBUILD_CFLAGS   += -ftrivial-auto-var-ini
   KBUILD_CFLAGS += -enable-trivial-auto-var-init-zero-knowing-it-will-be-removed-from-clang
   endif
   
+ +DEBUG_CFLAGS  :=
+ +
   # Workaround for GCC versions < 5.0
   # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=61801
   ifdef CONFIG_CC_IS_GCC
- -DEBUG_CFLAGS  := $(call cc-ifversion, -lt, 0500, $(call cc-option, -fno-var-tracking-assignments))
+ +DEBUG_CFLAGS  += $(call cc-ifversion, -lt, 0500, $(call cc-option, -fno-var-tracking-assignments))
   endif
   
   ifdef CONFIG_DEBUG_INFO
@@@ -950,6 -948,12 +950,6 @@@ KBUILD_CFLAGS   += $(call cc-option,-We
   # change __FILE__ to the relative path from the srctree
   KBUILD_CPPFLAGS += $(call cc-option,-fmacro-prefix-map=$(srctree)/=)
   
- -# ensure -fcf-protection is disabled when using retpoline as it is
- -# incompatible with -mindirect-branch=thunk-extern
- -ifdef CONFIG_RETPOLINE
- -KBUILD_CFLAGS += $(call cc-option,-fcf-protection=none)
- -endif
- -
   # include additional Makefiles when needed
   include-y                     := scripts/Makefile.extrawarn
   include-$(CONFIG_KASAN)               += scripts/Makefile.kasan
@@@ -1082,6 -1086,17 +1082,17 @@@ ifdef CONFIG_STACK_VALIDATIO
     endif
   endif
   
+ PHONY += resolve_btfids_clean
+ 
+ resolve_btfids_O = $(abspath $(objtree))/tools/bpf/resolve_btfids
+ 
+ # tools/bpf/resolve_btfids directory might not exist
+ # in output directory, skip its clean in that case
+ resolve_btfids_clean:
+ ifneq ($(wildcard $(resolve_btfids_O)),)
+       $(Q)$(MAKE) -sC $(srctree)/tools/bpf/resolve_btfids O=$(resolve_btfids_O) clean
+ endif
+ 
   ifdef CONFIG_BPF
   ifdef CONFIG_DEBUG_INFO_BTF
     ifeq ($(has_libelf),1)
@@@ -1491,7 -1506,7 +1502,7 @@@ vmlinuxclean
         $(Q)$(CONFIG_SHELL) $(srctree)/scripts/link-vmlinux.sh clean
         $(Q)$(if $(ARCH_POSTLINK), $(MAKE) -f $(ARCH_POSTLINK) clean)
   
- clean: archclean vmlinuxclean
+ clean: archclean vmlinuxclean resolve_btfids_clean
   
   # mrproper - Delete all generated files, including .config
   #
diff --combined include/linux/indirect_call_wrapper.h

index a8345c8,cfcfef3..c1c76a7
--- 1/include/linux/indirect_call_wrapper.h
--- 2/include/linux/indirect_call_wrapper.h
+++ b/include/linux/indirect_call_wrapper.h
@@@ -36,7 -36,6 +36,7 @@@
   
   #define INDIRECT_CALLABLE_DECLARE(f)  f
   #define INDIRECT_CALLABLE_SCOPE
+ +#define EXPORT_INDIRECT_CALLABLE(f)   EXPORT_SYMBOL(f)
   
   #else
   #define INDIRECT_CALL_1(f, f1, ...) f(__VA_ARGS__)
@@@ -45,7 -44,6 +45,7 @@@
   #define INDIRECT_CALL_4(f, f4, f3, f2, f1, ...) f(__VA_ARGS__)
   #define INDIRECT_CALLABLE_DECLARE(f)
   #define INDIRECT_CALLABLE_SCOPE               static
+ +#define EXPORT_INDIRECT_CALLABLE(f)
   #endif
   
   /*
@@@ -62,4 -60,10 +62,10 @@@
   #define INDIRECT_CALL_INET(f, f2, f1, ...) f(__VA_ARGS__)
   #endif
   
+ #if IS_ENABLED(CONFIG_INET)
+ #define INDIRECT_CALL_INET_1(f, f1, ...) INDIRECT_CALL_1(f, f1, __VA_ARGS__)
+ #else
+ #define INDIRECT_CALL_INET_1(f, f1, ...) f(__VA_ARGS__)
+ #endif
+ 
   #endif
diff --combined include/linux/netdevice.h

index bfadf3b,b9bcbfd..ddf4cfc
--- 1/include/linux/netdevice.h
--- 2/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@@ -347,7 -347,6 +347,7 @@@ struct napi_struct 
         struct list_head        dev_list;
         struct hlist_node       napi_hash_node;
         unsigned int            napi_id;
+ +      struct task_struct      *thread;
   };
   
   enum {
@@@ -359,7 -358,6 +359,7 @@@
         NAPI_STATE_NO_BUSY_POLL,        /* Do not add in napi_hash, no busy polling */
         NAPI_STATE_IN_BUSY_POLL,        /* sk_busy_loop() owns this NAPI */
         NAPI_STATE_PREFER_BUSY_POLL,    /* prefer busy-polling over softirq processing*/
+ +      NAPI_STATE_THREADED,            /* The poll is performed inside its own thread*/
   };
   
   enum {
@@@ -371,7 -369,6 +371,7 @@@
         NAPIF_STATE_NO_BUSY_POLL        = BIT(NAPI_STATE_NO_BUSY_POLL),
         NAPIF_STATE_IN_BUSY_POLL        = BIT(NAPI_STATE_IN_BUSY_POLL),
         NAPIF_STATE_PREFER_BUSY_POLL    = BIT(NAPI_STATE_PREFER_BUSY_POLL),
+ +      NAPIF_STATE_THREADED            = BIT(NAPI_STATE_THREADED),
   };
   
   enum gro_result {
@@@ -497,8 -494,6 +497,8 @@@ static inline bool napi_complete(struc
         return napi_complete_done(n, 0);
   }
   
+ +int dev_set_threaded(struct net_device *dev, bool threaded);
+ +
   /**
    *    napi_disable - prevent NAPI from scheduling
    *    @n: NAPI context
@@@ -508,7 -503,20 +508,7 @@@
    */
   void napi_disable(struct napi_struct *n);
   
- -/**
- - *    napi_enable - enable NAPI scheduling
- - *    @n: NAPI context
- - *
- - * Resume NAPI from being scheduled on this context.
- - * Must be paired with napi_disable.
- - */
- -static inline void napi_enable(struct napi_struct *n)
- -{
- -      BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
- -      smp_mb__before_atomic();
- -      clear_bit(NAPI_STATE_SCHED, &n->state);
- -      clear_bit(NAPI_STATE_NPSVC, &n->state);
- -}
+ +void napi_enable(struct napi_struct *n);
   
   /**
    *    napi_synchronize - wait until NAPI is not running
@@@ -850,7 -858,6 +850,7 @@@ enum tc_setup_type 
         TC_SETUP_QDISC_ETS,
         TC_SETUP_QDISC_TBF,
         TC_SETUP_QDISC_FIFO,
+ +      TC_SETUP_QDISC_HTB,
   };
   
   /* These structures hold the attributes of bpf state that are being passed
@@@ -1819,8 -1826,6 +1819,8 @@@ enum netdev_priv_flags 
    *
    *    @wol_enabled:   Wake-on-LAN is enabled
    *
+ + *    @threaded:      napi threaded mode is enabled
+ + *
    *    @net_notifier_list:     List of per-net netdev notifier block
    *                            that follow this device when it is moved
    *                            to another network namespace.
@@@ -1852,6 -1857,7 +1852,6 @@@ struct net_device 
         unsigned long           mem_end;
         unsigned long           mem_start;
         unsigned long           base_addr;
- -      int                     irq;
   
         /*
          *      Some hardware also needs these fields (state,dev_list,
@@@ -1873,23 -1879,6 +1873,23 @@@
                 struct list_head lower;
         } adj_list;
   
+ +      /* Read-mostly cache-line for fast-path access */
+ +      unsigned int            flags;
+ +      unsigned int            priv_flags;
+ +      const struct net_device_ops *netdev_ops;
+ +      int                     ifindex;
+ +      unsigned short          gflags;
+ +      unsigned short          hard_header_len;
+ +
+ +      /* Note : dev->mtu is often read without holding a lock.
+ +       * Writers usually hold RTNL.
+ +       * It is recommended to use READ_ONCE() to annotate the reads,
+ +       * and to use WRITE_ONCE() to annotate the writes.
+ +       */
+ +      unsigned int            mtu;
+ +      unsigned short          needed_headroom;
+ +      unsigned short          needed_tailroom;
+ +
         netdev_features_t       features;
         netdev_features_t       hw_features;
         netdev_features_t       wanted_features;
@@@ -1898,15 -1887,10 +1898,15 @@@
         netdev_features_t       mpls_features;
         netdev_features_t       gso_partial_features;
   
- -      int                     ifindex;
+ +      unsigned int            min_mtu;
+ +      unsigned int            max_mtu;
+ +      unsigned short          type;
+ +      unsigned char           min_header_len;
+ +      unsigned char           name_assign_type;
+ +
         int                     group;
   
- -      struct net_device_stats stats;
+ +      struct net_device_stats stats; /* not used by modern drivers */
   
         atomic_long_t           rx_dropped;
         atomic_long_t           tx_dropped;
@@@ -1920,6 -1904,7 +1920,6 @@@
         const struct iw_handler_def *wireless_handlers;
         struct iw_public_data   *wireless_data;
   #endif
- -      const struct net_device_ops *netdev_ops;
         const struct ethtool_ops *ethtool_ops;
   #ifdef CONFIG_NET_L3_MASTER_DEV
         const struct l3mdev_ops *l3mdev_ops;
@@@ -1938,12 -1923,34 +1938,12 @@@
   
         const struct header_ops *header_ops;
   
- -      unsigned int            flags;
- -      unsigned int            priv_flags;
- -
- -      unsigned short          gflags;
- -      unsigned short          padded;
- -
         unsigned char           operstate;
         unsigned char           link_mode;
   
         unsigned char           if_port;
         unsigned char           dma;
   
- -      /* Note : dev->mtu is often read without holding a lock.
- -       * Writers usually hold RTNL.
- -       * It is recommended to use READ_ONCE() to annotate the reads,
- -       * and to use WRITE_ONCE() to annotate the writes.
- -       */
- -      unsigned int            mtu;
- -      unsigned int            min_mtu;
- -      unsigned int            max_mtu;
- -      unsigned short          type;
- -      unsigned short          hard_header_len;
- -      unsigned char           min_header_len;
- -      unsigned char           name_assign_type;
- -
- -      unsigned short          needed_headroom;
- -      unsigned short          needed_tailroom;
- -
         /* Interface address info. */
         unsigned char           perm_addr[MAX_ADDR_LEN];
         unsigned char           addr_assign_type;
@@@ -1954,10 -1961,7 +1954,10 @@@
         unsigned short          neigh_priv_len;
         unsigned short          dev_id;
         unsigned short          dev_port;
+ +      unsigned short          padded;
+ +
         spinlock_t              addr_list_lock;
+ +      int                     irq;
   
         struct netdev_hw_addr_list      uc;
         struct netdev_hw_addr_list      mc;
@@@ -2139,7 -2143,6 +2139,7 @@@
         struct lock_class_key   *qdisc_running_key;
         bool                    proto_down;
         unsigned                wol_enabled:1;
+ +      unsigned                threaded:1;
   
         struct list_head        net_notifier_list;
   
@@@ -3902,9 -3905,6 +3902,9 @@@ int dev_pre_changeaddr_notify(struct ne
                               struct netlink_ext_ack *extack);
   int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa,
                         struct netlink_ext_ack *extack);
+ +int dev_set_mac_address_user(struct net_device *dev, struct sockaddr *sa,
+ +                           struct netlink_ext_ack *extack);
+ +int dev_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name);
   int dev_change_carrier(struct net_device *, bool new_carrier);
   int dev_get_phys_port_id(struct net_device *dev,
                          struct netdev_phys_item_id *ppid);
@@@ -3931,14 -3931,42 +3931,42 @@@ int xdp_umem_query(struct net_device *d
   
   int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb);
   int dev_forward_skb(struct net_device *dev, struct sk_buff *skb);
+ int dev_forward_skb_nomtu(struct net_device *dev, struct sk_buff *skb);
   bool is_skb_forwardable(const struct net_device *dev,
                         const struct sk_buff *skb);
   
+ static __always_inline bool __is_skb_forwardable(const struct net_device *dev,
+                                                const struct sk_buff *skb,
+                                                const bool check_mtu)
+ {
+       const u32 vlan_hdr_len = 4; /* VLAN_HLEN */
+       unsigned int len;
+ 
+       if (!(dev->flags & IFF_UP))
+               return false;
+ 
+       if (!check_mtu)
+               return true;
+ 
+       len = dev->mtu + dev->hard_header_len + vlan_hdr_len;
+       if (skb->len <= len)
+               return true;
+ 
+       /* if TSO is enabled, we don't care about the length as the packet
+        * could be forwarded without being segmented before
+        */
+       if (skb_is_gso(skb))
+               return true;
+ 
+       return false;
+ }
+ 
   static __always_inline int ____dev_forward_skb(struct net_device *dev,
-                                              struct sk_buff *skb)
+                                              struct sk_buff *skb,
+                                              const bool check_mtu)
   {
         if (skb_orphan_frags(skb, GFP_ATOMIC) ||
-           unlikely(!is_skb_forwardable(dev, skb))) {
+           unlikely(!__is_skb_forwardable(dev, skb, check_mtu))) {
                 atomic_long_inc(&dev->rx_dropped);
                 kfree_skb(skb);
                 return NET_RX_DROP;
@@@ -4339,7 -4367,6 +4367,7 @@@ static inline void netif_tx_disable(str
   
         local_bh_disable();
         cpu = smp_processor_id();
+ +      spin_lock(&dev->tx_global_lock);
         for (i = 0; i < dev->num_tx_queues; i++) {
                 struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
   
@@@ -4347,7 -4374,6 +4375,7 @@@
                 netif_tx_stop_queue(txq);
                 __netif_tx_unlock(txq);
         }
+ +      spin_unlock(&dev->tx_global_lock);
         local_bh_enable();
   }
   
diff --combined include/net/sock.h

index 855c068,7644ea6..636810d
--- 1/include/net/sock.h
--- 2/include/net/sock.h
+++ b/include/net/sock.h
@@@ -226,7 -226,7 +226,7 @@@ struct sock_common 
                 struct hlist_nulls_node skc_nulls_node;
         };
         unsigned short          skc_tx_queue_mapping;
- -#ifdef CONFIG_XPS
+ +#ifdef CONFIG_SOCK_RX_QUEUE_MAPPING
         unsigned short          skc_rx_queue_mapping;
   #endif
         union {
@@@ -356,7 -356,7 +356,7 @@@ struct sock 
   #define sk_nulls_node         __sk_common.skc_nulls_node
   #define sk_refcnt             __sk_common.skc_refcnt
   #define sk_tx_queue_mapping   __sk_common.skc_tx_queue_mapping
- -#ifdef CONFIG_XPS
+ +#ifdef CONFIG_SOCK_RX_QUEUE_MAPPING
   #define sk_rx_queue_mapping   __sk_common.skc_rx_queue_mapping
   #endif
   
@@@ -1174,6 -1174,8 +1174,8 @@@ struct proto 
   
         int                     (*backlog_rcv) (struct sock *sk,
                                                 struct sk_buff *skb);
+       bool                    (*bpf_bypass_getsockopt)(int level,
+                                                        int optname);
   
         void            (*release_cb)(struct sock *sk);
   
@@@ -1350,18 -1352,14 +1352,18 @@@ sk_memory_allocated_sub(struct sock *sk
         atomic_long_sub(amt, sk->sk_prot->memory_allocated);
   }
   
+ +#define SK_ALLOC_PERCPU_COUNTER_BATCH 16
+ +
   static inline void sk_sockets_allocated_dec(struct sock *sk)
   {
- -      percpu_counter_dec(sk->sk_prot->sockets_allocated);
+ +      percpu_counter_add_batch(sk->sk_prot->sockets_allocated, -1,
+ +                               SK_ALLOC_PERCPU_COUNTER_BATCH);
   }
   
   static inline void sk_sockets_allocated_inc(struct sock *sk)
   {
- -      percpu_counter_inc(sk->sk_prot->sockets_allocated);
+ +      percpu_counter_add_batch(sk->sk_prot->sockets_allocated, 1,
+ +                               SK_ALLOC_PERCPU_COUNTER_BATCH);
   }
   
   static inline u64
@@@ -1838,7 -1836,7 +1840,7 @@@ static inline int sk_tx_queue_get(cons
   
   static inline void sk_rx_queue_set(struct sock *sk, const struct sk_buff *skb)
   {
- -#ifdef CONFIG_XPS
+ +#ifdef CONFIG_SOCK_RX_QUEUE_MAPPING
         if (skb_rx_queue_recorded(skb)) {
                 u16 rx_queue = skb_get_rx_queue(skb);
   
@@@ -1852,20 -1850,20 +1854,20 @@@
   
   static inline void sk_rx_queue_clear(struct sock *sk)
   {
- -#ifdef CONFIG_XPS
+ +#ifdef CONFIG_SOCK_RX_QUEUE_MAPPING
         sk->sk_rx_queue_mapping = NO_QUEUE_MAPPING;
   #endif
   }
   
- -#ifdef CONFIG_XPS
   static inline int sk_rx_queue_get(const struct sock *sk)
   {
+ +#ifdef CONFIG_SOCK_RX_QUEUE_MAPPING
         if (sk && sk->sk_rx_queue_mapping != NO_QUEUE_MAPPING)
                 return sk->sk_rx_queue_mapping;
+ +#endif
   
         return -1;
   }
- -#endif
   
   static inline void sk_set_socket(struct sock *sk, struct socket *sock)
   {
diff --combined include/net/tcp.h

index 484eb23,4bb42fb..963cd86
--- 1/include/net/tcp.h
--- 2/include/net/tcp.h
+++ b/include/net/tcp.h
@@@ -403,6 -403,7 +403,7 @@@ __poll_t tcp_poll(struct file *file, st
                       struct poll_table_struct *wait);
   int tcp_getsockopt(struct sock *sk, int level, int optname,
                    char __user *optval, int __user *optlen);
+ bool tcp_bpf_bypass_getsockopt(int level, int optname);
   int tcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
                    unsigned int optlen);
   void tcp_set_keepalive(struct sock *sk, int val);
@@@ -630,7 -631,6 +631,7 @@@ static inline void tcp_clear_xmit_timer
   
   unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu);
   unsigned int tcp_current_mss(struct sock *sk);
+ +u32 tcp_clamp_probe0_to_user_timeout(const struct sock *sk, u32 when);
   
   /* Bound MSS / TSO packet size with the half of the window */
   static inline int tcp_bound_to_half_wnd(struct tcp_sock *tp, int pktsize)
@@@ -1431,29 -1431,12 +1432,29 @@@ void tcp_cleanup_rbuf(struct sock *sk, 
    */
   static inline bool tcp_rmem_pressure(const struct sock *sk)
   {
- -      int rcvbuf = READ_ONCE(sk->sk_rcvbuf);
- -      int threshold = rcvbuf - (rcvbuf >> 3);
+ +      int rcvbuf, threshold;
+ +
+ +      if (tcp_under_memory_pressure(sk))
+ +              return true;
+ +
+ +      rcvbuf = READ_ONCE(sk->sk_rcvbuf);
+ +      threshold = rcvbuf - (rcvbuf >> 3);
   
         return atomic_read(&sk->sk_rmem_alloc) > threshold;
   }
   
+ +static inline bool tcp_epollin_ready(const struct sock *sk, int target)
+ +{
+ +      const struct tcp_sock *tp = tcp_sk(sk);
+ +      int avail = READ_ONCE(tp->rcv_nxt) - READ_ONCE(tp->copied_seq);
+ +
+ +      if (avail <= 0)
+ +              return false;
+ +
+ +      return (avail >= target) || tcp_rmem_pressure(sk) ||
+ +             (tcp_receive_window(tp) <= inet_csk(sk)->icsk_ack.rcv_mss);
+ +}
+ +
   extern void tcp_openreq_init_rwin(struct request_sock *req,
                                   const struct sock *sk_listener,
                                   const struct dst_entry *dst);
@@@ -2078,7 -2061,7 +2079,7 @@@ void tcp_mark_skb_lost(struct sock *sk
   void tcp_newreno_mark_lost(struct sock *sk, bool snd_una_advanced);
   extern s32 tcp_rack_skb_timeout(struct tcp_sock *tp, struct sk_buff *skb,
                                 u32 reo_wnd);
- -extern void tcp_rack_mark_lost(struct sock *sk);
+ +extern bool tcp_rack_mark_lost(struct sock *sk);
   extern void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq,
                              u64 xmit_time);
   extern void tcp_rack_reo_timeout(struct sock *sk);
diff --combined kernel/bpf/cgroup.c

index 6aa9e10,cdf3c7e..b567ca4
--- 1/kernel/bpf/cgroup.c
--- 2/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@@ -19,7 -19,7 +19,7 @@@
   
   #include "../cgroup/cgroup-internal.h"
   
- DEFINE_STATIC_KEY_FALSE(cgroup_bpf_enabled_key);
+ DEFINE_STATIC_KEY_ARRAY_FALSE(cgroup_bpf_enabled_key, MAX_BPF_ATTACH_TYPE);
   EXPORT_SYMBOL(cgroup_bpf_enabled_key);
   
   void cgroup_bpf_offline(struct cgroup *cgrp)
@@@ -128,7 -128,7 +128,7 @@@ static void cgroup_bpf_release(struct w
                         if (pl->link)
                                 bpf_cgroup_link_auto_detach(pl->link);
                         kfree(pl);
-                       static_branch_dec(&cgroup_bpf_enabled_key);
+                       static_branch_dec(&cgroup_bpf_enabled_key[type]);
                 }
                 old_array = rcu_dereference_protected(
                                 cgrp->bpf.effective[type],
@@@ -499,7 -499,7 +499,7 @@@ int __cgroup_bpf_attach(struct cgroup *
         if (old_prog)
                 bpf_prog_put(old_prog);
         else
-               static_branch_inc(&cgroup_bpf_enabled_key);
+               static_branch_inc(&cgroup_bpf_enabled_key[type]);
         bpf_cgroup_storages_link(new_storage, cgrp, type);
         return 0;
   
@@@ -698,7 -698,7 +698,7 @@@ int __cgroup_bpf_detach(struct cgroup *
                 cgrp->bpf.flags[type] = 0;
         if (old_prog)
                 bpf_prog_put(old_prog);
-       static_branch_dec(&cgroup_bpf_enabled_key);
+       static_branch_dec(&cgroup_bpf_enabled_key[type]);
         return 0;
   
   cleanup:
@@@ -1055,6 -1055,8 +1055,8 @@@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_s
    * @uaddr: sockaddr struct provided by user
    * @type: The type of program to be exectuted
    * @t_ctx: Pointer to attach type specific context
+  * @flags: Pointer to u32 which contains higher bits of BPF program
+  *         return value (OR'ed together).
    *
    * socket is expected to be of type INET or INET6.
    *
@@@ -1064,7 -1066,8 +1066,8 @@@
   int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
                                       struct sockaddr *uaddr,
                                       enum bpf_attach_type type,
-                                     void *t_ctx)
+                                     void *t_ctx,
+                                     u32 *flags)
   {
         struct bpf_sock_addr_kern ctx = {
                 .sk = sk,
@@@ -1087,7 -1090,8 +1090,8 @@@
         }
   
         cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
-       ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, BPF_PROG_RUN);
+       ret = BPF_PROG_RUN_ARRAY_FLAGS(cgrp->bpf.effective[type], &ctx,
+                                      BPF_PROG_RUN, flags);
   
         return ret == 1 ? 0 : -EPERM;
   }
@@@ -1298,7 -1302,8 +1302,8 @@@ static bool __cgroup_bpf_prog_array_is_
         return empty;
   }
   
- static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen)
+ static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen,
+                            struct bpf_sockopt_buf *buf)
   {
         if (unlikely(max_optlen < 0))
                 return -EINVAL;
@@@ -1310,6 -1315,15 +1315,15 @@@
                 max_optlen = PAGE_SIZE;
         }
   
+       if (max_optlen <= sizeof(buf->data)) {
+               /* When the optval fits into BPF_SOCKOPT_KERN_BUF_SIZE
+                * bytes avoid the cost of kzalloc.
+                */
+               ctx->optval = buf->data;
+               ctx->optval_end = ctx->optval + max_optlen;
+               return max_optlen;
+       }
+ 
         ctx->optval = kzalloc(max_optlen, GFP_USER);
         if (!ctx->optval)
                 return -ENOMEM;
@@@ -1319,16 -1333,26 +1333,26 @@@
         return max_optlen;
   }
   
- static void sockopt_free_buf(struct bpf_sockopt_kern *ctx)
+ static void sockopt_free_buf(struct bpf_sockopt_kern *ctx,
+                            struct bpf_sockopt_buf *buf)
   {
+       if (ctx->optval == buf->data)
+               return;
         kfree(ctx->optval);
   }
   
+ static bool sockopt_buf_allocated(struct bpf_sockopt_kern *ctx,
+                                 struct bpf_sockopt_buf *buf)
+ {
+       return ctx->optval != buf->data;
+ }
+ 
   int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
                                        int *optname, char __user *optval,
                                        int *optlen, char **kernel_optval)
   {
         struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
+       struct bpf_sockopt_buf buf = {};
         struct bpf_sockopt_kern ctx = {
                 .sk = sk,
                 .level = *level,
@@@ -1340,8 -1364,7 +1364,7 @@@
          * attached to the hook so we don't waste time allocating
          * memory and locking the socket.
          */
-       if (!cgroup_bpf_enabled ||
-           __cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_SETSOCKOPT))
+       if (__cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_SETSOCKOPT))
                 return 0;
   
         /* Allocate a bit more than the initial user buffer for
@@@ -1350,7 -1373,7 +1373,7 @@@
          */
         max_optlen = max_t(int, 16, *optlen);
   
-       max_optlen = sockopt_alloc_buf(&ctx, max_optlen);
+       max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf);
         if (max_optlen < 0)
                 return max_optlen;
   
@@@ -1390,14 -1413,31 +1413,31 @@@
                  */
                 if (ctx.optlen != 0) {
                         *optlen = ctx.optlen;
-                       *kernel_optval = ctx.optval;
+                       /* We've used bpf_sockopt_kern->buf as an intermediary
+                        * storage, but the BPF program indicates that we need
+                        * to pass this data to the kernel setsockopt handler.
+                        * No way to export on-stack buf, have to allocate a
+                        * new buffer.
+                        */
+                       if (!sockopt_buf_allocated(&ctx, &buf)) {
+                               void *p = kmalloc(ctx.optlen, GFP_USER);
+ 
+                               if (!p) {
+                                       ret = -ENOMEM;
+                                       goto out;
+                               }
+                               memcpy(p, ctx.optval, ctx.optlen);
+                               *kernel_optval = p;
+                       } else {
+                               *kernel_optval = ctx.optval;
+                       }
                         /* export and don't free sockopt buf */
                         return 0;
                 }
         }
   
   out:
-       sockopt_free_buf(&ctx);
+       sockopt_free_buf(&ctx, &buf);
         return ret;
   }
   
@@@ -1407,6 -1447,7 +1447,7 @@@ int __cgroup_bpf_run_filter_getsockopt(
                                        int retval)
   {
         struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
+       struct bpf_sockopt_buf buf = {};
         struct bpf_sockopt_kern ctx = {
                 .sk = sk,
                 .level = level,
@@@ -1419,13 -1460,12 +1460,12 @@@
          * attached to the hook so we don't waste time allocating
          * memory and locking the socket.
          */
-       if (!cgroup_bpf_enabled ||
-           __cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_GETSOCKOPT))
+       if (__cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_GETSOCKOPT))
                 return retval;
   
         ctx.optlen = max_optlen;
   
-       max_optlen = sockopt_alloc_buf(&ctx, max_optlen);
+       max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf);
         if (max_optlen < 0)
                 return max_optlen;
   
@@@ -1442,11 -1482,6 +1482,11 @@@
                         goto out;
                 }
   
+ +              if (ctx.optlen < 0) {
+ +                      ret = -EFAULT;
+ +                      goto out;
+ +              }
+ +
                 if (copy_from_user(ctx.optval, optval,
                                    min(ctx.optlen, max_optlen)) != 0) {
                         ret = -EFAULT;
@@@ -1464,7 -1499,7 +1504,7 @@@
                 goto out;
         }
   
- -      if (ctx.optlen > max_optlen) {
+ +      if (ctx.optlen > max_optlen || ctx.optlen < 0) {
                 ret = -EFAULT;
                 goto out;
         }
@@@ -1488,9 -1523,55 +1528,55 @@@
         ret = ctx.retval;
   
   out:
-       sockopt_free_buf(&ctx);
+       sockopt_free_buf(&ctx, &buf);
         return ret;
   }
+ 
+ int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level,
+                                           int optname, void *optval,
+                                           int *optlen, int retval)
+ {
+       struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
+       struct bpf_sockopt_kern ctx = {
+               .sk = sk,
+               .level = level,
+               .optname = optname,
+               .retval = retval,
+               .optlen = *optlen,
+               .optval = optval,
+               .optval_end = optval + *optlen,
+       };
+       int ret;
+ 
+       /* Note that __cgroup_bpf_run_filter_getsockopt doesn't copy
+        * user data back into BPF buffer when reval != 0. This is
+        * done as an optimization to avoid extra copy, assuming
+        * kernel won't populate the data in case of an error.
+        * Here we always pass the data and memset() should
+        * be called if that data shouldn't be "exported".
+        */
+ 
+       ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_GETSOCKOPT],
+                                &ctx, BPF_PROG_RUN);
+       if (!ret)
+               return -EPERM;
+ 
+       if (ctx.optlen > *optlen)
+               return -EFAULT;
+ 
+       /* BPF programs only allowed to set retval to 0, not some
+        * arbitrary value.
+        */
+       if (ctx.retval != 0 && ctx.retval != retval)
+               return -EFAULT;
+ 
+       /* BPF programs can shrink the buffer, export the modifications.
+        */
+       if (ctx.optlen != 0)
+               *optlen = ctx.optlen;
+ 
+       return ctx.retval;
+ }
   #endif
   
   static ssize_t sysctl_cpy_dir(const struct ctl_dir *dir, char **bufp,
diff --combined kernel/bpf/verifier.c

index 1cffd4e,16ba433..36d1e73
--- 1/kernel/bpf/verifier.c
--- 2/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@@ -228,6 -228,12 +228,12 @@@ static void bpf_map_key_store(struct bp
                              (poisoned ? BPF_MAP_KEY_POISON : 0ULL);
   }
   
+ static bool bpf_pseudo_call(const struct bpf_insn *insn)
+ {
+       return insn->code == (BPF_JMP | BPF_CALL) &&
+              insn->src_reg == BPF_PSEUDO_CALL;
+ }
+ 
   struct bpf_call_arg_meta {
         struct bpf_map *map_ptr;
         bool raw_mode;
@@@ -1073,6 -1079,51 +1079,51 @@@ static void mark_reg_known_zero(struct 
         __mark_reg_known_zero(regs + regno);
   }
   
+ static void mark_ptr_not_null_reg(struct bpf_reg_state *reg)
+ {
+       switch (reg->type) {
+       case PTR_TO_MAP_VALUE_OR_NULL: {
+               const struct bpf_map *map = reg->map_ptr;
+ 
+               if (map->inner_map_meta) {
+                       reg->type = CONST_PTR_TO_MAP;
+                       reg->map_ptr = map->inner_map_meta;
+               } else if (map->map_type == BPF_MAP_TYPE_XSKMAP) {
+                       reg->type = PTR_TO_XDP_SOCK;
+               } else if (map->map_type == BPF_MAP_TYPE_SOCKMAP ||
+                          map->map_type == BPF_MAP_TYPE_SOCKHASH) {
+                       reg->type = PTR_TO_SOCKET;
+               } else {
+                       reg->type = PTR_TO_MAP_VALUE;
+               }
+               break;
+       }
+       case PTR_TO_SOCKET_OR_NULL:
+               reg->type = PTR_TO_SOCKET;
+               break;
+       case PTR_TO_SOCK_COMMON_OR_NULL:
+               reg->type = PTR_TO_SOCK_COMMON;
+               break;
+       case PTR_TO_TCP_SOCK_OR_NULL:
+               reg->type = PTR_TO_TCP_SOCK;
+               break;
+       case PTR_TO_BTF_ID_OR_NULL:
+               reg->type = PTR_TO_BTF_ID;
+               break;
+       case PTR_TO_MEM_OR_NULL:
+               reg->type = PTR_TO_MEM;
+               break;
+       case PTR_TO_RDONLY_BUF_OR_NULL:
+               reg->type = PTR_TO_RDONLY_BUF;
+               break;
+       case PTR_TO_RDWR_BUF_OR_NULL:
+               reg->type = PTR_TO_RDWR_BUF;
+               break;
+       default:
+               WARN_ON("unknown nullable register type");
+       }
+ }
+ 
   static bool reg_is_pkt_pointer(const struct bpf_reg_state *reg)
   {
         return type_is_pkt_pointer(reg->type);
@@@ -1486,9 -1537,7 +1537,7 @@@ static int check_subprogs(struct bpf_ve
   
         /* determine subprog starts. The end is one before the next starts */
         for (i = 0; i < insn_cnt; i++) {
-               if (insn[i].code != (BPF_JMP | BPF_CALL))
-                       continue;
-               if (insn[i].src_reg != BPF_PSEUDO_CALL)
+               if (!bpf_pseudo_call(insn + i))
                         continue;
                 if (!env->bpf_capable) {
                         verbose(env,
@@@ -2271,12 -2320,14 +2320,14 @@@ static void save_register_state(struct 
                 state->stack[spi].slot_type[i] = STACK_SPILL;
   }
   
- /* check_stack_read/write functions track spill/fill of registers,
+ /* check_stack_{read,write}_fixed_off functions track spill/fill of registers,
    * stack boundary and alignment are checked in check_mem_access()
    */
- static int check_stack_write(struct bpf_verifier_env *env,
-                            struct bpf_func_state *state, /* func where register points to */
-                            int off, int size, int value_regno, int insn_idx)
+ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
+                                      /* stack frame we're writing to */
+                                      struct bpf_func_state *state,
+                                      int off, int size, int value_regno,
+                                      int insn_idx)
   {
         struct bpf_func_state *cur; /* state of the current function */
         int i, slot = -off - 1, spi = slot / BPF_REG_SIZE, err;
@@@ -2402,9 -2453,175 +2453,175 @@@
         return 0;
   }
   
- static int check_stack_read(struct bpf_verifier_env *env,
-                           struct bpf_func_state *reg_state /* func where register points to */,
-                           int off, int size, int value_regno)
+ /* Write the stack: 'stack[ptr_regno + off] = value_regno'. 'ptr_regno' is
+  * known to contain a variable offset.
+  * This function checks whether the write is permitted and conservatively
+  * tracks the effects of the write, considering that each stack slot in the
+  * dynamic range is potentially written to.
+  *
+  * 'off' includes 'regno->off'.
+  * 'value_regno' can be -1, meaning that an unknown value is being written to
+  * the stack.
+  *
+  * Spilled pointers in range are not marked as written because we don't know
+  * what's going to be actually written. This means that read propagation for
+  * future reads cannot be terminated by this write.
+  *
+  * For privileged programs, uninitialized stack slots are considered
+  * initialized by this write (even though we don't know exactly what offsets
+  * are going to be written to). The idea is that we don't want the verifier to
+  * reject future reads that access slots written to through variable offsets.
+  */
+ static int check_stack_write_var_off(struct bpf_verifier_env *env,
+                                    /* func where register points to */
+                                    struct bpf_func_state *state,
+                                    int ptr_regno, int off, int size,
+                                    int value_regno, int insn_idx)
+ {
+       struct bpf_func_state *cur; /* state of the current function */
+       int min_off, max_off;
+       int i, err;
+       struct bpf_reg_state *ptr_reg = NULL, *value_reg = NULL;
+       bool writing_zero = false;
+       /* set if the fact that we're writing a zero is used to let any
+        * stack slots remain STACK_ZERO
+        */
+       bool zero_used = false;
+ 
+       cur = env->cur_state->frame[env->cur_state->curframe];
+       ptr_reg = &cur->regs[ptr_regno];
+       min_off = ptr_reg->smin_value + off;
+       max_off = ptr_reg->smax_value + off + size;
+       if (value_regno >= 0)
+               value_reg = &cur->regs[value_regno];
+       if (value_reg && register_is_null(value_reg))
+               writing_zero = true;
+ 
+       err = realloc_func_state(state, round_up(-min_off, BPF_REG_SIZE),
+                                state->acquired_refs, true);
+       if (err)
+               return err;
+ 
+ 
+       /* Variable offset writes destroy any spilled pointers in range. */
+       for (i = min_off; i < max_off; i++) {
+               u8 new_type, *stype;
+               int slot, spi;
+ 
+               slot = -i - 1;
+               spi = slot / BPF_REG_SIZE;
+               stype = &state->stack[spi].slot_type[slot % BPF_REG_SIZE];
+ 
+               if (!env->allow_ptr_leaks
+                               && *stype != NOT_INIT
+                               && *stype != SCALAR_VALUE) {
+                       /* Reject the write if there's are spilled pointers in
+                        * range. If we didn't reject here, the ptr status
+                        * would be erased below (even though not all slots are
+                        * actually overwritten), possibly opening the door to
+                        * leaks.
+                        */
+                       verbose(env, "spilled ptr in range of var-offset stack write; insn %d, ptr off: %d",
+                               insn_idx, i);
+                       return -EINVAL;
+               }
+ 
+               /* Erase all spilled pointers. */
+               state->stack[spi].spilled_ptr.type = NOT_INIT;
+ 
+               /* Update the slot type. */
+               new_type = STACK_MISC;
+               if (writing_zero && *stype == STACK_ZERO) {
+                       new_type = STACK_ZERO;
+                       zero_used = true;
+               }
+               /* If the slot is STACK_INVALID, we check whether it's OK to
+                * pretend that it will be initialized by this write. The slot
+                * might not actually be written to, and so if we mark it as
+                * initialized future reads might leak uninitialized memory.
+                * For privileged programs, we will accept such reads to slots
+                * that may or may not be written because, if we're reject
+                * them, the error would be too confusing.
+                */
+               if (*stype == STACK_INVALID && !env->allow_uninit_stack) {
+                       verbose(env, "uninit stack in range of var-offset write prohibited for !root; insn %d, off: %d",
+                                       insn_idx, i);
+                       return -EINVAL;
+               }
+               *stype = new_type;
+       }
+       if (zero_used) {
+               /* backtracking doesn't work for STACK_ZERO yet. */
+               err = mark_chain_precision(env, value_regno);
+               if (err)
+                       return err;
+       }
+       return 0;
+ }
+ 
+ /* When register 'dst_regno' is assigned some values from stack[min_off,
+  * max_off), we set the register's type according to the types of the
+  * respective stack slots. If all the stack values are known to be zeros, then
+  * so is the destination reg. Otherwise, the register is considered to be
+  * SCALAR. This function does not deal with register filling; the caller must
+  * ensure that all spilled registers in the stack range have been marked as
+  * read.
+  */
+ static void mark_reg_stack_read(struct bpf_verifier_env *env,
+                               /* func where src register points to */
+                               struct bpf_func_state *ptr_state,
+                               int min_off, int max_off, int dst_regno)
+ {
+       struct bpf_verifier_state *vstate = env->cur_state;
+       struct bpf_func_state *state = vstate->frame[vstate->curframe];
+       int i, slot, spi;
+       u8 *stype;
+       int zeros = 0;
+ 
+       for (i = min_off; i < max_off; i++) {
+               slot = -i - 1;
+               spi = slot / BPF_REG_SIZE;
+               stype = ptr_state->stack[spi].slot_type;
+               if (stype[slot % BPF_REG_SIZE] != STACK_ZERO)
+                       break;
+               zeros++;
+       }
+       if (zeros == max_off - min_off) {
+               /* any access_size read into register is zero extended,
+                * so the whole register == const_zero
+                */
+               __mark_reg_const_zero(&state->regs[dst_regno]);
+               /* backtracking doesn't support STACK_ZERO yet,
+                * so mark it precise here, so that later
+                * backtracking can stop here.
+                * Backtracking may not need this if this register
+                * doesn't participate in pointer adjustment.
+                * Forward propagation of precise flag is not
+                * necessary either. This mark is only to stop
+                * backtracking. Any register that contributed
+                * to const 0 was marked precise before spill.
+                */
+               state->regs[dst_regno].precise = true;
+       } else {
+               /* have read misc data from the stack */
+               mark_reg_unknown(env, state->regs, dst_regno);
+       }
+       state->regs[dst_regno].live |= REG_LIVE_WRITTEN;
+ }
+ 
+ /* Read the stack at 'off' and put the results into the register indicated by
+  * 'dst_regno'. It handles reg filling if the addressed stack slot is a
+  * spilled reg.
+  *
+  * 'dst_regno' can be -1, meaning that the read value is not going to a
+  * register.
+  *
+  * The access is assumed to be within the current stack bounds.
+  */
+ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
+                                     /* func where src register points to */
+                                     struct bpf_func_state *reg_state,
+                                     int off, int size, int dst_regno)
   {
         struct bpf_verifier_state *vstate = env->cur_state;
         struct bpf_func_state *state = vstate->frame[vstate->curframe];
@@@ -2412,11 -2629,6 +2629,6 @@@
         struct bpf_reg_state *reg;
         u8 *stype;
   
-       if (reg_state->allocated_stack <= slot) {
-               verbose(env, "invalid read from stack off %d+0 size %d\n",
-                       off, size);
-               return -EACCES;
-       }
         stype = reg_state->stack[spi].slot_type;
         reg = &reg_state->stack[spi].spilled_ptr;
   
@@@ -2427,9 -2639,9 +2639,9 @@@
                                 verbose(env, "invalid size of register fill\n");
                                 return -EACCES;
                         }
-                       if (value_regno >= 0) {
-                               mark_reg_unknown(env, state->regs, value_regno);
-                               state->regs[value_regno].live |= REG_LIVE_WRITTEN;
+                       if (dst_regno >= 0) {
+                               mark_reg_unknown(env, state->regs, dst_regno);
+                               state->regs[dst_regno].live |= REG_LIVE_WRITTEN;
                         }
                         mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64);
                         return 0;
@@@ -2441,16 -2653,16 +2653,16 @@@
                         }
                 }
   
-               if (value_regno >= 0) {
+               if (dst_regno >= 0) {
                         /* restore register state from stack */
-                       state->regs[value_regno] = *reg;
+                       state->regs[dst_regno] = *reg;
                         /* mark reg as written since spilled pointer state likely
                          * has its liveness marks cleared by is_state_visited()
                          * which resets stack/reg liveness for state transitions
                          */
-                       state->regs[value_regno].live |= REG_LIVE_WRITTEN;
+                       state->regs[dst_regno].live |= REG_LIVE_WRITTEN;
                 } else if (__is_pointer_value(env->allow_ptr_leaks, reg)) {
-                       /* If value_regno==-1, the caller is asking us whether
+                       /* If dst_regno==-1, the caller is asking us whether
                          * it is acceptable to use this value as a SCALAR_VALUE
                          * (e.g. for XADD).
                          * We must not allow unprivileged callers to do that
@@@ -2462,70 -2674,167 +2674,167 @@@
                 }
                 mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64);
         } else {
-               int zeros = 0;
+               u8 type;
   
                 for (i = 0; i < size; i++) {
-                       if (stype[(slot - i) % BPF_REG_SIZE] == STACK_MISC)
+                       type = stype[(slot - i) % BPF_REG_SIZE];
+                       if (type == STACK_MISC)
                                 continue;
-                       if (stype[(slot - i) % BPF_REG_SIZE] == STACK_ZERO) {
-                               zeros++;
+                       if (type == STACK_ZERO)
                                 continue;
-                       }
                         verbose(env, "invalid read from stack off %d+%d size %d\n",
                                 off, i, size);
                         return -EACCES;
                 }
                 mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64);
-               if (value_regno >= 0) {
-                       if (zeros == size) {
-                               /* any size read into register is zero extended,
-                                * so the whole register == const_zero
-                                */
-                               __mark_reg_const_zero(&state->regs[value_regno]);
-                               /* backtracking doesn't support STACK_ZERO yet,
-                                * so mark it precise here, so that later
-                                * backtracking can stop here.
-                                * Backtracking may not need this if this register
-                                * doesn't participate in pointer adjustment.
-                                * Forward propagation of precise flag is not
-                                * necessary either. This mark is only to stop
-                                * backtracking. Any register that contributed
-                                * to const 0 was marked precise before spill.
-                                */
-                               state->regs[value_regno].precise = true;
-                       } else {
-                               /* have read misc data from the stack */
-                               mark_reg_unknown(env, state->regs, value_regno);
-                       }
-                       state->regs[value_regno].live |= REG_LIVE_WRITTEN;
-               }
+               if (dst_regno >= 0)
+                       mark_reg_stack_read(env, reg_state, off, off + size, dst_regno);
         }
         return 0;
   }
   
- static int check_stack_access(struct bpf_verifier_env *env,
-                             const struct bpf_reg_state *reg,
-                             int off, int size)
+ enum stack_access_src {
+       ACCESS_DIRECT = 1,  /* the access is performed by an instruction */
+       ACCESS_HELPER = 2,  /* the access is performed by a helper */
+ };
+ 
+ static int check_stack_range_initialized(struct bpf_verifier_env *env,
+                                        int regno, int off, int access_size,
+                                        bool zero_size_allowed,
+                                        enum stack_access_src type,
+                                        struct bpf_call_arg_meta *meta);
+ 
+ static struct bpf_reg_state *reg_state(struct bpf_verifier_env *env, int regno)
+ {
+       return cur_regs(env) + regno;
+ }
+ 
+ /* Read the stack at 'ptr_regno + off' and put the result into the register
+  * 'dst_regno'.
+  * 'off' includes the pointer register's fixed offset(i.e. 'ptr_regno.off'),
+  * but not its variable offset.
+  * 'size' is assumed to be <= reg size and the access is assumed to be aligned.
+  *
+  * As opposed to check_stack_read_fixed_off, this function doesn't deal with
+  * filling registers (i.e. reads of spilled register cannot be detected when
+  * the offset is not fixed). We conservatively mark 'dst_regno' as containing
+  * SCALAR_VALUE. That's why we assert that the 'ptr_regno' has a variable
+  * offset; for a fixed offset check_stack_read_fixed_off should be used
+  * instead.
+  */
+ static int check_stack_read_var_off(struct bpf_verifier_env *env,
+                                   int ptr_regno, int off, int size, int dst_regno)
   {
-       /* Stack accesses must be at a fixed offset, so that we
-        * can determine what type of data were returned. See
-        * check_stack_read().
+       /* The state of the source register. */
+       struct bpf_reg_state *reg = reg_state(env, ptr_regno);
+       struct bpf_func_state *ptr_state = func(env, reg);
+       int err;
+       int min_off, max_off;
+ 
+       /* Note that we pass a NULL meta, so raw access will not be permitted.
          */
-       if (!tnum_is_const(reg->var_off)) {
+       err = check_stack_range_initialized(env, ptr_regno, off, size,
+                                           false, ACCESS_DIRECT, NULL);
+       if (err)
+               return err;
+ 
+       min_off = reg->smin_value + off;
+       max_off = reg->smax_value + off;
+       mark_reg_stack_read(env, ptr_state, min_off, max_off + size, dst_regno);
+       return 0;
+ }
+ 
+ /* check_stack_read dispatches to check_stack_read_fixed_off or
+  * check_stack_read_var_off.
+  *
+  * The caller must ensure that the offset falls within the allocated stack
+  * bounds.
+  *
+  * 'dst_regno' is a register which will receive the value from the stack. It
+  * can be -1, meaning that the read value is not going to a register.
+  */
+ static int check_stack_read(struct bpf_verifier_env *env,
+                           int ptr_regno, int off, int size,
+                           int dst_regno)
+ {
+       struct bpf_reg_state *reg = reg_state(env, ptr_regno);
+       struct bpf_func_state *state = func(env, reg);
+       int err;
+       /* Some accesses are only permitted with a static offset. */
+       bool var_off = !tnum_is_const(reg->var_off);
+ 
+       /* The offset is required to be static when reads don't go to a
+        * register, in order to not leak pointers (see
+        * check_stack_read_fixed_off).
+        */
+       if (dst_regno < 0 && var_off) {
                 char tn_buf[48];
   
                 tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
-               verbose(env, "variable stack access var_off=%s off=%d size=%d\n",
+               verbose(env, "variable offset stack pointer cannot be passed into helper function; var_off=%s off=%d size=%d\n",
                         tn_buf, off, size);
                 return -EACCES;
         }
+       /* Variable offset is prohibited for unprivileged mode for simplicity
+        * since it requires corresponding support in Spectre masking for stack
+        * ALU. See also retrieve_ptr_limit().
+        */
+       if (!env->bypass_spec_v1 && var_off) {
+               char tn_buf[48];
   
-       if (off >= 0 || off < -MAX_BPF_STACK) {
-               verbose(env, "invalid stack off=%d size=%d\n", off, size);
+               tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+               verbose(env, "R%d variable offset stack access prohibited for !root, var_off=%s\n",
+                               ptr_regno, tn_buf);
                 return -EACCES;
         }
   
-       return 0;
+       if (!var_off) {
+               off += reg->var_off.value;
+               err = check_stack_read_fixed_off(env, state, off, size,
+                                                dst_regno);
+       } else {
+               /* Variable offset stack reads need more conservative handling
+                * than fixed offset ones. Note that dst_regno >= 0 on this
+                * branch.
+                */
+               err = check_stack_read_var_off(env, ptr_regno, off, size,
+                                              dst_regno);
+       }
+       return err;
+ }
+ 
+ 
+ /* check_stack_write dispatches to check_stack_write_fixed_off or
+  * check_stack_write_var_off.
+  *
+  * 'ptr_regno' is the register used as a pointer into the stack.
+  * 'off' includes 'ptr_regno->off', but not its variable offset (if any).
+  * 'value_regno' is the register whose value we're writing to the stack. It can
+  * be -1, meaning that we're not writing from a register.
+  *
+  * The caller must ensure that the offset falls within the maximum stack size.
+  */
+ static int check_stack_write(struct bpf_verifier_env *env,
+                            int ptr_regno, int off, int size,
+                            int value_regno, int insn_idx)
+ {
+       struct bpf_reg_state *reg = reg_state(env, ptr_regno);
+       struct bpf_func_state *state = func(env, reg);
+       int err;
+ 
+       if (tnum_is_const(reg->var_off)) {
+               off += reg->var_off.value;
+               err = check_stack_write_fixed_off(env, state, off, size,
+                                                 value_regno, insn_idx);
+       } else {
+               /* Variable offset stack reads need more conservative handling
+                * than fixed offset ones.
+                */
+               err = check_stack_write_var_off(env, state,
+                                               ptr_regno, off, size,
+                                               value_regno, insn_idx);
+       }
+       return err;
   }
   
   static int check_map_access_type(struct bpf_verifier_env *env, u32 regno,
@@@ -2858,11 -3167,6 +3167,6 @@@ static int check_sock_access(struct bpf
         return -EACCES;
   }
   
- static struct bpf_reg_state *reg_state(struct bpf_verifier_env *env, int regno)
- {
-       return cur_regs(env) + regno;
- }
- 
   static bool is_pointer_value(struct bpf_verifier_env *env, int regno)
   {
         return __is_pointer_value(env->allow_ptr_leaks, reg_state(env, regno));
@@@ -2981,8 -3285,8 +3285,8 @@@ static int check_ptr_alignment(struct b
                 break;
         case PTR_TO_STACK:
                 pointer_desc = "stack ";
-               /* The stack spill tracking logic in check_stack_write()
-                * and check_stack_read() relies on stack accesses being
+               /* The stack spill tracking logic in check_stack_write_fixed_off()
+                * and check_stack_read_fixed_off() relies on stack accesses being
                  * aligned.
                  */
                 strict = true;
@@@ -3074,9 -3378,7 +3378,7 @@@ process_func
   continue_func:
         subprog_end = subprog[idx + 1].start;
         for (; i < subprog_end; i++) {
-               if (insn[i].code != (BPF_JMP | BPF_CALL))
-                       continue;
-               if (insn[i].src_reg != BPF_PSEUDO_CALL)
+               if (!bpf_pseudo_call(insn + i))
                         continue;
                 /* remember insn and function to return to */
                 ret_insn[frame] = i + 1;
@@@ -3400,6 -3702,91 +3702,91 @@@ static int check_ptr_to_map_access(stru
         return 0;
   }
   
+ /* Check that the stack access at the given offset is within bounds. The
+  * maximum valid offset is -1.
+  *
+  * The minimum valid offset is -MAX_BPF_STACK for writes, and
+  * -state->allocated_stack for reads.
+  */
+ static int check_stack_slot_within_bounds(int off,
+                                         struct bpf_func_state *state,
+                                         enum bpf_access_type t)
+ {
+       int min_valid_off;
+ 
+       if (t == BPF_WRITE)
+               min_valid_off = -MAX_BPF_STACK;
+       else
+               min_valid_off = -state->allocated_stack;
+ 
+       if (off < min_valid_off || off > -1)
+               return -EACCES;
+       return 0;
+ }
+ 
+ /* Check that the stack access at 'regno + off' falls within the maximum stack
+  * bounds.
+  *
+  * 'off' includes `regno->offset`, but not its dynamic part (if any).
+  */
+ static int check_stack_access_within_bounds(
+               struct bpf_verifier_env *env,
+               int regno, int off, int access_size,
+               enum stack_access_src src, enum bpf_access_type type)
+ {
+       struct bpf_reg_state *regs = cur_regs(env);
+       struct bpf_reg_state *reg = regs + regno;
+       struct bpf_func_state *state = func(env, reg);
+       int min_off, max_off;
+       int err;
+       char *err_extra;
+ 
+       if (src == ACCESS_HELPER)
+               /* We don't know if helpers are reading or writing (or both). */
+               err_extra = " indirect access to";
+       else if (type == BPF_READ)
+               err_extra = " read from";
+       else
+               err_extra = " write to";
+ 
+       if (tnum_is_const(reg->var_off)) {
+               min_off = reg->var_off.value + off;
+               if (access_size > 0)
+                       max_off = min_off + access_size - 1;
+               else
+                       max_off = min_off;
+       } else {
+               if (reg->smax_value >= BPF_MAX_VAR_OFF ||
+                   reg->smin_value <= -BPF_MAX_VAR_OFF) {
+                       verbose(env, "invalid unbounded variable-offset%s stack R%d\n",
+                               err_extra, regno);
+                       return -EACCES;
+               }
+               min_off = reg->smin_value + off;
+               if (access_size > 0)
+                       max_off = reg->smax_value + off + access_size - 1;
+               else
+                       max_off = min_off;
+       }
+ 
+       err = check_stack_slot_within_bounds(min_off, state, type);
+       if (!err)
+               err = check_stack_slot_within_bounds(max_off, state, type);
+ 
+       if (err) {
+               if (tnum_is_const(reg->var_off)) {
+                       verbose(env, "invalid%s stack R%d off=%d size=%d\n",
+                               err_extra, regno, off, access_size);
+               } else {
+                       char tn_buf[48];
+ 
+                       tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+                       verbose(env, "invalid variable-offset%s stack R%d var_off=%s size=%d\n",
+                               err_extra, regno, tn_buf, access_size);
+               }
+       }
+       return err;
+ }
   
   /* check whether memory at (regno + off) is accessible for t = (read | write)
    * if t==write, value_regno is a register which value is stored into memory
@@@ -3515,8 -3902,8 +3902,8 @@@ static int check_mem_access(struct bpf_
                 }
   
         } else if (reg->type == PTR_TO_STACK) {
-               off += reg->var_off.value;
-               err = check_stack_access(env, reg, off, size);
+               /* Basic bounds checks. */
+               err = check_stack_access_within_bounds(env, regno, off, size, ACCESS_DIRECT, t);
                 if (err)
                         return err;
   
@@@ -3525,12 -3912,12 +3912,12 @@@
                 if (err)
                         return err;
   
-               if (t == BPF_WRITE)
-                       err = check_stack_write(env, state, off, size,
-                                               value_regno, insn_idx);
-               else
-                       err = check_stack_read(env, state, off, size,
+               if (t == BPF_READ)
+                       err = check_stack_read(env, regno, off, size,
                                                value_regno);
+               else
+                       err = check_stack_write(env, regno, off, size,
+                                               value_regno, insn_idx);
         } else if (reg_is_pkt_pointer(reg)) {
                 if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL, t)) {
                         verbose(env, "cannot write into packet\n");
@@@ -3665,9 -4052,26 +4052,26 @@@ static int check_atomic(struct bpf_veri
                 return -EACCES;
         }
   
+       if (insn->imm & BPF_FETCH) {
+               if (insn->imm == BPF_CMPXCHG)
+                       load_reg = BPF_REG_0;
+               else
+                       load_reg = insn->src_reg;
+ 
+               /* check and record load of old value */
+               err = check_reg_arg(env, load_reg, DST_OP);
+               if (err)
+                       return err;
+       } else {
+               /* This instruction accesses a memory location but doesn't
+                * actually load it into a register.
+                */
+               load_reg = -1;
+       }
+ 
         /* check whether we can read the memory */
         err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
-                              BPF_SIZE(insn->code), BPF_READ, -1, true);
+                              BPF_SIZE(insn->code), BPF_READ, load_reg, true);
         if (err)
                 return err;
   
@@@ -3677,65 -4081,56 +4081,56 @@@
         if (err)
                 return err;
   
-       if (!(insn->imm & BPF_FETCH))
-               return 0;
- 
-       if (insn->imm == BPF_CMPXCHG)
-               load_reg = BPF_REG_0;
-       else
-               load_reg = insn->src_reg;
- 
-       /* check and record load of old value */
-       err = check_reg_arg(env, load_reg, DST_OP);
-       if (err)
-               return err;
- 
         return 0;
   }
   
- static int __check_stack_boundary(struct bpf_verifier_env *env, u32 regno,
-                                 int off, int access_size,
-                                 bool zero_size_allowed)
+ /* When register 'regno' is used to read the stack (either directly or through
+  * a helper function) make sure that it's within stack boundary and, depending
+  * on the access type, that all elements of the stack are initialized.
+  *
+  * 'off' includes 'regno->off', but not its dynamic part (if any).
+  *
+  * All registers that have been spilled on the stack in the slots within the
+  * read offsets are marked as read.
+  */
+ static int check_stack_range_initialized(
+               struct bpf_verifier_env *env, int regno, int off,
+               int access_size, bool zero_size_allowed,
+               enum stack_access_src type, struct bpf_call_arg_meta *meta)
   {
         struct bpf_reg_state *reg = reg_state(env, regno);
+       struct bpf_func_state *state = func(env, reg);
+       int err, min_off, max_off, i, j, slot, spi;
+       char *err_extra = type == ACCESS_HELPER ? " indirect" : "";
+       enum bpf_access_type bounds_check_type;
+       /* Some accesses can write anything into the stack, others are
+        * read-only.
+        */
+       bool clobber = false;
   
-       if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 ||
-           access_size < 0 || (access_size == 0 && !zero_size_allowed)) {
-               if (tnum_is_const(reg->var_off)) {
-                       verbose(env, "invalid stack type R%d off=%d access_size=%d\n",
-                               regno, off, access_size);
-               } else {
-                       char tn_buf[48];
- 
-                       tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
-                       verbose(env, "invalid stack type R%d var_off=%s access_size=%d\n",
-                               regno, tn_buf, access_size);
-               }
+       if (access_size == 0 && !zero_size_allowed) {
+               verbose(env, "invalid zero-sized read\n");
                 return -EACCES;
         }
-       return 0;
- }
   
- /* when register 'regno' is passed into function that will read 'access_size'
-  * bytes from that pointer, make sure that it's within stack boundary
-  * and all elements of stack are initialized.
-  * Unlike most pointer bounds-checking functions, this one doesn't take an
-  * 'off' argument, so it has to add in reg->off itself.
-  */
- static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
-                               int access_size, bool zero_size_allowed,
-                               struct bpf_call_arg_meta *meta)
- {
-       struct bpf_reg_state *reg = reg_state(env, regno);
-       struct bpf_func_state *state = func(env, reg);
-       int err, min_off, max_off, i, j, slot, spi;
+       if (type == ACCESS_HELPER) {
+               /* The bounds checks for writes are more permissive than for
+                * reads. However, if raw_mode is not set, we'll do extra
+                * checks below.
+                */
+               bounds_check_type = BPF_WRITE;
+               clobber = true;
+       } else {
+               bounds_check_type = BPF_READ;
+       }
+       err = check_stack_access_within_bounds(env, regno, off, access_size,
+                                              type, bounds_check_type);
+       if (err)
+               return err;
+ 
   
         if (tnum_is_const(reg->var_off)) {
-               min_off = max_off = reg->var_off.value + reg->off;
-               err = __check_stack_boundary(env, regno, min_off, access_size,
-                                            zero_size_allowed);
-               if (err)
-                       return err;
+               min_off = max_off = reg->var_off.value + off;
         } else {
                 /* Variable offset is prohibited for unprivileged mode for
                  * simplicity since it requires corresponding support in
@@@ -3746,8 -4141,8 +4141,8 @@@
                         char tn_buf[48];
   
                         tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
-                       verbose(env, "R%d indirect variable offset stack access prohibited for !root, var_off=%s\n",
-                               regno, tn_buf);
+                       verbose(env, "R%d%s variable offset stack access prohibited for !root, var_off=%s\n",
+                               regno, err_extra, tn_buf);
                         return -EACCES;
                 }
                 /* Only initialized buffer on stack is allowed to be accessed
@@@ -3759,28 -4154,8 +4154,8 @@@
                 if (meta && meta->raw_mode)
                         meta = NULL;
   
-               if (reg->smax_value >= BPF_MAX_VAR_OFF ||
-                   reg->smax_value <= -BPF_MAX_VAR_OFF) {
-                       verbose(env, "R%d unbounded indirect variable offset stack access\n",
-                               regno);
-                       return -EACCES;
-               }
-               min_off = reg->smin_value + reg->off;
-               max_off = reg->smax_value + reg->off;
-               err = __check_stack_boundary(env, regno, min_off, access_size,
-                                            zero_size_allowed);
-               if (err) {
-                       verbose(env, "R%d min value is outside of stack bound\n",
-                               regno);
-                       return err;
-               }
-               err = __check_stack_boundary(env, regno, max_off, access_size,
-                                            zero_size_allowed);
-               if (err) {
-                       verbose(env, "R%d max value is outside of stack bound\n",
-                               regno);
-                       return err;
-               }
+               min_off = reg->smin_value + off;
+               max_off = reg->smax_value + off;
         }
   
         if (meta && meta->raw_mode) {
@@@ -3800,8 -4175,10 +4175,10 @@@
                 if (*stype == STACK_MISC)
                         goto mark;
                 if (*stype == STACK_ZERO) {
-                       /* helper can write anything into the stack */
-                       *stype = STACK_MISC;
+                       if (clobber) {
+                               /* helper can write anything into the stack */
+                               *stype = STACK_MISC;
+                       }
                         goto mark;
                 }
   
@@@ -3812,22 -4189,24 +4189,24 @@@
                 if (state->stack[spi].slot_type[0] == STACK_SPILL &&
                     (state->stack[spi].spilled_ptr.type == SCALAR_VALUE ||
                      env->allow_ptr_leaks)) {
-                       __mark_reg_unknown(env, &state->stack[spi].spilled_ptr);
-                       for (j = 0; j < BPF_REG_SIZE; j++)
-                               state->stack[spi].slot_type[j] = STACK_MISC;
+                       if (clobber) {
+                               __mark_reg_unknown(env, &state->stack[spi].spilled_ptr);
+                               for (j = 0; j < BPF_REG_SIZE; j++)
+                                       state->stack[spi].slot_type[j] = STACK_MISC;
+                       }
                         goto mark;
                 }
   
   err:
                 if (tnum_is_const(reg->var_off)) {
-                       verbose(env, "invalid indirect read from stack off %d+%d size %d\n",
-                               min_off, i - min_off, access_size);
+                       verbose(env, "invalid%s read from stack R%d off %d+%d size %d\n",
+                               err_extra, regno, min_off, i - min_off, access_size);
                 } else {
                         char tn_buf[48];
   
                         tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
-                       verbose(env, "invalid indirect read from stack var_off %s+%d size %d\n",
-                               tn_buf, i - min_off, access_size);
+                       verbose(env, "invalid%s read from stack R%d var_off %s+%d size %d\n",
+                               err_extra, regno, tn_buf, i - min_off, access_size);
                 }
                 return -EACCES;
   mark:
@@@ -3876,8 -4255,10 +4255,10 @@@ static int check_helper_mem_access(stru
                                            "rdwr",
                                            &env->prog->aux->max_rdwr_access);
         case PTR_TO_STACK:
-               return check_stack_boundary(env, regno, access_size,
-                                           zero_size_allowed, meta);
+               return check_stack_range_initialized(
+                               env,
+                               regno, reg->off, access_size,
+                               zero_size_allowed, ACCESS_HELPER, meta);
         default: /* scalar_value or invalid ptr */
                 /* Allow zero-byte read from NULL, regardless of pointer type */
                 if (zero_size_allowed && access_size == 0 &&
@@@ -3891,6 -4272,29 +4272,29 @@@
         }
   }
   
+ int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+                  u32 regno, u32 mem_size)
+ {
+       if (register_is_null(reg))
+               return 0;
+ 
+       if (reg_type_may_be_null(reg->type)) {
+               /* Assuming that the register contains a value check if the memory
+                * access is safe. Temporarily save and restore the register's state as
+                * the conversion shouldn't be visible to a caller.
+                */
+               const struct bpf_reg_state saved_reg = *reg;
+               int rv;
+ 
+               mark_ptr_not_null_reg(reg);
+               rv = check_helper_mem_access(env, regno, mem_size, true, NULL);
+               *reg = saved_reg;
+               return rv;
+       }
+ 
+       return check_helper_mem_access(env, regno, mem_size, true, NULL);
+ }
+ 
   /* Implementation details:
    * bpf_map_lookup returns PTR_TO_MAP_VALUE_OR_NULL
    * Two bpf_map_lookups (even with the same key) will have different reg->id.
@@@ -4875,8 -5279,9 +5279,9 @@@ static int check_func_call(struct bpf_v
                                         subprog);
                         clear_caller_saved_regs(env, caller->regs);
   
-                       /* All global functions return SCALAR_VALUE */
+                       /* All global functions return a 64-bit SCALAR_VALUE */
                         mark_reg_unknown(env, caller->regs, BPF_REG_0);
+                       caller->regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG;
   
                         /* continue with next insn after call */
                         return 0;
@@@ -5541,6 -5946,41 +5946,41 @@@ do_sim
         return !ret ? -EFAULT : 0;
   }
   
+ /* check that stack access falls within stack limits and that 'reg' doesn't
+  * have a variable offset.
+  *
+  * Variable offset is prohibited for unprivileged mode for simplicity since it
+  * requires corresponding support in Spectre masking for stack ALU.  See also
+  * retrieve_ptr_limit().
+  *
+  *
+  * 'off' includes 'reg->off'.
+  */
+ static int check_stack_access_for_ptr_arithmetic(
+                               struct bpf_verifier_env *env,
+                               int regno,
+                               const struct bpf_reg_state *reg,
+                               int off)
+ {
+       if (!tnum_is_const(reg->var_off)) {
+               char tn_buf[48];
+ 
+               tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+               verbose(env, "R%d variable stack access prohibited for !root, var_off=%s off=%d\n",
+                       regno, tn_buf, off);
+               return -EACCES;
+       }
+ 
+       if (off >= 0 || off < -MAX_BPF_STACK) {
+               verbose(env, "R%d stack pointer arithmetic goes out of range, "
+                       "prohibited for !root; off=%d\n", regno, off);
+               return -EACCES;
+       }
+ 
+       return 0;
+ }
+ 
+ 
   /* Handles arithmetic on a pointer and a scalar: computes new min/max and var_off.
    * Caller should also handle BPF_MOV case separately.
    * If we return -EACCES, caller may want to try again treating pointer as a
@@@ -5784,10 -6224,9 +6224,9 @@@ static int adjust_ptr_min_max_vals(stru
                                 "prohibited for !root\n", dst);
                         return -EACCES;
                 } else if (dst_reg->type == PTR_TO_STACK &&
-                          check_stack_access(env, dst_reg, dst_reg->off +
-                                             dst_reg->var_off.value, 1)) {
-                       verbose(env, "R%d stack pointer arithmetic goes out of range, "
-                               "prohibited for !root\n", dst);
+                          check_stack_access_for_ptr_arithmetic(
+                                  env, dst, dst_reg, dst_reg->off +
+                                  dst_reg->var_off.value)) {
                         return -EACCES;
                 }
         }
@@@ -6266,7 -6705,7 +6705,7 @@@ static void scalar32_min_max_rsh(struc
          * 3) the signed bounds cross zero, so they tell us nothing
          *    about the result
          * If the value in dst_reg is known nonnegative, then again the
-        * unsigned bounts capture the signed bounds.
+        * unsigned bounds capture the signed bounds.
          * Thus, in all cases it suffices to blow away our signed bounds
          * and rely on inferring new ones from the unsigned bounds and
          * var_off of the result.
@@@ -6297,7 -6736,7 +6736,7 @@@ static void scalar_min_max_rsh(struct b
          * 3) the signed bounds cross zero, so they tell us nothing
          *    about the result
          * If the value in dst_reg is known nonnegative, then again the
-        * unsigned bounts capture the signed bounds.
+        * unsigned bounds capture the signed bounds.
          * Thus, in all cases it suffices to blow away our signed bounds
          * and rely on inferring new ones from the unsigned bounds and
          * var_off of the result.
@@@ -6918,7 -7357,7 +7357,7 @@@ static int is_branch32_taken(struct bpf
         case BPF_JSGT:
                 if (reg->s32_min_value > sval)
                         return 1;
- -              else if (reg->s32_max_value < sval)
+ +              else if (reg->s32_max_value <= sval)
                         return 0;
                 break;
         case BPF_JLT:
@@@ -6991,7 -7430,7 +7430,7 @@@ static int is_branch64_taken(struct bpf
         case BPF_JSGT:
                 if (reg->smin_value > sval)
                         return 1;
- -              else if (reg->smax_value < sval)
+ +              else if (reg->smax_value <= sval)
                         return 0;
                 break;
         case BPF_JLT:
@@@ -7367,43 -7806,19 +7806,19 @@@ static void mark_ptr_or_null_reg(struc
                 }
                 if (is_null) {
                         reg->type = SCALAR_VALUE;
-               } else if (reg->type == PTR_TO_MAP_VALUE_OR_NULL) {
-                       const struct bpf_map *map = reg->map_ptr;
- 
-                       if (map->inner_map_meta) {
-                               reg->type = CONST_PTR_TO_MAP;
-                               reg->map_ptr = map->inner_map_meta;
-                       } else if (map->map_type == BPF_MAP_TYPE_XSKMAP) {
-                               reg->type = PTR_TO_XDP_SOCK;
-                       } else if (map->map_type == BPF_MAP_TYPE_SOCKMAP ||
-                                  map->map_type == BPF_MAP_TYPE_SOCKHASH) {
-                               reg->type = PTR_TO_SOCKET;
-                       } else {
-                               reg->type = PTR_TO_MAP_VALUE;
-                       }
-               } else if (reg->type == PTR_TO_SOCKET_OR_NULL) {
-                       reg->type = PTR_TO_SOCKET;
-               } else if (reg->type == PTR_TO_SOCK_COMMON_OR_NULL) {
-                       reg->type = PTR_TO_SOCK_COMMON;
-               } else if (reg->type == PTR_TO_TCP_SOCK_OR_NULL) {
-                       reg->type = PTR_TO_TCP_SOCK;
-               } else if (reg->type == PTR_TO_BTF_ID_OR_NULL) {
-                       reg->type = PTR_TO_BTF_ID;
-               } else if (reg->type == PTR_TO_MEM_OR_NULL) {
-                       reg->type = PTR_TO_MEM;
-               } else if (reg->type == PTR_TO_RDONLY_BUF_OR_NULL) {
-                       reg->type = PTR_TO_RDONLY_BUF;
-               } else if (reg->type == PTR_TO_RDWR_BUF_OR_NULL) {
-                       reg->type = PTR_TO_RDWR_BUF;
-               }
-               if (is_null) {
                         /* We don't need id and ref_obj_id from this point
                          * onwards anymore, thus we should better reset it,
                          * so that state pruning has chances to take effect.
                          */
                         reg->id = 0;
                         reg->ref_obj_id = 0;
-               } else if (!reg_may_point_to_spin_lock(reg)) {
+ 
+                       return;
+               }
+ 
+               mark_ptr_not_null_reg(reg);
+ 
+               if (!reg_may_point_to_spin_lock(reg)) {
                         /* For not-NULL ptr, reg->ref_obj_id will be reset
                          * in release_reg_references().
                          *
@@@ -7986,6 -8401,9 +8401,9 @@@ static int check_return_code(struct bpf
                     env->prog->expected_attach_type == BPF_CGROUP_INET4_GETSOCKNAME ||
                     env->prog->expected_attach_type == BPF_CGROUP_INET6_GETSOCKNAME)
                         range = tnum_range(1, 1);
+               if (env->prog->expected_attach_type == BPF_CGROUP_INET4_BIND ||
+                   env->prog->expected_attach_type == BPF_CGROUP_INET6_BIND)
+                       range = tnum_range(0, 3);
                 break;
         case BPF_PROG_TYPE_CGROUP_SKB:
                 if (env->prog->expected_attach_type == BPF_CGROUP_INET_EGRESS) {
@@@ -8631,11 -9049,7 +9049,11 @@@ static bool range_within(struct bpf_reg
         return old->umin_value <= cur->umin_value &&
                old->umax_value >= cur->umax_value &&
                old->smin_value <= cur->smin_value &&
- -             old->smax_value >= cur->smax_value;
+ +             old->smax_value >= cur->smax_value &&
+ +             old->u32_min_value <= cur->u32_min_value &&
+ +             old->u32_max_value >= cur->u32_max_value &&
+ +             old->s32_min_value <= cur->s32_min_value &&
+ +             old->s32_max_value >= cur->s32_max_value;
   }
   
   /* Maximum number of register states that can exist at once */
@@@ -10015,15 -10429,22 +10433,22 @@@ static int check_map_prog_compatibility
                 case BPF_MAP_TYPE_HASH:
                 case BPF_MAP_TYPE_LRU_HASH:
                 case BPF_MAP_TYPE_ARRAY:
+               case BPF_MAP_TYPE_PERCPU_HASH:
+               case BPF_MAP_TYPE_PERCPU_ARRAY:
+               case BPF_MAP_TYPE_LRU_PERCPU_HASH:
+               case BPF_MAP_TYPE_ARRAY_OF_MAPS:
+               case BPF_MAP_TYPE_HASH_OF_MAPS:
                         if (!is_preallocated_map(map)) {
                                 verbose(env,
-                                       "Sleepable programs can only use preallocated hash maps\n");
+                                       "Sleepable programs can only use preallocated maps\n");
                                 return -EINVAL;
                         }
                         break;
+               case BPF_MAP_TYPE_RINGBUF:
+                       break;
                 default:
                         verbose(env,
-                               "Sleepable programs can only use array and hash maps\n");
+                               "Sleepable programs can only use array, hash, and ringbuf maps\n");
                         return -EINVAL;
                 }
   
@@@ -10581,6 -11002,7 +11006,7 @@@ static int opt_subreg_zext_lo32_rnd_hi3
         for (i = 0; i < len; i++) {
                 int adj_idx = i + delta;
                 struct bpf_insn insn;
+               u8 load_reg;
   
                 insn = insns[adj_idx];
                 if (!aux[adj_idx].zext_dst) {
@@@ -10623,9 -11045,27 +11049,27 @@@
                 if (!bpf_jit_needs_zext())
                         continue;
   
+               /* zext_dst means that we want to zero-extend whatever register
+                * the insn defines, which is dst_reg most of the time, with
+                * the notable exception of BPF_STX + BPF_ATOMIC + BPF_FETCH.
+                */
+               if (BPF_CLASS(insn.code) == BPF_STX &&
+                   BPF_MODE(insn.code) == BPF_ATOMIC) {
+                       /* BPF_STX + BPF_ATOMIC insns without BPF_FETCH do not
+                        * define any registers, therefore zext_dst cannot be
+                        * set.
+                        */
+                       if (WARN_ON(!(insn.imm & BPF_FETCH)))
+                               return -EINVAL;
+                       load_reg = insn.imm == BPF_CMPXCHG ? BPF_REG_0
+                                                          : insn.src_reg;
+               } else {
+                       load_reg = insn.dst_reg;
+               }
+ 
                 zext_patch[0] = insn;
-               zext_patch[1].dst_reg = insn.dst_reg;
-               zext_patch[1].src_reg = insn.dst_reg;
+               zext_patch[1].dst_reg = load_reg;
+               zext_patch[1].src_reg = load_reg;
                 patch = zext_patch;
                 patch_len = 2;
   apply_patch_buffer:
@@@ -10841,8 -11281,7 +11285,7 @@@ static int jit_subprogs(struct bpf_veri
                 return 0;
   
         for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
-               if (insn->code != (BPF_JMP | BPF_CALL) ||
-                   insn->src_reg != BPF_PSEUDO_CALL)
+               if (!bpf_pseudo_call(insn))
                         continue;
                 /* Upon error here we cannot fall back to interpreter but
                  * need a hard reject of the program. Thus -EFAULT is
@@@ -10883,7 -11322,7 +11326,7 @@@
                 /* BPF_PROG_RUN doesn't call subprogs directly,
                  * hence main prog stats include the runtime of subprogs.
                  * subprogs don't have IDs and not reachable via prog_get_next_id
-                * func[i]->aux->stats will never be accessed and stays NULL
+                * func[i]->stats will never be accessed and stays NULL
                  */
                 func[i] = bpf_prog_alloc_no_stats(bpf_prog_size(len), GFP_USER);
                 if (!func[i])
@@@ -10971,8 -11410,7 +11414,7 @@@
         for (i = 0; i < env->subprog_cnt; i++) {
                 insn = func[i]->insnsi;
                 for (j = 0; j < func[i]->len; j++, insn++) {
-                       if (insn->code != (BPF_JMP | BPF_CALL) ||
-                           insn->src_reg != BPF_PSEUDO_CALL)
+                       if (!bpf_pseudo_call(insn))
                                 continue;
                         subprog = insn->off;
                         insn->imm = BPF_CAST_CALL(func[subprog]->bpf_func) -
@@@ -11017,8 -11455,7 +11459,7 @@@
          * later look the same as if they were interpreted only.
          */
         for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
-               if (insn->code != (BPF_JMP | BPF_CALL) ||
-                   insn->src_reg != BPF_PSEUDO_CALL)
+               if (!bpf_pseudo_call(insn))
                         continue;
                 insn->off = env->insn_aux_data[i].call_imm;
                 subprog = find_subprog(env, i + insn->off + 1);
@@@ -11047,8 -11484,7 +11488,7 @@@ out_undo_insn
         /* cleanup main prog to be interpreted */
         prog->jit_requested = 0;
         for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
-               if (insn->code != (BPF_JMP | BPF_CALL) ||
-                   insn->src_reg != BPF_PSEUDO_CALL)
+               if (!bpf_pseudo_call(insn))
                         continue;
                 insn->off = 0;
                 insn->imm = env->insn_aux_data[i].call_imm;
@@@ -11083,8 -11519,7 +11523,7 @@@ static int fixup_call_args(struct bpf_v
                 return -EINVAL;
         }
         for (i = 0; i < prog->len; i++, insn++) {
-               if (insn->code != (BPF_JMP | BPF_CALL) ||
-                   insn->src_reg != BPF_PSEUDO_CALL)
+               if (!bpf_pseudo_call(insn))
                         continue;
                 depth = get_callee_stack_depth(env, insn, i);
                 if (depth < 0)
@@@ -11121,28 -11556,30 +11560,28 @@@ static int fixup_bpf_calls(struct bpf_v
                     insn->code == (BPF_ALU | BPF_MOD | BPF_X) ||
                     insn->code == (BPF_ALU | BPF_DIV | BPF_X)) {
                         bool is64 = BPF_CLASS(insn->code) == BPF_ALU64;
- -                      struct bpf_insn mask_and_div[] = {
- -                              BPF_MOV32_REG(insn->src_reg, insn->src_reg),
+ +                      bool isdiv = BPF_OP(insn->code) == BPF_DIV;
+ +                      struct bpf_insn *patchlet;
+ +                      struct bpf_insn chk_and_div[] = {
                                 /* Rx div 0 -> 0 */
- -                              BPF_JMP_IMM(BPF_JNE, insn->src_reg, 0, 2),
+ +                              BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) |
+ +                                           BPF_JNE | BPF_K, insn->src_reg,
+ +                                           0, 2, 0),
                                 BPF_ALU32_REG(BPF_XOR, insn->dst_reg, insn->dst_reg),
                                 BPF_JMP_IMM(BPF_JA, 0, 0, 1),
                                 *insn,
                         };
- -                      struct bpf_insn mask_and_mod[] = {
- -                              BPF_MOV32_REG(insn->src_reg, insn->src_reg),
+ +                      struct bpf_insn chk_and_mod[] = {
                                 /* Rx mod 0 -> Rx */
- -                              BPF_JMP_IMM(BPF_JEQ, insn->src_reg, 0, 1),
+ +                              BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) |
+ +                                           BPF_JEQ | BPF_K, insn->src_reg,
+ +                                           0, 1, 0),
                                 *insn,
                         };
- -                      struct bpf_insn *patchlet;
   
- -                      if (insn->code == (BPF_ALU64 | BPF_DIV | BPF_X) ||
- -                          insn->code == (BPF_ALU | BPF_DIV | BPF_X)) {
- -                              patchlet = mask_and_div + (is64 ? 1 : 0);
- -                              cnt = ARRAY_SIZE(mask_and_div) - (is64 ? 1 : 0);
- -                      } else {
- -                              patchlet = mask_and_mod + (is64 ? 1 : 0);
- -                              cnt = ARRAY_SIZE(mask_and_mod) - (is64 ? 1 : 0);
- -                      }
+ +                      patchlet = isdiv ? chk_and_div : chk_and_mod;
+ +                      cnt = isdiv ? ARRAY_SIZE(chk_and_div) :
+ +                                    ARRAY_SIZE(chk_and_mod);
   
                         new_prog = bpf_patch_insn_data(env, i + delta, patchlet, cnt);
                         if (!new_prog)
@@@ -11547,6 -11984,13 +11986,13 @@@ static int do_check_common(struct bpf_v
                                 mark_reg_known_zero(env, regs, i);
                         else if (regs[i].type == SCALAR_VALUE)
                                 mark_reg_unknown(env, regs, i);
+                       else if (regs[i].type == PTR_TO_MEM_OR_NULL) {
+                               const u32 mem_size = regs[i].mem_size;
+ 
+                               mark_reg_known_zero(env, regs, i);
+                               regs[i].mem_size = mem_size;
+                               regs[i].id = ++env->id_gen;
+                       }
                 }
         } else {
                 /* 1st arg to a function */
@@@ -12125,6 -12569,7 +12571,7 @@@ int bpf_check(struct bpf_prog **prog, u
                 env->strict_alignment = false;
   
         env->allow_ptr_leaks = bpf_allow_ptr_leaks();
+       env->allow_uninit_stack = bpf_allow_uninit_stack();
         env->allow_ptr_to_map_access = bpf_allow_ptr_to_map_access();
         env->bypass_spec_v1 = bpf_bypass_spec_v1();
         env->bypass_spec_v4 = bpf_bypass_spec_v4();
diff --combined kernel/trace/bpf_trace.c

index 7644002,0b9e4fd..b0c45d9
--- 1/kernel/trace/bpf_trace.c
--- 2/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@@ -96,6 -96,9 +96,6 @@@ unsigned int trace_call_bpf(struct trac
   {
         unsigned int ret;
   
- -      if (in_nmi()) /* not supported yet */
- -              return 1;
- -
         cant_sleep();
   
         if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) {
@@@ -1188,6 -1191,10 +1188,10 @@@ BTF_SET_END(btf_allowlist_d_path
   
   static bool bpf_d_path_allowed(const struct bpf_prog *prog)
   {
+       if (prog->type == BPF_PROG_TYPE_TRACING &&
+           prog->expected_attach_type == BPF_TRACE_ITER)
+               return true;
+ 
         if (prog->type == BPF_PROG_TYPE_LSM)
                 return bpf_lsm_is_sleepable_hook(prog->aux->attach_btf_id);
   
@@@ -1757,6 -1764,8 +1761,8 @@@ tracing_prog_func_proto(enum bpf_func_i
                 return &bpf_sk_storage_delete_tracing_proto;
         case BPF_FUNC_sock_from_file:
                 return &bpf_sock_from_file_proto;
+       case BPF_FUNC_get_socket_cookie:
+               return &bpf_get_socket_ptr_cookie_proto;
   #endif
         case BPF_FUNC_seq_printf:
                 return prog->expected_attach_type == BPF_TRACE_ITER ?
diff --combined net/core/dev.c

index ea9b463,8c820fe..6c5967e
--- 1/net/core/dev.c
--- 2/net/core/dev.c
+++ b/net/core/dev.c
@@@ -91,7 -91,6 +91,7 @@@
   #include <linux/etherdevice.h>
   #include <linux/ethtool.h>
   #include <linux/skbuff.h>
+ +#include <linux/kthread.h>
   #include <linux/bpf.h>
   #include <linux/bpf_trace.h>
   #include <net/net_namespace.h>
@@@ -102,7 -101,6 +102,7 @@@
   #include <net/dsa.h>
   #include <net/dst.h>
   #include <net/dst_metadata.h>
+ +#include <net/gro.h>
   #include <net/pkt_sched.h>
   #include <net/pkt_cls.h>
   #include <net/checksum.h>
@@@ -1495,27 -1493,6 +1495,27 @@@ void netdev_notify_peers(struct net_dev
   }
   EXPORT_SYMBOL(netdev_notify_peers);
   
+ +static int napi_threaded_poll(void *data);
+ +
+ +static int napi_kthread_create(struct napi_struct *n)
+ +{
+ +      int err = 0;
+ +
+ +      /* Create and wake up the kthread once to put it in
+ +       * TASK_INTERRUPTIBLE mode to avoid the blocked task
+ +       * warning and work with loadavg.
+ +       */
+ +      n->thread = kthread_run(napi_threaded_poll, n, "napi/%s-%d",
+ +                              n->dev->name, n->napi_id);
+ +      if (IS_ERR(n->thread)) {
+ +              err = PTR_ERR(n->thread);
+ +              pr_err("kthread_run failed with err %d\n", err);
+ +              n->thread = NULL;
+ +      }
+ +
+ +      return err;
+ +}
+ +
   static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
   {
         const struct net_device_ops *ops = dev->netdev_ops;
@@@ -2217,28 -2194,14 +2217,14 @@@ static inline void net_timestamp_set(st
   
   bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb)
   {
-       unsigned int len;
- 
-       if (!(dev->flags & IFF_UP))
-               return false;
- 
-       len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
-       if (skb->len <= len)
-               return true;
- 
-       /* if TSO is enabled, we don't care about the length as the packet
-        * could be forwarded without being segmented before
-        */
-       if (skb_is_gso(skb))
-               return true;
- 
-       return false;
+       return __is_skb_forwardable(dev, skb, true);
   }
   EXPORT_SYMBOL_GPL(is_skb_forwardable);
   
- int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
+ static int __dev_forward_skb2(struct net_device *dev, struct sk_buff *skb,
+                             bool check_mtu)
   {
-       int ret = ____dev_forward_skb(dev, skb);
+       int ret = ____dev_forward_skb(dev, skb, check_mtu);
   
         if (likely(!ret)) {
                 skb->protocol = eth_type_trans(skb, dev);
@@@ -2247,6 -2210,11 +2233,11 @@@
   
         return ret;
   }
+ 
+ int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
+ {
+       return __dev_forward_skb2(dev, skb, true);
+ }
   EXPORT_SYMBOL_GPL(__dev_forward_skb);
   
   /**
@@@ -2273,6 -2241,11 +2264,11 @@@ int dev_forward_skb(struct net_device *
   }
   EXPORT_SYMBOL_GPL(dev_forward_skb);
   
+ int dev_forward_skb_nomtu(struct net_device *dev, struct sk_buff *skb)
+ {
+       return __dev_forward_skb2(dev, skb, false) ?: netif_rx_internal(skb);
+ }
+ 
   static inline int deliver_skb(struct sk_buff *skb,
                               struct packet_type *pt_prev,
                               struct net_device *orig_dev)
@@@ -3644,18 -3617,7 +3640,18 @@@ int skb_csum_hwoffload_help(struct sk_b
                 return !!(features & NETIF_F_SCTP_CRC) ? 0 :
                         skb_crc32c_csum_help(skb);
   
- -      return !!(features & NETIF_F_CSUM_MASK) ? 0 : skb_checksum_help(skb);
+ +      if (features & NETIF_F_HW_CSUM)
+ +              return 0;
+ +
+ +      if (features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) {
+ +              switch (skb->csum_offset) {
+ +              case offsetof(struct tcphdr, check):
+ +              case offsetof(struct udphdr, check):
+ +                      return 0;
+ +              }
+ +      }
+ +
+ +      return skb_checksum_help(skb);
   }
   EXPORT_SYMBOL(skb_csum_hwoffload_help);
   
@@@ -3912,7 -3874,6 +3908,7 @@@ sch_handle_egress(struct sk_buff *skb, 
   
         /* qdisc_skb_cb(skb)->pkt_len was already set by the caller. */
         qdisc_skb_cb(skb)->mru = 0;
+ +      qdisc_skb_cb(skb)->post_ct = false;
         mini_qdisc_bstats_cpu_update(miniq, skb);
   
         switch (tcf_classify(skb, miniq->filter_list, &cl_res, false)) {
@@@ -4118,7 -4079,7 +4114,7 @@@ static int __dev_queue_xmit(struct sk_b
         skb_reset_mac_header(skb);
   
         if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
- -              __skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
+ +              __skb_tstamp_tx(skb, NULL, NULL, skb->sk, SCM_TSTAMP_SCHED);
   
         /* Disable soft irqs for various locks below. Also
          * stops preemption for RCU.
@@@ -4287,22 -4248,6 +4283,22 @@@ int gro_normal_batch __read_mostly = 8
   static inline void ____napi_schedule(struct softnet_data *sd,
                                      struct napi_struct *napi)
   {
+ +      struct task_struct *thread;
+ +
+ +      if (test_bit(NAPI_STATE_THREADED, &napi->state)) {
+ +              /* Paired with smp_mb__before_atomic() in
+ +               * napi_enable()/dev_set_threaded().
+ +               * Use READ_ONCE() to guarantee a complete
+ +               * read on napi->thread. Only call
+ +               * wake_up_process() when it's not NULL.
+ +               */
+ +              thread = READ_ONCE(napi->thread);
+ +              if (thread) {
+ +                      wake_up_process(thread);
+ +                      return;
+ +              }
+ +      }
+ +
         list_add_tail(&napi->poll_list, &sd->poll_list);
         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
   }
@@@ -4944,6 -4889,8 +4940,6 @@@ static __latent_entropy void net_tx_act
                         else
                                 __kfree_skb_defer(skb);
                 }
- -
- -              __kfree_skb_flush();
         }
   
         if (sd->output_queue) {
@@@ -5009,7 -4956,6 +5005,7 @@@ sch_handle_ingress(struct sk_buff *skb
   
         qdisc_skb_cb(skb)->pkt_len = skb->len;
         qdisc_skb_cb(skb)->mru = 0;
+ +      qdisc_skb_cb(skb)->post_ct = false;
         skb->tc_at_ingress = 1;
         mini_qdisc_bstats_cpu_update(miniq, skb);
   
@@@ -5759,7 -5705,7 +5755,7 @@@ static void flush_all_backlogs(void
         }
   
         /* we can have in flight packet[s] on the cpus we are not flushing,
- -       * synchronize_net() in rollback_registered_many() will take care of
+ +       * synchronize_net() in unregister_netdevice_many() will take care of
          * them
          */
         for_each_cpu(cpu, &flush_cpus)
@@@ -5781,14 -5727,15 +5777,14 @@@ static void gro_normal_list(struct napi
   /* Queue one GRO_NORMAL SKB up for list processing. If batch size exceeded,
    * pass the whole batch up to the stack.
    */
- -static void gro_normal_one(struct napi_struct *napi, struct sk_buff *skb)
+ +static void gro_normal_one(struct napi_struct *napi, struct sk_buff *skb, int segs)
   {
         list_add_tail(&skb->list, &napi->rx_list);
- -      if (++napi->rx_count >= gro_normal_batch)
+ +      napi->rx_count += segs;
+ +      if (napi->rx_count >= gro_normal_batch)
                 gro_normal_list(napi);
   }
   
- -INDIRECT_CALLABLE_DECLARE(int inet_gro_complete(struct sk_buff *, int));
- -INDIRECT_CALLABLE_DECLARE(int ipv6_gro_complete(struct sk_buff *, int));
   static int napi_gro_complete(struct napi_struct *napi, struct sk_buff *skb)
   {
         struct packet_offload *ptype;
@@@ -5822,7 -5769,7 +5818,7 @@@
         }
   
   out:
- -      gro_normal_one(napi, skb);
+ +      gro_normal_one(napi, skb, NAPI_GRO_CB(skb)->count);
         return NET_RX_SUCCESS;
   }
   
@@@ -5957,6 -5904,10 +5953,6 @@@ static void gro_flush_oldest(struct nap
         napi_gro_complete(napi, oldest);
   }
   
- -INDIRECT_CALLABLE_DECLARE(struct sk_buff *inet_gro_receive(struct list_head *,
- -                                                         struct sk_buff *));
- -INDIRECT_CALLABLE_DECLARE(struct sk_buff *ipv6_gro_receive(struct list_head *,
- -                                                         struct sk_buff *));
   static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
   {
         u32 hash = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1);
@@@ -6095,20 -6046,27 +6091,20 @@@ struct packet_offload *gro_find_complet
   }
   EXPORT_SYMBOL(gro_find_complete_by_type);
   
- -static void napi_skb_free_stolen_head(struct sk_buff *skb)
- -{
- -      skb_dst_drop(skb);
- -      skb_ext_put(skb);
- -      kmem_cache_free(skbuff_head_cache, skb);
- -}
- -
   static gro_result_t napi_skb_finish(struct napi_struct *napi,
                                     struct sk_buff *skb,
                                     gro_result_t ret)
   {
         switch (ret) {
         case GRO_NORMAL:
- -              gro_normal_one(napi, skb);
+ +              gro_normal_one(napi, skb, 1);
                 break;
   
         case GRO_MERGED_FREE:
                 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
                         napi_skb_free_stolen_head(skb);
                 else
- -                      __kfree_skb(skb);
+ +                      __kfree_skb_defer(skb);
                 break;
   
         case GRO_HELD:
@@@ -6185,7 -6143,7 +6181,7 @@@ static gro_result_t napi_frags_finish(s
                 __skb_push(skb, ETH_HLEN);
                 skb->protocol = eth_type_trans(skb, skb->dev);
                 if (ret == GRO_NORMAL)
- -                      gro_normal_one(napi, skb);
+ +                      gro_normal_one(napi, skb, 1);
                 break;
   
         case GRO_MERGED_FREE:
@@@ -6731,49 -6689,6 +6727,49 @@@ static void init_gro_hash(struct napi_s
         napi->gro_bitmask = 0;
   }
   
+ +int dev_set_threaded(struct net_device *dev, bool threaded)
+ +{
+ +      struct napi_struct *napi;
+ +      int err = 0;
+ +
+ +      if (dev->threaded == threaded)
+ +              return 0;
+ +
+ +      if (threaded) {
+ +              list_for_each_entry(napi, &dev->napi_list, dev_list) {
+ +                      if (!napi->thread) {
+ +                              err = napi_kthread_create(napi);
+ +                              if (err) {
+ +                                      threaded = false;
+ +                                      break;
+ +                              }
+ +                      }
+ +              }
+ +      }
+ +
+ +      dev->threaded = threaded;
+ +
+ +      /* Make sure kthread is created before THREADED bit
+ +       * is set.
+ +       */
+ +      smp_mb__before_atomic();
+ +
+ +      /* Setting/unsetting threaded mode on a napi might not immediately
+ +       * take effect, if the current napi instance is actively being
+ +       * polled. In this case, the switch between threaded mode and
+ +       * softirq mode will happen in the next round of napi_schedule().
+ +       * This should not cause hiccups/stalls to the live traffic.
+ +       */
+ +      list_for_each_entry(napi, &dev->napi_list, dev_list) {
+ +              if (threaded)
+ +                      set_bit(NAPI_STATE_THREADED, &napi->state);
+ +              else
+ +                      clear_bit(NAPI_STATE_THREADED, &napi->state);
+ +      }
+ +
+ +      return err;
+ +}
+ +
   void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
                     int (*poll)(struct napi_struct *, int), int weight)
   {
@@@ -6801,12 -6716,6 +6797,12 @@@
         set_bit(NAPI_STATE_NPSVC, &napi->state);
         list_add_rcu(&napi->dev_list, &dev->napi_list);
         napi_hash_add(napi);
+ +      /* Create kthread for this napi if dev->threaded is set.
+ +       * Clear dev->threaded if kthread creation failed so that
+ +       * threaded mode will not be enabled in napi_enable().
+ +       */
+ +      if (dev->threaded && napi_kthread_create(napi))
+ +              dev->threaded = 0;
   }
   EXPORT_SYMBOL(netif_napi_add);
   
@@@ -6824,28 -6733,9 +6820,28 @@@ void napi_disable(struct napi_struct *n
   
         clear_bit(NAPI_STATE_PREFER_BUSY_POLL, &n->state);
         clear_bit(NAPI_STATE_DISABLE, &n->state);
+ +      clear_bit(NAPI_STATE_THREADED, &n->state);
   }
   EXPORT_SYMBOL(napi_disable);
   
+ +/**
+ + *    napi_enable - enable NAPI scheduling
+ + *    @n: NAPI context
+ + *
+ + * Resume NAPI from being scheduled on this context.
+ + * Must be paired with napi_disable.
+ + */
+ +void napi_enable(struct napi_struct *n)
+ +{
+ +      BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
+ +      smp_mb__before_atomic();
+ +      clear_bit(NAPI_STATE_SCHED, &n->state);
+ +      clear_bit(NAPI_STATE_NPSVC, &n->state);
+ +      if (n->dev->threaded && n->thread)
+ +              set_bit(NAPI_STATE_THREADED, &n->state);
+ +}
+ +EXPORT_SYMBOL(napi_enable);
+ +
   static void flush_gro_hash(struct napi_struct *napi)
   {
         int i;
@@@ -6871,18 -6761,18 +6867,18 @@@ void __netif_napi_del(struct napi_struc
   
         flush_gro_hash(napi);
         napi->gro_bitmask = 0;
+ +
+ +      if (napi->thread) {
+ +              kthread_stop(napi->thread);
+ +              napi->thread = NULL;
+ +      }
   }
   EXPORT_SYMBOL(__netif_napi_del);
   
- -static int napi_poll(struct napi_struct *n, struct list_head *repoll)
+ +static int __napi_poll(struct napi_struct *n, bool *repoll)
   {
- -      void *have;
         int work, weight;
   
- -      list_del_init(&n->poll_list);
- -
- -      have = netpoll_poll_lock(n);
- -
         weight = n->weight;
   
         /* This NAPI_STATE_SCHED test is for avoiding a race
@@@ -6902,7 -6792,7 +6898,7 @@@
                             n->poll, work, weight);
   
         if (likely(work < weight))
- -              goto out_unlock;
+ +              return work;
   
         /* Drivers must not modify the NAPI state if they
          * consume the entire weight.  In such cases this code
@@@ -6911,7 -6801,7 +6907,7 @@@
          */
         if (unlikely(napi_disable_pending(n))) {
                 napi_complete(n);
- -              goto out_unlock;
+ +              return work;
         }
   
         /* The NAPI context has more processing work, but busy-polling
@@@ -6924,7 -6814,7 +6920,7 @@@
                          */
                         napi_schedule(n);
                 }
- -              goto out_unlock;
+ +              return work;
         }
   
         if (n->gro_bitmask) {
@@@ -6942,78 -6832,17 +6938,78 @@@
         if (unlikely(!list_empty(&n->poll_list))) {
                 pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
                              n->dev ? n->dev->name : "backlog");
- -              goto out_unlock;
+ +              return work;
         }
   
- -      list_add_tail(&n->poll_list, repoll);
+ +      *repoll = true;
+ +
+ +      return work;
+ +}
+ +
+ +static int napi_poll(struct napi_struct *n, struct list_head *repoll)
+ +{
+ +      bool do_repoll = false;
+ +      void *have;
+ +      int work;
+ +
+ +      list_del_init(&n->poll_list);
+ +
+ +      have = netpoll_poll_lock(n);
+ +
+ +      work = __napi_poll(n, &do_repoll);
+ +
+ +      if (do_repoll)
+ +              list_add_tail(&n->poll_list, repoll);
   
- -out_unlock:
         netpoll_poll_unlock(have);
   
         return work;
   }
   
+ +static int napi_thread_wait(struct napi_struct *napi)
+ +{
+ +      set_current_state(TASK_INTERRUPTIBLE);
+ +
+ +      while (!kthread_should_stop() && !napi_disable_pending(napi)) {
+ +              if (test_bit(NAPI_STATE_SCHED, &napi->state)) {
+ +                      WARN_ON(!list_empty(&napi->poll_list));
+ +                      __set_current_state(TASK_RUNNING);
+ +                      return 0;
+ +              }
+ +
+ +              schedule();
+ +              set_current_state(TASK_INTERRUPTIBLE);
+ +      }
+ +      __set_current_state(TASK_RUNNING);
+ +      return -1;
+ +}
+ +
+ +static int napi_threaded_poll(void *data)
+ +{
+ +      struct napi_struct *napi = data;
+ +      void *have;
+ +
+ +      while (!napi_thread_wait(napi)) {
+ +              for (;;) {
+ +                      bool repoll = false;
+ +
+ +                      local_bh_disable();
+ +
+ +                      have = netpoll_poll_lock(napi);
+ +                      __napi_poll(napi, &repoll);
+ +                      netpoll_poll_unlock(have);
+ +
+ +                      local_bh_enable();
+ +
+ +                      if (!repoll)
+ +                              break;
+ +
+ +                      cond_resched();
+ +              }
+ +      }
+ +      return 0;
+ +}
+ +
   static __latent_entropy void net_rx_action(struct softirq_action *h)
   {
         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
@@@ -7032,7 -6861,7 +7028,7 @@@
   
                 if (list_empty(&list)) {
                         if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
- -                              goto out;
+ +                              return;
                         break;
                 }
   
@@@ -7059,6 -6888,8 +7055,6 @@@
                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
   
         net_rps_action_and_irq_enable(sd);
- -out:
- -      __kfree_skb_flush();
   }
   
   struct netdev_adjacent {
@@@ -8925,48 -8756,6 +8921,48 @@@ int dev_set_mac_address(struct net_devi
   }
   EXPORT_SYMBOL(dev_set_mac_address);
   
+ +static DECLARE_RWSEM(dev_addr_sem);
+ +
+ +int dev_set_mac_address_user(struct net_device *dev, struct sockaddr *sa,
+ +                           struct netlink_ext_ack *extack)
+ +{
+ +      int ret;
+ +
+ +      down_write(&dev_addr_sem);
+ +      ret = dev_set_mac_address(dev, sa, extack);
+ +      up_write(&dev_addr_sem);
+ +      return ret;
+ +}
+ +EXPORT_SYMBOL(dev_set_mac_address_user);
+ +
+ +int dev_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name)
+ +{
+ +      size_t size = sizeof(sa->sa_data);
+ +      struct net_device *dev;
+ +      int ret = 0;
+ +
+ +      down_read(&dev_addr_sem);
+ +      rcu_read_lock();
+ +
+ +      dev = dev_get_by_name_rcu(net, dev_name);
+ +      if (!dev) {
+ +              ret = -ENODEV;
+ +              goto unlock;
+ +      }
+ +      if (!dev->addr_len)
+ +              memset(sa->sa_data, 0, size);
+ +      else
+ +              memcpy(sa->sa_data, dev->dev_addr,
+ +                     min_t(size_t, size, dev->addr_len));
+ +      sa->sa_family = dev->type;
+ +
+ +unlock:
+ +      rcu_read_unlock();
+ +      up_read(&dev_addr_sem);
+ +      return ret;
+ +}
+ +EXPORT_SYMBOL(dev_get_mac_address);
+ +
   /**
    *    dev_change_carrier - Change device carrier
    *    @dev: device
@@@ -9666,6 -9455,106 +9662,6 @@@ static void net_set_todo(struct net_dev
         dev_net(dev)->dev_unreg_count++;
   }
   
- -static void rollback_registered_many(struct list_head *head)
- -{
- -      struct net_device *dev, *tmp;
- -      LIST_HEAD(close_head);
- -
- -      BUG_ON(dev_boot_phase);
- -      ASSERT_RTNL();
- -
- -      list_for_each_entry_safe(dev, tmp, head, unreg_list) {
- -              /* Some devices call without registering
- -               * for initialization unwind. Remove those
- -               * devices and proceed with the remaining.
- -               */
- -              if (dev->reg_state == NETREG_UNINITIALIZED) {
- -                      pr_debug("unregister_netdevice: device %s/%p never was registered\n",
- -                               dev->name, dev);
- -
- -                      WARN_ON(1);
- -                      list_del(&dev->unreg_list);
- -                      continue;
- -              }
- -              dev->dismantle = true;
- -              BUG_ON(dev->reg_state != NETREG_REGISTERED);
- -      }
- -
- -      /* If device is running, close it first. */
- -      list_for_each_entry(dev, head, unreg_list)
- -              list_add_tail(&dev->close_list, &close_head);
- -      dev_close_many(&close_head, true);
- -
- -      list_for_each_entry(dev, head, unreg_list) {
- -              /* And unlink it from device chain. */
- -              unlist_netdevice(dev);
- -
- -              dev->reg_state = NETREG_UNREGISTERING;
- -      }
- -      flush_all_backlogs();
- -
- -      synchronize_net();
- -
- -      list_for_each_entry(dev, head, unreg_list) {
- -              struct sk_buff *skb = NULL;
- -
- -              /* Shutdown queueing discipline. */
- -              dev_shutdown(dev);
- -
- -              dev_xdp_uninstall(dev);
- -
- -              /* Notify protocols, that we are about to destroy
- -               * this device. They should clean all the things.
- -               */
- -              call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
- -
- -              if (!dev->rtnl_link_ops ||
- -                  dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
- -                      skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0,
- -                                                   GFP_KERNEL, NULL, 0);
- -
- -              /*
- -               *      Flush the unicast and multicast chains
- -               */
- -              dev_uc_flush(dev);
- -              dev_mc_flush(dev);
- -
- -              netdev_name_node_alt_flush(dev);
- -              netdev_name_node_free(dev->name_node);
- -
- -              if (dev->netdev_ops->ndo_uninit)
- -                      dev->netdev_ops->ndo_uninit(dev);
- -
- -              if (skb)
- -                      rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
- -
- -              /* Notifier chain MUST detach us all upper devices. */
- -              WARN_ON(netdev_has_any_upper_dev(dev));
- -              WARN_ON(netdev_has_any_lower_dev(dev));
- -
- -              /* Remove entries from kobject tree */
- -              netdev_unregister_kobject(dev);
- -#ifdef CONFIG_XPS
- -              /* Remove XPS queueing entries */
- -              netif_reset_xps_queues_gt(dev, 0);
- -#endif
- -      }
- -
- -      synchronize_net();
- -
- -      list_for_each_entry(dev, head, unreg_list)
- -              dev_put(dev);
- -}
- -
- -static void rollback_registered(struct net_device *dev)
- -{
- -      LIST_HEAD(single);
- -
- -      list_add(&dev->unreg_list, &single);
- -      rollback_registered_many(&single);
- -      list_del(&single);
- -}
- -
   static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
         struct net_device *upper, netdev_features_t features)
   {
@@@ -10215,7 -10104,8 +10211,7 @@@ int register_netdevice(struct net_devic
         if (ret) {
                 /* Expect explicit free_netdev() on failure */
                 dev->needs_free_netdev = false;
- -              rollback_registered(dev);
- -              net_set_todo(dev);
+ +              unregister_netdevice_queue(dev, NULL);
                 goto out;
         }
         /*
@@@ -10837,10 -10727,9 +10833,10 @@@ void unregister_netdevice_queue(struct 
         if (head) {
                 list_move_tail(&dev->unreg_list, head);
         } else {
- -              rollback_registered(dev);
- -              /* Finish processing unregister after unlock */
- -              net_set_todo(dev);
+ +              LIST_HEAD(single);
+ +
+ +              list_add(&dev->unreg_list, &single);
+ +              unregister_netdevice_many(&single);
         }
   }
   EXPORT_SYMBOL(unregister_netdevice_queue);
@@@ -10854,100 -10743,14 +10850,100 @@@
    */
   void unregister_netdevice_many(struct list_head *head)
   {
- -      struct net_device *dev;
+ +      struct net_device *dev, *tmp;
+ +      LIST_HEAD(close_head);
+ +
+ +      BUG_ON(dev_boot_phase);
+ +      ASSERT_RTNL();
+ +
+ +      if (list_empty(head))
+ +              return;
   
- -      if (!list_empty(head)) {
- -              rollback_registered_many(head);
- -              list_for_each_entry(dev, head, unreg_list)
- -                      net_set_todo(dev);
- -              list_del(head);
+ +      list_for_each_entry_safe(dev, tmp, head, unreg_list) {
+ +              /* Some devices call without registering
+ +               * for initialization unwind. Remove those
+ +               * devices and proceed with the remaining.
+ +               */
+ +              if (dev->reg_state == NETREG_UNINITIALIZED) {
+ +                      pr_debug("unregister_netdevice: device %s/%p never was registered\n",
+ +                               dev->name, dev);
+ +
+ +                      WARN_ON(1);
+ +                      list_del(&dev->unreg_list);
+ +                      continue;
+ +              }
+ +              dev->dismantle = true;
+ +              BUG_ON(dev->reg_state != NETREG_REGISTERED);
+ +      }
+ +
+ +      /* If device is running, close it first. */
+ +      list_for_each_entry(dev, head, unreg_list)
+ +              list_add_tail(&dev->close_list, &close_head);
+ +      dev_close_many(&close_head, true);
+ +
+ +      list_for_each_entry(dev, head, unreg_list) {
+ +              /* And unlink it from device chain. */
+ +              unlist_netdevice(dev);
+ +
+ +              dev->reg_state = NETREG_UNREGISTERING;
+ +      }
+ +      flush_all_backlogs();
+ +
+ +      synchronize_net();
+ +
+ +      list_for_each_entry(dev, head, unreg_list) {
+ +              struct sk_buff *skb = NULL;
+ +
+ +              /* Shutdown queueing discipline. */
+ +              dev_shutdown(dev);
+ +
+ +              dev_xdp_uninstall(dev);
+ +
+ +              /* Notify protocols, that we are about to destroy
+ +               * this device. They should clean all the things.
+ +               */
+ +              call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
+ +
+ +              if (!dev->rtnl_link_ops ||
+ +                  dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
+ +                      skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0,
+ +                                                   GFP_KERNEL, NULL, 0);
+ +
+ +              /*
+ +               *      Flush the unicast and multicast chains
+ +               */
+ +              dev_uc_flush(dev);
+ +              dev_mc_flush(dev);
+ +
+ +              netdev_name_node_alt_flush(dev);
+ +              netdev_name_node_free(dev->name_node);
+ +
+ +              if (dev->netdev_ops->ndo_uninit)
+ +                      dev->netdev_ops->ndo_uninit(dev);
+ +
+ +              if (skb)
+ +                      rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
+ +
+ +              /* Notifier chain MUST detach us all upper devices. */
+ +              WARN_ON(netdev_has_any_upper_dev(dev));
+ +              WARN_ON(netdev_has_any_lower_dev(dev));
+ +
+ +              /* Remove entries from kobject tree */
+ +              netdev_unregister_kobject(dev);
+ +#ifdef CONFIG_XPS
+ +              /* Remove XPS queueing entries */
+ +              netif_reset_xps_queues_gt(dev, 0);
+ +#endif
+ +      }
+ +
+ +      synchronize_net();
+ +
+ +      list_for_each_entry(dev, head, unreg_list) {
+ +              dev_put(dev);
+ +              net_set_todo(dev);
         }
+ +
+ +      list_del(head);
   }
   EXPORT_SYMBOL(unregister_netdevice_many);
   
diff --combined net/core/filter.c

index 3b728ab,7059cf6..adfdad2
--- 1/net/core/filter.c
--- 2/net/core/filter.c
+++ b/net/core/filter.c
@@@ -2083,13 -2083,13 +2083,13 @@@ static const struct bpf_func_proto bpf_
   
   static inline int __bpf_rx_skb(struct net_device *dev, struct sk_buff *skb)
   {
-       return dev_forward_skb(dev, skb);
+       return dev_forward_skb_nomtu(dev, skb);
   }
   
   static inline int __bpf_rx_skb_no_mac(struct net_device *dev,
                                       struct sk_buff *skb)
   {
-       int ret = ____dev_forward_skb(dev, skb);
+       int ret = ____dev_forward_skb(dev, skb, false);
   
         if (likely(!ret)) {
                 skb->dev = dev;
@@@ -2480,7 -2480,7 +2480,7 @@@ int skb_do_redirect(struct sk_buff *skb
                         goto out_drop;
                 dev = ops->ndo_get_peer_dev(dev);
                 if (unlikely(!dev ||
-                            !is_skb_forwardable(dev, skb) ||
+                            !(dev->flags & IFF_UP) ||
                              net_eq(net, dev_net(dev))))
                         goto out_drop;
                 skb->dev = dev;
@@@ -3552,11 -3552,7 +3552,7 @@@ static int bpf_skb_net_shrink(struct sk
         return 0;
   }
   
- static u32 __bpf_skb_max_len(const struct sk_buff *skb)
- {
-       return skb->dev ? skb->dev->mtu + skb->dev->hard_header_len :
-                         SKB_MAX_ALLOC;
- }
+ #define BPF_SKB_MAX_LEN SKB_MAX_ALLOC
   
   BPF_CALL_4(sk_skb_adjust_room, struct sk_buff *, skb, s32, len_diff,
            u32, mode, u64, flags)
@@@ -3605,7 -3601,7 +3601,7 @@@ BPF_CALL_4(bpf_skb_adjust_room, struct 
   {
         u32 len_cur, len_diff_abs = abs(len_diff);
         u32 len_min = bpf_skb_net_base_len(skb);
-       u32 len_max = __bpf_skb_max_len(skb);
+       u32 len_max = BPF_SKB_MAX_LEN;
         __be16 proto = skb->protocol;
         bool shrink = len_diff < 0;
         u32 off;
@@@ -3688,7 -3684,7 +3684,7 @@@ static int bpf_skb_trim_rcsum(struct sk
   static inline int __bpf_skb_change_tail(struct sk_buff *skb, u32 new_len,
                                         u64 flags)
   {
-       u32 max_len = __bpf_skb_max_len(skb);
+       u32 max_len = BPF_SKB_MAX_LEN;
         u32 min_len = __bpf_skb_min_len(skb);
         int ret;
   
@@@ -3764,7 -3760,7 +3760,7 @@@ static const struct bpf_func_proto sk_s
   static inline int __bpf_skb_change_head(struct sk_buff *skb, u32 head_room,
                                         u64 flags)
   {
-       u32 max_len = __bpf_skb_max_len(skb);
+       u32 max_len = BPF_SKB_MAX_LEN;
         u32 new_len = skb->len + head_room;
         int ret;
   
@@@ -4631,6 -4627,18 +4627,18 @@@ static const struct bpf_func_proto bpf_
         .arg1_type      = ARG_PTR_TO_CTX,
   };
   
+ BPF_CALL_1(bpf_get_socket_ptr_cookie, struct sock *, sk)
+ {
+       return sk ? sock_gen_cookie(sk) : 0;
+ }
+ 
+ const struct bpf_func_proto bpf_get_socket_ptr_cookie_proto = {
+       .func           = bpf_get_socket_ptr_cookie,
+       .gpl_only       = false,
+       .ret_type       = RET_INTEGER,
+       .arg1_type      = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
+ };
+ 
   BPF_CALL_1(bpf_get_socket_cookie_sock_ops, struct bpf_sock_ops_kern *, ctx)
   {
         return __sock_gen_cookie(ctx->sk);
@@@ -4645,9 -4653,11 +4653,9 @@@ static const struct bpf_func_proto bpf_
   
   static u64 __bpf_get_netns_cookie(struct sock *sk)
   {
- -#ifdef CONFIG_NET_NS
- -      return __net_gen_cookie(sk ? sk->sk_net.net : &init_net);
- -#else
- -      return 0;
- -#endif
+ +      const struct net *net = sk ? sock_net(sk) : &init_net;
+ +
+ +      return net->net_cookie;
   }
   
   BPF_CALL_1(bpf_get_netns_cookie_sock, struct sock *, ctx)
@@@ -5291,12 -5301,14 +5299,14 @@@ static const struct bpf_func_proto bpf_
   #if IS_ENABLED(CONFIG_INET) || IS_ENABLED(CONFIG_IPV6)
   static int bpf_fib_set_fwd_params(struct bpf_fib_lookup *params,
                                   const struct neighbour *neigh,
-                                 const struct net_device *dev)
+                                 const struct net_device *dev, u32 mtu)
   {
         memcpy(params->dmac, neigh->ha, ETH_ALEN);
         memcpy(params->smac, dev->dev_addr, ETH_ALEN);
         params->h_vlan_TCI = 0;
         params->h_vlan_proto = 0;
+       if (mtu)
+               params->mtu_result = mtu; /* union with tot_len */
   
         return 0;
   }
@@@ -5312,8 -5324,8 +5322,8 @@@ static int bpf_ipv4_fib_lookup(struct n
         struct net_device *dev;
         struct fib_result res;
         struct flowi4 fl4;
+       u32 mtu = 0;
         int err;
-       u32 mtu;
   
         dev = dev_get_by_index_rcu(net, params->ifindex);
         if (unlikely(!dev))
@@@ -5380,8 -5392,10 +5390,10 @@@
   
         if (check_mtu) {
                 mtu = ip_mtu_from_fib_result(&res, params->ipv4_dst);
-               if (params->tot_len > mtu)
+               if (params->tot_len > mtu) {
+                       params->mtu_result = mtu; /* union with tot_len */
                         return BPF_FIB_LKUP_RET_FRAG_NEEDED;
+               }
         }
   
         nhc = res.nhc;
@@@ -5415,7 -5429,7 +5427,7 @@@
         if (!neigh)
                 return BPF_FIB_LKUP_RET_NO_NEIGH;
   
-       return bpf_fib_set_fwd_params(params, neigh, dev);
+       return bpf_fib_set_fwd_params(params, neigh, dev, mtu);
   }
   #endif
   
@@@ -5432,7 -5446,7 +5444,7 @@@ static int bpf_ipv6_fib_lookup(struct n
         struct flowi6 fl6;
         int strict = 0;
         int oif, err;
-       u32 mtu;
+       u32 mtu = 0;
   
         /* link local addresses are never forwarded */
         if (rt6_need_strict(dst) || rt6_need_strict(src))
@@@ -5507,8 -5521,10 +5519,10 @@@
   
         if (check_mtu) {
                 mtu = ipv6_stub->ip6_mtu_from_fib6(&res, dst, src);
-               if (params->tot_len > mtu)
+               if (params->tot_len > mtu) {
+                       params->mtu_result = mtu; /* union with tot_len */
                         return BPF_FIB_LKUP_RET_FRAG_NEEDED;
+               }
         }
   
         if (res.nh->fib_nh_lws)
@@@ -5528,7 -5544,7 +5542,7 @@@
         if (!neigh)
                 return BPF_FIB_LKUP_RET_NO_NEIGH;
   
-       return bpf_fib_set_fwd_params(params, neigh, dev);
+       return bpf_fib_set_fwd_params(params, neigh, dev, mtu);
   }
   #endif
   
@@@ -5571,6 -5587,7 +5585,7 @@@ BPF_CALL_4(bpf_skb_fib_lookup, struct s
   {
         struct net *net = dev_net(skb->dev);
         int rc = -EAFNOSUPPORT;
+       bool check_mtu = false;
   
         if (plen < sizeof(*params))
                 return -EINVAL;
@@@ -5578,25 -5595,33 +5593,33 @@@
         if (flags & ~(BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_OUTPUT))
                 return -EINVAL;
   
+       if (params->tot_len)
+               check_mtu = true;
+ 
         switch (params->family) {
   #if IS_ENABLED(CONFIG_INET)
         case AF_INET:
-               rc = bpf_ipv4_fib_lookup(net, params, flags, false);
+               rc = bpf_ipv4_fib_lookup(net, params, flags, check_mtu);
                 break;
   #endif
   #if IS_ENABLED(CONFIG_IPV6)
         case AF_INET6:
-               rc = bpf_ipv6_fib_lookup(net, params, flags, false);
+               rc = bpf_ipv6_fib_lookup(net, params, flags, check_mtu);
                 break;
   #endif
         }
   
-       if (!rc) {
+       if (rc == BPF_FIB_LKUP_RET_SUCCESS && !check_mtu) {
                 struct net_device *dev;
   
+               /* When tot_len isn't provided by user, check skb
+                * against MTU of FIB lookup resulting net_device
+                */
                 dev = dev_get_by_index_rcu(net, params->ifindex);
                 if (!is_skb_forwardable(dev, skb))
                         rc = BPF_FIB_LKUP_RET_FRAG_NEEDED;
+ 
+               params->mtu_result = dev->mtu; /* union with tot_len */
         }
   
         return rc;
@@@ -5612,6 -5637,116 +5635,116 @@@ static const struct bpf_func_proto bpf_
         .arg4_type      = ARG_ANYTHING,
   };
   
+ static struct net_device *__dev_via_ifindex(struct net_device *dev_curr,
+                                           u32 ifindex)
+ {
+       struct net *netns = dev_net(dev_curr);
+ 
+       /* Non-redirect use-cases can use ifindex=0 and save ifindex lookup */
+       if (ifindex == 0)
+               return dev_curr;
+ 
+       return dev_get_by_index_rcu(netns, ifindex);
+ }
+ 
+ BPF_CALL_5(bpf_skb_check_mtu, struct sk_buff *, skb,
+          u32, ifindex, u32 *, mtu_len, s32, len_diff, u64, flags)
+ {
+       int ret = BPF_MTU_CHK_RET_FRAG_NEEDED;
+       struct net_device *dev = skb->dev;
+       int skb_len, dev_len;
+       int mtu;
+ 
+       if (unlikely(flags & ~(BPF_MTU_CHK_SEGS)))
+               return -EINVAL;
+ 
+       if (unlikely(flags & BPF_MTU_CHK_SEGS && len_diff))
+               return -EINVAL;
+ 
+       dev = __dev_via_ifindex(dev, ifindex);
+       if (unlikely(!dev))
+               return -ENODEV;
+ 
+       mtu = READ_ONCE(dev->mtu);
+ 
+       dev_len = mtu + dev->hard_header_len;
+       skb_len = skb->len + len_diff; /* minus result pass check */
+       if (skb_len <= dev_len) {
+               ret = BPF_MTU_CHK_RET_SUCCESS;
+               goto out;
+       }
+       /* At this point, skb->len exceed MTU, but as it include length of all
+        * segments, it can still be below MTU.  The SKB can possibly get
+        * re-segmented in transmit path (see validate_xmit_skb).  Thus, user
+        * must choose if segs are to be MTU checked.
+        */
+       if (skb_is_gso(skb)) {
+               ret = BPF_MTU_CHK_RET_SUCCESS;
+ 
+               if (flags & BPF_MTU_CHK_SEGS &&
+                   !skb_gso_validate_network_len(skb, mtu))
+                       ret = BPF_MTU_CHK_RET_SEGS_TOOBIG;
+       }
+ out:
+       /* BPF verifier guarantees valid pointer */
+       *mtu_len = mtu;
+ 
+       return ret;
+ }
+ 
+ BPF_CALL_5(bpf_xdp_check_mtu, struct xdp_buff *, xdp,
+          u32, ifindex, u32 *, mtu_len, s32, len_diff, u64, flags)
+ {
+       struct net_device *dev = xdp->rxq->dev;
+       int xdp_len = xdp->data_end - xdp->data;
+       int ret = BPF_MTU_CHK_RET_SUCCESS;
+       int mtu, dev_len;
+ 
+       /* XDP variant doesn't support multi-buffer segment check (yet) */
+       if (unlikely(flags))
+               return -EINVAL;
+ 
+       dev = __dev_via_ifindex(dev, ifindex);
+       if (unlikely(!dev))
+               return -ENODEV;
+ 
+       mtu = READ_ONCE(dev->mtu);
+ 
+       /* Add L2-header as dev MTU is L3 size */
+       dev_len = mtu + dev->hard_header_len;
+ 
+       xdp_len += len_diff; /* minus result pass check */
+       if (xdp_len > dev_len)
+               ret = BPF_MTU_CHK_RET_FRAG_NEEDED;
+ 
+       /* BPF verifier guarantees valid pointer */
+       *mtu_len = mtu;
+ 
+       return ret;
+ }
+ 
+ static const struct bpf_func_proto bpf_skb_check_mtu_proto = {
+       .func           = bpf_skb_check_mtu,
+       .gpl_only       = true,
+       .ret_type       = RET_INTEGER,
+       .arg1_type      = ARG_PTR_TO_CTX,
+       .arg2_type      = ARG_ANYTHING,
+       .arg3_type      = ARG_PTR_TO_INT,
+       .arg4_type      = ARG_ANYTHING,
+       .arg5_type      = ARG_ANYTHING,
+ };
+ 
+ static const struct bpf_func_proto bpf_xdp_check_mtu_proto = {
+       .func           = bpf_xdp_check_mtu,
+       .gpl_only       = true,
+       .ret_type       = RET_INTEGER,
+       .arg1_type      = ARG_PTR_TO_CTX,
+       .arg2_type      = ARG_ANYTHING,
+       .arg3_type      = ARG_PTR_TO_INT,
+       .arg4_type      = ARG_ANYTHING,
+       .arg5_type      = ARG_ANYTHING,
+ };
+ 
   #if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
   static int bpf_push_seg6_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len)
   {
@@@ -7021,6 -7156,14 +7154,14 @@@ sock_addr_func_proto(enum bpf_func_id f
                 case BPF_CGROUP_INET6_BIND:
                 case BPF_CGROUP_INET4_CONNECT:
                 case BPF_CGROUP_INET6_CONNECT:
+               case BPF_CGROUP_UDP4_RECVMSG:
+               case BPF_CGROUP_UDP6_RECVMSG:
+               case BPF_CGROUP_UDP4_SENDMSG:
+               case BPF_CGROUP_UDP6_SENDMSG:
+               case BPF_CGROUP_INET4_GETPEERNAME:
+               case BPF_CGROUP_INET6_GETPEERNAME:
+               case BPF_CGROUP_INET4_GETSOCKNAME:
+               case BPF_CGROUP_INET6_GETSOCKNAME:
                         return &bpf_sock_addr_setsockopt_proto;
                 default:
                         return NULL;
@@@ -7031,6 -7174,14 +7172,14 @@@
                 case BPF_CGROUP_INET6_BIND:
                 case BPF_CGROUP_INET4_CONNECT:
                 case BPF_CGROUP_INET6_CONNECT:
+               case BPF_CGROUP_UDP4_RECVMSG:
+               case BPF_CGROUP_UDP6_RECVMSG:
+               case BPF_CGROUP_UDP4_SENDMSG:
+               case BPF_CGROUP_UDP6_SENDMSG:
+               case BPF_CGROUP_INET4_GETPEERNAME:
+               case BPF_CGROUP_INET6_GETPEERNAME:
+               case BPF_CGROUP_INET4_GETSOCKNAME:
+               case BPF_CGROUP_INET6_GETSOCKNAME:
                         return &bpf_sock_addr_getsockopt_proto;
                 default:
                         return NULL;
@@@ -7181,6 -7332,8 +7330,8 @@@ tc_cls_act_func_proto(enum bpf_func_id 
                 return &bpf_get_socket_uid_proto;
         case BPF_FUNC_fib_lookup:
                 return &bpf_skb_fib_lookup_proto;
+       case BPF_FUNC_check_mtu:
+               return &bpf_skb_check_mtu_proto;
         case BPF_FUNC_sk_fullsock:
                 return &bpf_sk_fullsock_proto;
         case BPF_FUNC_sk_storage_get:
@@@ -7250,6 -7403,8 +7401,8 @@@ xdp_func_proto(enum bpf_func_id func_id
                 return &bpf_xdp_adjust_tail_proto;
         case BPF_FUNC_fib_lookup:
                 return &bpf_xdp_fib_lookup_proto;
+       case BPF_FUNC_check_mtu:
+               return &bpf_xdp_check_mtu_proto;
   #ifdef CONFIG_INET
         case BPF_FUNC_sk_lookup_udp:
                 return &bpf_xdp_sk_lookup_udp_proto;
@@@ -8814,7 -8969,7 +8967,7 @@@ u32 bpf_sock_convert_ctx_access(enum bp
                                        target_size));
                 break;
         case offsetof(struct bpf_sock, rx_queue_mapping):
- -#ifdef CONFIG_XPS
+ +#ifdef CONFIG_SOCK_RX_QUEUE_MAPPING
                 *insn++ = BPF_LDX_MEM(
                         BPF_FIELD_SIZEOF(struct sock, sk_rx_queue_mapping),
                         si->dst_reg, si->src_reg,
diff --combined net/ipv4/af_inet.c

index 2ff5d80,aaa94be..a02ce89
--- 1/net/ipv4/af_inet.c
--- 2/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@@ -438,6 -438,7 +438,7 @@@ EXPORT_SYMBOL(inet_release)
   int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
   {
         struct sock *sk = sock->sk;
+       u32 flags = BIND_WITH_LOCK;
         int err;
   
         /* If the socket has its own bind function then use it. (RAW) */
@@@ -450,11 -451,12 +451,12 @@@
         /* BPF prog is run before any checks are done so that if the prog
          * changes context in a wrong way it will be caught.
          */
-       err = BPF_CGROUP_RUN_PROG_INET4_BIND_LOCK(sk, uaddr);
+       err = BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr,
+                                                BPF_CGROUP_INET4_BIND, &flags);
         if (err)
                 return err;
   
-       return __inet_bind(sk, uaddr, addr_len, BIND_WITH_LOCK);
+       return __inet_bind(sk, uaddr, addr_len, flags);
   }
   EXPORT_SYMBOL(inet_bind);
   
@@@ -499,7 -501,8 +501,8 @@@ int __inet_bind(struct sock *sk, struc
   
         snum = ntohs(addr->sin_port);
         err = -EACCES;
-       if (snum && inet_port_requires_bind_service(net, snum) &&
+       if (!(flags & BIND_NO_CAP_NET_BIND_SERVICE) &&
+           snum && inet_port_requires_bind_service(net, snum) &&
             !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE))
                 goto out;
   
@@@ -777,18 -780,19 +780,19 @@@ int inet_getname(struct socket *sock, s
                         return -ENOTCONN;
                 sin->sin_port = inet->inet_dport;
                 sin->sin_addr.s_addr = inet->inet_daddr;
+               BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin,
+                                           BPF_CGROUP_INET4_GETPEERNAME,
+                                           NULL);
         } else {
                 __be32 addr = inet->inet_rcv_saddr;
                 if (!addr)
                         addr = inet->inet_saddr;
                 sin->sin_port = inet->inet_sport;
                 sin->sin_addr.s_addr = addr;
-       }
-       if (cgroup_bpf_enabled)
                 BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin,
-                                           peer ? BPF_CGROUP_INET4_GETPEERNAME :
-                                                  BPF_CGROUP_INET4_GETSOCKNAME,
+                                           BPF_CGROUP_INET4_GETSOCKNAME,
                                             NULL);
+       }
         memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
         return sizeof(*sin);
   }
@@@ -1419,6 -1423,7 +1423,6 @@@ struct sk_buff *inet_gso_segment(struc
   out:
         return segs;
   }
- -EXPORT_SYMBOL(inet_gso_segment);
   
   static struct sk_buff *ipip_gso_segment(struct sk_buff *skb,
                                         netdev_features_t features)
@@@ -1549,6 -1554,7 +1553,6 @@@ out
   
         return pp;
   }
- -EXPORT_SYMBOL(inet_gro_receive);
   
   static struct sk_buff *ipip_gro_receive(struct list_head *head,
                                         struct sk_buff *skb)
@@@ -1634,6 -1640,7 +1638,6 @@@ out_unlock
   
         return err;
   }
- -EXPORT_SYMBOL(inet_gro_complete);
   
   static int ipip_gro_complete(struct sk_buff *skb, int nhoff)
   {
@@@ -1868,8 -1875,6 +1872,8 @@@ static __net_init int inet_init_net(str
         net->ipv4.sysctl_igmp_llm_reports = 1;
         net->ipv4.sysctl_igmp_qrv = 2;
   
+ +      net->ipv4.sysctl_fib_notify_on_flag_change = 0;
+ +
         return 0;
   }
   
diff --combined net/ipv4/tcp.c

index 7a6b58a,26aa923..a3422e4
--- 1/net/ipv4/tcp.c
--- 2/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@@ -280,12 -280,6 +280,12 @@@
   #include <asm/ioctls.h>
   #include <net/busy_poll.h>
   
+ +/* Track pending CMSGs. */
+ +enum {
+ +      TCP_CMSG_INQ = 1,
+ +      TCP_CMSG_TS = 2
+ +};
+ +
   struct percpu_counter tcp_orphan_count;
   EXPORT_SYMBOL_GPL(tcp_orphan_count);
   
@@@ -481,11 -475,19 +481,11 @@@ static void tcp_tx_timestamp(struct soc
         }
   }
   
- -static inline bool tcp_stream_is_readable(const struct tcp_sock *tp,
- -                                        int target, struct sock *sk)
+ +static bool tcp_stream_is_readable(struct sock *sk, int target)
   {
- -      int avail = READ_ONCE(tp->rcv_nxt) - READ_ONCE(tp->copied_seq);
- -
- -      if (avail > 0) {
- -              if (avail >= target)
- -                      return true;
- -              if (tcp_rmem_pressure(sk))
- -                      return true;
- -              if (tcp_receive_window(tp) <= inet_csk(sk)->icsk_ack.rcv_mss)
- -                      return true;
- -      }
+ +      if (tcp_epollin_ready(sk, target))
+ +              return true;
+ +
         if (sk->sk_prot->stream_memory_read)
                 return sk->sk_prot->stream_memory_read(sk);
         return false;
@@@ -560,7 -562,7 +560,7 @@@ __poll_t tcp_poll(struct file *file, st
                     tp->urg_data)
                         target++;
   
- -              if (tcp_stream_is_readable(tp, target, sk))
+ +              if (tcp_stream_is_readable(sk, target))
                         mask |= EPOLLIN | EPOLLRDNORM;
   
                 if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
@@@ -1737,20 -1739,6 +1737,20 @@@ int tcp_set_rcvlowat(struct sock *sk, i
   }
   EXPORT_SYMBOL(tcp_set_rcvlowat);
   
+ +static void tcp_update_recv_tstamps(struct sk_buff *skb,
+ +                                  struct scm_timestamping_internal *tss)
+ +{
+ +      if (skb->tstamp)
+ +              tss->ts[0] = ktime_to_timespec64(skb->tstamp);
+ +      else
+ +              tss->ts[0] = (struct timespec64) {0};
+ +
+ +      if (skb_hwtstamps(skb)->hwtstamp)
+ +              tss->ts[2] = ktime_to_timespec64(skb_hwtstamps(skb)->hwtstamp);
+ +      else
+ +              tss->ts[2] = (struct timespec64) {0};
+ +}
+ +
   #ifdef CONFIG_MMU
   static const struct vm_operations_struct tcp_vm_ops = {
   };
@@@ -1854,13 -1842,13 +1854,13 @@@ static int tcp_recvmsg_locked(struct so
                               struct scm_timestamping_internal *tss,
                               int *cmsg_flags);
   static int receive_fallback_to_copy(struct sock *sk,
- -                                  struct tcp_zerocopy_receive *zc, int inq)
+ +                                  struct tcp_zerocopy_receive *zc, int inq,
+ +                                  struct scm_timestamping_internal *tss)
   {
         unsigned long copy_address = (unsigned long)zc->copybuf_address;
- -      struct scm_timestamping_internal tss_unused;
- -      int err, cmsg_flags_unused;
         struct msghdr msg = {};
         struct iovec iov;
+ +      int err;
   
         zc->length = 0;
         zc->recv_skip_hint = 0;
@@@ -1874,7 -1862,7 +1874,7 @@@
                 return err;
   
         err = tcp_recvmsg_locked(sk, &msg, inq, /*nonblock=*/1, /*flags=*/0,
- -                               &tss_unused, &cmsg_flags_unused);
+ +                               tss, &zc->msg_flags);
         if (err < 0)
                 return err;
   
@@@ -1915,27 -1903,21 +1915,27 @@@ static int tcp_copy_straggler_data(stru
         return (__s32)copylen;
   }
   
- -static int tcp_zerocopy_handle_leftover_data(struct tcp_zerocopy_receive *zc,
- -                                           struct sock *sk,
- -                                           struct sk_buff *skb,
- -                                           u32 *seq,
- -                                           s32 copybuf_len)
+ +static int tcp_zc_handle_leftover(struct tcp_zerocopy_receive *zc,
+ +                                struct sock *sk,
+ +                                struct sk_buff *skb,
+ +                                u32 *seq,
+ +                                s32 copybuf_len,
+ +                                struct scm_timestamping_internal *tss)
   {
         u32 offset, copylen = min_t(u32, copybuf_len, zc->recv_skip_hint);
   
         if (!copylen)
                 return 0;
         /* skb is null if inq < PAGE_SIZE. */
- -      if (skb)
+ +      if (skb) {
                 offset = *seq - TCP_SKB_CB(skb)->seq;
- -      else
+ +      } else {
                 skb = tcp_recv_skb(sk, *seq, &offset);
+ +              if (TCP_SKB_CB(skb)->has_rxtstamp) {
+ +                      tcp_update_recv_tstamps(skb, tss);
+ +                      zc->msg_flags |= TCP_CMSG_TS;
+ +              }
+ +      }
   
         zc->copybuf_len = tcp_copy_straggler_data(zc, skb, copylen, &offset,
                                                   seq);
@@@ -2022,38 -2004,9 +2022,38 @@@ static int tcp_zerocopy_vm_insert_batch
                 err);
   }
   
+ +#define TCP_VALID_ZC_MSG_FLAGS   (TCP_CMSG_TS)
+ +static void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk,
+ +                             struct scm_timestamping_internal *tss);
+ +static void tcp_zc_finalize_rx_tstamp(struct sock *sk,
+ +                                    struct tcp_zerocopy_receive *zc,
+ +                                    struct scm_timestamping_internal *tss)
+ +{
+ +      unsigned long msg_control_addr;
+ +      struct msghdr cmsg_dummy;
+ +
+ +      msg_control_addr = (unsigned long)zc->msg_control;
+ +      cmsg_dummy.msg_control = (void *)msg_control_addr;
+ +      cmsg_dummy.msg_controllen =
+ +              (__kernel_size_t)zc->msg_controllen;
+ +      cmsg_dummy.msg_flags = in_compat_syscall()
+ +              ? MSG_CMSG_COMPAT : 0;
+ +      zc->msg_flags = 0;
+ +      if (zc->msg_control == msg_control_addr &&
+ +          zc->msg_controllen == cmsg_dummy.msg_controllen) {
+ +              tcp_recv_timestamp(&cmsg_dummy, sk, tss);
+ +              zc->msg_control = (__u64)
+ +                      ((uintptr_t)cmsg_dummy.msg_control);
+ +              zc->msg_controllen =
+ +                      (__u64)cmsg_dummy.msg_controllen;
+ +              zc->msg_flags = (__u32)cmsg_dummy.msg_flags;
+ +      }
+ +}
+ +
   #define TCP_ZEROCOPY_PAGE_BATCH_SIZE 32
   static int tcp_zerocopy_receive(struct sock *sk,
- -                              struct tcp_zerocopy_receive *zc)
+ +                              struct tcp_zerocopy_receive *zc,
+ +                              struct scm_timestamping_internal *tss)
   {
         u32 length = 0, offset, vma_len, avail_len, copylen = 0;
         unsigned long address = (unsigned long)zc->address;
@@@ -2070,7 -2023,6 +2070,7 @@@
         int ret;
   
         zc->copybuf_len = 0;
+ +      zc->msg_flags = 0;
   
         if (address & (PAGE_SIZE - 1) || address != zc->address)
                 return -EINVAL;
@@@ -2081,7 -2033,7 +2081,7 @@@
         sock_rps_record_flow(sk);
   
         if (inq && inq <= copybuf_len)
- -              return receive_fallback_to_copy(sk, zc, inq);
+ +              return receive_fallback_to_copy(sk, zc, inq, tss);
   
         if (inq < PAGE_SIZE) {
                 zc->length = 0;
@@@ -2126,11 -2078,6 +2126,11 @@@
                         } else {
                                 skb = tcp_recv_skb(sk, seq, &offset);
                         }
+ +
+ +                      if (TCP_SKB_CB(skb)->has_rxtstamp) {
+ +                              tcp_update_recv_tstamps(skb, tss);
+ +                              zc->msg_flags |= TCP_CMSG_TS;
+ +                      }
                         zc->recv_skip_hint = skb->len - offset;
                         frags = skb_advance_to_frag(skb, offset, &offset_frag);
                         if (!frags || offset_frag)
@@@ -2173,7 -2120,8 +2173,7 @@@ out
         mmap_read_unlock(current->mm);
         /* Try to copy straggler data. */
         if (!ret)
- -              copylen = tcp_zerocopy_handle_leftover_data(zc, sk, skb, &seq,
- -                                                          copybuf_len);
+ +              copylen = tcp_zc_handle_leftover(zc, sk, skb, &seq, copybuf_len, tss);
   
         if (length + copylen) {
                 WRITE_ONCE(tp->copied_seq, seq);
@@@ -2194,6 -2142,20 +2194,6 @@@
   }
   #endif
   
- -static void tcp_update_recv_tstamps(struct sk_buff *skb,
- -                                  struct scm_timestamping_internal *tss)
- -{
- -      if (skb->tstamp)
- -              tss->ts[0] = ktime_to_timespec64(skb->tstamp);
- -      else
- -              tss->ts[0] = (struct timespec64) {0};
- -
- -      if (skb_hwtstamps(skb)->hwtstamp)
- -              tss->ts[2] = ktime_to_timespec64(skb_hwtstamps(skb)->hwtstamp);
- -      else
- -              tss->ts[2] = (struct timespec64) {0};
- -}
- -
   /* Similar to __sock_recv_timestamp, but does not require an skb */
   static void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk,
                                struct scm_timestamping_internal *tss)
@@@ -2310,7 -2272,7 +2310,7 @@@ static int tcp_recvmsg_locked(struct so
                 goto out;
   
         if (tp->recvmsg_inq)
- -              *cmsg_flags = 1;
+ +              *cmsg_flags = TCP_CMSG_INQ;
         timeo = sock_rcvtimeo(sk, nonblock);
   
         /* Urgent data needs to be handled specially. */
@@@ -2491,7 -2453,7 +2491,7 @@@ skip_copy
   
                 if (TCP_SKB_CB(skb)->has_rxtstamp) {
                         tcp_update_recv_tstamps(skb, tss);
- -                      *cmsg_flags |= 2;
+ +                      *cmsg_flags |= TCP_CMSG_TS;
                 }
   
                 if (used + offset < skb->len)
@@@ -2551,9 -2513,9 +2551,9 @@@ int tcp_recvmsg(struct sock *sk, struc
         release_sock(sk);
   
         if (cmsg_flags && ret >= 0) {
- -              if (cmsg_flags & 2)
+ +              if (cmsg_flags & TCP_CMSG_TS)
                         tcp_recv_timestamp(msg, sk, &tss);
- -              if (cmsg_flags & 1) {
+ +              if (cmsg_flags & TCP_CMSG_INQ) {
                         inq = tcp_inq_hint(sk);
                         put_cmsg(msg, SOL_TCP, TCP_CM_INQ, sizeof(inq), &inq);
                 }
@@@ -3805,24 -3767,11 +3805,24 @@@ static size_t tcp_opt_stats_get_size(vo
                 nla_total_size(sizeof(u16)) + /* TCP_NLA_TIMEOUT_REHASH */
                 nla_total_size(sizeof(u32)) + /* TCP_NLA_BYTES_NOTSENT */
                 nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_EDT */
+ +              nla_total_size(sizeof(u8)) + /* TCP_NLA_TTL */
                 0;
   }
   
+ +/* Returns TTL or hop limit of an incoming packet from skb. */
+ +static u8 tcp_skb_ttl_or_hop_limit(const struct sk_buff *skb)
+ +{
+ +      if (skb->protocol == htons(ETH_P_IP))
+ +              return ip_hdr(skb)->ttl;
+ +      else if (skb->protocol == htons(ETH_P_IPV6))
+ +              return ipv6_hdr(skb)->hop_limit;
+ +      else
+ +              return 0;
+ +}
+ +
   struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk,
- -                                             const struct sk_buff *orig_skb)
+ +                                             const struct sk_buff *orig_skb,
+ +                                             const struct sk_buff *ack_skb)
   {
         const struct tcp_sock *tp = tcp_sk(sk);
         struct sk_buff *stats;
@@@ -3878,9 -3827,6 +3878,9 @@@
                     max_t(int, 0, tp->write_seq - tp->snd_nxt));
         nla_put_u64_64bit(stats, TCP_NLA_EDT, orig_skb->skb_mstamp_ns,
                           TCP_NLA_PAD);
+ +      if (ack_skb)
+ +              nla_put_u8(stats, TCP_NLA_TTL,
+ +                         tcp_skb_ttl_or_hop_limit(ack_skb));
   
         return stats;
   }
@@@ -4137,7 -4083,6 +4137,7 @@@ static int do_tcp_getsockopt(struct soc
         }
   #ifdef CONFIG_MMU
         case TCP_ZEROCOPY_RECEIVE: {
+ +              struct scm_timestamping_internal tss;
                 struct tcp_zerocopy_receive zc = {};
                 int err;
   
@@@ -4145,34 -4090,21 +4145,36 @@@
                         return -EFAULT;
                 if (len < offsetofend(struct tcp_zerocopy_receive, length))
                         return -EINVAL;
- -              if (len > sizeof(zc)) {
+ +              if (unlikely(len > sizeof(zc))) {
+ +                      err = check_zeroed_user(optval + sizeof(zc),
+ +                                              len - sizeof(zc));
+ +                      if (err < 1)
+ +                              return err == 0 ? -EINVAL : err;
                         len = sizeof(zc);
                         if (put_user(len, optlen))
                                 return -EFAULT;
                 }
                 if (copy_from_user(&zc, optval, len))
                         return -EFAULT;
+ +              if (zc.reserved)
+ +                      return -EINVAL;
+ +              if (zc.msg_flags &  ~(TCP_VALID_ZC_MSG_FLAGS))
+ +                      return -EINVAL;
                 lock_sock(sk);
- -              err = tcp_zerocopy_receive(sk, &zc);
+ +              err = tcp_zerocopy_receive(sk, &zc, &tss);
+               err = BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sk, level, optname,
+                                                         &zc, &len, err);
                 release_sock(sk);
- -              if (len >= offsetofend(struct tcp_zerocopy_receive, err))
- -                      goto zerocopy_rcv_sk_err;
+ +              if (len >= offsetofend(struct tcp_zerocopy_receive, msg_flags))
+ +                      goto zerocopy_rcv_cmsg;
                 switch (len) {
+ +              case offsetofend(struct tcp_zerocopy_receive, msg_flags):
+ +                      goto zerocopy_rcv_cmsg;
+ +              case offsetofend(struct tcp_zerocopy_receive, msg_controllen):
+ +              case offsetofend(struct tcp_zerocopy_receive, msg_control):
+ +              case offsetofend(struct tcp_zerocopy_receive, flags):
+ +              case offsetofend(struct tcp_zerocopy_receive, copybuf_len):
+ +              case offsetofend(struct tcp_zerocopy_receive, copybuf_address):
                 case offsetofend(struct tcp_zerocopy_receive, err):
                         goto zerocopy_rcv_sk_err;
                 case offsetofend(struct tcp_zerocopy_receive, inq):
@@@ -4181,11 -4113,6 +4183,11 @@@
                 default:
                         goto zerocopy_rcv_out;
                 }
+ +zerocopy_rcv_cmsg:
+ +              if (zc.msg_flags & TCP_CMSG_TS)
+ +                      tcp_zc_finalize_rx_tstamp(sk, &zc, &tss);
+ +              else
+ +                      zc.msg_flags = 0;
   zerocopy_rcv_sk_err:
                 if (!err)
                         zc.err = sock_error(sk);
@@@ -4208,6 -4135,18 +4210,18 @@@ zerocopy_rcv_out
         return 0;
   }
   
+ bool tcp_bpf_bypass_getsockopt(int level, int optname)
+ {
+       /* TCP do_tcp_getsockopt has optimized getsockopt implementation
+        * to avoid extra socket lock for TCP_ZEROCOPY_RECEIVE.
+        */
+       if (level == SOL_TCP && optname == TCP_ZEROCOPY_RECEIVE)
+               return true;
+ 
+       return false;
+ }
+ EXPORT_SYMBOL(tcp_bpf_bypass_getsockopt);
+ 
   int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
                    int __user *optlen)
   {
diff --combined net/ipv4/tcp_ipv4.c

index 6110392,62b6fd3..daad4f9
--- 1/net/ipv4/tcp_ipv4.c
--- 2/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@@ -1649,8 -1649,6 +1649,8 @@@ u16 tcp_v4_get_syncookie(struct sock *s
         return mss;
   }
   
+ +INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
+ +                                                         u32));
   /* The socket must have it's spinlock held when we get
    * here, unless it is a TCP_LISTEN socket.
    *
@@@ -1670,8 -1668,7 +1670,8 @@@ int tcp_v4_do_rcv(struct sock *sk, stru
                 sk_mark_napi_id(sk, skb);
                 if (dst) {
                         if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
- -                          !dst->ops->check(dst, 0)) {
+ +                          !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
+ +                                           dst, 0)) {
                                 dst_release(dst);
                                 sk->sk_rx_dst = NULL;
                         }
@@@ -2796,6 -2793,7 +2796,7 @@@ struct proto tcp_prot = 
         .shutdown               = tcp_shutdown,
         .setsockopt             = tcp_setsockopt,
         .getsockopt             = tcp_getsockopt,
+       .bpf_bypass_getsockopt  = tcp_bpf_bypass_getsockopt,
         .keepalive              = tcp_set_keepalive,
         .recvmsg                = tcp_recvmsg,
         .sendmsg                = tcp_sendmsg,
diff --combined net/ipv4/udp.c

index 48208fb,c67e483..4a0478b
--- 1/net/ipv4/udp.c
--- 2/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@@ -596,12 -596,6 +596,12 @@@ void udp_encap_enable(void
   }
   EXPORT_SYMBOL(udp_encap_enable);
   
+ +void udp_encap_disable(void)
+ +{
+ +      static_branch_dec(&udp_encap_needed_key);
+ +}
+ +EXPORT_SYMBOL(udp_encap_disable);
+ +
   /* Handler for tunnels with arbitrary destination ports: no socket lookup, go
    * through error handlers in encapsulations looking for a match.
    */
@@@ -1130,7 -1124,7 +1130,7 @@@ int udp_sendmsg(struct sock *sk, struc
                 rcu_read_unlock();
         }
   
-       if (cgroup_bpf_enabled && !connected) {
+       if (cgroup_bpf_enabled(BPF_CGROUP_UDP4_SENDMSG) && !connected) {
                 err = BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk,
                                             (struct sockaddr *)usin, &ipc.addr);
                 if (err)
@@@ -1864,9 -1858,8 +1864,8 @@@ try_again
                 memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
                 *addr_len = sizeof(*sin);
   
-               if (cgroup_bpf_enabled)
-                       BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk,
-                                                       (struct sockaddr *)sin);
+               BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk,
+                                                     (struct sockaddr *)sin);
         }
   
         if (udp_sk(sk)->gro_enabled)
diff --combined net/ipv6/af_inet6.c

index 0e9994e,f091fe9..1fb75f0
--- 1/net/ipv6/af_inet6.c
--- 2/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@@ -295,7 -295,8 +295,8 @@@ static int __inet6_bind(struct sock *sk
                 return -EINVAL;
   
         snum = ntohs(addr->sin6_port);
-       if (snum && inet_port_requires_bind_service(net, snum) &&
+       if (!(flags & BIND_NO_CAP_NET_BIND_SERVICE) &&
+           snum && inet_port_requires_bind_service(net, snum) &&
             !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE))
                 return -EACCES;
   
@@@ -439,6 -440,7 +440,7 @@@ out_unlock
   int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
   {
         struct sock *sk = sock->sk;
+       u32 flags = BIND_WITH_LOCK;
         int err = 0;
   
         /* If the socket has its own bind function then use it. */
@@@ -451,11 -453,12 +453,12 @@@
         /* BPF prog is run before any checks are done so that if the prog
          * changes context in a wrong way it will be caught.
          */
-       err = BPF_CGROUP_RUN_PROG_INET6_BIND_LOCK(sk, uaddr);
+       err = BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr,
+                                                BPF_CGROUP_INET6_BIND, &flags);
         if (err)
                 return err;
   
-       return __inet6_bind(sk, uaddr, addr_len, BIND_WITH_LOCK);
+       return __inet6_bind(sk, uaddr, addr_len, flags);
   }
   EXPORT_SYMBOL(inet6_bind);
   
@@@ -527,18 -530,19 +530,19 @@@ int inet6_getname(struct socket *sock, 
                 sin->sin6_addr = sk->sk_v6_daddr;
                 if (np->sndflow)
                         sin->sin6_flowinfo = np->flow_label;
+               BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin,
+                                           BPF_CGROUP_INET6_GETPEERNAME,
+                                           NULL);
         } else {
                 if (ipv6_addr_any(&sk->sk_v6_rcv_saddr))
                         sin->sin6_addr = np->saddr;
                 else
                         sin->sin6_addr = sk->sk_v6_rcv_saddr;
                 sin->sin6_port = inet->inet_sport;
-       }
-       if (cgroup_bpf_enabled)
                 BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin,
-                                           peer ? BPF_CGROUP_INET6_GETPEERNAME :
-                                                  BPF_CGROUP_INET6_GETSOCKNAME,
+                                           BPF_CGROUP_INET6_GETSOCKNAME,
                                             NULL);
+       }
         sin->sin6_scope_id = ipv6_iface_scope_id(&sin->sin6_addr,
                                                  sk->sk_bound_dev_if);
         return sizeof(*sin);
@@@ -954,7 -958,6 +958,7 @@@ static int __net_init inet6_net_init(st
         net->ipv6.sysctl.max_hbh_opts_cnt = IP6_DEFAULT_MAX_HBH_OPTS_CNT;
         net->ipv6.sysctl.max_dst_opts_len = IP6_DEFAULT_MAX_DST_OPTS_LEN;
         net->ipv6.sysctl.max_hbh_opts_len = IP6_DEFAULT_MAX_HBH_OPTS_LEN;
+ +      net->ipv6.sysctl.fib_notify_on_flag_change = 0;
         atomic_set(&net->ipv6.fib6_sernum, 1);
   
         err = ipv6_init_mibs(net);
diff --combined net/ipv6/tcp_ipv6.c

index d093ef3,8539715..bd44ded
--- 1/net/ipv6/tcp_ipv6.c
--- 2/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@@ -1420,8 -1420,6 +1420,8 @@@ out
         return NULL;
   }
   
+ +INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
+ +                                                         u32));
   /* The socket must have it's spinlock held when we get
    * here, unless it is a TCP_LISTEN socket.
    *
@@@ -1475,8 -1473,7 +1475,8 @@@ static int tcp_v6_do_rcv(struct sock *s
                 sk_mark_napi_id(sk, skb);
                 if (dst) {
                         if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
- -                          dst->ops->check(dst, np->rx_dst_cookie) == NULL) {
+ +                          INDIRECT_CALL_1(dst->ops->check, ip6_dst_check,
+ +                                          dst, np->rx_dst_cookie) == NULL) {
                                 dst_release(dst);
                                 sk->sk_rx_dst = NULL;
                         }
@@@ -2124,6 -2121,7 +2124,7 @@@ struct proto tcpv6_prot = 
         .shutdown               = tcp_shutdown,
         .setsockopt             = tcp_setsockopt,
         .getsockopt             = tcp_getsockopt,
+       .bpf_bypass_getsockopt  = tcp_bpf_bypass_getsockopt,
         .keepalive              = tcp_set_keepalive,
         .recvmsg                = tcp_recvmsg,
         .sendmsg                = tcp_sendmsg,
diff --combined net/ipv6/udp.c

index d754292,a02ac87..d25e5a9
--- 1/net/ipv6/udp.c
--- 2/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@@ -409,9 -409,8 +409,8 @@@ try_again
                 }
                 *addr_len = sizeof(*sin6);
   
-               if (cgroup_bpf_enabled)
-                       BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk,
-                                               (struct sockaddr *)sin6);
+               BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk,
+                                                     (struct sockaddr *)sin6);
         }
   
         if (udp_sk(sk)->gro_enabled)
@@@ -1462,7 -1461,7 +1461,7 @@@ do_udp_sendmsg
                 fl6.saddr = np->saddr;
         fl6.fl6_sport = inet->inet_sport;
   
-       if (cgroup_bpf_enabled && !connected) {
+       if (cgroup_bpf_enabled(BPF_CGROUP_UDP6_SENDMSG) && !connected) {
                 err = BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk,
                                            (struct sockaddr *)sin6, &fl6.saddr);
                 if (err)
@@@ -1608,10 -1607,8 +1607,10 @@@ void udpv6_destroy_sock(struct sock *sk
                         if (encap_destroy)
                                 encap_destroy(sk);
                 }
- -              if (up->encap_enabled)
+ +              if (up->encap_enabled) {
                         static_branch_dec(&udpv6_encap_needed_key);
+ +                      udp_encap_disable();
+ +              }
         }
   
         inet6_destroy_sock(sk);
author	David S. Miller <davem@davemloft.net>
	Tue, 16 Feb 2021 21:14:06 +0000 (13:14 -0800)
committer	David S. Miller <davem@davemloft.net>
	Tue, 16 Feb 2021 21:14:06 +0000 (13:14 -0800)
		1	2
Makefile	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/indirect_call_wrapper.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/netdevice.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/net/sock.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/net/tcp.h	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/bpf/cgroup.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/bpf/verifier.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/trace/bpf_trace.c	patch \|	diff1 \|	diff2 \|	blob \| history
net/core/dev.c	patch \|	diff1 \|	diff2 \|	blob \| history
net/core/filter.c	patch \|	diff1 \|	diff2 \|	blob \| history
net/ipv4/af_inet.c	patch \|	diff1 \|	diff2 \|	blob \| history
net/ipv4/tcp.c	patch \|	diff1 \|	diff2 \|	blob \| history
net/ipv4/tcp_ipv4.c	patch \|	diff1 \|	diff2 \|	blob \| history
net/ipv4/udp.c	patch \|	diff1 \|	diff2 \|	blob \| history
net/ipv6/af_inet6.c	patch \|	diff1 \|	diff2 \|	blob \| history
net/ipv6/tcp_ipv6.c	patch \|	diff1 \|	diff2 \|	blob \| history
net/ipv6/udp.c	patch \|	diff1 \|	diff2 \|	blob \| history