Merge branches 'clk-range', 'clk-uniphier', 'clk-apple' and 'clk-qcom' into clk-next
[linux-2.6-microblaze.git] / drivers / net / veth.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  *  drivers/net/veth.c
4  *
5  *  Copyright (C) 2007 OpenVZ http://openvz.org, SWsoft Inc
6  *
7  * Author: Pavel Emelianov <xemul@openvz.org>
8  * Ethtool interface from: Eric W. Biederman <ebiederm@xmission.com>
9  *
10  */
11
12 #include <linux/netdevice.h>
13 #include <linux/slab.h>
14 #include <linux/ethtool.h>
15 #include <linux/etherdevice.h>
16 #include <linux/u64_stats_sync.h>
17
18 #include <net/rtnetlink.h>
19 #include <net/dst.h>
20 #include <net/xfrm.h>
21 #include <net/xdp.h>
22 #include <linux/veth.h>
23 #include <linux/module.h>
24 #include <linux/bpf.h>
25 #include <linux/filter.h>
26 #include <linux/ptr_ring.h>
27 #include <linux/bpf_trace.h>
28 #include <linux/net_tstamp.h>
29
30 #define DRV_NAME        "veth"
31 #define DRV_VERSION     "1.0"
32
33 #define VETH_XDP_FLAG           BIT(0)
34 #define VETH_RING_SIZE          256
35 #define VETH_XDP_HEADROOM       (XDP_PACKET_HEADROOM + NET_IP_ALIGN)
36
37 #define VETH_XDP_TX_BULK_SIZE   16
38 #define VETH_XDP_BATCH          16
39
40 struct veth_stats {
41         u64     rx_drops;
42         /* xdp */
43         u64     xdp_packets;
44         u64     xdp_bytes;
45         u64     xdp_redirect;
46         u64     xdp_drops;
47         u64     xdp_tx;
48         u64     xdp_tx_err;
49         u64     peer_tq_xdp_xmit;
50         u64     peer_tq_xdp_xmit_err;
51 };
52
53 struct veth_rq_stats {
54         struct veth_stats       vs;
55         struct u64_stats_sync   syncp;
56 };
57
58 struct veth_rq {
59         struct napi_struct      xdp_napi;
60         struct napi_struct __rcu *napi; /* points to xdp_napi when the latter is initialized */
61         struct net_device       *dev;
62         struct bpf_prog __rcu   *xdp_prog;
63         struct xdp_mem_info     xdp_mem;
64         struct veth_rq_stats    stats;
65         bool                    rx_notify_masked;
66         struct ptr_ring         xdp_ring;
67         struct xdp_rxq_info     xdp_rxq;
68 };
69
70 struct veth_priv {
71         struct net_device __rcu *peer;
72         atomic64_t              dropped;
73         struct bpf_prog         *_xdp_prog;
74         struct veth_rq          *rq;
75         unsigned int            requested_headroom;
76 };
77
78 struct veth_xdp_tx_bq {
79         struct xdp_frame *q[VETH_XDP_TX_BULK_SIZE];
80         unsigned int count;
81 };
82
83 /*
84  * ethtool interface
85  */
86
87 struct veth_q_stat_desc {
88         char    desc[ETH_GSTRING_LEN];
89         size_t  offset;
90 };
91
92 #define VETH_RQ_STAT(m) offsetof(struct veth_stats, m)
93
94 static const struct veth_q_stat_desc veth_rq_stats_desc[] = {
95         { "xdp_packets",        VETH_RQ_STAT(xdp_packets) },
96         { "xdp_bytes",          VETH_RQ_STAT(xdp_bytes) },
97         { "drops",              VETH_RQ_STAT(rx_drops) },
98         { "xdp_redirect",       VETH_RQ_STAT(xdp_redirect) },
99         { "xdp_drops",          VETH_RQ_STAT(xdp_drops) },
100         { "xdp_tx",             VETH_RQ_STAT(xdp_tx) },
101         { "xdp_tx_errors",      VETH_RQ_STAT(xdp_tx_err) },
102 };
103
104 #define VETH_RQ_STATS_LEN       ARRAY_SIZE(veth_rq_stats_desc)
105
106 static const struct veth_q_stat_desc veth_tq_stats_desc[] = {
107         { "xdp_xmit",           VETH_RQ_STAT(peer_tq_xdp_xmit) },
108         { "xdp_xmit_errors",    VETH_RQ_STAT(peer_tq_xdp_xmit_err) },
109 };
110
111 #define VETH_TQ_STATS_LEN       ARRAY_SIZE(veth_tq_stats_desc)
112
113 static struct {
114         const char string[ETH_GSTRING_LEN];
115 } ethtool_stats_keys[] = {
116         { "peer_ifindex" },
117 };
118
119 static int veth_get_link_ksettings(struct net_device *dev,
120                                    struct ethtool_link_ksettings *cmd)
121 {
122         cmd->base.speed         = SPEED_10000;
123         cmd->base.duplex        = DUPLEX_FULL;
124         cmd->base.port          = PORT_TP;
125         cmd->base.autoneg       = AUTONEG_DISABLE;
126         return 0;
127 }
128
129 static void veth_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info)
130 {
131         strlcpy(info->driver, DRV_NAME, sizeof(info->driver));
132         strlcpy(info->version, DRV_VERSION, sizeof(info->version));
133 }
134
135 static void veth_get_strings(struct net_device *dev, u32 stringset, u8 *buf)
136 {
137         u8 *p = buf;
138         int i, j;
139
140         switch(stringset) {
141         case ETH_SS_STATS:
142                 memcpy(p, &ethtool_stats_keys, sizeof(ethtool_stats_keys));
143                 p += sizeof(ethtool_stats_keys);
144                 for (i = 0; i < dev->real_num_rx_queues; i++)
145                         for (j = 0; j < VETH_RQ_STATS_LEN; j++)
146                                 ethtool_sprintf(&p, "rx_queue_%u_%.18s",
147                                                 i, veth_rq_stats_desc[j].desc);
148
149                 for (i = 0; i < dev->real_num_tx_queues; i++)
150                         for (j = 0; j < VETH_TQ_STATS_LEN; j++)
151                                 ethtool_sprintf(&p, "tx_queue_%u_%.18s",
152                                                 i, veth_tq_stats_desc[j].desc);
153                 break;
154         }
155 }
156
157 static int veth_get_sset_count(struct net_device *dev, int sset)
158 {
159         switch (sset) {
160         case ETH_SS_STATS:
161                 return ARRAY_SIZE(ethtool_stats_keys) +
162                        VETH_RQ_STATS_LEN * dev->real_num_rx_queues +
163                        VETH_TQ_STATS_LEN * dev->real_num_tx_queues;
164         default:
165                 return -EOPNOTSUPP;
166         }
167 }
168
169 static void veth_get_ethtool_stats(struct net_device *dev,
170                 struct ethtool_stats *stats, u64 *data)
171 {
172         struct veth_priv *rcv_priv, *priv = netdev_priv(dev);
173         struct net_device *peer = rtnl_dereference(priv->peer);
174         int i, j, idx;
175
176         data[0] = peer ? peer->ifindex : 0;
177         idx = 1;
178         for (i = 0; i < dev->real_num_rx_queues; i++) {
179                 const struct veth_rq_stats *rq_stats = &priv->rq[i].stats;
180                 const void *stats_base = (void *)&rq_stats->vs;
181                 unsigned int start;
182                 size_t offset;
183
184                 do {
185                         start = u64_stats_fetch_begin_irq(&rq_stats->syncp);
186                         for (j = 0; j < VETH_RQ_STATS_LEN; j++) {
187                                 offset = veth_rq_stats_desc[j].offset;
188                                 data[idx + j] = *(u64 *)(stats_base + offset);
189                         }
190                 } while (u64_stats_fetch_retry_irq(&rq_stats->syncp, start));
191                 idx += VETH_RQ_STATS_LEN;
192         }
193
194         if (!peer)
195                 return;
196
197         rcv_priv = netdev_priv(peer);
198         for (i = 0; i < peer->real_num_rx_queues; i++) {
199                 const struct veth_rq_stats *rq_stats = &rcv_priv->rq[i].stats;
200                 const void *base = (void *)&rq_stats->vs;
201                 unsigned int start, tx_idx = idx;
202                 size_t offset;
203
204                 tx_idx += (i % dev->real_num_tx_queues) * VETH_TQ_STATS_LEN;
205                 do {
206                         start = u64_stats_fetch_begin_irq(&rq_stats->syncp);
207                         for (j = 0; j < VETH_TQ_STATS_LEN; j++) {
208                                 offset = veth_tq_stats_desc[j].offset;
209                                 data[tx_idx + j] += *(u64 *)(base + offset);
210                         }
211                 } while (u64_stats_fetch_retry_irq(&rq_stats->syncp, start));
212         }
213 }
214
215 static void veth_get_channels(struct net_device *dev,
216                               struct ethtool_channels *channels)
217 {
218         channels->tx_count = dev->real_num_tx_queues;
219         channels->rx_count = dev->real_num_rx_queues;
220         channels->max_tx = dev->num_tx_queues;
221         channels->max_rx = dev->num_rx_queues;
222 }
223
224 static int veth_set_channels(struct net_device *dev,
225                              struct ethtool_channels *ch);
226
227 static const struct ethtool_ops veth_ethtool_ops = {
228         .get_drvinfo            = veth_get_drvinfo,
229         .get_link               = ethtool_op_get_link,
230         .get_strings            = veth_get_strings,
231         .get_sset_count         = veth_get_sset_count,
232         .get_ethtool_stats      = veth_get_ethtool_stats,
233         .get_link_ksettings     = veth_get_link_ksettings,
234         .get_ts_info            = ethtool_op_get_ts_info,
235         .get_channels           = veth_get_channels,
236         .set_channels           = veth_set_channels,
237 };
238
239 /* general routines */
240
241 static bool veth_is_xdp_frame(void *ptr)
242 {
243         return (unsigned long)ptr & VETH_XDP_FLAG;
244 }
245
246 static struct xdp_frame *veth_ptr_to_xdp(void *ptr)
247 {
248         return (void *)((unsigned long)ptr & ~VETH_XDP_FLAG);
249 }
250
251 static void *veth_xdp_to_ptr(struct xdp_frame *xdp)
252 {
253         return (void *)((unsigned long)xdp | VETH_XDP_FLAG);
254 }
255
256 static void veth_ptr_free(void *ptr)
257 {
258         if (veth_is_xdp_frame(ptr))
259                 xdp_return_frame(veth_ptr_to_xdp(ptr));
260         else
261                 kfree_skb(ptr);
262 }
263
264 static void __veth_xdp_flush(struct veth_rq *rq)
265 {
266         /* Write ptr_ring before reading rx_notify_masked */
267         smp_mb();
268         if (!READ_ONCE(rq->rx_notify_masked) &&
269             napi_schedule_prep(&rq->xdp_napi)) {
270                 WRITE_ONCE(rq->rx_notify_masked, true);
271                 __napi_schedule(&rq->xdp_napi);
272         }
273 }
274
275 static int veth_xdp_rx(struct veth_rq *rq, struct sk_buff *skb)
276 {
277         if (unlikely(ptr_ring_produce(&rq->xdp_ring, skb))) {
278                 dev_kfree_skb_any(skb);
279                 return NET_RX_DROP;
280         }
281
282         return NET_RX_SUCCESS;
283 }
284
285 static int veth_forward_skb(struct net_device *dev, struct sk_buff *skb,
286                             struct veth_rq *rq, bool xdp)
287 {
288         return __dev_forward_skb(dev, skb) ?: xdp ?
289                 veth_xdp_rx(rq, skb) :
290                 netif_rx(skb);
291 }
292
293 /* return true if the specified skb has chances of GRO aggregation
294  * Don't strive for accuracy, but try to avoid GRO overhead in the most
295  * common scenarios.
296  * When XDP is enabled, all traffic is considered eligible, as the xmit
297  * device has TSO off.
298  * When TSO is enabled on the xmit device, we are likely interested only
299  * in UDP aggregation, explicitly check for that if the skb is suspected
300  * - the sock_wfree destructor is used by UDP, ICMP and XDP sockets -
301  * to belong to locally generated UDP traffic.
302  */
303 static bool veth_skb_is_eligible_for_gro(const struct net_device *dev,
304                                          const struct net_device *rcv,
305                                          const struct sk_buff *skb)
306 {
307         return !(dev->features & NETIF_F_ALL_TSO) ||
308                 (skb->destructor == sock_wfree &&
309                  rcv->features & (NETIF_F_GRO_FRAGLIST | NETIF_F_GRO_UDP_FWD));
310 }
311
312 static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev)
313 {
314         struct veth_priv *rcv_priv, *priv = netdev_priv(dev);
315         struct veth_rq *rq = NULL;
316         struct net_device *rcv;
317         int length = skb->len;
318         bool use_napi = false;
319         int rxq;
320
321         rcu_read_lock();
322         rcv = rcu_dereference(priv->peer);
323         if (unlikely(!rcv)) {
324                 kfree_skb(skb);
325                 goto drop;
326         }
327
328         rcv_priv = netdev_priv(rcv);
329         rxq = skb_get_queue_mapping(skb);
330         if (rxq < rcv->real_num_rx_queues) {
331                 rq = &rcv_priv->rq[rxq];
332
333                 /* The napi pointer is available when an XDP program is
334                  * attached or when GRO is enabled
335                  * Don't bother with napi/GRO if the skb can't be aggregated
336                  */
337                 use_napi = rcu_access_pointer(rq->napi) &&
338                            veth_skb_is_eligible_for_gro(dev, rcv, skb);
339         }
340
341         skb_tx_timestamp(skb);
342         if (likely(veth_forward_skb(rcv, skb, rq, use_napi) == NET_RX_SUCCESS)) {
343                 if (!use_napi)
344                         dev_lstats_add(dev, length);
345         } else {
346 drop:
347                 atomic64_inc(&priv->dropped);
348         }
349
350         if (use_napi)
351                 __veth_xdp_flush(rq);
352
353         rcu_read_unlock();
354
355         return NETDEV_TX_OK;
356 }
357
358 static u64 veth_stats_tx(struct net_device *dev, u64 *packets, u64 *bytes)
359 {
360         struct veth_priv *priv = netdev_priv(dev);
361
362         dev_lstats_read(dev, packets, bytes);
363         return atomic64_read(&priv->dropped);
364 }
365
366 static void veth_stats_rx(struct veth_stats *result, struct net_device *dev)
367 {
368         struct veth_priv *priv = netdev_priv(dev);
369         int i;
370
371         result->peer_tq_xdp_xmit_err = 0;
372         result->xdp_packets = 0;
373         result->xdp_tx_err = 0;
374         result->xdp_bytes = 0;
375         result->rx_drops = 0;
376         for (i = 0; i < dev->num_rx_queues; i++) {
377                 u64 packets, bytes, drops, xdp_tx_err, peer_tq_xdp_xmit_err;
378                 struct veth_rq_stats *stats = &priv->rq[i].stats;
379                 unsigned int start;
380
381                 do {
382                         start = u64_stats_fetch_begin_irq(&stats->syncp);
383                         peer_tq_xdp_xmit_err = stats->vs.peer_tq_xdp_xmit_err;
384                         xdp_tx_err = stats->vs.xdp_tx_err;
385                         packets = stats->vs.xdp_packets;
386                         bytes = stats->vs.xdp_bytes;
387                         drops = stats->vs.rx_drops;
388                 } while (u64_stats_fetch_retry_irq(&stats->syncp, start));
389                 result->peer_tq_xdp_xmit_err += peer_tq_xdp_xmit_err;
390                 result->xdp_tx_err += xdp_tx_err;
391                 result->xdp_packets += packets;
392                 result->xdp_bytes += bytes;
393                 result->rx_drops += drops;
394         }
395 }
396
397 static void veth_get_stats64(struct net_device *dev,
398                              struct rtnl_link_stats64 *tot)
399 {
400         struct veth_priv *priv = netdev_priv(dev);
401         struct net_device *peer;
402         struct veth_stats rx;
403         u64 packets, bytes;
404
405         tot->tx_dropped = veth_stats_tx(dev, &packets, &bytes);
406         tot->tx_bytes = bytes;
407         tot->tx_packets = packets;
408
409         veth_stats_rx(&rx, dev);
410         tot->tx_dropped += rx.xdp_tx_err;
411         tot->rx_dropped = rx.rx_drops + rx.peer_tq_xdp_xmit_err;
412         tot->rx_bytes = rx.xdp_bytes;
413         tot->rx_packets = rx.xdp_packets;
414
415         rcu_read_lock();
416         peer = rcu_dereference(priv->peer);
417         if (peer) {
418                 veth_stats_tx(peer, &packets, &bytes);
419                 tot->rx_bytes += bytes;
420                 tot->rx_packets += packets;
421
422                 veth_stats_rx(&rx, peer);
423                 tot->tx_dropped += rx.peer_tq_xdp_xmit_err;
424                 tot->rx_dropped += rx.xdp_tx_err;
425                 tot->tx_bytes += rx.xdp_bytes;
426                 tot->tx_packets += rx.xdp_packets;
427         }
428         rcu_read_unlock();
429 }
430
431 /* fake multicast ability */
432 static void veth_set_multicast_list(struct net_device *dev)
433 {
434 }
435
436 static struct sk_buff *veth_build_skb(void *head, int headroom, int len,
437                                       int buflen)
438 {
439         struct sk_buff *skb;
440
441         skb = build_skb(head, buflen);
442         if (!skb)
443                 return NULL;
444
445         skb_reserve(skb, headroom);
446         skb_put(skb, len);
447
448         return skb;
449 }
450
451 static int veth_select_rxq(struct net_device *dev)
452 {
453         return smp_processor_id() % dev->real_num_rx_queues;
454 }
455
456 static struct net_device *veth_peer_dev(struct net_device *dev)
457 {
458         struct veth_priv *priv = netdev_priv(dev);
459
460         /* Callers must be under RCU read side. */
461         return rcu_dereference(priv->peer);
462 }
463
464 static int veth_xdp_xmit(struct net_device *dev, int n,
465                          struct xdp_frame **frames,
466                          u32 flags, bool ndo_xmit)
467 {
468         struct veth_priv *rcv_priv, *priv = netdev_priv(dev);
469         int i, ret = -ENXIO, nxmit = 0;
470         struct net_device *rcv;
471         unsigned int max_len;
472         struct veth_rq *rq;
473
474         if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK))
475                 return -EINVAL;
476
477         rcu_read_lock();
478         rcv = rcu_dereference(priv->peer);
479         if (unlikely(!rcv))
480                 goto out;
481
482         rcv_priv = netdev_priv(rcv);
483         rq = &rcv_priv->rq[veth_select_rxq(rcv)];
484         /* The napi pointer is set if NAPI is enabled, which ensures that
485          * xdp_ring is initialized on receive side and the peer device is up.
486          */
487         if (!rcu_access_pointer(rq->napi))
488                 goto out;
489
490         max_len = rcv->mtu + rcv->hard_header_len + VLAN_HLEN;
491
492         spin_lock(&rq->xdp_ring.producer_lock);
493         for (i = 0; i < n; i++) {
494                 struct xdp_frame *frame = frames[i];
495                 void *ptr = veth_xdp_to_ptr(frame);
496
497                 if (unlikely(frame->len > max_len ||
498                              __ptr_ring_produce(&rq->xdp_ring, ptr)))
499                         break;
500                 nxmit++;
501         }
502         spin_unlock(&rq->xdp_ring.producer_lock);
503
504         if (flags & XDP_XMIT_FLUSH)
505                 __veth_xdp_flush(rq);
506
507         ret = nxmit;
508         if (ndo_xmit) {
509                 u64_stats_update_begin(&rq->stats.syncp);
510                 rq->stats.vs.peer_tq_xdp_xmit += nxmit;
511                 rq->stats.vs.peer_tq_xdp_xmit_err += n - nxmit;
512                 u64_stats_update_end(&rq->stats.syncp);
513         }
514
515 out:
516         rcu_read_unlock();
517
518         return ret;
519 }
520
521 static int veth_ndo_xdp_xmit(struct net_device *dev, int n,
522                              struct xdp_frame **frames, u32 flags)
523 {
524         int err;
525
526         err = veth_xdp_xmit(dev, n, frames, flags, true);
527         if (err < 0) {
528                 struct veth_priv *priv = netdev_priv(dev);
529
530                 atomic64_add(n, &priv->dropped);
531         }
532
533         return err;
534 }
535
536 static void veth_xdp_flush_bq(struct veth_rq *rq, struct veth_xdp_tx_bq *bq)
537 {
538         int sent, i, err = 0, drops;
539
540         sent = veth_xdp_xmit(rq->dev, bq->count, bq->q, 0, false);
541         if (sent < 0) {
542                 err = sent;
543                 sent = 0;
544         }
545
546         for (i = sent; unlikely(i < bq->count); i++)
547                 xdp_return_frame(bq->q[i]);
548
549         drops = bq->count - sent;
550         trace_xdp_bulk_tx(rq->dev, sent, drops, err);
551
552         u64_stats_update_begin(&rq->stats.syncp);
553         rq->stats.vs.xdp_tx += sent;
554         rq->stats.vs.xdp_tx_err += drops;
555         u64_stats_update_end(&rq->stats.syncp);
556
557         bq->count = 0;
558 }
559
560 static void veth_xdp_flush(struct veth_rq *rq, struct veth_xdp_tx_bq *bq)
561 {
562         struct veth_priv *rcv_priv, *priv = netdev_priv(rq->dev);
563         struct net_device *rcv;
564         struct veth_rq *rcv_rq;
565
566         rcu_read_lock();
567         veth_xdp_flush_bq(rq, bq);
568         rcv = rcu_dereference(priv->peer);
569         if (unlikely(!rcv))
570                 goto out;
571
572         rcv_priv = netdev_priv(rcv);
573         rcv_rq = &rcv_priv->rq[veth_select_rxq(rcv)];
574         /* xdp_ring is initialized on receive side? */
575         if (unlikely(!rcu_access_pointer(rcv_rq->xdp_prog)))
576                 goto out;
577
578         __veth_xdp_flush(rcv_rq);
579 out:
580         rcu_read_unlock();
581 }
582
583 static int veth_xdp_tx(struct veth_rq *rq, struct xdp_buff *xdp,
584                        struct veth_xdp_tx_bq *bq)
585 {
586         struct xdp_frame *frame = xdp_convert_buff_to_frame(xdp);
587
588         if (unlikely(!frame))
589                 return -EOVERFLOW;
590
591         if (unlikely(bq->count == VETH_XDP_TX_BULK_SIZE))
592                 veth_xdp_flush_bq(rq, bq);
593
594         bq->q[bq->count++] = frame;
595
596         return 0;
597 }
598
599 static struct xdp_frame *veth_xdp_rcv_one(struct veth_rq *rq,
600                                           struct xdp_frame *frame,
601                                           struct veth_xdp_tx_bq *bq,
602                                           struct veth_stats *stats)
603 {
604         struct xdp_frame orig_frame;
605         struct bpf_prog *xdp_prog;
606
607         rcu_read_lock();
608         xdp_prog = rcu_dereference(rq->xdp_prog);
609         if (likely(xdp_prog)) {
610                 struct xdp_buff xdp;
611                 u32 act;
612
613                 xdp_convert_frame_to_buff(frame, &xdp);
614                 xdp.rxq = &rq->xdp_rxq;
615
616                 act = bpf_prog_run_xdp(xdp_prog, &xdp);
617
618                 switch (act) {
619                 case XDP_PASS:
620                         if (xdp_update_frame_from_buff(&xdp, frame))
621                                 goto err_xdp;
622                         break;
623                 case XDP_TX:
624                         orig_frame = *frame;
625                         xdp.rxq->mem = frame->mem;
626                         if (unlikely(veth_xdp_tx(rq, &xdp, bq) < 0)) {
627                                 trace_xdp_exception(rq->dev, xdp_prog, act);
628                                 frame = &orig_frame;
629                                 stats->rx_drops++;
630                                 goto err_xdp;
631                         }
632                         stats->xdp_tx++;
633                         rcu_read_unlock();
634                         goto xdp_xmit;
635                 case XDP_REDIRECT:
636                         orig_frame = *frame;
637                         xdp.rxq->mem = frame->mem;
638                         if (xdp_do_redirect(rq->dev, &xdp, xdp_prog)) {
639                                 frame = &orig_frame;
640                                 stats->rx_drops++;
641                                 goto err_xdp;
642                         }
643                         stats->xdp_redirect++;
644                         rcu_read_unlock();
645                         goto xdp_xmit;
646                 default:
647                         bpf_warn_invalid_xdp_action(rq->dev, xdp_prog, act);
648                         fallthrough;
649                 case XDP_ABORTED:
650                         trace_xdp_exception(rq->dev, xdp_prog, act);
651                         fallthrough;
652                 case XDP_DROP:
653                         stats->xdp_drops++;
654                         goto err_xdp;
655                 }
656         }
657         rcu_read_unlock();
658
659         return frame;
660 err_xdp:
661         rcu_read_unlock();
662         xdp_return_frame(frame);
663 xdp_xmit:
664         return NULL;
665 }
666
667 /* frames array contains VETH_XDP_BATCH at most */
668 static void veth_xdp_rcv_bulk_skb(struct veth_rq *rq, void **frames,
669                                   int n_xdpf, struct veth_xdp_tx_bq *bq,
670                                   struct veth_stats *stats)
671 {
672         void *skbs[VETH_XDP_BATCH];
673         int i;
674
675         if (xdp_alloc_skb_bulk(skbs, n_xdpf,
676                                GFP_ATOMIC | __GFP_ZERO) < 0) {
677                 for (i = 0; i < n_xdpf; i++)
678                         xdp_return_frame(frames[i]);
679                 stats->rx_drops += n_xdpf;
680
681                 return;
682         }
683
684         for (i = 0; i < n_xdpf; i++) {
685                 struct sk_buff *skb = skbs[i];
686
687                 skb = __xdp_build_skb_from_frame(frames[i], skb,
688                                                  rq->dev);
689                 if (!skb) {
690                         xdp_return_frame(frames[i]);
691                         stats->rx_drops++;
692                         continue;
693                 }
694                 napi_gro_receive(&rq->xdp_napi, skb);
695         }
696 }
697
698 static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq,
699                                         struct sk_buff *skb,
700                                         struct veth_xdp_tx_bq *bq,
701                                         struct veth_stats *stats)
702 {
703         u32 pktlen, headroom, act, metalen, frame_sz;
704         void *orig_data, *orig_data_end;
705         struct bpf_prog *xdp_prog;
706         int mac_len, delta, off;
707         struct xdp_buff xdp;
708
709         skb_prepare_for_gro(skb);
710
711         rcu_read_lock();
712         xdp_prog = rcu_dereference(rq->xdp_prog);
713         if (unlikely(!xdp_prog)) {
714                 rcu_read_unlock();
715                 goto out;
716         }
717
718         mac_len = skb->data - skb_mac_header(skb);
719         pktlen = skb->len + mac_len;
720         headroom = skb_headroom(skb) - mac_len;
721
722         if (skb_shared(skb) || skb_head_is_locked(skb) ||
723             skb_is_nonlinear(skb) || headroom < XDP_PACKET_HEADROOM) {
724                 struct sk_buff *nskb;
725                 int size, head_off;
726                 void *head, *start;
727                 struct page *page;
728
729                 size = SKB_DATA_ALIGN(VETH_XDP_HEADROOM + pktlen) +
730                        SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
731                 if (size > PAGE_SIZE)
732                         goto drop;
733
734                 page = alloc_page(GFP_ATOMIC | __GFP_NOWARN);
735                 if (!page)
736                         goto drop;
737
738                 head = page_address(page);
739                 start = head + VETH_XDP_HEADROOM;
740                 if (skb_copy_bits(skb, -mac_len, start, pktlen)) {
741                         page_frag_free(head);
742                         goto drop;
743                 }
744
745                 nskb = veth_build_skb(head, VETH_XDP_HEADROOM + mac_len,
746                                       skb->len, PAGE_SIZE);
747                 if (!nskb) {
748                         page_frag_free(head);
749                         goto drop;
750                 }
751
752                 skb_copy_header(nskb, skb);
753                 head_off = skb_headroom(nskb) - skb_headroom(skb);
754                 skb_headers_offset_update(nskb, head_off);
755                 consume_skb(skb);
756                 skb = nskb;
757         }
758
759         /* SKB "head" area always have tailroom for skb_shared_info */
760         frame_sz = skb_end_pointer(skb) - skb->head;
761         frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
762         xdp_init_buff(&xdp, frame_sz, &rq->xdp_rxq);
763         xdp_prepare_buff(&xdp, skb->head, skb->mac_header, pktlen, true);
764
765         orig_data = xdp.data;
766         orig_data_end = xdp.data_end;
767
768         act = bpf_prog_run_xdp(xdp_prog, &xdp);
769
770         switch (act) {
771         case XDP_PASS:
772                 break;
773         case XDP_TX:
774                 get_page(virt_to_page(xdp.data));
775                 consume_skb(skb);
776                 xdp.rxq->mem = rq->xdp_mem;
777                 if (unlikely(veth_xdp_tx(rq, &xdp, bq) < 0)) {
778                         trace_xdp_exception(rq->dev, xdp_prog, act);
779                         stats->rx_drops++;
780                         goto err_xdp;
781                 }
782                 stats->xdp_tx++;
783                 rcu_read_unlock();
784                 goto xdp_xmit;
785         case XDP_REDIRECT:
786                 get_page(virt_to_page(xdp.data));
787                 consume_skb(skb);
788                 xdp.rxq->mem = rq->xdp_mem;
789                 if (xdp_do_redirect(rq->dev, &xdp, xdp_prog)) {
790                         stats->rx_drops++;
791                         goto err_xdp;
792                 }
793                 stats->xdp_redirect++;
794                 rcu_read_unlock();
795                 goto xdp_xmit;
796         default:
797                 bpf_warn_invalid_xdp_action(rq->dev, xdp_prog, act);
798                 fallthrough;
799         case XDP_ABORTED:
800                 trace_xdp_exception(rq->dev, xdp_prog, act);
801                 fallthrough;
802         case XDP_DROP:
803                 stats->xdp_drops++;
804                 goto xdp_drop;
805         }
806         rcu_read_unlock();
807
808         /* check if bpf_xdp_adjust_head was used */
809         delta = orig_data - xdp.data;
810         off = mac_len + delta;
811         if (off > 0)
812                 __skb_push(skb, off);
813         else if (off < 0)
814                 __skb_pull(skb, -off);
815         skb->mac_header -= delta;
816
817         /* check if bpf_xdp_adjust_tail was used */
818         off = xdp.data_end - orig_data_end;
819         if (off != 0)
820                 __skb_put(skb, off); /* positive on grow, negative on shrink */
821         skb->protocol = eth_type_trans(skb, rq->dev);
822
823         metalen = xdp.data - xdp.data_meta;
824         if (metalen)
825                 skb_metadata_set(skb, metalen);
826 out:
827         return skb;
828 drop:
829         stats->rx_drops++;
830 xdp_drop:
831         rcu_read_unlock();
832         kfree_skb(skb);
833         return NULL;
834 err_xdp:
835         rcu_read_unlock();
836         page_frag_free(xdp.data);
837 xdp_xmit:
838         return NULL;
839 }
840
841 static int veth_xdp_rcv(struct veth_rq *rq, int budget,
842                         struct veth_xdp_tx_bq *bq,
843                         struct veth_stats *stats)
844 {
845         int i, done = 0, n_xdpf = 0;
846         void *xdpf[VETH_XDP_BATCH];
847
848         for (i = 0; i < budget; i++) {
849                 void *ptr = __ptr_ring_consume(&rq->xdp_ring);
850
851                 if (!ptr)
852                         break;
853
854                 if (veth_is_xdp_frame(ptr)) {
855                         /* ndo_xdp_xmit */
856                         struct xdp_frame *frame = veth_ptr_to_xdp(ptr);
857
858                         stats->xdp_bytes += frame->len;
859                         frame = veth_xdp_rcv_one(rq, frame, bq, stats);
860                         if (frame) {
861                                 /* XDP_PASS */
862                                 xdpf[n_xdpf++] = frame;
863                                 if (n_xdpf == VETH_XDP_BATCH) {
864                                         veth_xdp_rcv_bulk_skb(rq, xdpf, n_xdpf,
865                                                               bq, stats);
866                                         n_xdpf = 0;
867                                 }
868                         }
869                 } else {
870                         /* ndo_start_xmit */
871                         struct sk_buff *skb = ptr;
872
873                         stats->xdp_bytes += skb->len;
874                         skb = veth_xdp_rcv_skb(rq, skb, bq, stats);
875                         if (skb) {
876                                 if (skb_shared(skb) || skb_unclone(skb, GFP_ATOMIC))
877                                         netif_receive_skb(skb);
878                                 else
879                                         napi_gro_receive(&rq->xdp_napi, skb);
880                         }
881                 }
882                 done++;
883         }
884
885         if (n_xdpf)
886                 veth_xdp_rcv_bulk_skb(rq, xdpf, n_xdpf, bq, stats);
887
888         u64_stats_update_begin(&rq->stats.syncp);
889         rq->stats.vs.xdp_redirect += stats->xdp_redirect;
890         rq->stats.vs.xdp_bytes += stats->xdp_bytes;
891         rq->stats.vs.xdp_drops += stats->xdp_drops;
892         rq->stats.vs.rx_drops += stats->rx_drops;
893         rq->stats.vs.xdp_packets += done;
894         u64_stats_update_end(&rq->stats.syncp);
895
896         return done;
897 }
898
899 static int veth_poll(struct napi_struct *napi, int budget)
900 {
901         struct veth_rq *rq =
902                 container_of(napi, struct veth_rq, xdp_napi);
903         struct veth_stats stats = {};
904         struct veth_xdp_tx_bq bq;
905         int done;
906
907         bq.count = 0;
908
909         xdp_set_return_frame_no_direct();
910         done = veth_xdp_rcv(rq, budget, &bq, &stats);
911
912         if (done < budget && napi_complete_done(napi, done)) {
913                 /* Write rx_notify_masked before reading ptr_ring */
914                 smp_store_mb(rq->rx_notify_masked, false);
915                 if (unlikely(!__ptr_ring_empty(&rq->xdp_ring))) {
916                         if (napi_schedule_prep(&rq->xdp_napi)) {
917                                 WRITE_ONCE(rq->rx_notify_masked, true);
918                                 __napi_schedule(&rq->xdp_napi);
919                         }
920                 }
921         }
922
923         if (stats.xdp_tx > 0)
924                 veth_xdp_flush(rq, &bq);
925         if (stats.xdp_redirect > 0)
926                 xdp_do_flush();
927         xdp_clear_return_frame_no_direct();
928
929         return done;
930 }
931
932 static int __veth_napi_enable_range(struct net_device *dev, int start, int end)
933 {
934         struct veth_priv *priv = netdev_priv(dev);
935         int err, i;
936
937         for (i = start; i < end; i++) {
938                 struct veth_rq *rq = &priv->rq[i];
939
940                 err = ptr_ring_init(&rq->xdp_ring, VETH_RING_SIZE, GFP_KERNEL);
941                 if (err)
942                         goto err_xdp_ring;
943         }
944
945         for (i = start; i < end; i++) {
946                 struct veth_rq *rq = &priv->rq[i];
947
948                 napi_enable(&rq->xdp_napi);
949                 rcu_assign_pointer(priv->rq[i].napi, &priv->rq[i].xdp_napi);
950         }
951
952         return 0;
953
954 err_xdp_ring:
955         for (i--; i >= start; i--)
956                 ptr_ring_cleanup(&priv->rq[i].xdp_ring, veth_ptr_free);
957
958         return err;
959 }
960
961 static int __veth_napi_enable(struct net_device *dev)
962 {
963         return __veth_napi_enable_range(dev, 0, dev->real_num_rx_queues);
964 }
965
966 static void veth_napi_del_range(struct net_device *dev, int start, int end)
967 {
968         struct veth_priv *priv = netdev_priv(dev);
969         int i;
970
971         for (i = start; i < end; i++) {
972                 struct veth_rq *rq = &priv->rq[i];
973
974                 rcu_assign_pointer(priv->rq[i].napi, NULL);
975                 napi_disable(&rq->xdp_napi);
976                 __netif_napi_del(&rq->xdp_napi);
977         }
978         synchronize_net();
979
980         for (i = start; i < end; i++) {
981                 struct veth_rq *rq = &priv->rq[i];
982
983                 rq->rx_notify_masked = false;
984                 ptr_ring_cleanup(&rq->xdp_ring, veth_ptr_free);
985         }
986 }
987
988 static void veth_napi_del(struct net_device *dev)
989 {
990         veth_napi_del_range(dev, 0, dev->real_num_rx_queues);
991 }
992
993 static bool veth_gro_requested(const struct net_device *dev)
994 {
995         return !!(dev->wanted_features & NETIF_F_GRO);
996 }
997
998 static int veth_enable_xdp_range(struct net_device *dev, int start, int end,
999                                  bool napi_already_on)
1000 {
1001         struct veth_priv *priv = netdev_priv(dev);
1002         int err, i;
1003
1004         for (i = start; i < end; i++) {
1005                 struct veth_rq *rq = &priv->rq[i];
1006
1007                 if (!napi_already_on)
1008                         netif_napi_add(dev, &rq->xdp_napi, veth_poll, NAPI_POLL_WEIGHT);
1009                 err = xdp_rxq_info_reg(&rq->xdp_rxq, dev, i, rq->xdp_napi.napi_id);
1010                 if (err < 0)
1011                         goto err_rxq_reg;
1012
1013                 err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq,
1014                                                  MEM_TYPE_PAGE_SHARED,
1015                                                  NULL);
1016                 if (err < 0)
1017                         goto err_reg_mem;
1018
1019                 /* Save original mem info as it can be overwritten */
1020                 rq->xdp_mem = rq->xdp_rxq.mem;
1021         }
1022         return 0;
1023
1024 err_reg_mem:
1025         xdp_rxq_info_unreg(&priv->rq[i].xdp_rxq);
1026 err_rxq_reg:
1027         for (i--; i >= start; i--) {
1028                 struct veth_rq *rq = &priv->rq[i];
1029
1030                 xdp_rxq_info_unreg(&rq->xdp_rxq);
1031                 if (!napi_already_on)
1032                         netif_napi_del(&rq->xdp_napi);
1033         }
1034
1035         return err;
1036 }
1037
1038 static void veth_disable_xdp_range(struct net_device *dev, int start, int end,
1039                                    bool delete_napi)
1040 {
1041         struct veth_priv *priv = netdev_priv(dev);
1042         int i;
1043
1044         for (i = start; i < end; i++) {
1045                 struct veth_rq *rq = &priv->rq[i];
1046
1047                 rq->xdp_rxq.mem = rq->xdp_mem;
1048                 xdp_rxq_info_unreg(&rq->xdp_rxq);
1049
1050                 if (delete_napi)
1051                         netif_napi_del(&rq->xdp_napi);
1052         }
1053 }
1054
1055 static int veth_enable_xdp(struct net_device *dev)
1056 {
1057         bool napi_already_on = veth_gro_requested(dev) && (dev->flags & IFF_UP);
1058         struct veth_priv *priv = netdev_priv(dev);
1059         int err, i;
1060
1061         if (!xdp_rxq_info_is_reg(&priv->rq[0].xdp_rxq)) {
1062                 err = veth_enable_xdp_range(dev, 0, dev->real_num_rx_queues, napi_already_on);
1063                 if (err)
1064                         return err;
1065
1066                 if (!napi_already_on) {
1067                         err = __veth_napi_enable(dev);
1068                         if (err) {
1069                                 veth_disable_xdp_range(dev, 0, dev->real_num_rx_queues, true);
1070                                 return err;
1071                         }
1072
1073                         if (!veth_gro_requested(dev)) {
1074                                 /* user-space did not require GRO, but adding XDP
1075                                  * is supposed to get GRO working
1076                                  */
1077                                 dev->features |= NETIF_F_GRO;
1078                                 netdev_features_change(dev);
1079                         }
1080                 }
1081         }
1082
1083         for (i = 0; i < dev->real_num_rx_queues; i++) {
1084                 rcu_assign_pointer(priv->rq[i].xdp_prog, priv->_xdp_prog);
1085                 rcu_assign_pointer(priv->rq[i].napi, &priv->rq[i].xdp_napi);
1086         }
1087
1088         return 0;
1089 }
1090
1091 static void veth_disable_xdp(struct net_device *dev)
1092 {
1093         struct veth_priv *priv = netdev_priv(dev);
1094         int i;
1095
1096         for (i = 0; i < dev->real_num_rx_queues; i++)
1097                 rcu_assign_pointer(priv->rq[i].xdp_prog, NULL);
1098
1099         if (!netif_running(dev) || !veth_gro_requested(dev)) {
1100                 veth_napi_del(dev);
1101
1102                 /* if user-space did not require GRO, since adding XDP
1103                  * enabled it, clear it now
1104                  */
1105                 if (!veth_gro_requested(dev) && netif_running(dev)) {
1106                         dev->features &= ~NETIF_F_GRO;
1107                         netdev_features_change(dev);
1108                 }
1109         }
1110
1111         veth_disable_xdp_range(dev, 0, dev->real_num_rx_queues, false);
1112 }
1113
1114 static int veth_napi_enable_range(struct net_device *dev, int start, int end)
1115 {
1116         struct veth_priv *priv = netdev_priv(dev);
1117         int err, i;
1118
1119         for (i = start; i < end; i++) {
1120                 struct veth_rq *rq = &priv->rq[i];
1121
1122                 netif_napi_add(dev, &rq->xdp_napi, veth_poll, NAPI_POLL_WEIGHT);
1123         }
1124
1125         err = __veth_napi_enable_range(dev, start, end);
1126         if (err) {
1127                 for (i = start; i < end; i++) {
1128                         struct veth_rq *rq = &priv->rq[i];
1129
1130                         netif_napi_del(&rq->xdp_napi);
1131                 }
1132                 return err;
1133         }
1134         return err;
1135 }
1136
1137 static int veth_napi_enable(struct net_device *dev)
1138 {
1139         return veth_napi_enable_range(dev, 0, dev->real_num_rx_queues);
1140 }
1141
1142 static void veth_disable_range_safe(struct net_device *dev, int start, int end)
1143 {
1144         struct veth_priv *priv = netdev_priv(dev);
1145
1146         if (start >= end)
1147                 return;
1148
1149         if (priv->_xdp_prog) {
1150                 veth_napi_del_range(dev, start, end);
1151                 veth_disable_xdp_range(dev, start, end, false);
1152         } else if (veth_gro_requested(dev)) {
1153                 veth_napi_del_range(dev, start, end);
1154         }
1155 }
1156
1157 static int veth_enable_range_safe(struct net_device *dev, int start, int end)
1158 {
1159         struct veth_priv *priv = netdev_priv(dev);
1160         int err;
1161
1162         if (start >= end)
1163                 return 0;
1164
1165         if (priv->_xdp_prog) {
1166                 /* these channels are freshly initialized, napi is not on there even
1167                  * when GRO is requeste
1168                  */
1169                 err = veth_enable_xdp_range(dev, start, end, false);
1170                 if (err)
1171                         return err;
1172
1173                 err = __veth_napi_enable_range(dev, start, end);
1174                 if (err) {
1175                         /* on error always delete the newly added napis */
1176                         veth_disable_xdp_range(dev, start, end, true);
1177                         return err;
1178                 }
1179         } else if (veth_gro_requested(dev)) {
1180                 return veth_napi_enable_range(dev, start, end);
1181         }
1182         return 0;
1183 }
1184
1185 static int veth_set_channels(struct net_device *dev,
1186                              struct ethtool_channels *ch)
1187 {
1188         struct veth_priv *priv = netdev_priv(dev);
1189         unsigned int old_rx_count, new_rx_count;
1190         struct veth_priv *peer_priv;
1191         struct net_device *peer;
1192         int err;
1193
1194         /* sanity check. Upper bounds are already enforced by the caller */
1195         if (!ch->rx_count || !ch->tx_count)
1196                 return -EINVAL;
1197
1198         /* avoid braking XDP, if that is enabled */
1199         peer = rtnl_dereference(priv->peer);
1200         peer_priv = peer ? netdev_priv(peer) : NULL;
1201         if (priv->_xdp_prog && peer && ch->rx_count < peer->real_num_tx_queues)
1202                 return -EINVAL;
1203
1204         if (peer && peer_priv && peer_priv->_xdp_prog && ch->tx_count > peer->real_num_rx_queues)
1205                 return -EINVAL;
1206
1207         old_rx_count = dev->real_num_rx_queues;
1208         new_rx_count = ch->rx_count;
1209         if (netif_running(dev)) {
1210                 /* turn device off */
1211                 netif_carrier_off(dev);
1212                 if (peer)
1213                         netif_carrier_off(peer);
1214
1215                 /* try to allocate new resurces, as needed*/
1216                 err = veth_enable_range_safe(dev, old_rx_count, new_rx_count);
1217                 if (err)
1218                         goto out;
1219         }
1220
1221         err = netif_set_real_num_rx_queues(dev, ch->rx_count);
1222         if (err)
1223                 goto revert;
1224
1225         err = netif_set_real_num_tx_queues(dev, ch->tx_count);
1226         if (err) {
1227                 int err2 = netif_set_real_num_rx_queues(dev, old_rx_count);
1228
1229                 /* this error condition could happen only if rx and tx change
1230                  * in opposite directions (e.g. tx nr raises, rx nr decreases)
1231                  * and we can't do anything to fully restore the original
1232                  * status
1233                  */
1234                 if (err2)
1235                         pr_warn("Can't restore rx queues config %d -> %d %d",
1236                                 new_rx_count, old_rx_count, err2);
1237                 else
1238                         goto revert;
1239         }
1240
1241 out:
1242         if (netif_running(dev)) {
1243                 /* note that we need to swap the arguments WRT the enable part
1244                  * to identify the range we have to disable
1245                  */
1246                 veth_disable_range_safe(dev, new_rx_count, old_rx_count);
1247                 netif_carrier_on(dev);
1248                 if (peer)
1249                         netif_carrier_on(peer);
1250         }
1251         return err;
1252
1253 revert:
1254         new_rx_count = old_rx_count;
1255         old_rx_count = ch->rx_count;
1256         goto out;
1257 }
1258
1259 static int veth_open(struct net_device *dev)
1260 {
1261         struct veth_priv *priv = netdev_priv(dev);
1262         struct net_device *peer = rtnl_dereference(priv->peer);
1263         int err;
1264
1265         if (!peer)
1266                 return -ENOTCONN;
1267
1268         if (priv->_xdp_prog) {
1269                 err = veth_enable_xdp(dev);
1270                 if (err)
1271                         return err;
1272         } else if (veth_gro_requested(dev)) {
1273                 err = veth_napi_enable(dev);
1274                 if (err)
1275                         return err;
1276         }
1277
1278         if (peer->flags & IFF_UP) {
1279                 netif_carrier_on(dev);
1280                 netif_carrier_on(peer);
1281         }
1282
1283         return 0;
1284 }
1285
1286 static int veth_close(struct net_device *dev)
1287 {
1288         struct veth_priv *priv = netdev_priv(dev);
1289         struct net_device *peer = rtnl_dereference(priv->peer);
1290
1291         netif_carrier_off(dev);
1292         if (peer)
1293                 netif_carrier_off(peer);
1294
1295         if (priv->_xdp_prog)
1296                 veth_disable_xdp(dev);
1297         else if (veth_gro_requested(dev))
1298                 veth_napi_del(dev);
1299
1300         return 0;
1301 }
1302
1303 static int is_valid_veth_mtu(int mtu)
1304 {
1305         return mtu >= ETH_MIN_MTU && mtu <= ETH_MAX_MTU;
1306 }
1307
1308 static int veth_alloc_queues(struct net_device *dev)
1309 {
1310         struct veth_priv *priv = netdev_priv(dev);
1311         int i;
1312
1313         priv->rq = kcalloc(dev->num_rx_queues, sizeof(*priv->rq), GFP_KERNEL);
1314         if (!priv->rq)
1315                 return -ENOMEM;
1316
1317         for (i = 0; i < dev->num_rx_queues; i++) {
1318                 priv->rq[i].dev = dev;
1319                 u64_stats_init(&priv->rq[i].stats.syncp);
1320         }
1321
1322         return 0;
1323 }
1324
1325 static void veth_free_queues(struct net_device *dev)
1326 {
1327         struct veth_priv *priv = netdev_priv(dev);
1328
1329         kfree(priv->rq);
1330 }
1331
1332 static int veth_dev_init(struct net_device *dev)
1333 {
1334         int err;
1335
1336         dev->lstats = netdev_alloc_pcpu_stats(struct pcpu_lstats);
1337         if (!dev->lstats)
1338                 return -ENOMEM;
1339
1340         err = veth_alloc_queues(dev);
1341         if (err) {
1342                 free_percpu(dev->lstats);
1343                 return err;
1344         }
1345
1346         return 0;
1347 }
1348
1349 static void veth_dev_free(struct net_device *dev)
1350 {
1351         veth_free_queues(dev);
1352         free_percpu(dev->lstats);
1353 }
1354
1355 #ifdef CONFIG_NET_POLL_CONTROLLER
1356 static void veth_poll_controller(struct net_device *dev)
1357 {
1358         /* veth only receives frames when its peer sends one
1359          * Since it has nothing to do with disabling irqs, we are guaranteed
1360          * never to have pending data when we poll for it so
1361          * there is nothing to do here.
1362          *
1363          * We need this though so netpoll recognizes us as an interface that
1364          * supports polling, which enables bridge devices in virt setups to
1365          * still use netconsole
1366          */
1367 }
1368 #endif  /* CONFIG_NET_POLL_CONTROLLER */
1369
1370 static int veth_get_iflink(const struct net_device *dev)
1371 {
1372         struct veth_priv *priv = netdev_priv(dev);
1373         struct net_device *peer;
1374         int iflink;
1375
1376         rcu_read_lock();
1377         peer = rcu_dereference(priv->peer);
1378         iflink = peer ? peer->ifindex : 0;
1379         rcu_read_unlock();
1380
1381         return iflink;
1382 }
1383
1384 static netdev_features_t veth_fix_features(struct net_device *dev,
1385                                            netdev_features_t features)
1386 {
1387         struct veth_priv *priv = netdev_priv(dev);
1388         struct net_device *peer;
1389
1390         peer = rtnl_dereference(priv->peer);
1391         if (peer) {
1392                 struct veth_priv *peer_priv = netdev_priv(peer);
1393
1394                 if (peer_priv->_xdp_prog)
1395                         features &= ~NETIF_F_GSO_SOFTWARE;
1396         }
1397         if (priv->_xdp_prog)
1398                 features |= NETIF_F_GRO;
1399
1400         return features;
1401 }
1402
1403 static int veth_set_features(struct net_device *dev,
1404                              netdev_features_t features)
1405 {
1406         netdev_features_t changed = features ^ dev->features;
1407         struct veth_priv *priv = netdev_priv(dev);
1408         int err;
1409
1410         if (!(changed & NETIF_F_GRO) || !(dev->flags & IFF_UP) || priv->_xdp_prog)
1411                 return 0;
1412
1413         if (features & NETIF_F_GRO) {
1414                 err = veth_napi_enable(dev);
1415                 if (err)
1416                         return err;
1417         } else {
1418                 veth_napi_del(dev);
1419         }
1420         return 0;
1421 }
1422
1423 static void veth_set_rx_headroom(struct net_device *dev, int new_hr)
1424 {
1425         struct veth_priv *peer_priv, *priv = netdev_priv(dev);
1426         struct net_device *peer;
1427
1428         if (new_hr < 0)
1429                 new_hr = 0;
1430
1431         rcu_read_lock();
1432         peer = rcu_dereference(priv->peer);
1433         if (unlikely(!peer))
1434                 goto out;
1435
1436         peer_priv = netdev_priv(peer);
1437         priv->requested_headroom = new_hr;
1438         new_hr = max(priv->requested_headroom, peer_priv->requested_headroom);
1439         dev->needed_headroom = new_hr;
1440         peer->needed_headroom = new_hr;
1441
1442 out:
1443         rcu_read_unlock();
1444 }
1445
1446 static int veth_xdp_set(struct net_device *dev, struct bpf_prog *prog,
1447                         struct netlink_ext_ack *extack)
1448 {
1449         struct veth_priv *priv = netdev_priv(dev);
1450         struct bpf_prog *old_prog;
1451         struct net_device *peer;
1452         unsigned int max_mtu;
1453         int err;
1454
1455         old_prog = priv->_xdp_prog;
1456         priv->_xdp_prog = prog;
1457         peer = rtnl_dereference(priv->peer);
1458
1459         if (prog) {
1460                 if (!peer) {
1461                         NL_SET_ERR_MSG_MOD(extack, "Cannot set XDP when peer is detached");
1462                         err = -ENOTCONN;
1463                         goto err;
1464                 }
1465
1466                 max_mtu = PAGE_SIZE - VETH_XDP_HEADROOM -
1467                           peer->hard_header_len -
1468                           SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
1469                 if (peer->mtu > max_mtu) {
1470                         NL_SET_ERR_MSG_MOD(extack, "Peer MTU is too large to set XDP");
1471                         err = -ERANGE;
1472                         goto err;
1473                 }
1474
1475                 if (dev->real_num_rx_queues < peer->real_num_tx_queues) {
1476                         NL_SET_ERR_MSG_MOD(extack, "XDP expects number of rx queues not less than peer tx queues");
1477                         err = -ENOSPC;
1478                         goto err;
1479                 }
1480
1481                 if (dev->flags & IFF_UP) {
1482                         err = veth_enable_xdp(dev);
1483                         if (err) {
1484                                 NL_SET_ERR_MSG_MOD(extack, "Setup for XDP failed");
1485                                 goto err;
1486                         }
1487                 }
1488
1489                 if (!old_prog) {
1490                         peer->hw_features &= ~NETIF_F_GSO_SOFTWARE;
1491                         peer->max_mtu = max_mtu;
1492                 }
1493         }
1494
1495         if (old_prog) {
1496                 if (!prog) {
1497                         if (dev->flags & IFF_UP)
1498                                 veth_disable_xdp(dev);
1499
1500                         if (peer) {
1501                                 peer->hw_features |= NETIF_F_GSO_SOFTWARE;
1502                                 peer->max_mtu = ETH_MAX_MTU;
1503                         }
1504                 }
1505                 bpf_prog_put(old_prog);
1506         }
1507
1508         if ((!!old_prog ^ !!prog) && peer)
1509                 netdev_update_features(peer);
1510
1511         return 0;
1512 err:
1513         priv->_xdp_prog = old_prog;
1514
1515         return err;
1516 }
1517
1518 static int veth_xdp(struct net_device *dev, struct netdev_bpf *xdp)
1519 {
1520         switch (xdp->command) {
1521         case XDP_SETUP_PROG:
1522                 return veth_xdp_set(dev, xdp->prog, xdp->extack);
1523         default:
1524                 return -EINVAL;
1525         }
1526 }
1527
1528 static const struct net_device_ops veth_netdev_ops = {
1529         .ndo_init            = veth_dev_init,
1530         .ndo_open            = veth_open,
1531         .ndo_stop            = veth_close,
1532         .ndo_start_xmit      = veth_xmit,
1533         .ndo_get_stats64     = veth_get_stats64,
1534         .ndo_set_rx_mode     = veth_set_multicast_list,
1535         .ndo_set_mac_address = eth_mac_addr,
1536 #ifdef CONFIG_NET_POLL_CONTROLLER
1537         .ndo_poll_controller    = veth_poll_controller,
1538 #endif
1539         .ndo_get_iflink         = veth_get_iflink,
1540         .ndo_fix_features       = veth_fix_features,
1541         .ndo_set_features       = veth_set_features,
1542         .ndo_features_check     = passthru_features_check,
1543         .ndo_set_rx_headroom    = veth_set_rx_headroom,
1544         .ndo_bpf                = veth_xdp,
1545         .ndo_xdp_xmit           = veth_ndo_xdp_xmit,
1546         .ndo_get_peer_dev       = veth_peer_dev,
1547 };
1548
1549 #define VETH_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HW_CSUM | \
1550                        NETIF_F_RXCSUM | NETIF_F_SCTP_CRC | NETIF_F_HIGHDMA | \
1551                        NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_ENCAP_ALL | \
1552                        NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX | \
1553                        NETIF_F_HW_VLAN_STAG_TX | NETIF_F_HW_VLAN_STAG_RX )
1554
1555 static void veth_setup(struct net_device *dev)
1556 {
1557         ether_setup(dev);
1558
1559         dev->priv_flags &= ~IFF_TX_SKB_SHARING;
1560         dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
1561         dev->priv_flags |= IFF_NO_QUEUE;
1562         dev->priv_flags |= IFF_PHONY_HEADROOM;
1563
1564         dev->netdev_ops = &veth_netdev_ops;
1565         dev->ethtool_ops = &veth_ethtool_ops;
1566         dev->features |= NETIF_F_LLTX;
1567         dev->features |= VETH_FEATURES;
1568         dev->vlan_features = dev->features &
1569                              ~(NETIF_F_HW_VLAN_CTAG_TX |
1570                                NETIF_F_HW_VLAN_STAG_TX |
1571                                NETIF_F_HW_VLAN_CTAG_RX |
1572                                NETIF_F_HW_VLAN_STAG_RX);
1573         dev->needs_free_netdev = true;
1574         dev->priv_destructor = veth_dev_free;
1575         dev->max_mtu = ETH_MAX_MTU;
1576
1577         dev->hw_features = VETH_FEATURES;
1578         dev->hw_enc_features = VETH_FEATURES;
1579         dev->mpls_features = NETIF_F_HW_CSUM | NETIF_F_GSO_SOFTWARE;
1580 }
1581
1582 /*
1583  * netlink interface
1584  */
1585
1586 static int veth_validate(struct nlattr *tb[], struct nlattr *data[],
1587                          struct netlink_ext_ack *extack)
1588 {
1589         if (tb[IFLA_ADDRESS]) {
1590                 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
1591                         return -EINVAL;
1592                 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
1593                         return -EADDRNOTAVAIL;
1594         }
1595         if (tb[IFLA_MTU]) {
1596                 if (!is_valid_veth_mtu(nla_get_u32(tb[IFLA_MTU])))
1597                         return -EINVAL;
1598         }
1599         return 0;
1600 }
1601
1602 static struct rtnl_link_ops veth_link_ops;
1603
1604 static void veth_disable_gro(struct net_device *dev)
1605 {
1606         dev->features &= ~NETIF_F_GRO;
1607         dev->wanted_features &= ~NETIF_F_GRO;
1608         netdev_update_features(dev);
1609 }
1610
1611 static int veth_init_queues(struct net_device *dev, struct nlattr *tb[])
1612 {
1613         int err;
1614
1615         if (!tb[IFLA_NUM_TX_QUEUES] && dev->num_tx_queues > 1) {
1616                 err = netif_set_real_num_tx_queues(dev, 1);
1617                 if (err)
1618                         return err;
1619         }
1620         if (!tb[IFLA_NUM_RX_QUEUES] && dev->num_rx_queues > 1) {
1621                 err = netif_set_real_num_rx_queues(dev, 1);
1622                 if (err)
1623                         return err;
1624         }
1625         return 0;
1626 }
1627
1628 static int veth_newlink(struct net *src_net, struct net_device *dev,
1629                         struct nlattr *tb[], struct nlattr *data[],
1630                         struct netlink_ext_ack *extack)
1631 {
1632         int err;
1633         struct net_device *peer;
1634         struct veth_priv *priv;
1635         char ifname[IFNAMSIZ];
1636         struct nlattr *peer_tb[IFLA_MAX + 1], **tbp;
1637         unsigned char name_assign_type;
1638         struct ifinfomsg *ifmp;
1639         struct net *net;
1640
1641         /*
1642          * create and register peer first
1643          */
1644         if (data != NULL && data[VETH_INFO_PEER] != NULL) {
1645                 struct nlattr *nla_peer;
1646
1647                 nla_peer = data[VETH_INFO_PEER];
1648                 ifmp = nla_data(nla_peer);
1649                 err = rtnl_nla_parse_ifla(peer_tb,
1650                                           nla_data(nla_peer) + sizeof(struct ifinfomsg),
1651                                           nla_len(nla_peer) - sizeof(struct ifinfomsg),
1652                                           NULL);
1653                 if (err < 0)
1654                         return err;
1655
1656                 err = veth_validate(peer_tb, NULL, extack);
1657                 if (err < 0)
1658                         return err;
1659
1660                 tbp = peer_tb;
1661         } else {
1662                 ifmp = NULL;
1663                 tbp = tb;
1664         }
1665
1666         if (ifmp && tbp[IFLA_IFNAME]) {
1667                 nla_strscpy(ifname, tbp[IFLA_IFNAME], IFNAMSIZ);
1668                 name_assign_type = NET_NAME_USER;
1669         } else {
1670                 snprintf(ifname, IFNAMSIZ, DRV_NAME "%%d");
1671                 name_assign_type = NET_NAME_ENUM;
1672         }
1673
1674         net = rtnl_link_get_net(src_net, tbp);
1675         if (IS_ERR(net))
1676                 return PTR_ERR(net);
1677
1678         peer = rtnl_create_link(net, ifname, name_assign_type,
1679                                 &veth_link_ops, tbp, extack);
1680         if (IS_ERR(peer)) {
1681                 put_net(net);
1682                 return PTR_ERR(peer);
1683         }
1684
1685         if (!ifmp || !tbp[IFLA_ADDRESS])
1686                 eth_hw_addr_random(peer);
1687
1688         if (ifmp && (dev->ifindex != 0))
1689                 peer->ifindex = ifmp->ifi_index;
1690
1691         netif_set_gso_max_size(peer, dev->gso_max_size);
1692         netif_set_gso_max_segs(peer, dev->gso_max_segs);
1693
1694         err = register_netdevice(peer);
1695         put_net(net);
1696         net = NULL;
1697         if (err < 0)
1698                 goto err_register_peer;
1699
1700         /* keep GRO disabled by default to be consistent with the established
1701          * veth behavior
1702          */
1703         veth_disable_gro(peer);
1704         netif_carrier_off(peer);
1705
1706         err = rtnl_configure_link(peer, ifmp);
1707         if (err < 0)
1708                 goto err_configure_peer;
1709
1710         /*
1711          * register dev last
1712          *
1713          * note, that since we've registered new device the dev's name
1714          * should be re-allocated
1715          */
1716
1717         if (tb[IFLA_ADDRESS] == NULL)
1718                 eth_hw_addr_random(dev);
1719
1720         if (tb[IFLA_IFNAME])
1721                 nla_strscpy(dev->name, tb[IFLA_IFNAME], IFNAMSIZ);
1722         else
1723                 snprintf(dev->name, IFNAMSIZ, DRV_NAME "%%d");
1724
1725         err = register_netdevice(dev);
1726         if (err < 0)
1727                 goto err_register_dev;
1728
1729         netif_carrier_off(dev);
1730
1731         /*
1732          * tie the deviced together
1733          */
1734
1735         priv = netdev_priv(dev);
1736         rcu_assign_pointer(priv->peer, peer);
1737         err = veth_init_queues(dev, tb);
1738         if (err)
1739                 goto err_queues;
1740
1741         priv = netdev_priv(peer);
1742         rcu_assign_pointer(priv->peer, dev);
1743         err = veth_init_queues(peer, tb);
1744         if (err)
1745                 goto err_queues;
1746
1747         veth_disable_gro(dev);
1748         return 0;
1749
1750 err_queues:
1751         unregister_netdevice(dev);
1752 err_register_dev:
1753         /* nothing to do */
1754 err_configure_peer:
1755         unregister_netdevice(peer);
1756         return err;
1757
1758 err_register_peer:
1759         free_netdev(peer);
1760         return err;
1761 }
1762
1763 static void veth_dellink(struct net_device *dev, struct list_head *head)
1764 {
1765         struct veth_priv *priv;
1766         struct net_device *peer;
1767
1768         priv = netdev_priv(dev);
1769         peer = rtnl_dereference(priv->peer);
1770
1771         /* Note : dellink() is called from default_device_exit_batch(),
1772          * before a rcu_synchronize() point. The devices are guaranteed
1773          * not being freed before one RCU grace period.
1774          */
1775         RCU_INIT_POINTER(priv->peer, NULL);
1776         unregister_netdevice_queue(dev, head);
1777
1778         if (peer) {
1779                 priv = netdev_priv(peer);
1780                 RCU_INIT_POINTER(priv->peer, NULL);
1781                 unregister_netdevice_queue(peer, head);
1782         }
1783 }
1784
1785 static const struct nla_policy veth_policy[VETH_INFO_MAX + 1] = {
1786         [VETH_INFO_PEER]        = { .len = sizeof(struct ifinfomsg) },
1787 };
1788
1789 static struct net *veth_get_link_net(const struct net_device *dev)
1790 {
1791         struct veth_priv *priv = netdev_priv(dev);
1792         struct net_device *peer = rtnl_dereference(priv->peer);
1793
1794         return peer ? dev_net(peer) : dev_net(dev);
1795 }
1796
1797 static unsigned int veth_get_num_queues(void)
1798 {
1799         /* enforce the same queue limit as rtnl_create_link */
1800         int queues = num_possible_cpus();
1801
1802         if (queues > 4096)
1803                 queues = 4096;
1804         return queues;
1805 }
1806
1807 static struct rtnl_link_ops veth_link_ops = {
1808         .kind           = DRV_NAME,
1809         .priv_size      = sizeof(struct veth_priv),
1810         .setup          = veth_setup,
1811         .validate       = veth_validate,
1812         .newlink        = veth_newlink,
1813         .dellink        = veth_dellink,
1814         .policy         = veth_policy,
1815         .maxtype        = VETH_INFO_MAX,
1816         .get_link_net   = veth_get_link_net,
1817         .get_num_tx_queues      = veth_get_num_queues,
1818         .get_num_rx_queues      = veth_get_num_queues,
1819 };
1820
1821 /*
1822  * init/fini
1823  */
1824
1825 static __init int veth_init(void)
1826 {
1827         return rtnl_link_register(&veth_link_ops);
1828 }
1829
1830 static __exit void veth_exit(void)
1831 {
1832         rtnl_link_unregister(&veth_link_ops);
1833 }
1834
1835 module_init(veth_init);
1836 module_exit(veth_exit);
1837
1838 MODULE_DESCRIPTION("Virtual Ethernet Tunnel");
1839 MODULE_LICENSE("GPL v2");
1840 MODULE_ALIAS_RTNL_LINK(DRV_NAME);