Merge branch 'for-5.12/google' into for-linus
[linux-2.6-microblaze.git] / net / ipv4 / ip_tunnel.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (c) 2013 Nicira, Inc.
4  */
5
6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
7
8 #include <linux/capability.h>
9 #include <linux/module.h>
10 #include <linux/types.h>
11 #include <linux/kernel.h>
12 #include <linux/slab.h>
13 #include <linux/uaccess.h>
14 #include <linux/skbuff.h>
15 #include <linux/netdevice.h>
16 #include <linux/in.h>
17 #include <linux/tcp.h>
18 #include <linux/udp.h>
19 #include <linux/if_arp.h>
20 #include <linux/init.h>
21 #include <linux/in6.h>
22 #include <linux/inetdevice.h>
23 #include <linux/igmp.h>
24 #include <linux/netfilter_ipv4.h>
25 #include <linux/etherdevice.h>
26 #include <linux/if_ether.h>
27 #include <linux/if_vlan.h>
28 #include <linux/rculist.h>
29 #include <linux/err.h>
30
31 #include <net/sock.h>
32 #include <net/ip.h>
33 #include <net/icmp.h>
34 #include <net/protocol.h>
35 #include <net/ip_tunnels.h>
36 #include <net/arp.h>
37 #include <net/checksum.h>
38 #include <net/dsfield.h>
39 #include <net/inet_ecn.h>
40 #include <net/xfrm.h>
41 #include <net/net_namespace.h>
42 #include <net/netns/generic.h>
43 #include <net/rtnetlink.h>
44 #include <net/udp.h>
45 #include <net/dst_metadata.h>
46
47 #if IS_ENABLED(CONFIG_IPV6)
48 #include <net/ipv6.h>
49 #include <net/ip6_fib.h>
50 #include <net/ip6_route.h>
51 #endif
52
53 static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
54 {
55         return hash_32((__force u32)key ^ (__force u32)remote,
56                          IP_TNL_HASH_BITS);
57 }
58
59 static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
60                                 __be16 flags, __be32 key)
61 {
62         if (p->i_flags & TUNNEL_KEY) {
63                 if (flags & TUNNEL_KEY)
64                         return key == p->i_key;
65                 else
66                         /* key expected, none present */
67                         return false;
68         } else
69                 return !(flags & TUNNEL_KEY);
70 }
71
72 /* Fallback tunnel: no source, no destination, no key, no options
73
74    Tunnel hash table:
75    We require exact key match i.e. if a key is present in packet
76    it will match only tunnel with the same key; if it is not present,
77    it will match only keyless tunnel.
78
79    All keysless packets, if not matched configured keyless tunnels
80    will match fallback tunnel.
81    Given src, dst and key, find appropriate for input tunnel.
82 */
83 struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
84                                    int link, __be16 flags,
85                                    __be32 remote, __be32 local,
86                                    __be32 key)
87 {
88         struct ip_tunnel *t, *cand = NULL;
89         struct hlist_head *head;
90         struct net_device *ndev;
91         unsigned int hash;
92
93         hash = ip_tunnel_hash(key, remote);
94         head = &itn->tunnels[hash];
95
96         hlist_for_each_entry_rcu(t, head, hash_node) {
97                 if (local != t->parms.iph.saddr ||
98                     remote != t->parms.iph.daddr ||
99                     !(t->dev->flags & IFF_UP))
100                         continue;
101
102                 if (!ip_tunnel_key_match(&t->parms, flags, key))
103                         continue;
104
105                 if (t->parms.link == link)
106                         return t;
107                 else
108                         cand = t;
109         }
110
111         hlist_for_each_entry_rcu(t, head, hash_node) {
112                 if (remote != t->parms.iph.daddr ||
113                     t->parms.iph.saddr != 0 ||
114                     !(t->dev->flags & IFF_UP))
115                         continue;
116
117                 if (!ip_tunnel_key_match(&t->parms, flags, key))
118                         continue;
119
120                 if (t->parms.link == link)
121                         return t;
122                 else if (!cand)
123                         cand = t;
124         }
125
126         hash = ip_tunnel_hash(key, 0);
127         head = &itn->tunnels[hash];
128
129         hlist_for_each_entry_rcu(t, head, hash_node) {
130                 if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) &&
131                     (local != t->parms.iph.daddr || !ipv4_is_multicast(local)))
132                         continue;
133
134                 if (!(t->dev->flags & IFF_UP))
135                         continue;
136
137                 if (!ip_tunnel_key_match(&t->parms, flags, key))
138                         continue;
139
140                 if (t->parms.link == link)
141                         return t;
142                 else if (!cand)
143                         cand = t;
144         }
145
146         hlist_for_each_entry_rcu(t, head, hash_node) {
147                 if ((!(flags & TUNNEL_NO_KEY) && t->parms.i_key != key) ||
148                     t->parms.iph.saddr != 0 ||
149                     t->parms.iph.daddr != 0 ||
150                     !(t->dev->flags & IFF_UP))
151                         continue;
152
153                 if (t->parms.link == link)
154                         return t;
155                 else if (!cand)
156                         cand = t;
157         }
158
159         if (cand)
160                 return cand;
161
162         t = rcu_dereference(itn->collect_md_tun);
163         if (t && t->dev->flags & IFF_UP)
164                 return t;
165
166         ndev = READ_ONCE(itn->fb_tunnel_dev);
167         if (ndev && ndev->flags & IFF_UP)
168                 return netdev_priv(ndev);
169
170         return NULL;
171 }
172 EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
173
174 static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
175                                     struct ip_tunnel_parm *parms)
176 {
177         unsigned int h;
178         __be32 remote;
179         __be32 i_key = parms->i_key;
180
181         if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
182                 remote = parms->iph.daddr;
183         else
184                 remote = 0;
185
186         if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI))
187                 i_key = 0;
188
189         h = ip_tunnel_hash(i_key, remote);
190         return &itn->tunnels[h];
191 }
192
193 static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
194 {
195         struct hlist_head *head = ip_bucket(itn, &t->parms);
196
197         if (t->collect_md)
198                 rcu_assign_pointer(itn->collect_md_tun, t);
199         hlist_add_head_rcu(&t->hash_node, head);
200 }
201
202 static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t)
203 {
204         if (t->collect_md)
205                 rcu_assign_pointer(itn->collect_md_tun, NULL);
206         hlist_del_init_rcu(&t->hash_node);
207 }
208
209 static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
210                                         struct ip_tunnel_parm *parms,
211                                         int type)
212 {
213         __be32 remote = parms->iph.daddr;
214         __be32 local = parms->iph.saddr;
215         __be32 key = parms->i_key;
216         __be16 flags = parms->i_flags;
217         int link = parms->link;
218         struct ip_tunnel *t = NULL;
219         struct hlist_head *head = ip_bucket(itn, parms);
220
221         hlist_for_each_entry_rcu(t, head, hash_node) {
222                 if (local == t->parms.iph.saddr &&
223                     remote == t->parms.iph.daddr &&
224                     link == t->parms.link &&
225                     type == t->dev->type &&
226                     ip_tunnel_key_match(&t->parms, flags, key))
227                         break;
228         }
229         return t;
230 }
231
232 static struct net_device *__ip_tunnel_create(struct net *net,
233                                              const struct rtnl_link_ops *ops,
234                                              struct ip_tunnel_parm *parms)
235 {
236         int err;
237         struct ip_tunnel *tunnel;
238         struct net_device *dev;
239         char name[IFNAMSIZ];
240
241         err = -E2BIG;
242         if (parms->name[0]) {
243                 if (!dev_valid_name(parms->name))
244                         goto failed;
245                 strlcpy(name, parms->name, IFNAMSIZ);
246         } else {
247                 if (strlen(ops->kind) > (IFNAMSIZ - 3))
248                         goto failed;
249                 strcpy(name, ops->kind);
250                 strcat(name, "%d");
251         }
252
253         ASSERT_RTNL();
254         dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup);
255         if (!dev) {
256                 err = -ENOMEM;
257                 goto failed;
258         }
259         dev_net_set(dev, net);
260
261         dev->rtnl_link_ops = ops;
262
263         tunnel = netdev_priv(dev);
264         tunnel->parms = *parms;
265         tunnel->net = net;
266
267         err = register_netdevice(dev);
268         if (err)
269                 goto failed_free;
270
271         return dev;
272
273 failed_free:
274         free_netdev(dev);
275 failed:
276         return ERR_PTR(err);
277 }
278
279 static int ip_tunnel_bind_dev(struct net_device *dev)
280 {
281         struct net_device *tdev = NULL;
282         struct ip_tunnel *tunnel = netdev_priv(dev);
283         const struct iphdr *iph;
284         int hlen = LL_MAX_HEADER;
285         int mtu = ETH_DATA_LEN;
286         int t_hlen = tunnel->hlen + sizeof(struct iphdr);
287
288         iph = &tunnel->parms.iph;
289
290         /* Guess output device to choose reasonable mtu and needed_headroom */
291         if (iph->daddr) {
292                 struct flowi4 fl4;
293                 struct rtable *rt;
294
295                 ip_tunnel_init_flow(&fl4, iph->protocol, iph->daddr,
296                                     iph->saddr, tunnel->parms.o_key,
297                                     RT_TOS(iph->tos), tunnel->parms.link,
298                                     tunnel->fwmark, 0);
299                 rt = ip_route_output_key(tunnel->net, &fl4);
300
301                 if (!IS_ERR(rt)) {
302                         tdev = rt->dst.dev;
303                         ip_rt_put(rt);
304                 }
305                 if (dev->type != ARPHRD_ETHER)
306                         dev->flags |= IFF_POINTOPOINT;
307
308                 dst_cache_reset(&tunnel->dst_cache);
309         }
310
311         if (!tdev && tunnel->parms.link)
312                 tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
313
314         if (tdev) {
315                 hlen = tdev->hard_header_len + tdev->needed_headroom;
316                 mtu = min(tdev->mtu, IP_MAX_MTU);
317         }
318
319         dev->needed_headroom = t_hlen + hlen;
320         mtu -= (dev->hard_header_len + t_hlen);
321
322         if (mtu < IPV4_MIN_MTU)
323                 mtu = IPV4_MIN_MTU;
324
325         return mtu;
326 }
327
328 static struct ip_tunnel *ip_tunnel_create(struct net *net,
329                                           struct ip_tunnel_net *itn,
330                                           struct ip_tunnel_parm *parms)
331 {
332         struct ip_tunnel *nt;
333         struct net_device *dev;
334         int t_hlen;
335         int mtu;
336         int err;
337
338         dev = __ip_tunnel_create(net, itn->rtnl_link_ops, parms);
339         if (IS_ERR(dev))
340                 return ERR_CAST(dev);
341
342         mtu = ip_tunnel_bind_dev(dev);
343         err = dev_set_mtu(dev, mtu);
344         if (err)
345                 goto err_dev_set_mtu;
346
347         nt = netdev_priv(dev);
348         t_hlen = nt->hlen + sizeof(struct iphdr);
349         dev->min_mtu = ETH_MIN_MTU;
350         dev->max_mtu = IP_MAX_MTU - dev->hard_header_len - t_hlen;
351         ip_tunnel_add(itn, nt);
352         return nt;
353
354 err_dev_set_mtu:
355         unregister_netdevice(dev);
356         return ERR_PTR(err);
357 }
358
359 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
360                   const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst,
361                   bool log_ecn_error)
362 {
363         const struct iphdr *iph = ip_hdr(skb);
364         int err;
365
366 #ifdef CONFIG_NET_IPGRE_BROADCAST
367         if (ipv4_is_multicast(iph->daddr)) {
368                 tunnel->dev->stats.multicast++;
369                 skb->pkt_type = PACKET_BROADCAST;
370         }
371 #endif
372
373         if ((!(tpi->flags&TUNNEL_CSUM) &&  (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
374              ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
375                 tunnel->dev->stats.rx_crc_errors++;
376                 tunnel->dev->stats.rx_errors++;
377                 goto drop;
378         }
379
380         if (tunnel->parms.i_flags&TUNNEL_SEQ) {
381                 if (!(tpi->flags&TUNNEL_SEQ) ||
382                     (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
383                         tunnel->dev->stats.rx_fifo_errors++;
384                         tunnel->dev->stats.rx_errors++;
385                         goto drop;
386                 }
387                 tunnel->i_seqno = ntohl(tpi->seq) + 1;
388         }
389
390         skb_reset_network_header(skb);
391
392         err = IP_ECN_decapsulate(iph, skb);
393         if (unlikely(err)) {
394                 if (log_ecn_error)
395                         net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
396                                         &iph->saddr, iph->tos);
397                 if (err > 1) {
398                         ++tunnel->dev->stats.rx_frame_errors;
399                         ++tunnel->dev->stats.rx_errors;
400                         goto drop;
401                 }
402         }
403
404         dev_sw_netstats_rx_add(tunnel->dev, skb->len);
405         skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
406
407         if (tunnel->dev->type == ARPHRD_ETHER) {
408                 skb->protocol = eth_type_trans(skb, tunnel->dev);
409                 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
410         } else {
411                 skb->dev = tunnel->dev;
412         }
413
414         if (tun_dst)
415                 skb_dst_set(skb, (struct dst_entry *)tun_dst);
416
417         gro_cells_receive(&tunnel->gro_cells, skb);
418         return 0;
419
420 drop:
421         if (tun_dst)
422                 dst_release((struct dst_entry *)tun_dst);
423         kfree_skb(skb);
424         return 0;
425 }
426 EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
427
428 int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops,
429                             unsigned int num)
430 {
431         if (num >= MAX_IPTUN_ENCAP_OPS)
432                 return -ERANGE;
433
434         return !cmpxchg((const struct ip_tunnel_encap_ops **)
435                         &iptun_encaps[num],
436                         NULL, ops) ? 0 : -1;
437 }
438 EXPORT_SYMBOL(ip_tunnel_encap_add_ops);
439
440 int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops,
441                             unsigned int num)
442 {
443         int ret;
444
445         if (num >= MAX_IPTUN_ENCAP_OPS)
446                 return -ERANGE;
447
448         ret = (cmpxchg((const struct ip_tunnel_encap_ops **)
449                        &iptun_encaps[num],
450                        ops, NULL) == ops) ? 0 : -1;
451
452         synchronize_net();
453
454         return ret;
455 }
456 EXPORT_SYMBOL(ip_tunnel_encap_del_ops);
457
458 int ip_tunnel_encap_setup(struct ip_tunnel *t,
459                           struct ip_tunnel_encap *ipencap)
460 {
461         int hlen;
462
463         memset(&t->encap, 0, sizeof(t->encap));
464
465         hlen = ip_encap_hlen(ipencap);
466         if (hlen < 0)
467                 return hlen;
468
469         t->encap.type = ipencap->type;
470         t->encap.sport = ipencap->sport;
471         t->encap.dport = ipencap->dport;
472         t->encap.flags = ipencap->flags;
473
474         t->encap_hlen = hlen;
475         t->hlen = t->encap_hlen + t->tun_hlen;
476
477         return 0;
478 }
479 EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup);
480
481 static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
482                             struct rtable *rt, __be16 df,
483                             const struct iphdr *inner_iph,
484                             int tunnel_hlen, __be32 dst, bool md)
485 {
486         struct ip_tunnel *tunnel = netdev_priv(dev);
487         int pkt_size;
488         int mtu;
489
490         tunnel_hlen = md ? tunnel_hlen : tunnel->hlen;
491         pkt_size = skb->len - tunnel_hlen - dev->hard_header_len;
492
493         if (df)
494                 mtu = dst_mtu(&rt->dst) - dev->hard_header_len
495                                         - sizeof(struct iphdr) - tunnel_hlen;
496         else
497                 mtu = skb_valid_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
498
499         if (skb_valid_dst(skb))
500                 skb_dst_update_pmtu_no_confirm(skb, mtu);
501
502         if (skb->protocol == htons(ETH_P_IP)) {
503                 if (!skb_is_gso(skb) &&
504                     (inner_iph->frag_off & htons(IP_DF)) &&
505                     mtu < pkt_size) {
506                         memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
507                         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
508                         return -E2BIG;
509                 }
510         }
511 #if IS_ENABLED(CONFIG_IPV6)
512         else if (skb->protocol == htons(ETH_P_IPV6)) {
513                 struct rt6_info *rt6;
514                 __be32 daddr;
515
516                 rt6 = skb_valid_dst(skb) ? (struct rt6_info *)skb_dst(skb) :
517                                            NULL;
518                 daddr = md ? dst : tunnel->parms.iph.daddr;
519
520                 if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
521                            mtu >= IPV6_MIN_MTU) {
522                         if ((daddr && !ipv4_is_multicast(daddr)) ||
523                             rt6->rt6i_dst.plen == 128) {
524                                 rt6->rt6i_flags |= RTF_MODIFIED;
525                                 dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
526                         }
527                 }
528
529                 if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
530                                         mtu < pkt_size) {
531                         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
532                         return -E2BIG;
533                 }
534         }
535 #endif
536         return 0;
537 }
538
539 void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
540                        u8 proto, int tunnel_hlen)
541 {
542         struct ip_tunnel *tunnel = netdev_priv(dev);
543         u32 headroom = sizeof(struct iphdr);
544         struct ip_tunnel_info *tun_info;
545         const struct ip_tunnel_key *key;
546         const struct iphdr *inner_iph;
547         struct rtable *rt = NULL;
548         struct flowi4 fl4;
549         __be16 df = 0;
550         u8 tos, ttl;
551         bool use_cache;
552
553         tun_info = skb_tunnel_info(skb);
554         if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
555                      ip_tunnel_info_af(tun_info) != AF_INET))
556                 goto tx_error;
557         key = &tun_info->key;
558         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
559         inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
560         tos = key->tos;
561         if (tos == 1) {
562                 if (skb->protocol == htons(ETH_P_IP))
563                         tos = inner_iph->tos;
564                 else if (skb->protocol == htons(ETH_P_IPV6))
565                         tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
566         }
567         ip_tunnel_init_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src,
568                             tunnel_id_to_key32(key->tun_id), RT_TOS(tos),
569                             0, skb->mark, skb_get_hash(skb));
570         if (tunnel->encap.type != TUNNEL_ENCAP_NONE)
571                 goto tx_error;
572
573         use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
574         if (use_cache)
575                 rt = dst_cache_get_ip4(&tun_info->dst_cache, &fl4.saddr);
576         if (!rt) {
577                 rt = ip_route_output_key(tunnel->net, &fl4);
578                 if (IS_ERR(rt)) {
579                         dev->stats.tx_carrier_errors++;
580                         goto tx_error;
581                 }
582                 if (use_cache)
583                         dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst,
584                                           fl4.saddr);
585         }
586         if (rt->dst.dev == dev) {
587                 ip_rt_put(rt);
588                 dev->stats.collisions++;
589                 goto tx_error;
590         }
591
592         if (key->tun_flags & TUNNEL_DONT_FRAGMENT)
593                 df = htons(IP_DF);
594         if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, tunnel_hlen,
595                             key->u.ipv4.dst, true)) {
596                 ip_rt_put(rt);
597                 goto tx_error;
598         }
599
600         tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
601         ttl = key->ttl;
602         if (ttl == 0) {
603                 if (skb->protocol == htons(ETH_P_IP))
604                         ttl = inner_iph->ttl;
605                 else if (skb->protocol == htons(ETH_P_IPV6))
606                         ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
607                 else
608                         ttl = ip4_dst_hoplimit(&rt->dst);
609         }
610
611         headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len;
612         if (headroom > dev->needed_headroom)
613                 dev->needed_headroom = headroom;
614
615         if (skb_cow_head(skb, dev->needed_headroom)) {
616                 ip_rt_put(rt);
617                 goto tx_dropped;
618         }
619         iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, tos, ttl,
620                       df, !net_eq(tunnel->net, dev_net(dev)));
621         return;
622 tx_error:
623         dev->stats.tx_errors++;
624         goto kfree;
625 tx_dropped:
626         dev->stats.tx_dropped++;
627 kfree:
628         kfree_skb(skb);
629 }
630 EXPORT_SYMBOL_GPL(ip_md_tunnel_xmit);
631
632 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
633                     const struct iphdr *tnl_params, u8 protocol)
634 {
635         struct ip_tunnel *tunnel = netdev_priv(dev);
636         struct ip_tunnel_info *tun_info = NULL;
637         const struct iphdr *inner_iph;
638         unsigned int max_headroom;      /* The extra header space needed */
639         struct rtable *rt = NULL;               /* Route to the other host */
640         bool use_cache = false;
641         struct flowi4 fl4;
642         bool md = false;
643         bool connected;
644         u8 tos, ttl;
645         __be32 dst;
646         __be16 df;
647
648         inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
649         connected = (tunnel->parms.iph.daddr != 0);
650
651         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
652
653         dst = tnl_params->daddr;
654         if (dst == 0) {
655                 /* NBMA tunnel */
656
657                 if (!skb_dst(skb)) {
658                         dev->stats.tx_fifo_errors++;
659                         goto tx_error;
660                 }
661
662                 tun_info = skb_tunnel_info(skb);
663                 if (tun_info && (tun_info->mode & IP_TUNNEL_INFO_TX) &&
664                     ip_tunnel_info_af(tun_info) == AF_INET &&
665                     tun_info->key.u.ipv4.dst) {
666                         dst = tun_info->key.u.ipv4.dst;
667                         md = true;
668                         connected = true;
669                 }
670                 else if (skb->protocol == htons(ETH_P_IP)) {
671                         rt = skb_rtable(skb);
672                         dst = rt_nexthop(rt, inner_iph->daddr);
673                 }
674 #if IS_ENABLED(CONFIG_IPV6)
675                 else if (skb->protocol == htons(ETH_P_IPV6)) {
676                         const struct in6_addr *addr6;
677                         struct neighbour *neigh;
678                         bool do_tx_error_icmp;
679                         int addr_type;
680
681                         neigh = dst_neigh_lookup(skb_dst(skb),
682                                                  &ipv6_hdr(skb)->daddr);
683                         if (!neigh)
684                                 goto tx_error;
685
686                         addr6 = (const struct in6_addr *)&neigh->primary_key;
687                         addr_type = ipv6_addr_type(addr6);
688
689                         if (addr_type == IPV6_ADDR_ANY) {
690                                 addr6 = &ipv6_hdr(skb)->daddr;
691                                 addr_type = ipv6_addr_type(addr6);
692                         }
693
694                         if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
695                                 do_tx_error_icmp = true;
696                         else {
697                                 do_tx_error_icmp = false;
698                                 dst = addr6->s6_addr32[3];
699                         }
700                         neigh_release(neigh);
701                         if (do_tx_error_icmp)
702                                 goto tx_error_icmp;
703                 }
704 #endif
705                 else
706                         goto tx_error;
707
708                 if (!md)
709                         connected = false;
710         }
711
712         tos = tnl_params->tos;
713         if (tos & 0x1) {
714                 tos &= ~0x1;
715                 if (skb->protocol == htons(ETH_P_IP)) {
716                         tos = inner_iph->tos;
717                         connected = false;
718                 } else if (skb->protocol == htons(ETH_P_IPV6)) {
719                         tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
720                         connected = false;
721                 }
722         }
723
724         ip_tunnel_init_flow(&fl4, protocol, dst, tnl_params->saddr,
725                             tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link,
726                             tunnel->fwmark, skb_get_hash(skb));
727
728         if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0)
729                 goto tx_error;
730
731         if (connected && md) {
732                 use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
733                 if (use_cache)
734                         rt = dst_cache_get_ip4(&tun_info->dst_cache,
735                                                &fl4.saddr);
736         } else {
737                 rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache,
738                                                 &fl4.saddr) : NULL;
739         }
740
741         if (!rt) {
742                 rt = ip_route_output_key(tunnel->net, &fl4);
743
744                 if (IS_ERR(rt)) {
745                         dev->stats.tx_carrier_errors++;
746                         goto tx_error;
747                 }
748                 if (use_cache)
749                         dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst,
750                                           fl4.saddr);
751                 else if (!md && connected)
752                         dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst,
753                                           fl4.saddr);
754         }
755
756         if (rt->dst.dev == dev) {
757                 ip_rt_put(rt);
758                 dev->stats.collisions++;
759                 goto tx_error;
760         }
761
762         df = tnl_params->frag_off;
763         if (skb->protocol == htons(ETH_P_IP) && !tunnel->ignore_df)
764                 df |= (inner_iph->frag_off & htons(IP_DF));
765
766         if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, 0, 0, false)) {
767                 ip_rt_put(rt);
768                 goto tx_error;
769         }
770
771         if (tunnel->err_count > 0) {
772                 if (time_before(jiffies,
773                                 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
774                         tunnel->err_count--;
775
776                         dst_link_failure(skb);
777                 } else
778                         tunnel->err_count = 0;
779         }
780
781         tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
782         ttl = tnl_params->ttl;
783         if (ttl == 0) {
784                 if (skb->protocol == htons(ETH_P_IP))
785                         ttl = inner_iph->ttl;
786 #if IS_ENABLED(CONFIG_IPV6)
787                 else if (skb->protocol == htons(ETH_P_IPV6))
788                         ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
789 #endif
790                 else
791                         ttl = ip4_dst_hoplimit(&rt->dst);
792         }
793
794         max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
795                         + rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
796         if (max_headroom > dev->needed_headroom)
797                 dev->needed_headroom = max_headroom;
798
799         if (skb_cow_head(skb, dev->needed_headroom)) {
800                 ip_rt_put(rt);
801                 dev->stats.tx_dropped++;
802                 kfree_skb(skb);
803                 return;
804         }
805
806         iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl,
807                       df, !net_eq(tunnel->net, dev_net(dev)));
808         return;
809
810 #if IS_ENABLED(CONFIG_IPV6)
811 tx_error_icmp:
812         dst_link_failure(skb);
813 #endif
814 tx_error:
815         dev->stats.tx_errors++;
816         kfree_skb(skb);
817 }
818 EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
819
820 static void ip_tunnel_update(struct ip_tunnel_net *itn,
821                              struct ip_tunnel *t,
822                              struct net_device *dev,
823                              struct ip_tunnel_parm *p,
824                              bool set_mtu,
825                              __u32 fwmark)
826 {
827         ip_tunnel_del(itn, t);
828         t->parms.iph.saddr = p->iph.saddr;
829         t->parms.iph.daddr = p->iph.daddr;
830         t->parms.i_key = p->i_key;
831         t->parms.o_key = p->o_key;
832         if (dev->type != ARPHRD_ETHER) {
833                 memcpy(dev->dev_addr, &p->iph.saddr, 4);
834                 memcpy(dev->broadcast, &p->iph.daddr, 4);
835         }
836         ip_tunnel_add(itn, t);
837
838         t->parms.iph.ttl = p->iph.ttl;
839         t->parms.iph.tos = p->iph.tos;
840         t->parms.iph.frag_off = p->iph.frag_off;
841
842         if (t->parms.link != p->link || t->fwmark != fwmark) {
843                 int mtu;
844
845                 t->parms.link = p->link;
846                 t->fwmark = fwmark;
847                 mtu = ip_tunnel_bind_dev(dev);
848                 if (set_mtu)
849                         dev->mtu = mtu;
850         }
851         dst_cache_reset(&t->dst_cache);
852         netdev_state_change(dev);
853 }
854
855 int ip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
856 {
857         int err = 0;
858         struct ip_tunnel *t = netdev_priv(dev);
859         struct net *net = t->net;
860         struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
861
862         switch (cmd) {
863         case SIOCGETTUNNEL:
864                 if (dev == itn->fb_tunnel_dev) {
865                         t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
866                         if (!t)
867                                 t = netdev_priv(dev);
868                 }
869                 memcpy(p, &t->parms, sizeof(*p));
870                 break;
871
872         case SIOCADDTUNNEL:
873         case SIOCCHGTUNNEL:
874                 err = -EPERM;
875                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
876                         goto done;
877                 if (p->iph.ttl)
878                         p->iph.frag_off |= htons(IP_DF);
879                 if (!(p->i_flags & VTI_ISVTI)) {
880                         if (!(p->i_flags & TUNNEL_KEY))
881                                 p->i_key = 0;
882                         if (!(p->o_flags & TUNNEL_KEY))
883                                 p->o_key = 0;
884                 }
885
886                 t = ip_tunnel_find(itn, p, itn->type);
887
888                 if (cmd == SIOCADDTUNNEL) {
889                         if (!t) {
890                                 t = ip_tunnel_create(net, itn, p);
891                                 err = PTR_ERR_OR_ZERO(t);
892                                 break;
893                         }
894
895                         err = -EEXIST;
896                         break;
897                 }
898                 if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
899                         if (t) {
900                                 if (t->dev != dev) {
901                                         err = -EEXIST;
902                                         break;
903                                 }
904                         } else {
905                                 unsigned int nflags = 0;
906
907                                 if (ipv4_is_multicast(p->iph.daddr))
908                                         nflags = IFF_BROADCAST;
909                                 else if (p->iph.daddr)
910                                         nflags = IFF_POINTOPOINT;
911
912                                 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
913                                         err = -EINVAL;
914                                         break;
915                                 }
916
917                                 t = netdev_priv(dev);
918                         }
919                 }
920
921                 if (t) {
922                         err = 0;
923                         ip_tunnel_update(itn, t, dev, p, true, 0);
924                 } else {
925                         err = -ENOENT;
926                 }
927                 break;
928
929         case SIOCDELTUNNEL:
930                 err = -EPERM;
931                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
932                         goto done;
933
934                 if (dev == itn->fb_tunnel_dev) {
935                         err = -ENOENT;
936                         t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
937                         if (!t)
938                                 goto done;
939                         err = -EPERM;
940                         if (t == netdev_priv(itn->fb_tunnel_dev))
941                                 goto done;
942                         dev = t->dev;
943                 }
944                 unregister_netdevice(dev);
945                 err = 0;
946                 break;
947
948         default:
949                 err = -EINVAL;
950         }
951
952 done:
953         return err;
954 }
955 EXPORT_SYMBOL_GPL(ip_tunnel_ctl);
956
957 int ip_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
958 {
959         struct ip_tunnel_parm p;
960         int err;
961
962         if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
963                 return -EFAULT;
964         err = dev->netdev_ops->ndo_tunnel_ctl(dev, &p, cmd);
965         if (!err && copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
966                 return -EFAULT;
967         return err;
968 }
969 EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
970
971 int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict)
972 {
973         struct ip_tunnel *tunnel = netdev_priv(dev);
974         int t_hlen = tunnel->hlen + sizeof(struct iphdr);
975         int max_mtu = IP_MAX_MTU - dev->hard_header_len - t_hlen;
976
977         if (new_mtu < ETH_MIN_MTU)
978                 return -EINVAL;
979
980         if (new_mtu > max_mtu) {
981                 if (strict)
982                         return -EINVAL;
983
984                 new_mtu = max_mtu;
985         }
986
987         dev->mtu = new_mtu;
988         return 0;
989 }
990 EXPORT_SYMBOL_GPL(__ip_tunnel_change_mtu);
991
992 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
993 {
994         return __ip_tunnel_change_mtu(dev, new_mtu, true);
995 }
996 EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
997
998 static void ip_tunnel_dev_free(struct net_device *dev)
999 {
1000         struct ip_tunnel *tunnel = netdev_priv(dev);
1001
1002         gro_cells_destroy(&tunnel->gro_cells);
1003         dst_cache_destroy(&tunnel->dst_cache);
1004         free_percpu(dev->tstats);
1005 }
1006
1007 void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
1008 {
1009         struct ip_tunnel *tunnel = netdev_priv(dev);
1010         struct ip_tunnel_net *itn;
1011
1012         itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
1013
1014         if (itn->fb_tunnel_dev != dev) {
1015                 ip_tunnel_del(itn, netdev_priv(dev));
1016                 unregister_netdevice_queue(dev, head);
1017         }
1018 }
1019 EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
1020
1021 struct net *ip_tunnel_get_link_net(const struct net_device *dev)
1022 {
1023         struct ip_tunnel *tunnel = netdev_priv(dev);
1024
1025         return tunnel->net;
1026 }
1027 EXPORT_SYMBOL(ip_tunnel_get_link_net);
1028
1029 int ip_tunnel_get_iflink(const struct net_device *dev)
1030 {
1031         struct ip_tunnel *tunnel = netdev_priv(dev);
1032
1033         return tunnel->parms.link;
1034 }
1035 EXPORT_SYMBOL(ip_tunnel_get_iflink);
1036
1037 int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id,
1038                                   struct rtnl_link_ops *ops, char *devname)
1039 {
1040         struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
1041         struct ip_tunnel_parm parms;
1042         unsigned int i;
1043
1044         itn->rtnl_link_ops = ops;
1045         for (i = 0; i < IP_TNL_HASH_SIZE; i++)
1046                 INIT_HLIST_HEAD(&itn->tunnels[i]);
1047
1048         if (!ops || !net_has_fallback_tunnels(net)) {
1049                 struct ip_tunnel_net *it_init_net;
1050
1051                 it_init_net = net_generic(&init_net, ip_tnl_net_id);
1052                 itn->type = it_init_net->type;
1053                 itn->fb_tunnel_dev = NULL;
1054                 return 0;
1055         }
1056
1057         memset(&parms, 0, sizeof(parms));
1058         if (devname)
1059                 strlcpy(parms.name, devname, IFNAMSIZ);
1060
1061         rtnl_lock();
1062         itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
1063         /* FB netdevice is special: we have one, and only one per netns.
1064          * Allowing to move it to another netns is clearly unsafe.
1065          */
1066         if (!IS_ERR(itn->fb_tunnel_dev)) {
1067                 itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
1068                 itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
1069                 ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
1070                 itn->type = itn->fb_tunnel_dev->type;
1071         }
1072         rtnl_unlock();
1073
1074         return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
1075 }
1076 EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
1077
1078 static void ip_tunnel_destroy(struct net *net, struct ip_tunnel_net *itn,
1079                               struct list_head *head,
1080                               struct rtnl_link_ops *ops)
1081 {
1082         struct net_device *dev, *aux;
1083         int h;
1084
1085         for_each_netdev_safe(net, dev, aux)
1086                 if (dev->rtnl_link_ops == ops)
1087                         unregister_netdevice_queue(dev, head);
1088
1089         for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
1090                 struct ip_tunnel *t;
1091                 struct hlist_node *n;
1092                 struct hlist_head *thead = &itn->tunnels[h];
1093
1094                 hlist_for_each_entry_safe(t, n, thead, hash_node)
1095                         /* If dev is in the same netns, it has already
1096                          * been added to the list by the previous loop.
1097                          */
1098                         if (!net_eq(dev_net(t->dev), net))
1099                                 unregister_netdevice_queue(t->dev, head);
1100         }
1101 }
1102
1103 void ip_tunnel_delete_nets(struct list_head *net_list, unsigned int id,
1104                            struct rtnl_link_ops *ops)
1105 {
1106         struct ip_tunnel_net *itn;
1107         struct net *net;
1108         LIST_HEAD(list);
1109
1110         rtnl_lock();
1111         list_for_each_entry(net, net_list, exit_list) {
1112                 itn = net_generic(net, id);
1113                 ip_tunnel_destroy(net, itn, &list, ops);
1114         }
1115         unregister_netdevice_many(&list);
1116         rtnl_unlock();
1117 }
1118 EXPORT_SYMBOL_GPL(ip_tunnel_delete_nets);
1119
1120 int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
1121                       struct ip_tunnel_parm *p, __u32 fwmark)
1122 {
1123         struct ip_tunnel *nt;
1124         struct net *net = dev_net(dev);
1125         struct ip_tunnel_net *itn;
1126         int mtu;
1127         int err;
1128
1129         nt = netdev_priv(dev);
1130         itn = net_generic(net, nt->ip_tnl_net_id);
1131
1132         if (nt->collect_md) {
1133                 if (rtnl_dereference(itn->collect_md_tun))
1134                         return -EEXIST;
1135         } else {
1136                 if (ip_tunnel_find(itn, p, dev->type))
1137                         return -EEXIST;
1138         }
1139
1140         nt->net = net;
1141         nt->parms = *p;
1142         nt->fwmark = fwmark;
1143         err = register_netdevice(dev);
1144         if (err)
1145                 goto err_register_netdevice;
1146
1147         if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1148                 eth_hw_addr_random(dev);
1149
1150         mtu = ip_tunnel_bind_dev(dev);
1151         if (tb[IFLA_MTU]) {
1152                 unsigned int max = IP_MAX_MTU - dev->hard_header_len - nt->hlen;
1153
1154                 mtu = clamp(dev->mtu, (unsigned int)ETH_MIN_MTU,
1155                             (unsigned int)(max - sizeof(struct iphdr)));
1156         }
1157
1158         err = dev_set_mtu(dev, mtu);
1159         if (err)
1160                 goto err_dev_set_mtu;
1161
1162         ip_tunnel_add(itn, nt);
1163         return 0;
1164
1165 err_dev_set_mtu:
1166         unregister_netdevice(dev);
1167 err_register_netdevice:
1168         return err;
1169 }
1170 EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
1171
1172 int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
1173                          struct ip_tunnel_parm *p, __u32 fwmark)
1174 {
1175         struct ip_tunnel *t;
1176         struct ip_tunnel *tunnel = netdev_priv(dev);
1177         struct net *net = tunnel->net;
1178         struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
1179
1180         if (dev == itn->fb_tunnel_dev)
1181                 return -EINVAL;
1182
1183         t = ip_tunnel_find(itn, p, dev->type);
1184
1185         if (t) {
1186                 if (t->dev != dev)
1187                         return -EEXIST;
1188         } else {
1189                 t = tunnel;
1190
1191                 if (dev->type != ARPHRD_ETHER) {
1192                         unsigned int nflags = 0;
1193
1194                         if (ipv4_is_multicast(p->iph.daddr))
1195                                 nflags = IFF_BROADCAST;
1196                         else if (p->iph.daddr)
1197                                 nflags = IFF_POINTOPOINT;
1198
1199                         if ((dev->flags ^ nflags) &
1200                             (IFF_POINTOPOINT | IFF_BROADCAST))
1201                                 return -EINVAL;
1202                 }
1203         }
1204
1205         ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU], fwmark);
1206         return 0;
1207 }
1208 EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
1209
1210 int ip_tunnel_init(struct net_device *dev)
1211 {
1212         struct ip_tunnel *tunnel = netdev_priv(dev);
1213         struct iphdr *iph = &tunnel->parms.iph;
1214         int err;
1215
1216         dev->needs_free_netdev = true;
1217         dev->priv_destructor = ip_tunnel_dev_free;
1218         dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
1219         if (!dev->tstats)
1220                 return -ENOMEM;
1221
1222         err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
1223         if (err) {
1224                 free_percpu(dev->tstats);
1225                 return err;
1226         }
1227
1228         err = gro_cells_init(&tunnel->gro_cells, dev);
1229         if (err) {
1230                 dst_cache_destroy(&tunnel->dst_cache);
1231                 free_percpu(dev->tstats);
1232                 return err;
1233         }
1234
1235         tunnel->dev = dev;
1236         tunnel->net = dev_net(dev);
1237         strcpy(tunnel->parms.name, dev->name);
1238         iph->version            = 4;
1239         iph->ihl                = 5;
1240
1241         if (tunnel->collect_md)
1242                 netif_keep_dst(dev);
1243         return 0;
1244 }
1245 EXPORT_SYMBOL_GPL(ip_tunnel_init);
1246
1247 void ip_tunnel_uninit(struct net_device *dev)
1248 {
1249         struct ip_tunnel *tunnel = netdev_priv(dev);
1250         struct net *net = tunnel->net;
1251         struct ip_tunnel_net *itn;
1252
1253         itn = net_generic(net, tunnel->ip_tnl_net_id);
1254         ip_tunnel_del(itn, netdev_priv(dev));
1255         if (itn->fb_tunnel_dev == dev)
1256                 WRITE_ONCE(itn->fb_tunnel_dev, NULL);
1257
1258         dst_cache_reset(&tunnel->dst_cache);
1259 }
1260 EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1261
1262 /* Do least required initialization, rest of init is done in tunnel_init call */
1263 void ip_tunnel_setup(struct net_device *dev, unsigned int net_id)
1264 {
1265         struct ip_tunnel *tunnel = netdev_priv(dev);
1266         tunnel->ip_tnl_net_id = net_id;
1267 }
1268 EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1269
1270 MODULE_LICENSE("GPL");