Merge tag 'locking-urgent-2021-05-09' of git://git.kernel.org/pub/scm/linux/kernel...
[linux-2.6-microblaze.git] / net / ipv4 / ip_tunnel.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (c) 2013 Nicira, Inc.
4  */
5
6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
7
8 #include <linux/capability.h>
9 #include <linux/module.h>
10 #include <linux/types.h>
11 #include <linux/kernel.h>
12 #include <linux/slab.h>
13 #include <linux/uaccess.h>
14 #include <linux/skbuff.h>
15 #include <linux/netdevice.h>
16 #include <linux/in.h>
17 #include <linux/tcp.h>
18 #include <linux/udp.h>
19 #include <linux/if_arp.h>
20 #include <linux/init.h>
21 #include <linux/in6.h>
22 #include <linux/inetdevice.h>
23 #include <linux/igmp.h>
24 #include <linux/netfilter_ipv4.h>
25 #include <linux/etherdevice.h>
26 #include <linux/if_ether.h>
27 #include <linux/if_vlan.h>
28 #include <linux/rculist.h>
29 #include <linux/err.h>
30
31 #include <net/sock.h>
32 #include <net/ip.h>
33 #include <net/icmp.h>
34 #include <net/protocol.h>
35 #include <net/ip_tunnels.h>
36 #include <net/arp.h>
37 #include <net/checksum.h>
38 #include <net/dsfield.h>
39 #include <net/inet_ecn.h>
40 #include <net/xfrm.h>
41 #include <net/net_namespace.h>
42 #include <net/netns/generic.h>
43 #include <net/rtnetlink.h>
44 #include <net/udp.h>
45 #include <net/dst_metadata.h>
46
47 #if IS_ENABLED(CONFIG_IPV6)
48 #include <net/ipv6.h>
49 #include <net/ip6_fib.h>
50 #include <net/ip6_route.h>
51 #endif
52
53 static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
54 {
55         return hash_32((__force u32)key ^ (__force u32)remote,
56                          IP_TNL_HASH_BITS);
57 }
58
59 static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
60                                 __be16 flags, __be32 key)
61 {
62         if (p->i_flags & TUNNEL_KEY) {
63                 if (flags & TUNNEL_KEY)
64                         return key == p->i_key;
65                 else
66                         /* key expected, none present */
67                         return false;
68         } else
69                 return !(flags & TUNNEL_KEY);
70 }
71
72 /* Fallback tunnel: no source, no destination, no key, no options
73
74    Tunnel hash table:
75    We require exact key match i.e. if a key is present in packet
76    it will match only tunnel with the same key; if it is not present,
77    it will match only keyless tunnel.
78
79    All keysless packets, if not matched configured keyless tunnels
80    will match fallback tunnel.
81    Given src, dst and key, find appropriate for input tunnel.
82 */
83 struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
84                                    int link, __be16 flags,
85                                    __be32 remote, __be32 local,
86                                    __be32 key)
87 {
88         struct ip_tunnel *t, *cand = NULL;
89         struct hlist_head *head;
90         struct net_device *ndev;
91         unsigned int hash;
92
93         hash = ip_tunnel_hash(key, remote);
94         head = &itn->tunnels[hash];
95
96         hlist_for_each_entry_rcu(t, head, hash_node) {
97                 if (local != t->parms.iph.saddr ||
98                     remote != t->parms.iph.daddr ||
99                     !(t->dev->flags & IFF_UP))
100                         continue;
101
102                 if (!ip_tunnel_key_match(&t->parms, flags, key))
103                         continue;
104
105                 if (t->parms.link == link)
106                         return t;
107                 else
108                         cand = t;
109         }
110
111         hlist_for_each_entry_rcu(t, head, hash_node) {
112                 if (remote != t->parms.iph.daddr ||
113                     t->parms.iph.saddr != 0 ||
114                     !(t->dev->flags & IFF_UP))
115                         continue;
116
117                 if (!ip_tunnel_key_match(&t->parms, flags, key))
118                         continue;
119
120                 if (t->parms.link == link)
121                         return t;
122                 else if (!cand)
123                         cand = t;
124         }
125
126         hash = ip_tunnel_hash(key, 0);
127         head = &itn->tunnels[hash];
128
129         hlist_for_each_entry_rcu(t, head, hash_node) {
130                 if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) &&
131                     (local != t->parms.iph.daddr || !ipv4_is_multicast(local)))
132                         continue;
133
134                 if (!(t->dev->flags & IFF_UP))
135                         continue;
136
137                 if (!ip_tunnel_key_match(&t->parms, flags, key))
138                         continue;
139
140                 if (t->parms.link == link)
141                         return t;
142                 else if (!cand)
143                         cand = t;
144         }
145
146         hlist_for_each_entry_rcu(t, head, hash_node) {
147                 if ((!(flags & TUNNEL_NO_KEY) && t->parms.i_key != key) ||
148                     t->parms.iph.saddr != 0 ||
149                     t->parms.iph.daddr != 0 ||
150                     !(t->dev->flags & IFF_UP))
151                         continue;
152
153                 if (t->parms.link == link)
154                         return t;
155                 else if (!cand)
156                         cand = t;
157         }
158
159         if (cand)
160                 return cand;
161
162         t = rcu_dereference(itn->collect_md_tun);
163         if (t && t->dev->flags & IFF_UP)
164                 return t;
165
166         ndev = READ_ONCE(itn->fb_tunnel_dev);
167         if (ndev && ndev->flags & IFF_UP)
168                 return netdev_priv(ndev);
169
170         return NULL;
171 }
172 EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
173
174 static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
175                                     struct ip_tunnel_parm *parms)
176 {
177         unsigned int h;
178         __be32 remote;
179         __be32 i_key = parms->i_key;
180
181         if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
182                 remote = parms->iph.daddr;
183         else
184                 remote = 0;
185
186         if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI))
187                 i_key = 0;
188
189         h = ip_tunnel_hash(i_key, remote);
190         return &itn->tunnels[h];
191 }
192
193 static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
194 {
195         struct hlist_head *head = ip_bucket(itn, &t->parms);
196
197         if (t->collect_md)
198                 rcu_assign_pointer(itn->collect_md_tun, t);
199         hlist_add_head_rcu(&t->hash_node, head);
200 }
201
202 static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t)
203 {
204         if (t->collect_md)
205                 rcu_assign_pointer(itn->collect_md_tun, NULL);
206         hlist_del_init_rcu(&t->hash_node);
207 }
208
209 static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
210                                         struct ip_tunnel_parm *parms,
211                                         int type)
212 {
213         __be32 remote = parms->iph.daddr;
214         __be32 local = parms->iph.saddr;
215         __be32 key = parms->i_key;
216         __be16 flags = parms->i_flags;
217         int link = parms->link;
218         struct ip_tunnel *t = NULL;
219         struct hlist_head *head = ip_bucket(itn, parms);
220
221         hlist_for_each_entry_rcu(t, head, hash_node) {
222                 if (local == t->parms.iph.saddr &&
223                     remote == t->parms.iph.daddr &&
224                     link == t->parms.link &&
225                     type == t->dev->type &&
226                     ip_tunnel_key_match(&t->parms, flags, key))
227                         break;
228         }
229         return t;
230 }
231
232 static struct net_device *__ip_tunnel_create(struct net *net,
233                                              const struct rtnl_link_ops *ops,
234                                              struct ip_tunnel_parm *parms)
235 {
236         int err;
237         struct ip_tunnel *tunnel;
238         struct net_device *dev;
239         char name[IFNAMSIZ];
240
241         err = -E2BIG;
242         if (parms->name[0]) {
243                 if (!dev_valid_name(parms->name))
244                         goto failed;
245                 strlcpy(name, parms->name, IFNAMSIZ);
246         } else {
247                 if (strlen(ops->kind) > (IFNAMSIZ - 3))
248                         goto failed;
249                 strcpy(name, ops->kind);
250                 strcat(name, "%d");
251         }
252
253         ASSERT_RTNL();
254         dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup);
255         if (!dev) {
256                 err = -ENOMEM;
257                 goto failed;
258         }
259         dev_net_set(dev, net);
260
261         dev->rtnl_link_ops = ops;
262
263         tunnel = netdev_priv(dev);
264         tunnel->parms = *parms;
265         tunnel->net = net;
266
267         err = register_netdevice(dev);
268         if (err)
269                 goto failed_free;
270
271         return dev;
272
273 failed_free:
274         free_netdev(dev);
275 failed:
276         return ERR_PTR(err);
277 }
278
279 static int ip_tunnel_bind_dev(struct net_device *dev)
280 {
281         struct net_device *tdev = NULL;
282         struct ip_tunnel *tunnel = netdev_priv(dev);
283         const struct iphdr *iph;
284         int hlen = LL_MAX_HEADER;
285         int mtu = ETH_DATA_LEN;
286         int t_hlen = tunnel->hlen + sizeof(struct iphdr);
287
288         iph = &tunnel->parms.iph;
289
290         /* Guess output device to choose reasonable mtu and needed_headroom */
291         if (iph->daddr) {
292                 struct flowi4 fl4;
293                 struct rtable *rt;
294
295                 ip_tunnel_init_flow(&fl4, iph->protocol, iph->daddr,
296                                     iph->saddr, tunnel->parms.o_key,
297                                     RT_TOS(iph->tos), tunnel->parms.link,
298                                     tunnel->fwmark, 0);
299                 rt = ip_route_output_key(tunnel->net, &fl4);
300
301                 if (!IS_ERR(rt)) {
302                         tdev = rt->dst.dev;
303                         ip_rt_put(rt);
304                 }
305                 if (dev->type != ARPHRD_ETHER)
306                         dev->flags |= IFF_POINTOPOINT;
307
308                 dst_cache_reset(&tunnel->dst_cache);
309         }
310
311         if (!tdev && tunnel->parms.link)
312                 tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
313
314         if (tdev) {
315                 hlen = tdev->hard_header_len + tdev->needed_headroom;
316                 mtu = min(tdev->mtu, IP_MAX_MTU);
317         }
318
319         dev->needed_headroom = t_hlen + hlen;
320         mtu -= t_hlen;
321
322         if (mtu < IPV4_MIN_MTU)
323                 mtu = IPV4_MIN_MTU;
324
325         return mtu;
326 }
327
328 static struct ip_tunnel *ip_tunnel_create(struct net *net,
329                                           struct ip_tunnel_net *itn,
330                                           struct ip_tunnel_parm *parms)
331 {
332         struct ip_tunnel *nt;
333         struct net_device *dev;
334         int t_hlen;
335         int mtu;
336         int err;
337
338         dev = __ip_tunnel_create(net, itn->rtnl_link_ops, parms);
339         if (IS_ERR(dev))
340                 return ERR_CAST(dev);
341
342         mtu = ip_tunnel_bind_dev(dev);
343         err = dev_set_mtu(dev, mtu);
344         if (err)
345                 goto err_dev_set_mtu;
346
347         nt = netdev_priv(dev);
348         t_hlen = nt->hlen + sizeof(struct iphdr);
349         dev->min_mtu = ETH_MIN_MTU;
350         dev->max_mtu = IP_MAX_MTU - t_hlen;
351         ip_tunnel_add(itn, nt);
352         return nt;
353
354 err_dev_set_mtu:
355         unregister_netdevice(dev);
356         return ERR_PTR(err);
357 }
358
359 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
360                   const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst,
361                   bool log_ecn_error)
362 {
363         const struct iphdr *iph = ip_hdr(skb);
364         int err;
365
366 #ifdef CONFIG_NET_IPGRE_BROADCAST
367         if (ipv4_is_multicast(iph->daddr)) {
368                 tunnel->dev->stats.multicast++;
369                 skb->pkt_type = PACKET_BROADCAST;
370         }
371 #endif
372
373         if ((!(tpi->flags&TUNNEL_CSUM) &&  (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
374              ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
375                 tunnel->dev->stats.rx_crc_errors++;
376                 tunnel->dev->stats.rx_errors++;
377                 goto drop;
378         }
379
380         if (tunnel->parms.i_flags&TUNNEL_SEQ) {
381                 if (!(tpi->flags&TUNNEL_SEQ) ||
382                     (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
383                         tunnel->dev->stats.rx_fifo_errors++;
384                         tunnel->dev->stats.rx_errors++;
385                         goto drop;
386                 }
387                 tunnel->i_seqno = ntohl(tpi->seq) + 1;
388         }
389
390         skb_reset_network_header(skb);
391
392         err = IP_ECN_decapsulate(iph, skb);
393         if (unlikely(err)) {
394                 if (log_ecn_error)
395                         net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
396                                         &iph->saddr, iph->tos);
397                 if (err > 1) {
398                         ++tunnel->dev->stats.rx_frame_errors;
399                         ++tunnel->dev->stats.rx_errors;
400                         goto drop;
401                 }
402         }
403
404         dev_sw_netstats_rx_add(tunnel->dev, skb->len);
405         skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
406
407         if (tunnel->dev->type == ARPHRD_ETHER) {
408                 skb->protocol = eth_type_trans(skb, tunnel->dev);
409                 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
410         } else {
411                 skb->dev = tunnel->dev;
412         }
413
414         if (tun_dst)
415                 skb_dst_set(skb, (struct dst_entry *)tun_dst);
416
417         gro_cells_receive(&tunnel->gro_cells, skb);
418         return 0;
419
420 drop:
421         if (tun_dst)
422                 dst_release((struct dst_entry *)tun_dst);
423         kfree_skb(skb);
424         return 0;
425 }
426 EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
427
428 int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops,
429                             unsigned int num)
430 {
431         if (num >= MAX_IPTUN_ENCAP_OPS)
432                 return -ERANGE;
433
434         return !cmpxchg((const struct ip_tunnel_encap_ops **)
435                         &iptun_encaps[num],
436                         NULL, ops) ? 0 : -1;
437 }
438 EXPORT_SYMBOL(ip_tunnel_encap_add_ops);
439
440 int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops,
441                             unsigned int num)
442 {
443         int ret;
444
445         if (num >= MAX_IPTUN_ENCAP_OPS)
446                 return -ERANGE;
447
448         ret = (cmpxchg((const struct ip_tunnel_encap_ops **)
449                        &iptun_encaps[num],
450                        ops, NULL) == ops) ? 0 : -1;
451
452         synchronize_net();
453
454         return ret;
455 }
456 EXPORT_SYMBOL(ip_tunnel_encap_del_ops);
457
458 int ip_tunnel_encap_setup(struct ip_tunnel *t,
459                           struct ip_tunnel_encap *ipencap)
460 {
461         int hlen;
462
463         memset(&t->encap, 0, sizeof(t->encap));
464
465         hlen = ip_encap_hlen(ipencap);
466         if (hlen < 0)
467                 return hlen;
468
469         t->encap.type = ipencap->type;
470         t->encap.sport = ipencap->sport;
471         t->encap.dport = ipencap->dport;
472         t->encap.flags = ipencap->flags;
473
474         t->encap_hlen = hlen;
475         t->hlen = t->encap_hlen + t->tun_hlen;
476
477         return 0;
478 }
479 EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup);
480
481 static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
482                             struct rtable *rt, __be16 df,
483                             const struct iphdr *inner_iph,
484                             int tunnel_hlen, __be32 dst, bool md)
485 {
486         struct ip_tunnel *tunnel = netdev_priv(dev);
487         int pkt_size;
488         int mtu;
489
490         tunnel_hlen = md ? tunnel_hlen : tunnel->hlen;
491         pkt_size = skb->len - tunnel_hlen;
492
493         if (df)
494                 mtu = dst_mtu(&rt->dst) - (sizeof(struct iphdr) + tunnel_hlen);
495         else
496                 mtu = skb_valid_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
497
498         if (skb_valid_dst(skb))
499                 skb_dst_update_pmtu_no_confirm(skb, mtu);
500
501         if (skb->protocol == htons(ETH_P_IP)) {
502                 if (!skb_is_gso(skb) &&
503                     (inner_iph->frag_off & htons(IP_DF)) &&
504                     mtu < pkt_size) {
505                         icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
506                         return -E2BIG;
507                 }
508         }
509 #if IS_ENABLED(CONFIG_IPV6)
510         else if (skb->protocol == htons(ETH_P_IPV6)) {
511                 struct rt6_info *rt6;
512                 __be32 daddr;
513
514                 rt6 = skb_valid_dst(skb) ? (struct rt6_info *)skb_dst(skb) :
515                                            NULL;
516                 daddr = md ? dst : tunnel->parms.iph.daddr;
517
518                 if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
519                            mtu >= IPV6_MIN_MTU) {
520                         if ((daddr && !ipv4_is_multicast(daddr)) ||
521                             rt6->rt6i_dst.plen == 128) {
522                                 rt6->rt6i_flags |= RTF_MODIFIED;
523                                 dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
524                         }
525                 }
526
527                 if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
528                                         mtu < pkt_size) {
529                         icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
530                         return -E2BIG;
531                 }
532         }
533 #endif
534         return 0;
535 }
536
537 void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
538                        u8 proto, int tunnel_hlen)
539 {
540         struct ip_tunnel *tunnel = netdev_priv(dev);
541         u32 headroom = sizeof(struct iphdr);
542         struct ip_tunnel_info *tun_info;
543         const struct ip_tunnel_key *key;
544         const struct iphdr *inner_iph;
545         struct rtable *rt = NULL;
546         struct flowi4 fl4;
547         __be16 df = 0;
548         u8 tos, ttl;
549         bool use_cache;
550
551         tun_info = skb_tunnel_info(skb);
552         if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
553                      ip_tunnel_info_af(tun_info) != AF_INET))
554                 goto tx_error;
555         key = &tun_info->key;
556         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
557         inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
558         tos = key->tos;
559         if (tos == 1) {
560                 if (skb->protocol == htons(ETH_P_IP))
561                         tos = inner_iph->tos;
562                 else if (skb->protocol == htons(ETH_P_IPV6))
563                         tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
564         }
565         ip_tunnel_init_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src,
566                             tunnel_id_to_key32(key->tun_id), RT_TOS(tos),
567                             0, skb->mark, skb_get_hash(skb));
568         if (tunnel->encap.type != TUNNEL_ENCAP_NONE)
569                 goto tx_error;
570
571         use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
572         if (use_cache)
573                 rt = dst_cache_get_ip4(&tun_info->dst_cache, &fl4.saddr);
574         if (!rt) {
575                 rt = ip_route_output_key(tunnel->net, &fl4);
576                 if (IS_ERR(rt)) {
577                         dev->stats.tx_carrier_errors++;
578                         goto tx_error;
579                 }
580                 if (use_cache)
581                         dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst,
582                                           fl4.saddr);
583         }
584         if (rt->dst.dev == dev) {
585                 ip_rt_put(rt);
586                 dev->stats.collisions++;
587                 goto tx_error;
588         }
589
590         if (key->tun_flags & TUNNEL_DONT_FRAGMENT)
591                 df = htons(IP_DF);
592         if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, tunnel_hlen,
593                             key->u.ipv4.dst, true)) {
594                 ip_rt_put(rt);
595                 goto tx_error;
596         }
597
598         tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
599         ttl = key->ttl;
600         if (ttl == 0) {
601                 if (skb->protocol == htons(ETH_P_IP))
602                         ttl = inner_iph->ttl;
603                 else if (skb->protocol == htons(ETH_P_IPV6))
604                         ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
605                 else
606                         ttl = ip4_dst_hoplimit(&rt->dst);
607         }
608
609         headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len;
610         if (headroom > dev->needed_headroom)
611                 dev->needed_headroom = headroom;
612
613         if (skb_cow_head(skb, dev->needed_headroom)) {
614                 ip_rt_put(rt);
615                 goto tx_dropped;
616         }
617         iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, tos, ttl,
618                       df, !net_eq(tunnel->net, dev_net(dev)));
619         return;
620 tx_error:
621         dev->stats.tx_errors++;
622         goto kfree;
623 tx_dropped:
624         dev->stats.tx_dropped++;
625 kfree:
626         kfree_skb(skb);
627 }
628 EXPORT_SYMBOL_GPL(ip_md_tunnel_xmit);
629
630 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
631                     const struct iphdr *tnl_params, u8 protocol)
632 {
633         struct ip_tunnel *tunnel = netdev_priv(dev);
634         struct ip_tunnel_info *tun_info = NULL;
635         const struct iphdr *inner_iph;
636         unsigned int max_headroom;      /* The extra header space needed */
637         struct rtable *rt = NULL;               /* Route to the other host */
638         bool use_cache = false;
639         struct flowi4 fl4;
640         bool md = false;
641         bool connected;
642         u8 tos, ttl;
643         __be32 dst;
644         __be16 df;
645
646         inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
647         connected = (tunnel->parms.iph.daddr != 0);
648
649         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
650
651         dst = tnl_params->daddr;
652         if (dst == 0) {
653                 /* NBMA tunnel */
654
655                 if (!skb_dst(skb)) {
656                         dev->stats.tx_fifo_errors++;
657                         goto tx_error;
658                 }
659
660                 tun_info = skb_tunnel_info(skb);
661                 if (tun_info && (tun_info->mode & IP_TUNNEL_INFO_TX) &&
662                     ip_tunnel_info_af(tun_info) == AF_INET &&
663                     tun_info->key.u.ipv4.dst) {
664                         dst = tun_info->key.u.ipv4.dst;
665                         md = true;
666                         connected = true;
667                 }
668                 else if (skb->protocol == htons(ETH_P_IP)) {
669                         rt = skb_rtable(skb);
670                         dst = rt_nexthop(rt, inner_iph->daddr);
671                 }
672 #if IS_ENABLED(CONFIG_IPV6)
673                 else if (skb->protocol == htons(ETH_P_IPV6)) {
674                         const struct in6_addr *addr6;
675                         struct neighbour *neigh;
676                         bool do_tx_error_icmp;
677                         int addr_type;
678
679                         neigh = dst_neigh_lookup(skb_dst(skb),
680                                                  &ipv6_hdr(skb)->daddr);
681                         if (!neigh)
682                                 goto tx_error;
683
684                         addr6 = (const struct in6_addr *)&neigh->primary_key;
685                         addr_type = ipv6_addr_type(addr6);
686
687                         if (addr_type == IPV6_ADDR_ANY) {
688                                 addr6 = &ipv6_hdr(skb)->daddr;
689                                 addr_type = ipv6_addr_type(addr6);
690                         }
691
692                         if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
693                                 do_tx_error_icmp = true;
694                         else {
695                                 do_tx_error_icmp = false;
696                                 dst = addr6->s6_addr32[3];
697                         }
698                         neigh_release(neigh);
699                         if (do_tx_error_icmp)
700                                 goto tx_error_icmp;
701                 }
702 #endif
703                 else
704                         goto tx_error;
705
706                 if (!md)
707                         connected = false;
708         }
709
710         tos = tnl_params->tos;
711         if (tos & 0x1) {
712                 tos &= ~0x1;
713                 if (skb->protocol == htons(ETH_P_IP)) {
714                         tos = inner_iph->tos;
715                         connected = false;
716                 } else if (skb->protocol == htons(ETH_P_IPV6)) {
717                         tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
718                         connected = false;
719                 }
720         }
721
722         ip_tunnel_init_flow(&fl4, protocol, dst, tnl_params->saddr,
723                             tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link,
724                             tunnel->fwmark, skb_get_hash(skb));
725
726         if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0)
727                 goto tx_error;
728
729         if (connected && md) {
730                 use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
731                 if (use_cache)
732                         rt = dst_cache_get_ip4(&tun_info->dst_cache,
733                                                &fl4.saddr);
734         } else {
735                 rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache,
736                                                 &fl4.saddr) : NULL;
737         }
738
739         if (!rt) {
740                 rt = ip_route_output_key(tunnel->net, &fl4);
741
742                 if (IS_ERR(rt)) {
743                         dev->stats.tx_carrier_errors++;
744                         goto tx_error;
745                 }
746                 if (use_cache)
747                         dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst,
748                                           fl4.saddr);
749                 else if (!md && connected)
750                         dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst,
751                                           fl4.saddr);
752         }
753
754         if (rt->dst.dev == dev) {
755                 ip_rt_put(rt);
756                 dev->stats.collisions++;
757                 goto tx_error;
758         }
759
760         df = tnl_params->frag_off;
761         if (skb->protocol == htons(ETH_P_IP) && !tunnel->ignore_df)
762                 df |= (inner_iph->frag_off & htons(IP_DF));
763
764         if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, 0, 0, false)) {
765                 ip_rt_put(rt);
766                 goto tx_error;
767         }
768
769         if (tunnel->err_count > 0) {
770                 if (time_before(jiffies,
771                                 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
772                         tunnel->err_count--;
773
774                         dst_link_failure(skb);
775                 } else
776                         tunnel->err_count = 0;
777         }
778
779         tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
780         ttl = tnl_params->ttl;
781         if (ttl == 0) {
782                 if (skb->protocol == htons(ETH_P_IP))
783                         ttl = inner_iph->ttl;
784 #if IS_ENABLED(CONFIG_IPV6)
785                 else if (skb->protocol == htons(ETH_P_IPV6))
786                         ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
787 #endif
788                 else
789                         ttl = ip4_dst_hoplimit(&rt->dst);
790         }
791
792         max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
793                         + rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
794         if (max_headroom > dev->needed_headroom)
795                 dev->needed_headroom = max_headroom;
796
797         if (skb_cow_head(skb, dev->needed_headroom)) {
798                 ip_rt_put(rt);
799                 dev->stats.tx_dropped++;
800                 kfree_skb(skb);
801                 return;
802         }
803
804         iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl,
805                       df, !net_eq(tunnel->net, dev_net(dev)));
806         return;
807
808 #if IS_ENABLED(CONFIG_IPV6)
809 tx_error_icmp:
810         dst_link_failure(skb);
811 #endif
812 tx_error:
813         dev->stats.tx_errors++;
814         kfree_skb(skb);
815 }
816 EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
817
818 static void ip_tunnel_update(struct ip_tunnel_net *itn,
819                              struct ip_tunnel *t,
820                              struct net_device *dev,
821                              struct ip_tunnel_parm *p,
822                              bool set_mtu,
823                              __u32 fwmark)
824 {
825         ip_tunnel_del(itn, t);
826         t->parms.iph.saddr = p->iph.saddr;
827         t->parms.iph.daddr = p->iph.daddr;
828         t->parms.i_key = p->i_key;
829         t->parms.o_key = p->o_key;
830         if (dev->type != ARPHRD_ETHER) {
831                 memcpy(dev->dev_addr, &p->iph.saddr, 4);
832                 memcpy(dev->broadcast, &p->iph.daddr, 4);
833         }
834         ip_tunnel_add(itn, t);
835
836         t->parms.iph.ttl = p->iph.ttl;
837         t->parms.iph.tos = p->iph.tos;
838         t->parms.iph.frag_off = p->iph.frag_off;
839
840         if (t->parms.link != p->link || t->fwmark != fwmark) {
841                 int mtu;
842
843                 t->parms.link = p->link;
844                 t->fwmark = fwmark;
845                 mtu = ip_tunnel_bind_dev(dev);
846                 if (set_mtu)
847                         dev->mtu = mtu;
848         }
849         dst_cache_reset(&t->dst_cache);
850         netdev_state_change(dev);
851 }
852
853 int ip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
854 {
855         int err = 0;
856         struct ip_tunnel *t = netdev_priv(dev);
857         struct net *net = t->net;
858         struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
859
860         switch (cmd) {
861         case SIOCGETTUNNEL:
862                 if (dev == itn->fb_tunnel_dev) {
863                         t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
864                         if (!t)
865                                 t = netdev_priv(dev);
866                 }
867                 memcpy(p, &t->parms, sizeof(*p));
868                 break;
869
870         case SIOCADDTUNNEL:
871         case SIOCCHGTUNNEL:
872                 err = -EPERM;
873                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
874                         goto done;
875                 if (p->iph.ttl)
876                         p->iph.frag_off |= htons(IP_DF);
877                 if (!(p->i_flags & VTI_ISVTI)) {
878                         if (!(p->i_flags & TUNNEL_KEY))
879                                 p->i_key = 0;
880                         if (!(p->o_flags & TUNNEL_KEY))
881                                 p->o_key = 0;
882                 }
883
884                 t = ip_tunnel_find(itn, p, itn->type);
885
886                 if (cmd == SIOCADDTUNNEL) {
887                         if (!t) {
888                                 t = ip_tunnel_create(net, itn, p);
889                                 err = PTR_ERR_OR_ZERO(t);
890                                 break;
891                         }
892
893                         err = -EEXIST;
894                         break;
895                 }
896                 if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
897                         if (t) {
898                                 if (t->dev != dev) {
899                                         err = -EEXIST;
900                                         break;
901                                 }
902                         } else {
903                                 unsigned int nflags = 0;
904
905                                 if (ipv4_is_multicast(p->iph.daddr))
906                                         nflags = IFF_BROADCAST;
907                                 else if (p->iph.daddr)
908                                         nflags = IFF_POINTOPOINT;
909
910                                 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
911                                         err = -EINVAL;
912                                         break;
913                                 }
914
915                                 t = netdev_priv(dev);
916                         }
917                 }
918
919                 if (t) {
920                         err = 0;
921                         ip_tunnel_update(itn, t, dev, p, true, 0);
922                 } else {
923                         err = -ENOENT;
924                 }
925                 break;
926
927         case SIOCDELTUNNEL:
928                 err = -EPERM;
929                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
930                         goto done;
931
932                 if (dev == itn->fb_tunnel_dev) {
933                         err = -ENOENT;
934                         t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
935                         if (!t)
936                                 goto done;
937                         err = -EPERM;
938                         if (t == netdev_priv(itn->fb_tunnel_dev))
939                                 goto done;
940                         dev = t->dev;
941                 }
942                 unregister_netdevice(dev);
943                 err = 0;
944                 break;
945
946         default:
947                 err = -EINVAL;
948         }
949
950 done:
951         return err;
952 }
953 EXPORT_SYMBOL_GPL(ip_tunnel_ctl);
954
955 int ip_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
956 {
957         struct ip_tunnel_parm p;
958         int err;
959
960         if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
961                 return -EFAULT;
962         err = dev->netdev_ops->ndo_tunnel_ctl(dev, &p, cmd);
963         if (!err && copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
964                 return -EFAULT;
965         return err;
966 }
967 EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
968
969 int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict)
970 {
971         struct ip_tunnel *tunnel = netdev_priv(dev);
972         int t_hlen = tunnel->hlen + sizeof(struct iphdr);
973         int max_mtu = IP_MAX_MTU - t_hlen;
974
975         if (new_mtu < ETH_MIN_MTU)
976                 return -EINVAL;
977
978         if (new_mtu > max_mtu) {
979                 if (strict)
980                         return -EINVAL;
981
982                 new_mtu = max_mtu;
983         }
984
985         dev->mtu = new_mtu;
986         return 0;
987 }
988 EXPORT_SYMBOL_GPL(__ip_tunnel_change_mtu);
989
990 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
991 {
992         return __ip_tunnel_change_mtu(dev, new_mtu, true);
993 }
994 EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
995
996 static void ip_tunnel_dev_free(struct net_device *dev)
997 {
998         struct ip_tunnel *tunnel = netdev_priv(dev);
999
1000         gro_cells_destroy(&tunnel->gro_cells);
1001         dst_cache_destroy(&tunnel->dst_cache);
1002         free_percpu(dev->tstats);
1003 }
1004
1005 void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
1006 {
1007         struct ip_tunnel *tunnel = netdev_priv(dev);
1008         struct ip_tunnel_net *itn;
1009
1010         itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
1011
1012         if (itn->fb_tunnel_dev != dev) {
1013                 ip_tunnel_del(itn, netdev_priv(dev));
1014                 unregister_netdevice_queue(dev, head);
1015         }
1016 }
1017 EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
1018
1019 struct net *ip_tunnel_get_link_net(const struct net_device *dev)
1020 {
1021         struct ip_tunnel *tunnel = netdev_priv(dev);
1022
1023         return tunnel->net;
1024 }
1025 EXPORT_SYMBOL(ip_tunnel_get_link_net);
1026
1027 int ip_tunnel_get_iflink(const struct net_device *dev)
1028 {
1029         struct ip_tunnel *tunnel = netdev_priv(dev);
1030
1031         return tunnel->parms.link;
1032 }
1033 EXPORT_SYMBOL(ip_tunnel_get_iflink);
1034
1035 int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id,
1036                                   struct rtnl_link_ops *ops, char *devname)
1037 {
1038         struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
1039         struct ip_tunnel_parm parms;
1040         unsigned int i;
1041
1042         itn->rtnl_link_ops = ops;
1043         for (i = 0; i < IP_TNL_HASH_SIZE; i++)
1044                 INIT_HLIST_HEAD(&itn->tunnels[i]);
1045
1046         if (!ops || !net_has_fallback_tunnels(net)) {
1047                 struct ip_tunnel_net *it_init_net;
1048
1049                 it_init_net = net_generic(&init_net, ip_tnl_net_id);
1050                 itn->type = it_init_net->type;
1051                 itn->fb_tunnel_dev = NULL;
1052                 return 0;
1053         }
1054
1055         memset(&parms, 0, sizeof(parms));
1056         if (devname)
1057                 strlcpy(parms.name, devname, IFNAMSIZ);
1058
1059         rtnl_lock();
1060         itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
1061         /* FB netdevice is special: we have one, and only one per netns.
1062          * Allowing to move it to another netns is clearly unsafe.
1063          */
1064         if (!IS_ERR(itn->fb_tunnel_dev)) {
1065                 itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
1066                 itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
1067                 ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
1068                 itn->type = itn->fb_tunnel_dev->type;
1069         }
1070         rtnl_unlock();
1071
1072         return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
1073 }
1074 EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
1075
1076 static void ip_tunnel_destroy(struct net *net, struct ip_tunnel_net *itn,
1077                               struct list_head *head,
1078                               struct rtnl_link_ops *ops)
1079 {
1080         struct net_device *dev, *aux;
1081         int h;
1082
1083         for_each_netdev_safe(net, dev, aux)
1084                 if (dev->rtnl_link_ops == ops)
1085                         unregister_netdevice_queue(dev, head);
1086
1087         for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
1088                 struct ip_tunnel *t;
1089                 struct hlist_node *n;
1090                 struct hlist_head *thead = &itn->tunnels[h];
1091
1092                 hlist_for_each_entry_safe(t, n, thead, hash_node)
1093                         /* If dev is in the same netns, it has already
1094                          * been added to the list by the previous loop.
1095                          */
1096                         if (!net_eq(dev_net(t->dev), net))
1097                                 unregister_netdevice_queue(t->dev, head);
1098         }
1099 }
1100
1101 void ip_tunnel_delete_nets(struct list_head *net_list, unsigned int id,
1102                            struct rtnl_link_ops *ops)
1103 {
1104         struct ip_tunnel_net *itn;
1105         struct net *net;
1106         LIST_HEAD(list);
1107
1108         rtnl_lock();
1109         list_for_each_entry(net, net_list, exit_list) {
1110                 itn = net_generic(net, id);
1111                 ip_tunnel_destroy(net, itn, &list, ops);
1112         }
1113         unregister_netdevice_many(&list);
1114         rtnl_unlock();
1115 }
1116 EXPORT_SYMBOL_GPL(ip_tunnel_delete_nets);
1117
1118 int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
1119                       struct ip_tunnel_parm *p, __u32 fwmark)
1120 {
1121         struct ip_tunnel *nt;
1122         struct net *net = dev_net(dev);
1123         struct ip_tunnel_net *itn;
1124         int mtu;
1125         int err;
1126
1127         nt = netdev_priv(dev);
1128         itn = net_generic(net, nt->ip_tnl_net_id);
1129
1130         if (nt->collect_md) {
1131                 if (rtnl_dereference(itn->collect_md_tun))
1132                         return -EEXIST;
1133         } else {
1134                 if (ip_tunnel_find(itn, p, dev->type))
1135                         return -EEXIST;
1136         }
1137
1138         nt->net = net;
1139         nt->parms = *p;
1140         nt->fwmark = fwmark;
1141         err = register_netdevice(dev);
1142         if (err)
1143                 goto err_register_netdevice;
1144
1145         if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1146                 eth_hw_addr_random(dev);
1147
1148         mtu = ip_tunnel_bind_dev(dev);
1149         if (tb[IFLA_MTU]) {
1150                 unsigned int max = IP_MAX_MTU - (nt->hlen + sizeof(struct iphdr));
1151
1152                 mtu = clamp(dev->mtu, (unsigned int)ETH_MIN_MTU, max);
1153         }
1154
1155         err = dev_set_mtu(dev, mtu);
1156         if (err)
1157                 goto err_dev_set_mtu;
1158
1159         ip_tunnel_add(itn, nt);
1160         return 0;
1161
1162 err_dev_set_mtu:
1163         unregister_netdevice(dev);
1164 err_register_netdevice:
1165         return err;
1166 }
1167 EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
1168
1169 int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
1170                          struct ip_tunnel_parm *p, __u32 fwmark)
1171 {
1172         struct ip_tunnel *t;
1173         struct ip_tunnel *tunnel = netdev_priv(dev);
1174         struct net *net = tunnel->net;
1175         struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
1176
1177         if (dev == itn->fb_tunnel_dev)
1178                 return -EINVAL;
1179
1180         t = ip_tunnel_find(itn, p, dev->type);
1181
1182         if (t) {
1183                 if (t->dev != dev)
1184                         return -EEXIST;
1185         } else {
1186                 t = tunnel;
1187
1188                 if (dev->type != ARPHRD_ETHER) {
1189                         unsigned int nflags = 0;
1190
1191                         if (ipv4_is_multicast(p->iph.daddr))
1192                                 nflags = IFF_BROADCAST;
1193                         else if (p->iph.daddr)
1194                                 nflags = IFF_POINTOPOINT;
1195
1196                         if ((dev->flags ^ nflags) &
1197                             (IFF_POINTOPOINT | IFF_BROADCAST))
1198                                 return -EINVAL;
1199                 }
1200         }
1201
1202         ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU], fwmark);
1203         return 0;
1204 }
1205 EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
1206
1207 int ip_tunnel_init(struct net_device *dev)
1208 {
1209         struct ip_tunnel *tunnel = netdev_priv(dev);
1210         struct iphdr *iph = &tunnel->parms.iph;
1211         int err;
1212
1213         dev->needs_free_netdev = true;
1214         dev->priv_destructor = ip_tunnel_dev_free;
1215         dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
1216         if (!dev->tstats)
1217                 return -ENOMEM;
1218
1219         err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
1220         if (err) {
1221                 free_percpu(dev->tstats);
1222                 return err;
1223         }
1224
1225         err = gro_cells_init(&tunnel->gro_cells, dev);
1226         if (err) {
1227                 dst_cache_destroy(&tunnel->dst_cache);
1228                 free_percpu(dev->tstats);
1229                 return err;
1230         }
1231
1232         tunnel->dev = dev;
1233         tunnel->net = dev_net(dev);
1234         strcpy(tunnel->parms.name, dev->name);
1235         iph->version            = 4;
1236         iph->ihl                = 5;
1237
1238         if (tunnel->collect_md)
1239                 netif_keep_dst(dev);
1240         return 0;
1241 }
1242 EXPORT_SYMBOL_GPL(ip_tunnel_init);
1243
1244 void ip_tunnel_uninit(struct net_device *dev)
1245 {
1246         struct ip_tunnel *tunnel = netdev_priv(dev);
1247         struct net *net = tunnel->net;
1248         struct ip_tunnel_net *itn;
1249
1250         itn = net_generic(net, tunnel->ip_tnl_net_id);
1251         ip_tunnel_del(itn, netdev_priv(dev));
1252         if (itn->fb_tunnel_dev == dev)
1253                 WRITE_ONCE(itn->fb_tunnel_dev, NULL);
1254
1255         dst_cache_reset(&tunnel->dst_cache);
1256 }
1257 EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1258
1259 /* Do least required initialization, rest of init is done in tunnel_init call */
1260 void ip_tunnel_setup(struct net_device *dev, unsigned int net_id)
1261 {
1262         struct ip_tunnel *tunnel = netdev_priv(dev);
1263         tunnel->ip_tnl_net_id = net_id;
1264 }
1265 EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1266
1267 MODULE_LICENSE("GPL");