x86/platform/uv: Recognize UV5 hubless system identifier
[linux-2.6-microblaze.git] / net / ipv4 / ip_tunnel.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (c) 2013 Nicira, Inc.
4  */
5
6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
7
8 #include <linux/capability.h>
9 #include <linux/module.h>
10 #include <linux/types.h>
11 #include <linux/kernel.h>
12 #include <linux/slab.h>
13 #include <linux/uaccess.h>
14 #include <linux/skbuff.h>
15 #include <linux/netdevice.h>
16 #include <linux/in.h>
17 #include <linux/tcp.h>
18 #include <linux/udp.h>
19 #include <linux/if_arp.h>
20 #include <linux/init.h>
21 #include <linux/in6.h>
22 #include <linux/inetdevice.h>
23 #include <linux/igmp.h>
24 #include <linux/netfilter_ipv4.h>
25 #include <linux/etherdevice.h>
26 #include <linux/if_ether.h>
27 #include <linux/if_vlan.h>
28 #include <linux/rculist.h>
29 #include <linux/err.h>
30
31 #include <net/sock.h>
32 #include <net/ip.h>
33 #include <net/icmp.h>
34 #include <net/protocol.h>
35 #include <net/ip_tunnels.h>
36 #include <net/arp.h>
37 #include <net/checksum.h>
38 #include <net/dsfield.h>
39 #include <net/inet_ecn.h>
40 #include <net/xfrm.h>
41 #include <net/net_namespace.h>
42 #include <net/netns/generic.h>
43 #include <net/rtnetlink.h>
44 #include <net/udp.h>
45 #include <net/dst_metadata.h>
46
47 #if IS_ENABLED(CONFIG_IPV6)
48 #include <net/ipv6.h>
49 #include <net/ip6_fib.h>
50 #include <net/ip6_route.h>
51 #endif
52
53 static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
54 {
55         return hash_32((__force u32)key ^ (__force u32)remote,
56                          IP_TNL_HASH_BITS);
57 }
58
59 static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
60                                 __be16 flags, __be32 key)
61 {
62         if (p->i_flags & TUNNEL_KEY) {
63                 if (flags & TUNNEL_KEY)
64                         return key == p->i_key;
65                 else
66                         /* key expected, none present */
67                         return false;
68         } else
69                 return !(flags & TUNNEL_KEY);
70 }
71
72 /* Fallback tunnel: no source, no destination, no key, no options
73
74    Tunnel hash table:
75    We require exact key match i.e. if a key is present in packet
76    it will match only tunnel with the same key; if it is not present,
77    it will match only keyless tunnel.
78
79    All keysless packets, if not matched configured keyless tunnels
80    will match fallback tunnel.
81    Given src, dst and key, find appropriate for input tunnel.
82 */
83 struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
84                                    int link, __be16 flags,
85                                    __be32 remote, __be32 local,
86                                    __be32 key)
87 {
88         struct ip_tunnel *t, *cand = NULL;
89         struct hlist_head *head;
90         struct net_device *ndev;
91         unsigned int hash;
92
93         hash = ip_tunnel_hash(key, remote);
94         head = &itn->tunnels[hash];
95
96         hlist_for_each_entry_rcu(t, head, hash_node) {
97                 if (local != t->parms.iph.saddr ||
98                     remote != t->parms.iph.daddr ||
99                     !(t->dev->flags & IFF_UP))
100                         continue;
101
102                 if (!ip_tunnel_key_match(&t->parms, flags, key))
103                         continue;
104
105                 if (t->parms.link == link)
106                         return t;
107                 else
108                         cand = t;
109         }
110
111         hlist_for_each_entry_rcu(t, head, hash_node) {
112                 if (remote != t->parms.iph.daddr ||
113                     t->parms.iph.saddr != 0 ||
114                     !(t->dev->flags & IFF_UP))
115                         continue;
116
117                 if (!ip_tunnel_key_match(&t->parms, flags, key))
118                         continue;
119
120                 if (t->parms.link == link)
121                         return t;
122                 else if (!cand)
123                         cand = t;
124         }
125
126         hash = ip_tunnel_hash(key, 0);
127         head = &itn->tunnels[hash];
128
129         hlist_for_each_entry_rcu(t, head, hash_node) {
130                 if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) &&
131                     (local != t->parms.iph.daddr || !ipv4_is_multicast(local)))
132                         continue;
133
134                 if (!(t->dev->flags & IFF_UP))
135                         continue;
136
137                 if (!ip_tunnel_key_match(&t->parms, flags, key))
138                         continue;
139
140                 if (t->parms.link == link)
141                         return t;
142                 else if (!cand)
143                         cand = t;
144         }
145
146         hlist_for_each_entry_rcu(t, head, hash_node) {
147                 if ((!(flags & TUNNEL_NO_KEY) && t->parms.i_key != key) ||
148                     t->parms.iph.saddr != 0 ||
149                     t->parms.iph.daddr != 0 ||
150                     !(t->dev->flags & IFF_UP))
151                         continue;
152
153                 if (t->parms.link == link)
154                         return t;
155                 else if (!cand)
156                         cand = t;
157         }
158
159         if (cand)
160                 return cand;
161
162         t = rcu_dereference(itn->collect_md_tun);
163         if (t && t->dev->flags & IFF_UP)
164                 return t;
165
166         ndev = READ_ONCE(itn->fb_tunnel_dev);
167         if (ndev && ndev->flags & IFF_UP)
168                 return netdev_priv(ndev);
169
170         return NULL;
171 }
172 EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
173
174 static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
175                                     struct ip_tunnel_parm *parms)
176 {
177         unsigned int h;
178         __be32 remote;
179         __be32 i_key = parms->i_key;
180
181         if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
182                 remote = parms->iph.daddr;
183         else
184                 remote = 0;
185
186         if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI))
187                 i_key = 0;
188
189         h = ip_tunnel_hash(i_key, remote);
190         return &itn->tunnels[h];
191 }
192
193 static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
194 {
195         struct hlist_head *head = ip_bucket(itn, &t->parms);
196
197         if (t->collect_md)
198                 rcu_assign_pointer(itn->collect_md_tun, t);
199         hlist_add_head_rcu(&t->hash_node, head);
200 }
201
202 static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t)
203 {
204         if (t->collect_md)
205                 rcu_assign_pointer(itn->collect_md_tun, NULL);
206         hlist_del_init_rcu(&t->hash_node);
207 }
208
209 static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
210                                         struct ip_tunnel_parm *parms,
211                                         int type)
212 {
213         __be32 remote = parms->iph.daddr;
214         __be32 local = parms->iph.saddr;
215         __be32 key = parms->i_key;
216         __be16 flags = parms->i_flags;
217         int link = parms->link;
218         struct ip_tunnel *t = NULL;
219         struct hlist_head *head = ip_bucket(itn, parms);
220
221         hlist_for_each_entry_rcu(t, head, hash_node) {
222                 if (local == t->parms.iph.saddr &&
223                     remote == t->parms.iph.daddr &&
224                     link == t->parms.link &&
225                     type == t->dev->type &&
226                     ip_tunnel_key_match(&t->parms, flags, key))
227                         break;
228         }
229         return t;
230 }
231
232 static struct net_device *__ip_tunnel_create(struct net *net,
233                                              const struct rtnl_link_ops *ops,
234                                              struct ip_tunnel_parm *parms)
235 {
236         int err;
237         struct ip_tunnel *tunnel;
238         struct net_device *dev;
239         char name[IFNAMSIZ];
240
241         err = -E2BIG;
242         if (parms->name[0]) {
243                 if (!dev_valid_name(parms->name))
244                         goto failed;
245                 strlcpy(name, parms->name, IFNAMSIZ);
246         } else {
247                 if (strlen(ops->kind) > (IFNAMSIZ - 3))
248                         goto failed;
249                 strcpy(name, ops->kind);
250                 strcat(name, "%d");
251         }
252
253         ASSERT_RTNL();
254         dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup);
255         if (!dev) {
256                 err = -ENOMEM;
257                 goto failed;
258         }
259         dev_net_set(dev, net);
260
261         dev->rtnl_link_ops = ops;
262
263         tunnel = netdev_priv(dev);
264         tunnel->parms = *parms;
265         tunnel->net = net;
266
267         err = register_netdevice(dev);
268         if (err)
269                 goto failed_free;
270
271         return dev;
272
273 failed_free:
274         free_netdev(dev);
275 failed:
276         return ERR_PTR(err);
277 }
278
279 static int ip_tunnel_bind_dev(struct net_device *dev)
280 {
281         struct net_device *tdev = NULL;
282         struct ip_tunnel *tunnel = netdev_priv(dev);
283         const struct iphdr *iph;
284         int hlen = LL_MAX_HEADER;
285         int mtu = ETH_DATA_LEN;
286         int t_hlen = tunnel->hlen + sizeof(struct iphdr);
287
288         iph = &tunnel->parms.iph;
289
290         /* Guess output device to choose reasonable mtu and needed_headroom */
291         if (iph->daddr) {
292                 struct flowi4 fl4;
293                 struct rtable *rt;
294
295                 ip_tunnel_init_flow(&fl4, iph->protocol, iph->daddr,
296                                     iph->saddr, tunnel->parms.o_key,
297                                     RT_TOS(iph->tos), tunnel->parms.link,
298                                     tunnel->fwmark, 0);
299                 rt = ip_route_output_key(tunnel->net, &fl4);
300
301                 if (!IS_ERR(rt)) {
302                         tdev = rt->dst.dev;
303                         ip_rt_put(rt);
304                 }
305                 if (dev->type != ARPHRD_ETHER)
306                         dev->flags |= IFF_POINTOPOINT;
307
308                 dst_cache_reset(&tunnel->dst_cache);
309         }
310
311         if (!tdev && tunnel->parms.link)
312                 tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
313
314         if (tdev) {
315                 hlen = tdev->hard_header_len + tdev->needed_headroom;
316                 mtu = min(tdev->mtu, IP_MAX_MTU);
317         }
318
319         dev->needed_headroom = t_hlen + hlen;
320         mtu -= (dev->hard_header_len + t_hlen);
321
322         if (mtu < IPV4_MIN_MTU)
323                 mtu = IPV4_MIN_MTU;
324
325         return mtu;
326 }
327
328 static struct ip_tunnel *ip_tunnel_create(struct net *net,
329                                           struct ip_tunnel_net *itn,
330                                           struct ip_tunnel_parm *parms)
331 {
332         struct ip_tunnel *nt;
333         struct net_device *dev;
334         int t_hlen;
335         int mtu;
336         int err;
337
338         dev = __ip_tunnel_create(net, itn->rtnl_link_ops, parms);
339         if (IS_ERR(dev))
340                 return ERR_CAST(dev);
341
342         mtu = ip_tunnel_bind_dev(dev);
343         err = dev_set_mtu(dev, mtu);
344         if (err)
345                 goto err_dev_set_mtu;
346
347         nt = netdev_priv(dev);
348         t_hlen = nt->hlen + sizeof(struct iphdr);
349         dev->min_mtu = ETH_MIN_MTU;
350         dev->max_mtu = IP_MAX_MTU - dev->hard_header_len - t_hlen;
351         ip_tunnel_add(itn, nt);
352         return nt;
353
354 err_dev_set_mtu:
355         unregister_netdevice(dev);
356         return ERR_PTR(err);
357 }
358
359 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
360                   const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst,
361                   bool log_ecn_error)
362 {
363         const struct iphdr *iph = ip_hdr(skb);
364         int err;
365
366 #ifdef CONFIG_NET_IPGRE_BROADCAST
367         if (ipv4_is_multicast(iph->daddr)) {
368                 tunnel->dev->stats.multicast++;
369                 skb->pkt_type = PACKET_BROADCAST;
370         }
371 #endif
372
373         if ((!(tpi->flags&TUNNEL_CSUM) &&  (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
374              ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
375                 tunnel->dev->stats.rx_crc_errors++;
376                 tunnel->dev->stats.rx_errors++;
377                 goto drop;
378         }
379
380         if (tunnel->parms.i_flags&TUNNEL_SEQ) {
381                 if (!(tpi->flags&TUNNEL_SEQ) ||
382                     (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
383                         tunnel->dev->stats.rx_fifo_errors++;
384                         tunnel->dev->stats.rx_errors++;
385                         goto drop;
386                 }
387                 tunnel->i_seqno = ntohl(tpi->seq) + 1;
388         }
389
390         skb_reset_network_header(skb);
391
392         err = IP_ECN_decapsulate(iph, skb);
393         if (unlikely(err)) {
394                 if (log_ecn_error)
395                         net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
396                                         &iph->saddr, iph->tos);
397                 if (err > 1) {
398                         ++tunnel->dev->stats.rx_frame_errors;
399                         ++tunnel->dev->stats.rx_errors;
400                         goto drop;
401                 }
402         }
403
404         dev_sw_netstats_rx_add(tunnel->dev, skb->len);
405         skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
406
407         if (tunnel->dev->type == ARPHRD_ETHER) {
408                 skb->protocol = eth_type_trans(skb, tunnel->dev);
409                 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
410         } else {
411                 skb->dev = tunnel->dev;
412         }
413
414         if (tun_dst)
415                 skb_dst_set(skb, (struct dst_entry *)tun_dst);
416
417         gro_cells_receive(&tunnel->gro_cells, skb);
418         return 0;
419
420 drop:
421         if (tun_dst)
422                 dst_release((struct dst_entry *)tun_dst);
423         kfree_skb(skb);
424         return 0;
425 }
426 EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
427
428 int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops,
429                             unsigned int num)
430 {
431         if (num >= MAX_IPTUN_ENCAP_OPS)
432                 return -ERANGE;
433
434         return !cmpxchg((const struct ip_tunnel_encap_ops **)
435                         &iptun_encaps[num],
436                         NULL, ops) ? 0 : -1;
437 }
438 EXPORT_SYMBOL(ip_tunnel_encap_add_ops);
439
440 int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops,
441                             unsigned int num)
442 {
443         int ret;
444
445         if (num >= MAX_IPTUN_ENCAP_OPS)
446                 return -ERANGE;
447
448         ret = (cmpxchg((const struct ip_tunnel_encap_ops **)
449                        &iptun_encaps[num],
450                        ops, NULL) == ops) ? 0 : -1;
451
452         synchronize_net();
453
454         return ret;
455 }
456 EXPORT_SYMBOL(ip_tunnel_encap_del_ops);
457
458 int ip_tunnel_encap_setup(struct ip_tunnel *t,
459                           struct ip_tunnel_encap *ipencap)
460 {
461         int hlen;
462
463         memset(&t->encap, 0, sizeof(t->encap));
464
465         hlen = ip_encap_hlen(ipencap);
466         if (hlen < 0)
467                 return hlen;
468
469         t->encap.type = ipencap->type;
470         t->encap.sport = ipencap->sport;
471         t->encap.dport = ipencap->dport;
472         t->encap.flags = ipencap->flags;
473
474         t->encap_hlen = hlen;
475         t->hlen = t->encap_hlen + t->tun_hlen;
476
477         return 0;
478 }
479 EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup);
480
481 static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
482                             struct rtable *rt, __be16 df,
483                             const struct iphdr *inner_iph,
484                             int tunnel_hlen, __be32 dst, bool md)
485 {
486         struct ip_tunnel *tunnel = netdev_priv(dev);
487         int pkt_size;
488         int mtu;
489
490         tunnel_hlen = md ? tunnel_hlen : tunnel->hlen;
491         pkt_size = skb->len - tunnel_hlen - dev->hard_header_len;
492
493         if (df)
494                 mtu = dst_mtu(&rt->dst) - dev->hard_header_len
495                                         - sizeof(struct iphdr) - tunnel_hlen;
496         else
497                 mtu = skb_valid_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
498
499         if (skb_valid_dst(skb))
500                 skb_dst_update_pmtu_no_confirm(skb, mtu);
501
502         if (skb->protocol == htons(ETH_P_IP)) {
503                 if (!skb_is_gso(skb) &&
504                     (inner_iph->frag_off & htons(IP_DF)) &&
505                     mtu < pkt_size) {
506                         memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
507                         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
508                         return -E2BIG;
509                 }
510         }
511 #if IS_ENABLED(CONFIG_IPV6)
512         else if (skb->protocol == htons(ETH_P_IPV6)) {
513                 struct rt6_info *rt6;
514                 __be32 daddr;
515
516                 rt6 = skb_valid_dst(skb) ? (struct rt6_info *)skb_dst(skb) :
517                                            NULL;
518                 daddr = md ? dst : tunnel->parms.iph.daddr;
519
520                 if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
521                            mtu >= IPV6_MIN_MTU) {
522                         if ((daddr && !ipv4_is_multicast(daddr)) ||
523                             rt6->rt6i_dst.plen == 128) {
524                                 rt6->rt6i_flags |= RTF_MODIFIED;
525                                 dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
526                         }
527                 }
528
529                 if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
530                                         mtu < pkt_size) {
531                         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
532                         return -E2BIG;
533                 }
534         }
535 #endif
536         return 0;
537 }
538
539 void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
540                        u8 proto, int tunnel_hlen)
541 {
542         struct ip_tunnel *tunnel = netdev_priv(dev);
543         u32 headroom = sizeof(struct iphdr);
544         struct ip_tunnel_info *tun_info;
545         const struct ip_tunnel_key *key;
546         const struct iphdr *inner_iph;
547         struct rtable *rt = NULL;
548         struct flowi4 fl4;
549         __be16 df = 0;
550         u8 tos, ttl;
551         bool use_cache;
552
553         tun_info = skb_tunnel_info(skb);
554         if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
555                      ip_tunnel_info_af(tun_info) != AF_INET))
556                 goto tx_error;
557         key = &tun_info->key;
558         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
559         inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
560         tos = key->tos;
561         if (tos == 1) {
562                 if (skb->protocol == htons(ETH_P_IP))
563                         tos = inner_iph->tos;
564                 else if (skb->protocol == htons(ETH_P_IPV6))
565                         tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
566         }
567         ip_tunnel_init_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src,
568                             tunnel_id_to_key32(key->tun_id), RT_TOS(tos),
569                             0, skb->mark, skb_get_hash(skb));
570         if (tunnel->encap.type != TUNNEL_ENCAP_NONE)
571                 goto tx_error;
572
573         use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
574         if (use_cache)
575                 rt = dst_cache_get_ip4(&tun_info->dst_cache, &fl4.saddr);
576         if (!rt) {
577                 rt = ip_route_output_key(tunnel->net, &fl4);
578                 if (IS_ERR(rt)) {
579                         dev->stats.tx_carrier_errors++;
580                         goto tx_error;
581                 }
582                 if (use_cache)
583                         dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst,
584                                           fl4.saddr);
585         }
586         if (rt->dst.dev == dev) {
587                 ip_rt_put(rt);
588                 dev->stats.collisions++;
589                 goto tx_error;
590         }
591
592         if (key->tun_flags & TUNNEL_DONT_FRAGMENT)
593                 df = htons(IP_DF);
594         if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, tunnel_hlen,
595                             key->u.ipv4.dst, true)) {
596                 ip_rt_put(rt);
597                 goto tx_error;
598         }
599
600         tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
601         ttl = key->ttl;
602         if (ttl == 0) {
603                 if (skb->protocol == htons(ETH_P_IP))
604                         ttl = inner_iph->ttl;
605                 else if (skb->protocol == htons(ETH_P_IPV6))
606                         ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
607                 else
608                         ttl = ip4_dst_hoplimit(&rt->dst);
609         }
610
611         if (!df && skb->protocol == htons(ETH_P_IP))
612                 df = inner_iph->frag_off & htons(IP_DF);
613
614         headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len;
615         if (headroom > dev->needed_headroom)
616                 dev->needed_headroom = headroom;
617
618         if (skb_cow_head(skb, dev->needed_headroom)) {
619                 ip_rt_put(rt);
620                 goto tx_dropped;
621         }
622         iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, tos, ttl,
623                       df, !net_eq(tunnel->net, dev_net(dev)));
624         return;
625 tx_error:
626         dev->stats.tx_errors++;
627         goto kfree;
628 tx_dropped:
629         dev->stats.tx_dropped++;
630 kfree:
631         kfree_skb(skb);
632 }
633 EXPORT_SYMBOL_GPL(ip_md_tunnel_xmit);
634
635 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
636                     const struct iphdr *tnl_params, u8 protocol)
637 {
638         struct ip_tunnel *tunnel = netdev_priv(dev);
639         struct ip_tunnel_info *tun_info = NULL;
640         const struct iphdr *inner_iph;
641         unsigned int max_headroom;      /* The extra header space needed */
642         struct rtable *rt = NULL;               /* Route to the other host */
643         bool use_cache = false;
644         struct flowi4 fl4;
645         bool md = false;
646         bool connected;
647         u8 tos, ttl;
648         __be32 dst;
649         __be16 df;
650
651         inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
652         connected = (tunnel->parms.iph.daddr != 0);
653
654         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
655
656         dst = tnl_params->daddr;
657         if (dst == 0) {
658                 /* NBMA tunnel */
659
660                 if (!skb_dst(skb)) {
661                         dev->stats.tx_fifo_errors++;
662                         goto tx_error;
663                 }
664
665                 tun_info = skb_tunnel_info(skb);
666                 if (tun_info && (tun_info->mode & IP_TUNNEL_INFO_TX) &&
667                     ip_tunnel_info_af(tun_info) == AF_INET &&
668                     tun_info->key.u.ipv4.dst) {
669                         dst = tun_info->key.u.ipv4.dst;
670                         md = true;
671                         connected = true;
672                 }
673                 else if (skb->protocol == htons(ETH_P_IP)) {
674                         rt = skb_rtable(skb);
675                         dst = rt_nexthop(rt, inner_iph->daddr);
676                 }
677 #if IS_ENABLED(CONFIG_IPV6)
678                 else if (skb->protocol == htons(ETH_P_IPV6)) {
679                         const struct in6_addr *addr6;
680                         struct neighbour *neigh;
681                         bool do_tx_error_icmp;
682                         int addr_type;
683
684                         neigh = dst_neigh_lookup(skb_dst(skb),
685                                                  &ipv6_hdr(skb)->daddr);
686                         if (!neigh)
687                                 goto tx_error;
688
689                         addr6 = (const struct in6_addr *)&neigh->primary_key;
690                         addr_type = ipv6_addr_type(addr6);
691
692                         if (addr_type == IPV6_ADDR_ANY) {
693                                 addr6 = &ipv6_hdr(skb)->daddr;
694                                 addr_type = ipv6_addr_type(addr6);
695                         }
696
697                         if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
698                                 do_tx_error_icmp = true;
699                         else {
700                                 do_tx_error_icmp = false;
701                                 dst = addr6->s6_addr32[3];
702                         }
703                         neigh_release(neigh);
704                         if (do_tx_error_icmp)
705                                 goto tx_error_icmp;
706                 }
707 #endif
708                 else
709                         goto tx_error;
710
711                 if (!md)
712                         connected = false;
713         }
714
715         tos = tnl_params->tos;
716         if (tos & 0x1) {
717                 tos &= ~0x1;
718                 if (skb->protocol == htons(ETH_P_IP)) {
719                         tos = inner_iph->tos;
720                         connected = false;
721                 } else if (skb->protocol == htons(ETH_P_IPV6)) {
722                         tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
723                         connected = false;
724                 }
725         }
726
727         ip_tunnel_init_flow(&fl4, protocol, dst, tnl_params->saddr,
728                             tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link,
729                             tunnel->fwmark, skb_get_hash(skb));
730
731         if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0)
732                 goto tx_error;
733
734         if (connected && md) {
735                 use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
736                 if (use_cache)
737                         rt = dst_cache_get_ip4(&tun_info->dst_cache,
738                                                &fl4.saddr);
739         } else {
740                 rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache,
741                                                 &fl4.saddr) : NULL;
742         }
743
744         if (!rt) {
745                 rt = ip_route_output_key(tunnel->net, &fl4);
746
747                 if (IS_ERR(rt)) {
748                         dev->stats.tx_carrier_errors++;
749                         goto tx_error;
750                 }
751                 if (use_cache)
752                         dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst,
753                                           fl4.saddr);
754                 else if (!md && connected)
755                         dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst,
756                                           fl4.saddr);
757         }
758
759         if (rt->dst.dev == dev) {
760                 ip_rt_put(rt);
761                 dev->stats.collisions++;
762                 goto tx_error;
763         }
764
765         if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off, inner_iph,
766                             0, 0, false)) {
767                 ip_rt_put(rt);
768                 goto tx_error;
769         }
770
771         if (tunnel->err_count > 0) {
772                 if (time_before(jiffies,
773                                 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
774                         tunnel->err_count--;
775
776                         dst_link_failure(skb);
777                 } else
778                         tunnel->err_count = 0;
779         }
780
781         tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
782         ttl = tnl_params->ttl;
783         if (ttl == 0) {
784                 if (skb->protocol == htons(ETH_P_IP))
785                         ttl = inner_iph->ttl;
786 #if IS_ENABLED(CONFIG_IPV6)
787                 else if (skb->protocol == htons(ETH_P_IPV6))
788                         ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
789 #endif
790                 else
791                         ttl = ip4_dst_hoplimit(&rt->dst);
792         }
793
794         df = tnl_params->frag_off;
795         if (skb->protocol == htons(ETH_P_IP) && !tunnel->ignore_df)
796                 df |= (inner_iph->frag_off&htons(IP_DF));
797
798         max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
799                         + rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
800         if (max_headroom > dev->needed_headroom)
801                 dev->needed_headroom = max_headroom;
802
803         if (skb_cow_head(skb, dev->needed_headroom)) {
804                 ip_rt_put(rt);
805                 dev->stats.tx_dropped++;
806                 kfree_skb(skb);
807                 return;
808         }
809
810         iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl,
811                       df, !net_eq(tunnel->net, dev_net(dev)));
812         return;
813
814 #if IS_ENABLED(CONFIG_IPV6)
815 tx_error_icmp:
816         dst_link_failure(skb);
817 #endif
818 tx_error:
819         dev->stats.tx_errors++;
820         kfree_skb(skb);
821 }
822 EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
823
824 static void ip_tunnel_update(struct ip_tunnel_net *itn,
825                              struct ip_tunnel *t,
826                              struct net_device *dev,
827                              struct ip_tunnel_parm *p,
828                              bool set_mtu,
829                              __u32 fwmark)
830 {
831         ip_tunnel_del(itn, t);
832         t->parms.iph.saddr = p->iph.saddr;
833         t->parms.iph.daddr = p->iph.daddr;
834         t->parms.i_key = p->i_key;
835         t->parms.o_key = p->o_key;
836         if (dev->type != ARPHRD_ETHER) {
837                 memcpy(dev->dev_addr, &p->iph.saddr, 4);
838                 memcpy(dev->broadcast, &p->iph.daddr, 4);
839         }
840         ip_tunnel_add(itn, t);
841
842         t->parms.iph.ttl = p->iph.ttl;
843         t->parms.iph.tos = p->iph.tos;
844         t->parms.iph.frag_off = p->iph.frag_off;
845
846         if (t->parms.link != p->link || t->fwmark != fwmark) {
847                 int mtu;
848
849                 t->parms.link = p->link;
850                 t->fwmark = fwmark;
851                 mtu = ip_tunnel_bind_dev(dev);
852                 if (set_mtu)
853                         dev->mtu = mtu;
854         }
855         dst_cache_reset(&t->dst_cache);
856         netdev_state_change(dev);
857 }
858
859 int ip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
860 {
861         int err = 0;
862         struct ip_tunnel *t = netdev_priv(dev);
863         struct net *net = t->net;
864         struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
865
866         switch (cmd) {
867         case SIOCGETTUNNEL:
868                 if (dev == itn->fb_tunnel_dev) {
869                         t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
870                         if (!t)
871                                 t = netdev_priv(dev);
872                 }
873                 memcpy(p, &t->parms, sizeof(*p));
874                 break;
875
876         case SIOCADDTUNNEL:
877         case SIOCCHGTUNNEL:
878                 err = -EPERM;
879                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
880                         goto done;
881                 if (p->iph.ttl)
882                         p->iph.frag_off |= htons(IP_DF);
883                 if (!(p->i_flags & VTI_ISVTI)) {
884                         if (!(p->i_flags & TUNNEL_KEY))
885                                 p->i_key = 0;
886                         if (!(p->o_flags & TUNNEL_KEY))
887                                 p->o_key = 0;
888                 }
889
890                 t = ip_tunnel_find(itn, p, itn->type);
891
892                 if (cmd == SIOCADDTUNNEL) {
893                         if (!t) {
894                                 t = ip_tunnel_create(net, itn, p);
895                                 err = PTR_ERR_OR_ZERO(t);
896                                 break;
897                         }
898
899                         err = -EEXIST;
900                         break;
901                 }
902                 if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
903                         if (t) {
904                                 if (t->dev != dev) {
905                                         err = -EEXIST;
906                                         break;
907                                 }
908                         } else {
909                                 unsigned int nflags = 0;
910
911                                 if (ipv4_is_multicast(p->iph.daddr))
912                                         nflags = IFF_BROADCAST;
913                                 else if (p->iph.daddr)
914                                         nflags = IFF_POINTOPOINT;
915
916                                 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
917                                         err = -EINVAL;
918                                         break;
919                                 }
920
921                                 t = netdev_priv(dev);
922                         }
923                 }
924
925                 if (t) {
926                         err = 0;
927                         ip_tunnel_update(itn, t, dev, p, true, 0);
928                 } else {
929                         err = -ENOENT;
930                 }
931                 break;
932
933         case SIOCDELTUNNEL:
934                 err = -EPERM;
935                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
936                         goto done;
937
938                 if (dev == itn->fb_tunnel_dev) {
939                         err = -ENOENT;
940                         t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
941                         if (!t)
942                                 goto done;
943                         err = -EPERM;
944                         if (t == netdev_priv(itn->fb_tunnel_dev))
945                                 goto done;
946                         dev = t->dev;
947                 }
948                 unregister_netdevice(dev);
949                 err = 0;
950                 break;
951
952         default:
953                 err = -EINVAL;
954         }
955
956 done:
957         return err;
958 }
959 EXPORT_SYMBOL_GPL(ip_tunnel_ctl);
960
961 int ip_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
962 {
963         struct ip_tunnel_parm p;
964         int err;
965
966         if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
967                 return -EFAULT;
968         err = dev->netdev_ops->ndo_tunnel_ctl(dev, &p, cmd);
969         if (!err && copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
970                 return -EFAULT;
971         return err;
972 }
973 EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
974
975 int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict)
976 {
977         struct ip_tunnel *tunnel = netdev_priv(dev);
978         int t_hlen = tunnel->hlen + sizeof(struct iphdr);
979         int max_mtu = IP_MAX_MTU - dev->hard_header_len - t_hlen;
980
981         if (new_mtu < ETH_MIN_MTU)
982                 return -EINVAL;
983
984         if (new_mtu > max_mtu) {
985                 if (strict)
986                         return -EINVAL;
987
988                 new_mtu = max_mtu;
989         }
990
991         dev->mtu = new_mtu;
992         return 0;
993 }
994 EXPORT_SYMBOL_GPL(__ip_tunnel_change_mtu);
995
996 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
997 {
998         return __ip_tunnel_change_mtu(dev, new_mtu, true);
999 }
1000 EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
1001
1002 static void ip_tunnel_dev_free(struct net_device *dev)
1003 {
1004         struct ip_tunnel *tunnel = netdev_priv(dev);
1005
1006         gro_cells_destroy(&tunnel->gro_cells);
1007         dst_cache_destroy(&tunnel->dst_cache);
1008         free_percpu(dev->tstats);
1009 }
1010
1011 void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
1012 {
1013         struct ip_tunnel *tunnel = netdev_priv(dev);
1014         struct ip_tunnel_net *itn;
1015
1016         itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
1017
1018         if (itn->fb_tunnel_dev != dev) {
1019                 ip_tunnel_del(itn, netdev_priv(dev));
1020                 unregister_netdevice_queue(dev, head);
1021         }
1022 }
1023 EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
1024
1025 struct net *ip_tunnel_get_link_net(const struct net_device *dev)
1026 {
1027         struct ip_tunnel *tunnel = netdev_priv(dev);
1028
1029         return tunnel->net;
1030 }
1031 EXPORT_SYMBOL(ip_tunnel_get_link_net);
1032
1033 int ip_tunnel_get_iflink(const struct net_device *dev)
1034 {
1035         struct ip_tunnel *tunnel = netdev_priv(dev);
1036
1037         return tunnel->parms.link;
1038 }
1039 EXPORT_SYMBOL(ip_tunnel_get_iflink);
1040
1041 int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id,
1042                                   struct rtnl_link_ops *ops, char *devname)
1043 {
1044         struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
1045         struct ip_tunnel_parm parms;
1046         unsigned int i;
1047
1048         itn->rtnl_link_ops = ops;
1049         for (i = 0; i < IP_TNL_HASH_SIZE; i++)
1050                 INIT_HLIST_HEAD(&itn->tunnels[i]);
1051
1052         if (!ops || !net_has_fallback_tunnels(net)) {
1053                 struct ip_tunnel_net *it_init_net;
1054
1055                 it_init_net = net_generic(&init_net, ip_tnl_net_id);
1056                 itn->type = it_init_net->type;
1057                 itn->fb_tunnel_dev = NULL;
1058                 return 0;
1059         }
1060
1061         memset(&parms, 0, sizeof(parms));
1062         if (devname)
1063                 strlcpy(parms.name, devname, IFNAMSIZ);
1064
1065         rtnl_lock();
1066         itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
1067         /* FB netdevice is special: we have one, and only one per netns.
1068          * Allowing to move it to another netns is clearly unsafe.
1069          */
1070         if (!IS_ERR(itn->fb_tunnel_dev)) {
1071                 itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
1072                 itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
1073                 ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
1074                 itn->type = itn->fb_tunnel_dev->type;
1075         }
1076         rtnl_unlock();
1077
1078         return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
1079 }
1080 EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
1081
1082 static void ip_tunnel_destroy(struct net *net, struct ip_tunnel_net *itn,
1083                               struct list_head *head,
1084                               struct rtnl_link_ops *ops)
1085 {
1086         struct net_device *dev, *aux;
1087         int h;
1088
1089         for_each_netdev_safe(net, dev, aux)
1090                 if (dev->rtnl_link_ops == ops)
1091                         unregister_netdevice_queue(dev, head);
1092
1093         for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
1094                 struct ip_tunnel *t;
1095                 struct hlist_node *n;
1096                 struct hlist_head *thead = &itn->tunnels[h];
1097
1098                 hlist_for_each_entry_safe(t, n, thead, hash_node)
1099                         /* If dev is in the same netns, it has already
1100                          * been added to the list by the previous loop.
1101                          */
1102                         if (!net_eq(dev_net(t->dev), net))
1103                                 unregister_netdevice_queue(t->dev, head);
1104         }
1105 }
1106
1107 void ip_tunnel_delete_nets(struct list_head *net_list, unsigned int id,
1108                            struct rtnl_link_ops *ops)
1109 {
1110         struct ip_tunnel_net *itn;
1111         struct net *net;
1112         LIST_HEAD(list);
1113
1114         rtnl_lock();
1115         list_for_each_entry(net, net_list, exit_list) {
1116                 itn = net_generic(net, id);
1117                 ip_tunnel_destroy(net, itn, &list, ops);
1118         }
1119         unregister_netdevice_many(&list);
1120         rtnl_unlock();
1121 }
1122 EXPORT_SYMBOL_GPL(ip_tunnel_delete_nets);
1123
1124 int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
1125                       struct ip_tunnel_parm *p, __u32 fwmark)
1126 {
1127         struct ip_tunnel *nt;
1128         struct net *net = dev_net(dev);
1129         struct ip_tunnel_net *itn;
1130         int mtu;
1131         int err;
1132
1133         nt = netdev_priv(dev);
1134         itn = net_generic(net, nt->ip_tnl_net_id);
1135
1136         if (nt->collect_md) {
1137                 if (rtnl_dereference(itn->collect_md_tun))
1138                         return -EEXIST;
1139         } else {
1140                 if (ip_tunnel_find(itn, p, dev->type))
1141                         return -EEXIST;
1142         }
1143
1144         nt->net = net;
1145         nt->parms = *p;
1146         nt->fwmark = fwmark;
1147         err = register_netdevice(dev);
1148         if (err)
1149                 goto err_register_netdevice;
1150
1151         if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1152                 eth_hw_addr_random(dev);
1153
1154         mtu = ip_tunnel_bind_dev(dev);
1155         if (tb[IFLA_MTU]) {
1156                 unsigned int max = IP_MAX_MTU - dev->hard_header_len - nt->hlen;
1157
1158                 mtu = clamp(dev->mtu, (unsigned int)ETH_MIN_MTU,
1159                             (unsigned int)(max - sizeof(struct iphdr)));
1160         }
1161
1162         err = dev_set_mtu(dev, mtu);
1163         if (err)
1164                 goto err_dev_set_mtu;
1165
1166         ip_tunnel_add(itn, nt);
1167         return 0;
1168
1169 err_dev_set_mtu:
1170         unregister_netdevice(dev);
1171 err_register_netdevice:
1172         return err;
1173 }
1174 EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
1175
1176 int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
1177                          struct ip_tunnel_parm *p, __u32 fwmark)
1178 {
1179         struct ip_tunnel *t;
1180         struct ip_tunnel *tunnel = netdev_priv(dev);
1181         struct net *net = tunnel->net;
1182         struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
1183
1184         if (dev == itn->fb_tunnel_dev)
1185                 return -EINVAL;
1186
1187         t = ip_tunnel_find(itn, p, dev->type);
1188
1189         if (t) {
1190                 if (t->dev != dev)
1191                         return -EEXIST;
1192         } else {
1193                 t = tunnel;
1194
1195                 if (dev->type != ARPHRD_ETHER) {
1196                         unsigned int nflags = 0;
1197
1198                         if (ipv4_is_multicast(p->iph.daddr))
1199                                 nflags = IFF_BROADCAST;
1200                         else if (p->iph.daddr)
1201                                 nflags = IFF_POINTOPOINT;
1202
1203                         if ((dev->flags ^ nflags) &
1204                             (IFF_POINTOPOINT | IFF_BROADCAST))
1205                                 return -EINVAL;
1206                 }
1207         }
1208
1209         ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU], fwmark);
1210         return 0;
1211 }
1212 EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
1213
1214 int ip_tunnel_init(struct net_device *dev)
1215 {
1216         struct ip_tunnel *tunnel = netdev_priv(dev);
1217         struct iphdr *iph = &tunnel->parms.iph;
1218         int err;
1219
1220         dev->needs_free_netdev = true;
1221         dev->priv_destructor = ip_tunnel_dev_free;
1222         dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
1223         if (!dev->tstats)
1224                 return -ENOMEM;
1225
1226         err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
1227         if (err) {
1228                 free_percpu(dev->tstats);
1229                 return err;
1230         }
1231
1232         err = gro_cells_init(&tunnel->gro_cells, dev);
1233         if (err) {
1234                 dst_cache_destroy(&tunnel->dst_cache);
1235                 free_percpu(dev->tstats);
1236                 return err;
1237         }
1238
1239         tunnel->dev = dev;
1240         tunnel->net = dev_net(dev);
1241         strcpy(tunnel->parms.name, dev->name);
1242         iph->version            = 4;
1243         iph->ihl                = 5;
1244
1245         if (tunnel->collect_md)
1246                 netif_keep_dst(dev);
1247         return 0;
1248 }
1249 EXPORT_SYMBOL_GPL(ip_tunnel_init);
1250
1251 void ip_tunnel_uninit(struct net_device *dev)
1252 {
1253         struct ip_tunnel *tunnel = netdev_priv(dev);
1254         struct net *net = tunnel->net;
1255         struct ip_tunnel_net *itn;
1256
1257         itn = net_generic(net, tunnel->ip_tnl_net_id);
1258         ip_tunnel_del(itn, netdev_priv(dev));
1259         if (itn->fb_tunnel_dev == dev)
1260                 WRITE_ONCE(itn->fb_tunnel_dev, NULL);
1261
1262         dst_cache_reset(&tunnel->dst_cache);
1263 }
1264 EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1265
1266 /* Do least required initialization, rest of init is done in tunnel_init call */
1267 void ip_tunnel_setup(struct net_device *dev, unsigned int net_id)
1268 {
1269         struct ip_tunnel *tunnel = netdev_priv(dev);
1270         tunnel->ip_tnl_net_id = net_id;
1271 }
1272 EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1273
1274 MODULE_LICENSE("GPL");