x86/hyperv: Remove aliases with X64 in their name
[linux-2.6-microblaze.git] / net / ipv4 / ip_tunnel.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (c) 2013 Nicira, Inc.
4  */
5
6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
7
8 #include <linux/capability.h>
9 #include <linux/module.h>
10 #include <linux/types.h>
11 #include <linux/kernel.h>
12 #include <linux/slab.h>
13 #include <linux/uaccess.h>
14 #include <linux/skbuff.h>
15 #include <linux/netdevice.h>
16 #include <linux/in.h>
17 #include <linux/tcp.h>
18 #include <linux/udp.h>
19 #include <linux/if_arp.h>
20 #include <linux/init.h>
21 #include <linux/in6.h>
22 #include <linux/inetdevice.h>
23 #include <linux/igmp.h>
24 #include <linux/netfilter_ipv4.h>
25 #include <linux/etherdevice.h>
26 #include <linux/if_ether.h>
27 #include <linux/if_vlan.h>
28 #include <linux/rculist.h>
29 #include <linux/err.h>
30
31 #include <net/sock.h>
32 #include <net/ip.h>
33 #include <net/icmp.h>
34 #include <net/protocol.h>
35 #include <net/ip_tunnels.h>
36 #include <net/arp.h>
37 #include <net/checksum.h>
38 #include <net/dsfield.h>
39 #include <net/inet_ecn.h>
40 #include <net/xfrm.h>
41 #include <net/net_namespace.h>
42 #include <net/netns/generic.h>
43 #include <net/rtnetlink.h>
44 #include <net/udp.h>
45 #include <net/dst_metadata.h>
46
47 #if IS_ENABLED(CONFIG_IPV6)
48 #include <net/ipv6.h>
49 #include <net/ip6_fib.h>
50 #include <net/ip6_route.h>
51 #endif
52
53 static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
54 {
55         return hash_32((__force u32)key ^ (__force u32)remote,
56                          IP_TNL_HASH_BITS);
57 }
58
59 static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
60                                 __be16 flags, __be32 key)
61 {
62         if (p->i_flags & TUNNEL_KEY) {
63                 if (flags & TUNNEL_KEY)
64                         return key == p->i_key;
65                 else
66                         /* key expected, none present */
67                         return false;
68         } else
69                 return !(flags & TUNNEL_KEY);
70 }
71
72 /* Fallback tunnel: no source, no destination, no key, no options
73
74    Tunnel hash table:
75    We require exact key match i.e. if a key is present in packet
76    it will match only tunnel with the same key; if it is not present,
77    it will match only keyless tunnel.
78
79    All keysless packets, if not matched configured keyless tunnels
80    will match fallback tunnel.
81    Given src, dst and key, find appropriate for input tunnel.
82 */
83 struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
84                                    int link, __be16 flags,
85                                    __be32 remote, __be32 local,
86                                    __be32 key)
87 {
88         struct ip_tunnel *t, *cand = NULL;
89         struct hlist_head *head;
90         struct net_device *ndev;
91         unsigned int hash;
92
93         hash = ip_tunnel_hash(key, remote);
94         head = &itn->tunnels[hash];
95
96         hlist_for_each_entry_rcu(t, head, hash_node) {
97                 if (local != t->parms.iph.saddr ||
98                     remote != t->parms.iph.daddr ||
99                     !(t->dev->flags & IFF_UP))
100                         continue;
101
102                 if (!ip_tunnel_key_match(&t->parms, flags, key))
103                         continue;
104
105                 if (t->parms.link == link)
106                         return t;
107                 else
108                         cand = t;
109         }
110
111         hlist_for_each_entry_rcu(t, head, hash_node) {
112                 if (remote != t->parms.iph.daddr ||
113                     t->parms.iph.saddr != 0 ||
114                     !(t->dev->flags & IFF_UP))
115                         continue;
116
117                 if (!ip_tunnel_key_match(&t->parms, flags, key))
118                         continue;
119
120                 if (t->parms.link == link)
121                         return t;
122                 else if (!cand)
123                         cand = t;
124         }
125
126         hash = ip_tunnel_hash(key, 0);
127         head = &itn->tunnels[hash];
128
129         hlist_for_each_entry_rcu(t, head, hash_node) {
130                 if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) &&
131                     (local != t->parms.iph.daddr || !ipv4_is_multicast(local)))
132                         continue;
133
134                 if (!(t->dev->flags & IFF_UP))
135                         continue;
136
137                 if (!ip_tunnel_key_match(&t->parms, flags, key))
138                         continue;
139
140                 if (t->parms.link == link)
141                         return t;
142                 else if (!cand)
143                         cand = t;
144         }
145
146         hlist_for_each_entry_rcu(t, head, hash_node) {
147                 if ((!(flags & TUNNEL_NO_KEY) && t->parms.i_key != key) ||
148                     t->parms.iph.saddr != 0 ||
149                     t->parms.iph.daddr != 0 ||
150                     !(t->dev->flags & IFF_UP))
151                         continue;
152
153                 if (t->parms.link == link)
154                         return t;
155                 else if (!cand)
156                         cand = t;
157         }
158
159         if (cand)
160                 return cand;
161
162         t = rcu_dereference(itn->collect_md_tun);
163         if (t && t->dev->flags & IFF_UP)
164                 return t;
165
166         ndev = READ_ONCE(itn->fb_tunnel_dev);
167         if (ndev && ndev->flags & IFF_UP)
168                 return netdev_priv(ndev);
169
170         return NULL;
171 }
172 EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
173
174 static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
175                                     struct ip_tunnel_parm *parms)
176 {
177         unsigned int h;
178         __be32 remote;
179         __be32 i_key = parms->i_key;
180
181         if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
182                 remote = parms->iph.daddr;
183         else
184                 remote = 0;
185
186         if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI))
187                 i_key = 0;
188
189         h = ip_tunnel_hash(i_key, remote);
190         return &itn->tunnels[h];
191 }
192
193 static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
194 {
195         struct hlist_head *head = ip_bucket(itn, &t->parms);
196
197         if (t->collect_md)
198                 rcu_assign_pointer(itn->collect_md_tun, t);
199         hlist_add_head_rcu(&t->hash_node, head);
200 }
201
202 static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t)
203 {
204         if (t->collect_md)
205                 rcu_assign_pointer(itn->collect_md_tun, NULL);
206         hlist_del_init_rcu(&t->hash_node);
207 }
208
209 static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
210                                         struct ip_tunnel_parm *parms,
211                                         int type)
212 {
213         __be32 remote = parms->iph.daddr;
214         __be32 local = parms->iph.saddr;
215         __be32 key = parms->i_key;
216         __be16 flags = parms->i_flags;
217         int link = parms->link;
218         struct ip_tunnel *t = NULL;
219         struct hlist_head *head = ip_bucket(itn, parms);
220
221         hlist_for_each_entry_rcu(t, head, hash_node) {
222                 if (local == t->parms.iph.saddr &&
223                     remote == t->parms.iph.daddr &&
224                     link == t->parms.link &&
225                     type == t->dev->type &&
226                     ip_tunnel_key_match(&t->parms, flags, key))
227                         break;
228         }
229         return t;
230 }
231
232 static struct net_device *__ip_tunnel_create(struct net *net,
233                                              const struct rtnl_link_ops *ops,
234                                              struct ip_tunnel_parm *parms)
235 {
236         int err;
237         struct ip_tunnel *tunnel;
238         struct net_device *dev;
239         char name[IFNAMSIZ];
240
241         err = -E2BIG;
242         if (parms->name[0]) {
243                 if (!dev_valid_name(parms->name))
244                         goto failed;
245                 strlcpy(name, parms->name, IFNAMSIZ);
246         } else {
247                 if (strlen(ops->kind) > (IFNAMSIZ - 3))
248                         goto failed;
249                 strcpy(name, ops->kind);
250                 strcat(name, "%d");
251         }
252
253         ASSERT_RTNL();
254         dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup);
255         if (!dev) {
256                 err = -ENOMEM;
257                 goto failed;
258         }
259         dev_net_set(dev, net);
260
261         dev->rtnl_link_ops = ops;
262
263         tunnel = netdev_priv(dev);
264         tunnel->parms = *parms;
265         tunnel->net = net;
266
267         err = register_netdevice(dev);
268         if (err)
269                 goto failed_free;
270
271         return dev;
272
273 failed_free:
274         free_netdev(dev);
275 failed:
276         return ERR_PTR(err);
277 }
278
279 static int ip_tunnel_bind_dev(struct net_device *dev)
280 {
281         struct net_device *tdev = NULL;
282         struct ip_tunnel *tunnel = netdev_priv(dev);
283         const struct iphdr *iph;
284         int hlen = LL_MAX_HEADER;
285         int mtu = ETH_DATA_LEN;
286         int t_hlen = tunnel->hlen + sizeof(struct iphdr);
287
288         iph = &tunnel->parms.iph;
289
290         /* Guess output device to choose reasonable mtu and needed_headroom */
291         if (iph->daddr) {
292                 struct flowi4 fl4;
293                 struct rtable *rt;
294
295                 ip_tunnel_init_flow(&fl4, iph->protocol, iph->daddr,
296                                     iph->saddr, tunnel->parms.o_key,
297                                     RT_TOS(iph->tos), tunnel->parms.link,
298                                     tunnel->fwmark, 0);
299                 rt = ip_route_output_key(tunnel->net, &fl4);
300
301                 if (!IS_ERR(rt)) {
302                         tdev = rt->dst.dev;
303                         ip_rt_put(rt);
304                 }
305                 if (dev->type != ARPHRD_ETHER)
306                         dev->flags |= IFF_POINTOPOINT;
307
308                 dst_cache_reset(&tunnel->dst_cache);
309         }
310
311         if (!tdev && tunnel->parms.link)
312                 tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
313
314         if (tdev) {
315                 hlen = tdev->hard_header_len + tdev->needed_headroom;
316                 mtu = min(tdev->mtu, IP_MAX_MTU);
317         }
318
319         dev->needed_headroom = t_hlen + hlen;
320         mtu -= (dev->hard_header_len + t_hlen);
321
322         if (mtu < IPV4_MIN_MTU)
323                 mtu = IPV4_MIN_MTU;
324
325         return mtu;
326 }
327
328 static struct ip_tunnel *ip_tunnel_create(struct net *net,
329                                           struct ip_tunnel_net *itn,
330                                           struct ip_tunnel_parm *parms)
331 {
332         struct ip_tunnel *nt;
333         struct net_device *dev;
334         int t_hlen;
335         int mtu;
336         int err;
337
338         dev = __ip_tunnel_create(net, itn->rtnl_link_ops, parms);
339         if (IS_ERR(dev))
340                 return ERR_CAST(dev);
341
342         mtu = ip_tunnel_bind_dev(dev);
343         err = dev_set_mtu(dev, mtu);
344         if (err)
345                 goto err_dev_set_mtu;
346
347         nt = netdev_priv(dev);
348         t_hlen = nt->hlen + sizeof(struct iphdr);
349         dev->min_mtu = ETH_MIN_MTU;
350         dev->max_mtu = IP_MAX_MTU - dev->hard_header_len - t_hlen;
351         ip_tunnel_add(itn, nt);
352         return nt;
353
354 err_dev_set_mtu:
355         unregister_netdevice(dev);
356         return ERR_PTR(err);
357 }
358
359 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
360                   const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst,
361                   bool log_ecn_error)
362 {
363         struct pcpu_sw_netstats *tstats;
364         const struct iphdr *iph = ip_hdr(skb);
365         int err;
366
367 #ifdef CONFIG_NET_IPGRE_BROADCAST
368         if (ipv4_is_multicast(iph->daddr)) {
369                 tunnel->dev->stats.multicast++;
370                 skb->pkt_type = PACKET_BROADCAST;
371         }
372 #endif
373
374         if ((!(tpi->flags&TUNNEL_CSUM) &&  (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
375              ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
376                 tunnel->dev->stats.rx_crc_errors++;
377                 tunnel->dev->stats.rx_errors++;
378                 goto drop;
379         }
380
381         if (tunnel->parms.i_flags&TUNNEL_SEQ) {
382                 if (!(tpi->flags&TUNNEL_SEQ) ||
383                     (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
384                         tunnel->dev->stats.rx_fifo_errors++;
385                         tunnel->dev->stats.rx_errors++;
386                         goto drop;
387                 }
388                 tunnel->i_seqno = ntohl(tpi->seq) + 1;
389         }
390
391         skb_reset_network_header(skb);
392
393         err = IP_ECN_decapsulate(iph, skb);
394         if (unlikely(err)) {
395                 if (log_ecn_error)
396                         net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
397                                         &iph->saddr, iph->tos);
398                 if (err > 1) {
399                         ++tunnel->dev->stats.rx_frame_errors;
400                         ++tunnel->dev->stats.rx_errors;
401                         goto drop;
402                 }
403         }
404
405         tstats = this_cpu_ptr(tunnel->dev->tstats);
406         u64_stats_update_begin(&tstats->syncp);
407         tstats->rx_packets++;
408         tstats->rx_bytes += skb->len;
409         u64_stats_update_end(&tstats->syncp);
410
411         skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
412
413         if (tunnel->dev->type == ARPHRD_ETHER) {
414                 skb->protocol = eth_type_trans(skb, tunnel->dev);
415                 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
416         } else {
417                 skb->dev = tunnel->dev;
418         }
419
420         if (tun_dst)
421                 skb_dst_set(skb, (struct dst_entry *)tun_dst);
422
423         gro_cells_receive(&tunnel->gro_cells, skb);
424         return 0;
425
426 drop:
427         if (tun_dst)
428                 dst_release((struct dst_entry *)tun_dst);
429         kfree_skb(skb);
430         return 0;
431 }
432 EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
433
434 int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops,
435                             unsigned int num)
436 {
437         if (num >= MAX_IPTUN_ENCAP_OPS)
438                 return -ERANGE;
439
440         return !cmpxchg((const struct ip_tunnel_encap_ops **)
441                         &iptun_encaps[num],
442                         NULL, ops) ? 0 : -1;
443 }
444 EXPORT_SYMBOL(ip_tunnel_encap_add_ops);
445
446 int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops,
447                             unsigned int num)
448 {
449         int ret;
450
451         if (num >= MAX_IPTUN_ENCAP_OPS)
452                 return -ERANGE;
453
454         ret = (cmpxchg((const struct ip_tunnel_encap_ops **)
455                        &iptun_encaps[num],
456                        ops, NULL) == ops) ? 0 : -1;
457
458         synchronize_net();
459
460         return ret;
461 }
462 EXPORT_SYMBOL(ip_tunnel_encap_del_ops);
463
464 int ip_tunnel_encap_setup(struct ip_tunnel *t,
465                           struct ip_tunnel_encap *ipencap)
466 {
467         int hlen;
468
469         memset(&t->encap, 0, sizeof(t->encap));
470
471         hlen = ip_encap_hlen(ipencap);
472         if (hlen < 0)
473                 return hlen;
474
475         t->encap.type = ipencap->type;
476         t->encap.sport = ipencap->sport;
477         t->encap.dport = ipencap->dport;
478         t->encap.flags = ipencap->flags;
479
480         t->encap_hlen = hlen;
481         t->hlen = t->encap_hlen + t->tun_hlen;
482
483         return 0;
484 }
485 EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup);
486
487 static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
488                             struct rtable *rt, __be16 df,
489                             const struct iphdr *inner_iph,
490                             int tunnel_hlen, __be32 dst, bool md)
491 {
492         struct ip_tunnel *tunnel = netdev_priv(dev);
493         int pkt_size;
494         int mtu;
495
496         tunnel_hlen = md ? tunnel_hlen : tunnel->hlen;
497         pkt_size = skb->len - tunnel_hlen - dev->hard_header_len;
498
499         if (df)
500                 mtu = dst_mtu(&rt->dst) - dev->hard_header_len
501                                         - sizeof(struct iphdr) - tunnel_hlen;
502         else
503                 mtu = skb_valid_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
504
505         if (skb_valid_dst(skb))
506                 skb_dst_update_pmtu_no_confirm(skb, mtu);
507
508         if (skb->protocol == htons(ETH_P_IP)) {
509                 if (!skb_is_gso(skb) &&
510                     (inner_iph->frag_off & htons(IP_DF)) &&
511                     mtu < pkt_size) {
512                         memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
513                         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
514                         return -E2BIG;
515                 }
516         }
517 #if IS_ENABLED(CONFIG_IPV6)
518         else if (skb->protocol == htons(ETH_P_IPV6)) {
519                 struct rt6_info *rt6;
520                 __be32 daddr;
521
522                 rt6 = skb_valid_dst(skb) ? (struct rt6_info *)skb_dst(skb) :
523                                            NULL;
524                 daddr = md ? dst : tunnel->parms.iph.daddr;
525
526                 if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
527                            mtu >= IPV6_MIN_MTU) {
528                         if ((daddr && !ipv4_is_multicast(daddr)) ||
529                             rt6->rt6i_dst.plen == 128) {
530                                 rt6->rt6i_flags |= RTF_MODIFIED;
531                                 dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
532                         }
533                 }
534
535                 if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
536                                         mtu < pkt_size) {
537                         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
538                         return -E2BIG;
539                 }
540         }
541 #endif
542         return 0;
543 }
544
545 void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
546                        u8 proto, int tunnel_hlen)
547 {
548         struct ip_tunnel *tunnel = netdev_priv(dev);
549         u32 headroom = sizeof(struct iphdr);
550         struct ip_tunnel_info *tun_info;
551         const struct ip_tunnel_key *key;
552         const struct iphdr *inner_iph;
553         struct rtable *rt = NULL;
554         struct flowi4 fl4;
555         __be16 df = 0;
556         u8 tos, ttl;
557         bool use_cache;
558
559         tun_info = skb_tunnel_info(skb);
560         if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
561                      ip_tunnel_info_af(tun_info) != AF_INET))
562                 goto tx_error;
563         key = &tun_info->key;
564         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
565         inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
566         tos = key->tos;
567         if (tos == 1) {
568                 if (skb->protocol == htons(ETH_P_IP))
569                         tos = inner_iph->tos;
570                 else if (skb->protocol == htons(ETH_P_IPV6))
571                         tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
572         }
573         ip_tunnel_init_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src,
574                             tunnel_id_to_key32(key->tun_id), RT_TOS(tos),
575                             0, skb->mark, skb_get_hash(skb));
576         if (tunnel->encap.type != TUNNEL_ENCAP_NONE)
577                 goto tx_error;
578
579         use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
580         if (use_cache)
581                 rt = dst_cache_get_ip4(&tun_info->dst_cache, &fl4.saddr);
582         if (!rt) {
583                 rt = ip_route_output_key(tunnel->net, &fl4);
584                 if (IS_ERR(rt)) {
585                         dev->stats.tx_carrier_errors++;
586                         goto tx_error;
587                 }
588                 if (use_cache)
589                         dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst,
590                                           fl4.saddr);
591         }
592         if (rt->dst.dev == dev) {
593                 ip_rt_put(rt);
594                 dev->stats.collisions++;
595                 goto tx_error;
596         }
597
598         if (key->tun_flags & TUNNEL_DONT_FRAGMENT)
599                 df = htons(IP_DF);
600         if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, tunnel_hlen,
601                             key->u.ipv4.dst, true)) {
602                 ip_rt_put(rt);
603                 goto tx_error;
604         }
605
606         tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
607         ttl = key->ttl;
608         if (ttl == 0) {
609                 if (skb->protocol == htons(ETH_P_IP))
610                         ttl = inner_iph->ttl;
611                 else if (skb->protocol == htons(ETH_P_IPV6))
612                         ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
613                 else
614                         ttl = ip4_dst_hoplimit(&rt->dst);
615         }
616
617         if (!df && skb->protocol == htons(ETH_P_IP))
618                 df = inner_iph->frag_off & htons(IP_DF);
619
620         headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len;
621         if (headroom > dev->needed_headroom)
622                 dev->needed_headroom = headroom;
623
624         if (skb_cow_head(skb, dev->needed_headroom)) {
625                 ip_rt_put(rt);
626                 goto tx_dropped;
627         }
628         iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, tos, ttl,
629                       df, !net_eq(tunnel->net, dev_net(dev)));
630         return;
631 tx_error:
632         dev->stats.tx_errors++;
633         goto kfree;
634 tx_dropped:
635         dev->stats.tx_dropped++;
636 kfree:
637         kfree_skb(skb);
638 }
639 EXPORT_SYMBOL_GPL(ip_md_tunnel_xmit);
640
641 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
642                     const struct iphdr *tnl_params, u8 protocol)
643 {
644         struct ip_tunnel *tunnel = netdev_priv(dev);
645         struct ip_tunnel_info *tun_info = NULL;
646         const struct iphdr *inner_iph;
647         unsigned int max_headroom;      /* The extra header space needed */
648         struct rtable *rt = NULL;               /* Route to the other host */
649         bool use_cache = false;
650         struct flowi4 fl4;
651         bool md = false;
652         bool connected;
653         u8 tos, ttl;
654         __be32 dst;
655         __be16 df;
656
657         inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
658         connected = (tunnel->parms.iph.daddr != 0);
659
660         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
661
662         dst = tnl_params->daddr;
663         if (dst == 0) {
664                 /* NBMA tunnel */
665
666                 if (!skb_dst(skb)) {
667                         dev->stats.tx_fifo_errors++;
668                         goto tx_error;
669                 }
670
671                 tun_info = skb_tunnel_info(skb);
672                 if (tun_info && (tun_info->mode & IP_TUNNEL_INFO_TX) &&
673                     ip_tunnel_info_af(tun_info) == AF_INET &&
674                     tun_info->key.u.ipv4.dst) {
675                         dst = tun_info->key.u.ipv4.dst;
676                         md = true;
677                         connected = true;
678                 }
679                 else if (skb->protocol == htons(ETH_P_IP)) {
680                         rt = skb_rtable(skb);
681                         dst = rt_nexthop(rt, inner_iph->daddr);
682                 }
683 #if IS_ENABLED(CONFIG_IPV6)
684                 else if (skb->protocol == htons(ETH_P_IPV6)) {
685                         const struct in6_addr *addr6;
686                         struct neighbour *neigh;
687                         bool do_tx_error_icmp;
688                         int addr_type;
689
690                         neigh = dst_neigh_lookup(skb_dst(skb),
691                                                  &ipv6_hdr(skb)->daddr);
692                         if (!neigh)
693                                 goto tx_error;
694
695                         addr6 = (const struct in6_addr *)&neigh->primary_key;
696                         addr_type = ipv6_addr_type(addr6);
697
698                         if (addr_type == IPV6_ADDR_ANY) {
699                                 addr6 = &ipv6_hdr(skb)->daddr;
700                                 addr_type = ipv6_addr_type(addr6);
701                         }
702
703                         if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
704                                 do_tx_error_icmp = true;
705                         else {
706                                 do_tx_error_icmp = false;
707                                 dst = addr6->s6_addr32[3];
708                         }
709                         neigh_release(neigh);
710                         if (do_tx_error_icmp)
711                                 goto tx_error_icmp;
712                 }
713 #endif
714                 else
715                         goto tx_error;
716
717                 if (!md)
718                         connected = false;
719         }
720
721         tos = tnl_params->tos;
722         if (tos & 0x1) {
723                 tos &= ~0x1;
724                 if (skb->protocol == htons(ETH_P_IP)) {
725                         tos = inner_iph->tos;
726                         connected = false;
727                 } else if (skb->protocol == htons(ETH_P_IPV6)) {
728                         tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
729                         connected = false;
730                 }
731         }
732
733         ip_tunnel_init_flow(&fl4, protocol, dst, tnl_params->saddr,
734                             tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link,
735                             tunnel->fwmark, skb_get_hash(skb));
736
737         if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0)
738                 goto tx_error;
739
740         if (connected && md) {
741                 use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
742                 if (use_cache)
743                         rt = dst_cache_get_ip4(&tun_info->dst_cache,
744                                                &fl4.saddr);
745         } else {
746                 rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache,
747                                                 &fl4.saddr) : NULL;
748         }
749
750         if (!rt) {
751                 rt = ip_route_output_key(tunnel->net, &fl4);
752
753                 if (IS_ERR(rt)) {
754                         dev->stats.tx_carrier_errors++;
755                         goto tx_error;
756                 }
757                 if (use_cache)
758                         dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst,
759                                           fl4.saddr);
760                 else if (!md && connected)
761                         dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst,
762                                           fl4.saddr);
763         }
764
765         if (rt->dst.dev == dev) {
766                 ip_rt_put(rt);
767                 dev->stats.collisions++;
768                 goto tx_error;
769         }
770
771         if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off, inner_iph,
772                             0, 0, false)) {
773                 ip_rt_put(rt);
774                 goto tx_error;
775         }
776
777         if (tunnel->err_count > 0) {
778                 if (time_before(jiffies,
779                                 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
780                         tunnel->err_count--;
781
782                         dst_link_failure(skb);
783                 } else
784                         tunnel->err_count = 0;
785         }
786
787         tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
788         ttl = tnl_params->ttl;
789         if (ttl == 0) {
790                 if (skb->protocol == htons(ETH_P_IP))
791                         ttl = inner_iph->ttl;
792 #if IS_ENABLED(CONFIG_IPV6)
793                 else if (skb->protocol == htons(ETH_P_IPV6))
794                         ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
795 #endif
796                 else
797                         ttl = ip4_dst_hoplimit(&rt->dst);
798         }
799
800         df = tnl_params->frag_off;
801         if (skb->protocol == htons(ETH_P_IP) && !tunnel->ignore_df)
802                 df |= (inner_iph->frag_off&htons(IP_DF));
803
804         max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
805                         + rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
806         if (max_headroom > dev->needed_headroom)
807                 dev->needed_headroom = max_headroom;
808
809         if (skb_cow_head(skb, dev->needed_headroom)) {
810                 ip_rt_put(rt);
811                 dev->stats.tx_dropped++;
812                 kfree_skb(skb);
813                 return;
814         }
815
816         iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl,
817                       df, !net_eq(tunnel->net, dev_net(dev)));
818         return;
819
820 #if IS_ENABLED(CONFIG_IPV6)
821 tx_error_icmp:
822         dst_link_failure(skb);
823 #endif
824 tx_error:
825         dev->stats.tx_errors++;
826         kfree_skb(skb);
827 }
828 EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
829
830 static void ip_tunnel_update(struct ip_tunnel_net *itn,
831                              struct ip_tunnel *t,
832                              struct net_device *dev,
833                              struct ip_tunnel_parm *p,
834                              bool set_mtu,
835                              __u32 fwmark)
836 {
837         ip_tunnel_del(itn, t);
838         t->parms.iph.saddr = p->iph.saddr;
839         t->parms.iph.daddr = p->iph.daddr;
840         t->parms.i_key = p->i_key;
841         t->parms.o_key = p->o_key;
842         if (dev->type != ARPHRD_ETHER) {
843                 memcpy(dev->dev_addr, &p->iph.saddr, 4);
844                 memcpy(dev->broadcast, &p->iph.daddr, 4);
845         }
846         ip_tunnel_add(itn, t);
847
848         t->parms.iph.ttl = p->iph.ttl;
849         t->parms.iph.tos = p->iph.tos;
850         t->parms.iph.frag_off = p->iph.frag_off;
851
852         if (t->parms.link != p->link || t->fwmark != fwmark) {
853                 int mtu;
854
855                 t->parms.link = p->link;
856                 t->fwmark = fwmark;
857                 mtu = ip_tunnel_bind_dev(dev);
858                 if (set_mtu)
859                         dev->mtu = mtu;
860         }
861         dst_cache_reset(&t->dst_cache);
862         netdev_state_change(dev);
863 }
864
865 int ip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
866 {
867         int err = 0;
868         struct ip_tunnel *t = netdev_priv(dev);
869         struct net *net = t->net;
870         struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
871
872         switch (cmd) {
873         case SIOCGETTUNNEL:
874                 if (dev == itn->fb_tunnel_dev) {
875                         t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
876                         if (!t)
877                                 t = netdev_priv(dev);
878                 }
879                 memcpy(p, &t->parms, sizeof(*p));
880                 break;
881
882         case SIOCADDTUNNEL:
883         case SIOCCHGTUNNEL:
884                 err = -EPERM;
885                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
886                         goto done;
887                 if (p->iph.ttl)
888                         p->iph.frag_off |= htons(IP_DF);
889                 if (!(p->i_flags & VTI_ISVTI)) {
890                         if (!(p->i_flags & TUNNEL_KEY))
891                                 p->i_key = 0;
892                         if (!(p->o_flags & TUNNEL_KEY))
893                                 p->o_key = 0;
894                 }
895
896                 t = ip_tunnel_find(itn, p, itn->type);
897
898                 if (cmd == SIOCADDTUNNEL) {
899                         if (!t) {
900                                 t = ip_tunnel_create(net, itn, p);
901                                 err = PTR_ERR_OR_ZERO(t);
902                                 break;
903                         }
904
905                         err = -EEXIST;
906                         break;
907                 }
908                 if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
909                         if (t) {
910                                 if (t->dev != dev) {
911                                         err = -EEXIST;
912                                         break;
913                                 }
914                         } else {
915                                 unsigned int nflags = 0;
916
917                                 if (ipv4_is_multicast(p->iph.daddr))
918                                         nflags = IFF_BROADCAST;
919                                 else if (p->iph.daddr)
920                                         nflags = IFF_POINTOPOINT;
921
922                                 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
923                                         err = -EINVAL;
924                                         break;
925                                 }
926
927                                 t = netdev_priv(dev);
928                         }
929                 }
930
931                 if (t) {
932                         err = 0;
933                         ip_tunnel_update(itn, t, dev, p, true, 0);
934                 } else {
935                         err = -ENOENT;
936                 }
937                 break;
938
939         case SIOCDELTUNNEL:
940                 err = -EPERM;
941                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
942                         goto done;
943
944                 if (dev == itn->fb_tunnel_dev) {
945                         err = -ENOENT;
946                         t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
947                         if (!t)
948                                 goto done;
949                         err = -EPERM;
950                         if (t == netdev_priv(itn->fb_tunnel_dev))
951                                 goto done;
952                         dev = t->dev;
953                 }
954                 unregister_netdevice(dev);
955                 err = 0;
956                 break;
957
958         default:
959                 err = -EINVAL;
960         }
961
962 done:
963         return err;
964 }
965 EXPORT_SYMBOL_GPL(ip_tunnel_ctl);
966
967 int ip_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
968 {
969         struct ip_tunnel_parm p;
970         int err;
971
972         if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
973                 return -EFAULT;
974         err = dev->netdev_ops->ndo_tunnel_ctl(dev, &p, cmd);
975         if (!err && copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
976                 return -EFAULT;
977         return err;
978 }
979 EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
980
981 int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict)
982 {
983         struct ip_tunnel *tunnel = netdev_priv(dev);
984         int t_hlen = tunnel->hlen + sizeof(struct iphdr);
985         int max_mtu = IP_MAX_MTU - dev->hard_header_len - t_hlen;
986
987         if (new_mtu < ETH_MIN_MTU)
988                 return -EINVAL;
989
990         if (new_mtu > max_mtu) {
991                 if (strict)
992                         return -EINVAL;
993
994                 new_mtu = max_mtu;
995         }
996
997         dev->mtu = new_mtu;
998         return 0;
999 }
1000 EXPORT_SYMBOL_GPL(__ip_tunnel_change_mtu);
1001
1002 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1003 {
1004         return __ip_tunnel_change_mtu(dev, new_mtu, true);
1005 }
1006 EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
1007
1008 static void ip_tunnel_dev_free(struct net_device *dev)
1009 {
1010         struct ip_tunnel *tunnel = netdev_priv(dev);
1011
1012         gro_cells_destroy(&tunnel->gro_cells);
1013         dst_cache_destroy(&tunnel->dst_cache);
1014         free_percpu(dev->tstats);
1015 }
1016
1017 void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
1018 {
1019         struct ip_tunnel *tunnel = netdev_priv(dev);
1020         struct ip_tunnel_net *itn;
1021
1022         itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
1023
1024         if (itn->fb_tunnel_dev != dev) {
1025                 ip_tunnel_del(itn, netdev_priv(dev));
1026                 unregister_netdevice_queue(dev, head);
1027         }
1028 }
1029 EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
1030
1031 struct net *ip_tunnel_get_link_net(const struct net_device *dev)
1032 {
1033         struct ip_tunnel *tunnel = netdev_priv(dev);
1034
1035         return tunnel->net;
1036 }
1037 EXPORT_SYMBOL(ip_tunnel_get_link_net);
1038
1039 int ip_tunnel_get_iflink(const struct net_device *dev)
1040 {
1041         struct ip_tunnel *tunnel = netdev_priv(dev);
1042
1043         return tunnel->parms.link;
1044 }
1045 EXPORT_SYMBOL(ip_tunnel_get_iflink);
1046
1047 int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id,
1048                                   struct rtnl_link_ops *ops, char *devname)
1049 {
1050         struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
1051         struct ip_tunnel_parm parms;
1052         unsigned int i;
1053
1054         itn->rtnl_link_ops = ops;
1055         for (i = 0; i < IP_TNL_HASH_SIZE; i++)
1056                 INIT_HLIST_HEAD(&itn->tunnels[i]);
1057
1058         if (!ops || !net_has_fallback_tunnels(net)) {
1059                 struct ip_tunnel_net *it_init_net;
1060
1061                 it_init_net = net_generic(&init_net, ip_tnl_net_id);
1062                 itn->type = it_init_net->type;
1063                 itn->fb_tunnel_dev = NULL;
1064                 return 0;
1065         }
1066
1067         memset(&parms, 0, sizeof(parms));
1068         if (devname)
1069                 strlcpy(parms.name, devname, IFNAMSIZ);
1070
1071         rtnl_lock();
1072         itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
1073         /* FB netdevice is special: we have one, and only one per netns.
1074          * Allowing to move it to another netns is clearly unsafe.
1075          */
1076         if (!IS_ERR(itn->fb_tunnel_dev)) {
1077                 itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
1078                 itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
1079                 ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
1080                 itn->type = itn->fb_tunnel_dev->type;
1081         }
1082         rtnl_unlock();
1083
1084         return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
1085 }
1086 EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
1087
1088 static void ip_tunnel_destroy(struct net *net, struct ip_tunnel_net *itn,
1089                               struct list_head *head,
1090                               struct rtnl_link_ops *ops)
1091 {
1092         struct net_device *dev, *aux;
1093         int h;
1094
1095         for_each_netdev_safe(net, dev, aux)
1096                 if (dev->rtnl_link_ops == ops)
1097                         unregister_netdevice_queue(dev, head);
1098
1099         for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
1100                 struct ip_tunnel *t;
1101                 struct hlist_node *n;
1102                 struct hlist_head *thead = &itn->tunnels[h];
1103
1104                 hlist_for_each_entry_safe(t, n, thead, hash_node)
1105                         /* If dev is in the same netns, it has already
1106                          * been added to the list by the previous loop.
1107                          */
1108                         if (!net_eq(dev_net(t->dev), net))
1109                                 unregister_netdevice_queue(t->dev, head);
1110         }
1111 }
1112
1113 void ip_tunnel_delete_nets(struct list_head *net_list, unsigned int id,
1114                            struct rtnl_link_ops *ops)
1115 {
1116         struct ip_tunnel_net *itn;
1117         struct net *net;
1118         LIST_HEAD(list);
1119
1120         rtnl_lock();
1121         list_for_each_entry(net, net_list, exit_list) {
1122                 itn = net_generic(net, id);
1123                 ip_tunnel_destroy(net, itn, &list, ops);
1124         }
1125         unregister_netdevice_many(&list);
1126         rtnl_unlock();
1127 }
1128 EXPORT_SYMBOL_GPL(ip_tunnel_delete_nets);
1129
1130 int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
1131                       struct ip_tunnel_parm *p, __u32 fwmark)
1132 {
1133         struct ip_tunnel *nt;
1134         struct net *net = dev_net(dev);
1135         struct ip_tunnel_net *itn;
1136         int mtu;
1137         int err;
1138
1139         nt = netdev_priv(dev);
1140         itn = net_generic(net, nt->ip_tnl_net_id);
1141
1142         if (nt->collect_md) {
1143                 if (rtnl_dereference(itn->collect_md_tun))
1144                         return -EEXIST;
1145         } else {
1146                 if (ip_tunnel_find(itn, p, dev->type))
1147                         return -EEXIST;
1148         }
1149
1150         nt->net = net;
1151         nt->parms = *p;
1152         nt->fwmark = fwmark;
1153         err = register_netdevice(dev);
1154         if (err)
1155                 goto err_register_netdevice;
1156
1157         if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1158                 eth_hw_addr_random(dev);
1159
1160         mtu = ip_tunnel_bind_dev(dev);
1161         if (tb[IFLA_MTU]) {
1162                 unsigned int max = IP_MAX_MTU - dev->hard_header_len - nt->hlen;
1163
1164                 mtu = clamp(dev->mtu, (unsigned int)ETH_MIN_MTU,
1165                             (unsigned int)(max - sizeof(struct iphdr)));
1166         }
1167
1168         err = dev_set_mtu(dev, mtu);
1169         if (err)
1170                 goto err_dev_set_mtu;
1171
1172         ip_tunnel_add(itn, nt);
1173         return 0;
1174
1175 err_dev_set_mtu:
1176         unregister_netdevice(dev);
1177 err_register_netdevice:
1178         return err;
1179 }
1180 EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
1181
1182 int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
1183                          struct ip_tunnel_parm *p, __u32 fwmark)
1184 {
1185         struct ip_tunnel *t;
1186         struct ip_tunnel *tunnel = netdev_priv(dev);
1187         struct net *net = tunnel->net;
1188         struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
1189
1190         if (dev == itn->fb_tunnel_dev)
1191                 return -EINVAL;
1192
1193         t = ip_tunnel_find(itn, p, dev->type);
1194
1195         if (t) {
1196                 if (t->dev != dev)
1197                         return -EEXIST;
1198         } else {
1199                 t = tunnel;
1200
1201                 if (dev->type != ARPHRD_ETHER) {
1202                         unsigned int nflags = 0;
1203
1204                         if (ipv4_is_multicast(p->iph.daddr))
1205                                 nflags = IFF_BROADCAST;
1206                         else if (p->iph.daddr)
1207                                 nflags = IFF_POINTOPOINT;
1208
1209                         if ((dev->flags ^ nflags) &
1210                             (IFF_POINTOPOINT | IFF_BROADCAST))
1211                                 return -EINVAL;
1212                 }
1213         }
1214
1215         ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU], fwmark);
1216         return 0;
1217 }
1218 EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
1219
1220 int ip_tunnel_init(struct net_device *dev)
1221 {
1222         struct ip_tunnel *tunnel = netdev_priv(dev);
1223         struct iphdr *iph = &tunnel->parms.iph;
1224         int err;
1225
1226         dev->needs_free_netdev = true;
1227         dev->priv_destructor = ip_tunnel_dev_free;
1228         dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
1229         if (!dev->tstats)
1230                 return -ENOMEM;
1231
1232         err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
1233         if (err) {
1234                 free_percpu(dev->tstats);
1235                 return err;
1236         }
1237
1238         err = gro_cells_init(&tunnel->gro_cells, dev);
1239         if (err) {
1240                 dst_cache_destroy(&tunnel->dst_cache);
1241                 free_percpu(dev->tstats);
1242                 return err;
1243         }
1244
1245         tunnel->dev = dev;
1246         tunnel->net = dev_net(dev);
1247         strcpy(tunnel->parms.name, dev->name);
1248         iph->version            = 4;
1249         iph->ihl                = 5;
1250
1251         if (tunnel->collect_md)
1252                 netif_keep_dst(dev);
1253         return 0;
1254 }
1255 EXPORT_SYMBOL_GPL(ip_tunnel_init);
1256
1257 void ip_tunnel_uninit(struct net_device *dev)
1258 {
1259         struct ip_tunnel *tunnel = netdev_priv(dev);
1260         struct net *net = tunnel->net;
1261         struct ip_tunnel_net *itn;
1262
1263         itn = net_generic(net, tunnel->ip_tnl_net_id);
1264         ip_tunnel_del(itn, netdev_priv(dev));
1265         if (itn->fb_tunnel_dev == dev)
1266                 WRITE_ONCE(itn->fb_tunnel_dev, NULL);
1267
1268         dst_cache_reset(&tunnel->dst_cache);
1269 }
1270 EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1271
1272 /* Do least required initialization, rest of init is done in tunnel_init call */
1273 void ip_tunnel_setup(struct net_device *dev, unsigned int net_id)
1274 {
1275         struct ip_tunnel *tunnel = netdev_priv(dev);
1276         tunnel->ip_tnl_net_id = net_id;
1277 }
1278 EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1279
1280 MODULE_LICENSE("GPL");