Merge tag 'for-v5.15' of git://git.kernel.org/pub/scm/linux/kernel/git/sre/linux...
[linux-2.6-microblaze.git] / net / ipv4 / ip_tunnel.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (c) 2013 Nicira, Inc.
4  */
5
6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
7
8 #include <linux/capability.h>
9 #include <linux/module.h>
10 #include <linux/types.h>
11 #include <linux/kernel.h>
12 #include <linux/slab.h>
13 #include <linux/uaccess.h>
14 #include <linux/skbuff.h>
15 #include <linux/netdevice.h>
16 #include <linux/in.h>
17 #include <linux/tcp.h>
18 #include <linux/udp.h>
19 #include <linux/if_arp.h>
20 #include <linux/init.h>
21 #include <linux/in6.h>
22 #include <linux/inetdevice.h>
23 #include <linux/igmp.h>
24 #include <linux/netfilter_ipv4.h>
25 #include <linux/etherdevice.h>
26 #include <linux/if_ether.h>
27 #include <linux/if_vlan.h>
28 #include <linux/rculist.h>
29 #include <linux/err.h>
30
31 #include <net/sock.h>
32 #include <net/ip.h>
33 #include <net/icmp.h>
34 #include <net/protocol.h>
35 #include <net/ip_tunnels.h>
36 #include <net/arp.h>
37 #include <net/checksum.h>
38 #include <net/dsfield.h>
39 #include <net/inet_ecn.h>
40 #include <net/xfrm.h>
41 #include <net/net_namespace.h>
42 #include <net/netns/generic.h>
43 #include <net/rtnetlink.h>
44 #include <net/udp.h>
45 #include <net/dst_metadata.h>
46
47 #if IS_ENABLED(CONFIG_IPV6)
48 #include <net/ipv6.h>
49 #include <net/ip6_fib.h>
50 #include <net/ip6_route.h>
51 #endif
52
53 static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
54 {
55         return hash_32((__force u32)key ^ (__force u32)remote,
56                          IP_TNL_HASH_BITS);
57 }
58
59 static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
60                                 __be16 flags, __be32 key)
61 {
62         if (p->i_flags & TUNNEL_KEY) {
63                 if (flags & TUNNEL_KEY)
64                         return key == p->i_key;
65                 else
66                         /* key expected, none present */
67                         return false;
68         } else
69                 return !(flags & TUNNEL_KEY);
70 }
71
72 /* Fallback tunnel: no source, no destination, no key, no options
73
74    Tunnel hash table:
75    We require exact key match i.e. if a key is present in packet
76    it will match only tunnel with the same key; if it is not present,
77    it will match only keyless tunnel.
78
79    All keysless packets, if not matched configured keyless tunnels
80    will match fallback tunnel.
81    Given src, dst and key, find appropriate for input tunnel.
82 */
83 struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
84                                    int link, __be16 flags,
85                                    __be32 remote, __be32 local,
86                                    __be32 key)
87 {
88         struct ip_tunnel *t, *cand = NULL;
89         struct hlist_head *head;
90         struct net_device *ndev;
91         unsigned int hash;
92
93         hash = ip_tunnel_hash(key, remote);
94         head = &itn->tunnels[hash];
95
96         hlist_for_each_entry_rcu(t, head, hash_node) {
97                 if (local != t->parms.iph.saddr ||
98                     remote != t->parms.iph.daddr ||
99                     !(t->dev->flags & IFF_UP))
100                         continue;
101
102                 if (!ip_tunnel_key_match(&t->parms, flags, key))
103                         continue;
104
105                 if (t->parms.link == link)
106                         return t;
107                 else
108                         cand = t;
109         }
110
111         hlist_for_each_entry_rcu(t, head, hash_node) {
112                 if (remote != t->parms.iph.daddr ||
113                     t->parms.iph.saddr != 0 ||
114                     !(t->dev->flags & IFF_UP))
115                         continue;
116
117                 if (!ip_tunnel_key_match(&t->parms, flags, key))
118                         continue;
119
120                 if (t->parms.link == link)
121                         return t;
122                 else if (!cand)
123                         cand = t;
124         }
125
126         hash = ip_tunnel_hash(key, 0);
127         head = &itn->tunnels[hash];
128
129         hlist_for_each_entry_rcu(t, head, hash_node) {
130                 if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) &&
131                     (local != t->parms.iph.daddr || !ipv4_is_multicast(local)))
132                         continue;
133
134                 if (!(t->dev->flags & IFF_UP))
135                         continue;
136
137                 if (!ip_tunnel_key_match(&t->parms, flags, key))
138                         continue;
139
140                 if (t->parms.link == link)
141                         return t;
142                 else if (!cand)
143                         cand = t;
144         }
145
146         hlist_for_each_entry_rcu(t, head, hash_node) {
147                 if ((!(flags & TUNNEL_NO_KEY) && t->parms.i_key != key) ||
148                     t->parms.iph.saddr != 0 ||
149                     t->parms.iph.daddr != 0 ||
150                     !(t->dev->flags & IFF_UP))
151                         continue;
152
153                 if (t->parms.link == link)
154                         return t;
155                 else if (!cand)
156                         cand = t;
157         }
158
159         if (cand)
160                 return cand;
161
162         t = rcu_dereference(itn->collect_md_tun);
163         if (t && t->dev->flags & IFF_UP)
164                 return t;
165
166         ndev = READ_ONCE(itn->fb_tunnel_dev);
167         if (ndev && ndev->flags & IFF_UP)
168                 return netdev_priv(ndev);
169
170         return NULL;
171 }
172 EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
173
174 static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
175                                     struct ip_tunnel_parm *parms)
176 {
177         unsigned int h;
178         __be32 remote;
179         __be32 i_key = parms->i_key;
180
181         if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
182                 remote = parms->iph.daddr;
183         else
184                 remote = 0;
185
186         if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI))
187                 i_key = 0;
188
189         h = ip_tunnel_hash(i_key, remote);
190         return &itn->tunnels[h];
191 }
192
193 static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
194 {
195         struct hlist_head *head = ip_bucket(itn, &t->parms);
196
197         if (t->collect_md)
198                 rcu_assign_pointer(itn->collect_md_tun, t);
199         hlist_add_head_rcu(&t->hash_node, head);
200 }
201
202 static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t)
203 {
204         if (t->collect_md)
205                 rcu_assign_pointer(itn->collect_md_tun, NULL);
206         hlist_del_init_rcu(&t->hash_node);
207 }
208
209 static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
210                                         struct ip_tunnel_parm *parms,
211                                         int type)
212 {
213         __be32 remote = parms->iph.daddr;
214         __be32 local = parms->iph.saddr;
215         __be32 key = parms->i_key;
216         __be16 flags = parms->i_flags;
217         int link = parms->link;
218         struct ip_tunnel *t = NULL;
219         struct hlist_head *head = ip_bucket(itn, parms);
220
221         hlist_for_each_entry_rcu(t, head, hash_node) {
222                 if (local == t->parms.iph.saddr &&
223                     remote == t->parms.iph.daddr &&
224                     link == t->parms.link &&
225                     type == t->dev->type &&
226                     ip_tunnel_key_match(&t->parms, flags, key))
227                         break;
228         }
229         return t;
230 }
231
232 static struct net_device *__ip_tunnel_create(struct net *net,
233                                              const struct rtnl_link_ops *ops,
234                                              struct ip_tunnel_parm *parms)
235 {
236         int err;
237         struct ip_tunnel *tunnel;
238         struct net_device *dev;
239         char name[IFNAMSIZ];
240
241         err = -E2BIG;
242         if (parms->name[0]) {
243                 if (!dev_valid_name(parms->name))
244                         goto failed;
245                 strlcpy(name, parms->name, IFNAMSIZ);
246         } else {
247                 if (strlen(ops->kind) > (IFNAMSIZ - 3))
248                         goto failed;
249                 strcpy(name, ops->kind);
250                 strcat(name, "%d");
251         }
252
253         ASSERT_RTNL();
254         dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup);
255         if (!dev) {
256                 err = -ENOMEM;
257                 goto failed;
258         }
259         dev_net_set(dev, net);
260
261         dev->rtnl_link_ops = ops;
262
263         tunnel = netdev_priv(dev);
264         tunnel->parms = *parms;
265         tunnel->net = net;
266
267         err = register_netdevice(dev);
268         if (err)
269                 goto failed_free;
270
271         return dev;
272
273 failed_free:
274         free_netdev(dev);
275 failed:
276         return ERR_PTR(err);
277 }
278
279 static int ip_tunnel_bind_dev(struct net_device *dev)
280 {
281         struct net_device *tdev = NULL;
282         struct ip_tunnel *tunnel = netdev_priv(dev);
283         const struct iphdr *iph;
284         int hlen = LL_MAX_HEADER;
285         int mtu = ETH_DATA_LEN;
286         int t_hlen = tunnel->hlen + sizeof(struct iphdr);
287
288         iph = &tunnel->parms.iph;
289
290         /* Guess output device to choose reasonable mtu and needed_headroom */
291         if (iph->daddr) {
292                 struct flowi4 fl4;
293                 struct rtable *rt;
294
295                 ip_tunnel_init_flow(&fl4, iph->protocol, iph->daddr,
296                                     iph->saddr, tunnel->parms.o_key,
297                                     RT_TOS(iph->tos), tunnel->parms.link,
298                                     tunnel->fwmark, 0);
299                 rt = ip_route_output_key(tunnel->net, &fl4);
300
301                 if (!IS_ERR(rt)) {
302                         tdev = rt->dst.dev;
303                         ip_rt_put(rt);
304                 }
305                 if (dev->type != ARPHRD_ETHER)
306                         dev->flags |= IFF_POINTOPOINT;
307
308                 dst_cache_reset(&tunnel->dst_cache);
309         }
310
311         if (!tdev && tunnel->parms.link)
312                 tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
313
314         if (tdev) {
315                 hlen = tdev->hard_header_len + tdev->needed_headroom;
316                 mtu = min(tdev->mtu, IP_MAX_MTU);
317         }
318
319         dev->needed_headroom = t_hlen + hlen;
320         mtu -= t_hlen + (dev->type == ARPHRD_ETHER ? dev->hard_header_len : 0);
321
322         if (mtu < IPV4_MIN_MTU)
323                 mtu = IPV4_MIN_MTU;
324
325         return mtu;
326 }
327
328 static struct ip_tunnel *ip_tunnel_create(struct net *net,
329                                           struct ip_tunnel_net *itn,
330                                           struct ip_tunnel_parm *parms)
331 {
332         struct ip_tunnel *nt;
333         struct net_device *dev;
334         int t_hlen;
335         int mtu;
336         int err;
337
338         dev = __ip_tunnel_create(net, itn->rtnl_link_ops, parms);
339         if (IS_ERR(dev))
340                 return ERR_CAST(dev);
341
342         mtu = ip_tunnel_bind_dev(dev);
343         err = dev_set_mtu(dev, mtu);
344         if (err)
345                 goto err_dev_set_mtu;
346
347         nt = netdev_priv(dev);
348         t_hlen = nt->hlen + sizeof(struct iphdr);
349         dev->min_mtu = ETH_MIN_MTU;
350         dev->max_mtu = IP_MAX_MTU - t_hlen;
351         if (dev->type == ARPHRD_ETHER)
352                 dev->max_mtu -= dev->hard_header_len;
353
354         ip_tunnel_add(itn, nt);
355         return nt;
356
357 err_dev_set_mtu:
358         unregister_netdevice(dev);
359         return ERR_PTR(err);
360 }
361
362 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
363                   const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst,
364                   bool log_ecn_error)
365 {
366         const struct iphdr *iph = ip_hdr(skb);
367         int err;
368
369 #ifdef CONFIG_NET_IPGRE_BROADCAST
370         if (ipv4_is_multicast(iph->daddr)) {
371                 tunnel->dev->stats.multicast++;
372                 skb->pkt_type = PACKET_BROADCAST;
373         }
374 #endif
375
376         if ((!(tpi->flags&TUNNEL_CSUM) &&  (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
377              ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
378                 tunnel->dev->stats.rx_crc_errors++;
379                 tunnel->dev->stats.rx_errors++;
380                 goto drop;
381         }
382
383         if (tunnel->parms.i_flags&TUNNEL_SEQ) {
384                 if (!(tpi->flags&TUNNEL_SEQ) ||
385                     (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
386                         tunnel->dev->stats.rx_fifo_errors++;
387                         tunnel->dev->stats.rx_errors++;
388                         goto drop;
389                 }
390                 tunnel->i_seqno = ntohl(tpi->seq) + 1;
391         }
392
393         skb_set_network_header(skb, (tunnel->dev->type == ARPHRD_ETHER) ? ETH_HLEN : 0);
394
395         err = IP_ECN_decapsulate(iph, skb);
396         if (unlikely(err)) {
397                 if (log_ecn_error)
398                         net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
399                                         &iph->saddr, iph->tos);
400                 if (err > 1) {
401                         ++tunnel->dev->stats.rx_frame_errors;
402                         ++tunnel->dev->stats.rx_errors;
403                         goto drop;
404                 }
405         }
406
407         dev_sw_netstats_rx_add(tunnel->dev, skb->len);
408         skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
409
410         if (tunnel->dev->type == ARPHRD_ETHER) {
411                 skb->protocol = eth_type_trans(skb, tunnel->dev);
412                 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
413         } else {
414                 skb->dev = tunnel->dev;
415         }
416
417         if (tun_dst)
418                 skb_dst_set(skb, (struct dst_entry *)tun_dst);
419
420         gro_cells_receive(&tunnel->gro_cells, skb);
421         return 0;
422
423 drop:
424         if (tun_dst)
425                 dst_release((struct dst_entry *)tun_dst);
426         kfree_skb(skb);
427         return 0;
428 }
429 EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
430
431 int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops,
432                             unsigned int num)
433 {
434         if (num >= MAX_IPTUN_ENCAP_OPS)
435                 return -ERANGE;
436
437         return !cmpxchg((const struct ip_tunnel_encap_ops **)
438                         &iptun_encaps[num],
439                         NULL, ops) ? 0 : -1;
440 }
441 EXPORT_SYMBOL(ip_tunnel_encap_add_ops);
442
443 int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops,
444                             unsigned int num)
445 {
446         int ret;
447
448         if (num >= MAX_IPTUN_ENCAP_OPS)
449                 return -ERANGE;
450
451         ret = (cmpxchg((const struct ip_tunnel_encap_ops **)
452                        &iptun_encaps[num],
453                        ops, NULL) == ops) ? 0 : -1;
454
455         synchronize_net();
456
457         return ret;
458 }
459 EXPORT_SYMBOL(ip_tunnel_encap_del_ops);
460
461 int ip_tunnel_encap_setup(struct ip_tunnel *t,
462                           struct ip_tunnel_encap *ipencap)
463 {
464         int hlen;
465
466         memset(&t->encap, 0, sizeof(t->encap));
467
468         hlen = ip_encap_hlen(ipencap);
469         if (hlen < 0)
470                 return hlen;
471
472         t->encap.type = ipencap->type;
473         t->encap.sport = ipencap->sport;
474         t->encap.dport = ipencap->dport;
475         t->encap.flags = ipencap->flags;
476
477         t->encap_hlen = hlen;
478         t->hlen = t->encap_hlen + t->tun_hlen;
479
480         return 0;
481 }
482 EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup);
483
484 static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
485                             struct rtable *rt, __be16 df,
486                             const struct iphdr *inner_iph,
487                             int tunnel_hlen, __be32 dst, bool md)
488 {
489         struct ip_tunnel *tunnel = netdev_priv(dev);
490         int pkt_size;
491         int mtu;
492
493         tunnel_hlen = md ? tunnel_hlen : tunnel->hlen;
494         pkt_size = skb->len - tunnel_hlen;
495         pkt_size -= dev->type == ARPHRD_ETHER ? dev->hard_header_len : 0;
496
497         if (df) {
498                 mtu = dst_mtu(&rt->dst) - (sizeof(struct iphdr) + tunnel_hlen);
499                 mtu -= dev->type == ARPHRD_ETHER ? dev->hard_header_len : 0;
500         } else {
501                 mtu = skb_valid_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
502         }
503
504         if (skb_valid_dst(skb))
505                 skb_dst_update_pmtu_no_confirm(skb, mtu);
506
507         if (skb->protocol == htons(ETH_P_IP)) {
508                 if (!skb_is_gso(skb) &&
509                     (inner_iph->frag_off & htons(IP_DF)) &&
510                     mtu < pkt_size) {
511                         icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
512                         return -E2BIG;
513                 }
514         }
515 #if IS_ENABLED(CONFIG_IPV6)
516         else if (skb->protocol == htons(ETH_P_IPV6)) {
517                 struct rt6_info *rt6;
518                 __be32 daddr;
519
520                 rt6 = skb_valid_dst(skb) ? (struct rt6_info *)skb_dst(skb) :
521                                            NULL;
522                 daddr = md ? dst : tunnel->parms.iph.daddr;
523
524                 if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
525                            mtu >= IPV6_MIN_MTU) {
526                         if ((daddr && !ipv4_is_multicast(daddr)) ||
527                             rt6->rt6i_dst.plen == 128) {
528                                 rt6->rt6i_flags |= RTF_MODIFIED;
529                                 dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
530                         }
531                 }
532
533                 if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
534                                         mtu < pkt_size) {
535                         icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
536                         return -E2BIG;
537                 }
538         }
539 #endif
540         return 0;
541 }
542
543 void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
544                        u8 proto, int tunnel_hlen)
545 {
546         struct ip_tunnel *tunnel = netdev_priv(dev);
547         u32 headroom = sizeof(struct iphdr);
548         struct ip_tunnel_info *tun_info;
549         const struct ip_tunnel_key *key;
550         const struct iphdr *inner_iph;
551         struct rtable *rt = NULL;
552         struct flowi4 fl4;
553         __be16 df = 0;
554         u8 tos, ttl;
555         bool use_cache;
556
557         tun_info = skb_tunnel_info(skb);
558         if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
559                      ip_tunnel_info_af(tun_info) != AF_INET))
560                 goto tx_error;
561         key = &tun_info->key;
562         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
563         inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
564         tos = key->tos;
565         if (tos == 1) {
566                 if (skb->protocol == htons(ETH_P_IP))
567                         tos = inner_iph->tos;
568                 else if (skb->protocol == htons(ETH_P_IPV6))
569                         tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
570         }
571         ip_tunnel_init_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src,
572                             tunnel_id_to_key32(key->tun_id), RT_TOS(tos),
573                             0, skb->mark, skb_get_hash(skb));
574         if (tunnel->encap.type != TUNNEL_ENCAP_NONE)
575                 goto tx_error;
576
577         use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
578         if (use_cache)
579                 rt = dst_cache_get_ip4(&tun_info->dst_cache, &fl4.saddr);
580         if (!rt) {
581                 rt = ip_route_output_key(tunnel->net, &fl4);
582                 if (IS_ERR(rt)) {
583                         dev->stats.tx_carrier_errors++;
584                         goto tx_error;
585                 }
586                 if (use_cache)
587                         dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst,
588                                           fl4.saddr);
589         }
590         if (rt->dst.dev == dev) {
591                 ip_rt_put(rt);
592                 dev->stats.collisions++;
593                 goto tx_error;
594         }
595
596         if (key->tun_flags & TUNNEL_DONT_FRAGMENT)
597                 df = htons(IP_DF);
598         if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, tunnel_hlen,
599                             key->u.ipv4.dst, true)) {
600                 ip_rt_put(rt);
601                 goto tx_error;
602         }
603
604         tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
605         ttl = key->ttl;
606         if (ttl == 0) {
607                 if (skb->protocol == htons(ETH_P_IP))
608                         ttl = inner_iph->ttl;
609                 else if (skb->protocol == htons(ETH_P_IPV6))
610                         ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
611                 else
612                         ttl = ip4_dst_hoplimit(&rt->dst);
613         }
614
615         headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len;
616         if (headroom > dev->needed_headroom)
617                 dev->needed_headroom = headroom;
618
619         if (skb_cow_head(skb, dev->needed_headroom)) {
620                 ip_rt_put(rt);
621                 goto tx_dropped;
622         }
623         iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, tos, ttl,
624                       df, !net_eq(tunnel->net, dev_net(dev)));
625         return;
626 tx_error:
627         dev->stats.tx_errors++;
628         goto kfree;
629 tx_dropped:
630         dev->stats.tx_dropped++;
631 kfree:
632         kfree_skb(skb);
633 }
634 EXPORT_SYMBOL_GPL(ip_md_tunnel_xmit);
635
636 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
637                     const struct iphdr *tnl_params, u8 protocol)
638 {
639         struct ip_tunnel *tunnel = netdev_priv(dev);
640         struct ip_tunnel_info *tun_info = NULL;
641         const struct iphdr *inner_iph;
642         unsigned int max_headroom;      /* The extra header space needed */
643         struct rtable *rt = NULL;               /* Route to the other host */
644         bool use_cache = false;
645         struct flowi4 fl4;
646         bool md = false;
647         bool connected;
648         u8 tos, ttl;
649         __be32 dst;
650         __be16 df;
651
652         inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
653         connected = (tunnel->parms.iph.daddr != 0);
654
655         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
656
657         dst = tnl_params->daddr;
658         if (dst == 0) {
659                 /* NBMA tunnel */
660
661                 if (!skb_dst(skb)) {
662                         dev->stats.tx_fifo_errors++;
663                         goto tx_error;
664                 }
665
666                 tun_info = skb_tunnel_info(skb);
667                 if (tun_info && (tun_info->mode & IP_TUNNEL_INFO_TX) &&
668                     ip_tunnel_info_af(tun_info) == AF_INET &&
669                     tun_info->key.u.ipv4.dst) {
670                         dst = tun_info->key.u.ipv4.dst;
671                         md = true;
672                         connected = true;
673                 }
674                 else if (skb->protocol == htons(ETH_P_IP)) {
675                         rt = skb_rtable(skb);
676                         dst = rt_nexthop(rt, inner_iph->daddr);
677                 }
678 #if IS_ENABLED(CONFIG_IPV6)
679                 else if (skb->protocol == htons(ETH_P_IPV6)) {
680                         const struct in6_addr *addr6;
681                         struct neighbour *neigh;
682                         bool do_tx_error_icmp;
683                         int addr_type;
684
685                         neigh = dst_neigh_lookup(skb_dst(skb),
686                                                  &ipv6_hdr(skb)->daddr);
687                         if (!neigh)
688                                 goto tx_error;
689
690                         addr6 = (const struct in6_addr *)&neigh->primary_key;
691                         addr_type = ipv6_addr_type(addr6);
692
693                         if (addr_type == IPV6_ADDR_ANY) {
694                                 addr6 = &ipv6_hdr(skb)->daddr;
695                                 addr_type = ipv6_addr_type(addr6);
696                         }
697
698                         if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
699                                 do_tx_error_icmp = true;
700                         else {
701                                 do_tx_error_icmp = false;
702                                 dst = addr6->s6_addr32[3];
703                         }
704                         neigh_release(neigh);
705                         if (do_tx_error_icmp)
706                                 goto tx_error_icmp;
707                 }
708 #endif
709                 else
710                         goto tx_error;
711
712                 if (!md)
713                         connected = false;
714         }
715
716         tos = tnl_params->tos;
717         if (tos & 0x1) {
718                 tos &= ~0x1;
719                 if (skb->protocol == htons(ETH_P_IP)) {
720                         tos = inner_iph->tos;
721                         connected = false;
722                 } else if (skb->protocol == htons(ETH_P_IPV6)) {
723                         tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
724                         connected = false;
725                 }
726         }
727
728         ip_tunnel_init_flow(&fl4, protocol, dst, tnl_params->saddr,
729                             tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link,
730                             tunnel->fwmark, skb_get_hash(skb));
731
732         if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0)
733                 goto tx_error;
734
735         if (connected && md) {
736                 use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
737                 if (use_cache)
738                         rt = dst_cache_get_ip4(&tun_info->dst_cache,
739                                                &fl4.saddr);
740         } else {
741                 rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache,
742                                                 &fl4.saddr) : NULL;
743         }
744
745         if (!rt) {
746                 rt = ip_route_output_key(tunnel->net, &fl4);
747
748                 if (IS_ERR(rt)) {
749                         dev->stats.tx_carrier_errors++;
750                         goto tx_error;
751                 }
752                 if (use_cache)
753                         dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst,
754                                           fl4.saddr);
755                 else if (!md && connected)
756                         dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst,
757                                           fl4.saddr);
758         }
759
760         if (rt->dst.dev == dev) {
761                 ip_rt_put(rt);
762                 dev->stats.collisions++;
763                 goto tx_error;
764         }
765
766         df = tnl_params->frag_off;
767         if (skb->protocol == htons(ETH_P_IP) && !tunnel->ignore_df)
768                 df |= (inner_iph->frag_off & htons(IP_DF));
769
770         if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, 0, 0, false)) {
771                 ip_rt_put(rt);
772                 goto tx_error;
773         }
774
775         if (tunnel->err_count > 0) {
776                 if (time_before(jiffies,
777                                 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
778                         tunnel->err_count--;
779
780                         dst_link_failure(skb);
781                 } else
782                         tunnel->err_count = 0;
783         }
784
785         tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
786         ttl = tnl_params->ttl;
787         if (ttl == 0) {
788                 if (skb->protocol == htons(ETH_P_IP))
789                         ttl = inner_iph->ttl;
790 #if IS_ENABLED(CONFIG_IPV6)
791                 else if (skb->protocol == htons(ETH_P_IPV6))
792                         ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
793 #endif
794                 else
795                         ttl = ip4_dst_hoplimit(&rt->dst);
796         }
797
798         max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
799                         + rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
800         if (max_headroom > dev->needed_headroom)
801                 dev->needed_headroom = max_headroom;
802
803         if (skb_cow_head(skb, dev->needed_headroom)) {
804                 ip_rt_put(rt);
805                 dev->stats.tx_dropped++;
806                 kfree_skb(skb);
807                 return;
808         }
809
810         iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl,
811                       df, !net_eq(tunnel->net, dev_net(dev)));
812         return;
813
814 #if IS_ENABLED(CONFIG_IPV6)
815 tx_error_icmp:
816         dst_link_failure(skb);
817 #endif
818 tx_error:
819         dev->stats.tx_errors++;
820         kfree_skb(skb);
821 }
822 EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
823
824 static void ip_tunnel_update(struct ip_tunnel_net *itn,
825                              struct ip_tunnel *t,
826                              struct net_device *dev,
827                              struct ip_tunnel_parm *p,
828                              bool set_mtu,
829                              __u32 fwmark)
830 {
831         ip_tunnel_del(itn, t);
832         t->parms.iph.saddr = p->iph.saddr;
833         t->parms.iph.daddr = p->iph.daddr;
834         t->parms.i_key = p->i_key;
835         t->parms.o_key = p->o_key;
836         if (dev->type != ARPHRD_ETHER) {
837                 memcpy(dev->dev_addr, &p->iph.saddr, 4);
838                 memcpy(dev->broadcast, &p->iph.daddr, 4);
839         }
840         ip_tunnel_add(itn, t);
841
842         t->parms.iph.ttl = p->iph.ttl;
843         t->parms.iph.tos = p->iph.tos;
844         t->parms.iph.frag_off = p->iph.frag_off;
845
846         if (t->parms.link != p->link || t->fwmark != fwmark) {
847                 int mtu;
848
849                 t->parms.link = p->link;
850                 t->fwmark = fwmark;
851                 mtu = ip_tunnel_bind_dev(dev);
852                 if (set_mtu)
853                         dev->mtu = mtu;
854         }
855         dst_cache_reset(&t->dst_cache);
856         netdev_state_change(dev);
857 }
858
859 int ip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
860 {
861         int err = 0;
862         struct ip_tunnel *t = netdev_priv(dev);
863         struct net *net = t->net;
864         struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
865
866         switch (cmd) {
867         case SIOCGETTUNNEL:
868                 if (dev == itn->fb_tunnel_dev) {
869                         t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
870                         if (!t)
871                                 t = netdev_priv(dev);
872                 }
873                 memcpy(p, &t->parms, sizeof(*p));
874                 break;
875
876         case SIOCADDTUNNEL:
877         case SIOCCHGTUNNEL:
878                 err = -EPERM;
879                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
880                         goto done;
881                 if (p->iph.ttl)
882                         p->iph.frag_off |= htons(IP_DF);
883                 if (!(p->i_flags & VTI_ISVTI)) {
884                         if (!(p->i_flags & TUNNEL_KEY))
885                                 p->i_key = 0;
886                         if (!(p->o_flags & TUNNEL_KEY))
887                                 p->o_key = 0;
888                 }
889
890                 t = ip_tunnel_find(itn, p, itn->type);
891
892                 if (cmd == SIOCADDTUNNEL) {
893                         if (!t) {
894                                 t = ip_tunnel_create(net, itn, p);
895                                 err = PTR_ERR_OR_ZERO(t);
896                                 break;
897                         }
898
899                         err = -EEXIST;
900                         break;
901                 }
902                 if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
903                         if (t) {
904                                 if (t->dev != dev) {
905                                         err = -EEXIST;
906                                         break;
907                                 }
908                         } else {
909                                 unsigned int nflags = 0;
910
911                                 if (ipv4_is_multicast(p->iph.daddr))
912                                         nflags = IFF_BROADCAST;
913                                 else if (p->iph.daddr)
914                                         nflags = IFF_POINTOPOINT;
915
916                                 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
917                                         err = -EINVAL;
918                                         break;
919                                 }
920
921                                 t = netdev_priv(dev);
922                         }
923                 }
924
925                 if (t) {
926                         err = 0;
927                         ip_tunnel_update(itn, t, dev, p, true, 0);
928                 } else {
929                         err = -ENOENT;
930                 }
931                 break;
932
933         case SIOCDELTUNNEL:
934                 err = -EPERM;
935                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
936                         goto done;
937
938                 if (dev == itn->fb_tunnel_dev) {
939                         err = -ENOENT;
940                         t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
941                         if (!t)
942                                 goto done;
943                         err = -EPERM;
944                         if (t == netdev_priv(itn->fb_tunnel_dev))
945                                 goto done;
946                         dev = t->dev;
947                 }
948                 unregister_netdevice(dev);
949                 err = 0;
950                 break;
951
952         default:
953                 err = -EINVAL;
954         }
955
956 done:
957         return err;
958 }
959 EXPORT_SYMBOL_GPL(ip_tunnel_ctl);
960
961 int ip_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
962 {
963         struct ip_tunnel_parm p;
964         int err;
965
966         if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
967                 return -EFAULT;
968         err = dev->netdev_ops->ndo_tunnel_ctl(dev, &p, cmd);
969         if (!err && copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
970                 return -EFAULT;
971         return err;
972 }
973 EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
974
975 int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict)
976 {
977         struct ip_tunnel *tunnel = netdev_priv(dev);
978         int t_hlen = tunnel->hlen + sizeof(struct iphdr);
979         int max_mtu = IP_MAX_MTU - t_hlen;
980
981         if (dev->type == ARPHRD_ETHER)
982                 max_mtu -= dev->hard_header_len;
983
984         if (new_mtu < ETH_MIN_MTU)
985                 return -EINVAL;
986
987         if (new_mtu > max_mtu) {
988                 if (strict)
989                         return -EINVAL;
990
991                 new_mtu = max_mtu;
992         }
993
994         dev->mtu = new_mtu;
995         return 0;
996 }
997 EXPORT_SYMBOL_GPL(__ip_tunnel_change_mtu);
998
999 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1000 {
1001         return __ip_tunnel_change_mtu(dev, new_mtu, true);
1002 }
1003 EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
1004
1005 static void ip_tunnel_dev_free(struct net_device *dev)
1006 {
1007         struct ip_tunnel *tunnel = netdev_priv(dev);
1008
1009         gro_cells_destroy(&tunnel->gro_cells);
1010         dst_cache_destroy(&tunnel->dst_cache);
1011         free_percpu(dev->tstats);
1012 }
1013
1014 void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
1015 {
1016         struct ip_tunnel *tunnel = netdev_priv(dev);
1017         struct ip_tunnel_net *itn;
1018
1019         itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
1020
1021         if (itn->fb_tunnel_dev != dev) {
1022                 ip_tunnel_del(itn, netdev_priv(dev));
1023                 unregister_netdevice_queue(dev, head);
1024         }
1025 }
1026 EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
1027
1028 struct net *ip_tunnel_get_link_net(const struct net_device *dev)
1029 {
1030         struct ip_tunnel *tunnel = netdev_priv(dev);
1031
1032         return tunnel->net;
1033 }
1034 EXPORT_SYMBOL(ip_tunnel_get_link_net);
1035
1036 int ip_tunnel_get_iflink(const struct net_device *dev)
1037 {
1038         struct ip_tunnel *tunnel = netdev_priv(dev);
1039
1040         return tunnel->parms.link;
1041 }
1042 EXPORT_SYMBOL(ip_tunnel_get_iflink);
1043
1044 int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id,
1045                                   struct rtnl_link_ops *ops, char *devname)
1046 {
1047         struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
1048         struct ip_tunnel_parm parms;
1049         unsigned int i;
1050
1051         itn->rtnl_link_ops = ops;
1052         for (i = 0; i < IP_TNL_HASH_SIZE; i++)
1053                 INIT_HLIST_HEAD(&itn->tunnels[i]);
1054
1055         if (!ops || !net_has_fallback_tunnels(net)) {
1056                 struct ip_tunnel_net *it_init_net;
1057
1058                 it_init_net = net_generic(&init_net, ip_tnl_net_id);
1059                 itn->type = it_init_net->type;
1060                 itn->fb_tunnel_dev = NULL;
1061                 return 0;
1062         }
1063
1064         memset(&parms, 0, sizeof(parms));
1065         if (devname)
1066                 strlcpy(parms.name, devname, IFNAMSIZ);
1067
1068         rtnl_lock();
1069         itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
1070         /* FB netdevice is special: we have one, and only one per netns.
1071          * Allowing to move it to another netns is clearly unsafe.
1072          */
1073         if (!IS_ERR(itn->fb_tunnel_dev)) {
1074                 itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
1075                 itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
1076                 ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
1077                 itn->type = itn->fb_tunnel_dev->type;
1078         }
1079         rtnl_unlock();
1080
1081         return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
1082 }
1083 EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
1084
1085 static void ip_tunnel_destroy(struct net *net, struct ip_tunnel_net *itn,
1086                               struct list_head *head,
1087                               struct rtnl_link_ops *ops)
1088 {
1089         struct net_device *dev, *aux;
1090         int h;
1091
1092         for_each_netdev_safe(net, dev, aux)
1093                 if (dev->rtnl_link_ops == ops)
1094                         unregister_netdevice_queue(dev, head);
1095
1096         for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
1097                 struct ip_tunnel *t;
1098                 struct hlist_node *n;
1099                 struct hlist_head *thead = &itn->tunnels[h];
1100
1101                 hlist_for_each_entry_safe(t, n, thead, hash_node)
1102                         /* If dev is in the same netns, it has already
1103                          * been added to the list by the previous loop.
1104                          */
1105                         if (!net_eq(dev_net(t->dev), net))
1106                                 unregister_netdevice_queue(t->dev, head);
1107         }
1108 }
1109
1110 void ip_tunnel_delete_nets(struct list_head *net_list, unsigned int id,
1111                            struct rtnl_link_ops *ops)
1112 {
1113         struct ip_tunnel_net *itn;
1114         struct net *net;
1115         LIST_HEAD(list);
1116
1117         rtnl_lock();
1118         list_for_each_entry(net, net_list, exit_list) {
1119                 itn = net_generic(net, id);
1120                 ip_tunnel_destroy(net, itn, &list, ops);
1121         }
1122         unregister_netdevice_many(&list);
1123         rtnl_unlock();
1124 }
1125 EXPORT_SYMBOL_GPL(ip_tunnel_delete_nets);
1126
1127 int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
1128                       struct ip_tunnel_parm *p, __u32 fwmark)
1129 {
1130         struct ip_tunnel *nt;
1131         struct net *net = dev_net(dev);
1132         struct ip_tunnel_net *itn;
1133         int mtu;
1134         int err;
1135
1136         nt = netdev_priv(dev);
1137         itn = net_generic(net, nt->ip_tnl_net_id);
1138
1139         if (nt->collect_md) {
1140                 if (rtnl_dereference(itn->collect_md_tun))
1141                         return -EEXIST;
1142         } else {
1143                 if (ip_tunnel_find(itn, p, dev->type))
1144                         return -EEXIST;
1145         }
1146
1147         nt->net = net;
1148         nt->parms = *p;
1149         nt->fwmark = fwmark;
1150         err = register_netdevice(dev);
1151         if (err)
1152                 goto err_register_netdevice;
1153
1154         if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1155                 eth_hw_addr_random(dev);
1156
1157         mtu = ip_tunnel_bind_dev(dev);
1158         if (tb[IFLA_MTU]) {
1159                 unsigned int max = IP_MAX_MTU - (nt->hlen + sizeof(struct iphdr));
1160
1161                 if (dev->type == ARPHRD_ETHER)
1162                         max -= dev->hard_header_len;
1163
1164                 mtu = clamp(dev->mtu, (unsigned int)ETH_MIN_MTU, max);
1165         }
1166
1167         err = dev_set_mtu(dev, mtu);
1168         if (err)
1169                 goto err_dev_set_mtu;
1170
1171         ip_tunnel_add(itn, nt);
1172         return 0;
1173
1174 err_dev_set_mtu:
1175         unregister_netdevice(dev);
1176 err_register_netdevice:
1177         return err;
1178 }
1179 EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
1180
1181 int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
1182                          struct ip_tunnel_parm *p, __u32 fwmark)
1183 {
1184         struct ip_tunnel *t;
1185         struct ip_tunnel *tunnel = netdev_priv(dev);
1186         struct net *net = tunnel->net;
1187         struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
1188
1189         if (dev == itn->fb_tunnel_dev)
1190                 return -EINVAL;
1191
1192         t = ip_tunnel_find(itn, p, dev->type);
1193
1194         if (t) {
1195                 if (t->dev != dev)
1196                         return -EEXIST;
1197         } else {
1198                 t = tunnel;
1199
1200                 if (dev->type != ARPHRD_ETHER) {
1201                         unsigned int nflags = 0;
1202
1203                         if (ipv4_is_multicast(p->iph.daddr))
1204                                 nflags = IFF_BROADCAST;
1205                         else if (p->iph.daddr)
1206                                 nflags = IFF_POINTOPOINT;
1207
1208                         if ((dev->flags ^ nflags) &
1209                             (IFF_POINTOPOINT | IFF_BROADCAST))
1210                                 return -EINVAL;
1211                 }
1212         }
1213
1214         ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU], fwmark);
1215         return 0;
1216 }
1217 EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
1218
1219 int ip_tunnel_init(struct net_device *dev)
1220 {
1221         struct ip_tunnel *tunnel = netdev_priv(dev);
1222         struct iphdr *iph = &tunnel->parms.iph;
1223         int err;
1224
1225         dev->needs_free_netdev = true;
1226         dev->priv_destructor = ip_tunnel_dev_free;
1227         dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
1228         if (!dev->tstats)
1229                 return -ENOMEM;
1230
1231         err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
1232         if (err) {
1233                 free_percpu(dev->tstats);
1234                 return err;
1235         }
1236
1237         err = gro_cells_init(&tunnel->gro_cells, dev);
1238         if (err) {
1239                 dst_cache_destroy(&tunnel->dst_cache);
1240                 free_percpu(dev->tstats);
1241                 return err;
1242         }
1243
1244         tunnel->dev = dev;
1245         tunnel->net = dev_net(dev);
1246         strcpy(tunnel->parms.name, dev->name);
1247         iph->version            = 4;
1248         iph->ihl                = 5;
1249
1250         if (tunnel->collect_md)
1251                 netif_keep_dst(dev);
1252         return 0;
1253 }
1254 EXPORT_SYMBOL_GPL(ip_tunnel_init);
1255
1256 void ip_tunnel_uninit(struct net_device *dev)
1257 {
1258         struct ip_tunnel *tunnel = netdev_priv(dev);
1259         struct net *net = tunnel->net;
1260         struct ip_tunnel_net *itn;
1261
1262         itn = net_generic(net, tunnel->ip_tnl_net_id);
1263         ip_tunnel_del(itn, netdev_priv(dev));
1264         if (itn->fb_tunnel_dev == dev)
1265                 WRITE_ONCE(itn->fb_tunnel_dev, NULL);
1266
1267         dst_cache_reset(&tunnel->dst_cache);
1268 }
1269 EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1270
1271 /* Do least required initialization, rest of init is done in tunnel_init call */
1272 void ip_tunnel_setup(struct net_device *dev, unsigned int net_id)
1273 {
1274         struct ip_tunnel *tunnel = netdev_priv(dev);
1275         tunnel->ip_tnl_net_id = net_id;
1276 }
1277 EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1278
1279 MODULE_LICENSE("GPL");