Merge tag 'wireless-drivers-next-2020-12-12' of git://git.kernel.org/pub/scm/linux...
[linux-2.6-microblaze.git] / net / ipv4 / ip_tunnel.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (c) 2013 Nicira, Inc.
4  */
5
6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
7
8 #include <linux/capability.h>
9 #include <linux/module.h>
10 #include <linux/types.h>
11 #include <linux/kernel.h>
12 #include <linux/slab.h>
13 #include <linux/uaccess.h>
14 #include <linux/skbuff.h>
15 #include <linux/netdevice.h>
16 #include <linux/in.h>
17 #include <linux/tcp.h>
18 #include <linux/udp.h>
19 #include <linux/if_arp.h>
20 #include <linux/init.h>
21 #include <linux/in6.h>
22 #include <linux/inetdevice.h>
23 #include <linux/igmp.h>
24 #include <linux/netfilter_ipv4.h>
25 #include <linux/etherdevice.h>
26 #include <linux/if_ether.h>
27 #include <linux/if_vlan.h>
28 #include <linux/rculist.h>
29 #include <linux/err.h>
30
31 #include <net/sock.h>
32 #include <net/ip.h>
33 #include <net/icmp.h>
34 #include <net/protocol.h>
35 #include <net/ip_tunnels.h>
36 #include <net/arp.h>
37 #include <net/checksum.h>
38 #include <net/dsfield.h>
39 #include <net/inet_ecn.h>
40 #include <net/xfrm.h>
41 #include <net/net_namespace.h>
42 #include <net/netns/generic.h>
43 #include <net/rtnetlink.h>
44 #include <net/udp.h>
45 #include <net/dst_metadata.h>
46
47 #if IS_ENABLED(CONFIG_IPV6)
48 #include <net/ipv6.h>
49 #include <net/ip6_fib.h>
50 #include <net/ip6_route.h>
51 #endif
52
53 static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
54 {
55         return hash_32((__force u32)key ^ (__force u32)remote,
56                          IP_TNL_HASH_BITS);
57 }
58
59 static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
60                                 __be16 flags, __be32 key)
61 {
62         if (p->i_flags & TUNNEL_KEY) {
63                 if (flags & TUNNEL_KEY)
64                         return key == p->i_key;
65                 else
66                         /* key expected, none present */
67                         return false;
68         } else
69                 return !(flags & TUNNEL_KEY);
70 }
71
72 /* Fallback tunnel: no source, no destination, no key, no options
73
74    Tunnel hash table:
75    We require exact key match i.e. if a key is present in packet
76    it will match only tunnel with the same key; if it is not present,
77    it will match only keyless tunnel.
78
79    All keysless packets, if not matched configured keyless tunnels
80    will match fallback tunnel.
81    Given src, dst and key, find appropriate for input tunnel.
82 */
83 struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
84                                    int link, __be16 flags,
85                                    __be32 remote, __be32 local,
86                                    __be32 key)
87 {
88         struct ip_tunnel *t, *cand = NULL;
89         struct hlist_head *head;
90         struct net_device *ndev;
91         unsigned int hash;
92
93         hash = ip_tunnel_hash(key, remote);
94         head = &itn->tunnels[hash];
95
96         hlist_for_each_entry_rcu(t, head, hash_node) {
97                 if (local != t->parms.iph.saddr ||
98                     remote != t->parms.iph.daddr ||
99                     !(t->dev->flags & IFF_UP))
100                         continue;
101
102                 if (!ip_tunnel_key_match(&t->parms, flags, key))
103                         continue;
104
105                 if (t->parms.link == link)
106                         return t;
107                 else
108                         cand = t;
109         }
110
111         hlist_for_each_entry_rcu(t, head, hash_node) {
112                 if (remote != t->parms.iph.daddr ||
113                     t->parms.iph.saddr != 0 ||
114                     !(t->dev->flags & IFF_UP))
115                         continue;
116
117                 if (!ip_tunnel_key_match(&t->parms, flags, key))
118                         continue;
119
120                 if (t->parms.link == link)
121                         return t;
122                 else if (!cand)
123                         cand = t;
124         }
125
126         hash = ip_tunnel_hash(key, 0);
127         head = &itn->tunnels[hash];
128
129         hlist_for_each_entry_rcu(t, head, hash_node) {
130                 if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) &&
131                     (local != t->parms.iph.daddr || !ipv4_is_multicast(local)))
132                         continue;
133
134                 if (!(t->dev->flags & IFF_UP))
135                         continue;
136
137                 if (!ip_tunnel_key_match(&t->parms, flags, key))
138                         continue;
139
140                 if (t->parms.link == link)
141                         return t;
142                 else if (!cand)
143                         cand = t;
144         }
145
146         hlist_for_each_entry_rcu(t, head, hash_node) {
147                 if ((!(flags & TUNNEL_NO_KEY) && t->parms.i_key != key) ||
148                     t->parms.iph.saddr != 0 ||
149                     t->parms.iph.daddr != 0 ||
150                     !(t->dev->flags & IFF_UP))
151                         continue;
152
153                 if (t->parms.link == link)
154                         return t;
155                 else if (!cand)
156                         cand = t;
157         }
158
159         if (cand)
160                 return cand;
161
162         t = rcu_dereference(itn->collect_md_tun);
163         if (t && t->dev->flags & IFF_UP)
164                 return t;
165
166         ndev = READ_ONCE(itn->fb_tunnel_dev);
167         if (ndev && ndev->flags & IFF_UP)
168                 return netdev_priv(ndev);
169
170         return NULL;
171 }
172 EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
173
174 static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
175                                     struct ip_tunnel_parm *parms)
176 {
177         unsigned int h;
178         __be32 remote;
179         __be32 i_key = parms->i_key;
180
181         if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
182                 remote = parms->iph.daddr;
183         else
184                 remote = 0;
185
186         if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI))
187                 i_key = 0;
188
189         h = ip_tunnel_hash(i_key, remote);
190         return &itn->tunnels[h];
191 }
192
193 static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
194 {
195         struct hlist_head *head = ip_bucket(itn, &t->parms);
196
197         if (t->collect_md)
198                 rcu_assign_pointer(itn->collect_md_tun, t);
199         hlist_add_head_rcu(&t->hash_node, head);
200 }
201
202 static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t)
203 {
204         if (t->collect_md)
205                 rcu_assign_pointer(itn->collect_md_tun, NULL);
206         hlist_del_init_rcu(&t->hash_node);
207 }
208
209 static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
210                                         struct ip_tunnel_parm *parms,
211                                         int type)
212 {
213         __be32 remote = parms->iph.daddr;
214         __be32 local = parms->iph.saddr;
215         __be32 key = parms->i_key;
216         __be16 flags = parms->i_flags;
217         int link = parms->link;
218         struct ip_tunnel *t = NULL;
219         struct hlist_head *head = ip_bucket(itn, parms);
220
221         hlist_for_each_entry_rcu(t, head, hash_node) {
222                 if (local == t->parms.iph.saddr &&
223                     remote == t->parms.iph.daddr &&
224                     link == t->parms.link &&
225                     type == t->dev->type &&
226                     ip_tunnel_key_match(&t->parms, flags, key))
227                         break;
228         }
229         return t;
230 }
231
232 static struct net_device *__ip_tunnel_create(struct net *net,
233                                              const struct rtnl_link_ops *ops,
234                                              struct ip_tunnel_parm *parms)
235 {
236         int err;
237         struct ip_tunnel *tunnel;
238         struct net_device *dev;
239         char name[IFNAMSIZ];
240
241         err = -E2BIG;
242         if (parms->name[0]) {
243                 if (!dev_valid_name(parms->name))
244                         goto failed;
245                 strlcpy(name, parms->name, IFNAMSIZ);
246         } else {
247                 if (strlen(ops->kind) > (IFNAMSIZ - 3))
248                         goto failed;
249                 strcpy(name, ops->kind);
250                 strcat(name, "%d");
251         }
252
253         ASSERT_RTNL();
254         dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup);
255         if (!dev) {
256                 err = -ENOMEM;
257                 goto failed;
258         }
259         dev_net_set(dev, net);
260
261         dev->rtnl_link_ops = ops;
262
263         tunnel = netdev_priv(dev);
264         tunnel->parms = *parms;
265         tunnel->net = net;
266
267         err = register_netdevice(dev);
268         if (err)
269                 goto failed_free;
270
271         return dev;
272
273 failed_free:
274         free_netdev(dev);
275 failed:
276         return ERR_PTR(err);
277 }
278
279 static int ip_tunnel_bind_dev(struct net_device *dev)
280 {
281         struct net_device *tdev = NULL;
282         struct ip_tunnel *tunnel = netdev_priv(dev);
283         const struct iphdr *iph;
284         int hlen = LL_MAX_HEADER;
285         int mtu = ETH_DATA_LEN;
286         int t_hlen = tunnel->hlen + sizeof(struct iphdr);
287
288         iph = &tunnel->parms.iph;
289
290         /* Guess output device to choose reasonable mtu and needed_headroom */
291         if (iph->daddr) {
292                 struct flowi4 fl4;
293                 struct rtable *rt;
294
295                 ip_tunnel_init_flow(&fl4, iph->protocol, iph->daddr,
296                                     iph->saddr, tunnel->parms.o_key,
297                                     RT_TOS(iph->tos), tunnel->parms.link,
298                                     tunnel->fwmark, 0);
299                 rt = ip_route_output_key(tunnel->net, &fl4);
300
301                 if (!IS_ERR(rt)) {
302                         tdev = rt->dst.dev;
303                         ip_rt_put(rt);
304                 }
305                 if (dev->type != ARPHRD_ETHER)
306                         dev->flags |= IFF_POINTOPOINT;
307
308                 dst_cache_reset(&tunnel->dst_cache);
309         }
310
311         if (!tdev && tunnel->parms.link)
312                 tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
313
314         if (tdev) {
315                 hlen = tdev->hard_header_len + tdev->needed_headroom;
316                 mtu = min(tdev->mtu, IP_MAX_MTU);
317         }
318
319         dev->needed_headroom = t_hlen + hlen;
320         mtu -= (dev->hard_header_len + t_hlen);
321
322         if (mtu < IPV4_MIN_MTU)
323                 mtu = IPV4_MIN_MTU;
324
325         return mtu;
326 }
327
328 static struct ip_tunnel *ip_tunnel_create(struct net *net,
329                                           struct ip_tunnel_net *itn,
330                                           struct ip_tunnel_parm *parms)
331 {
332         struct ip_tunnel *nt;
333         struct net_device *dev;
334         int t_hlen;
335         int mtu;
336         int err;
337
338         dev = __ip_tunnel_create(net, itn->rtnl_link_ops, parms);
339         if (IS_ERR(dev))
340                 return ERR_CAST(dev);
341
342         mtu = ip_tunnel_bind_dev(dev);
343         err = dev_set_mtu(dev, mtu);
344         if (err)
345                 goto err_dev_set_mtu;
346
347         nt = netdev_priv(dev);
348         t_hlen = nt->hlen + sizeof(struct iphdr);
349         dev->min_mtu = ETH_MIN_MTU;
350         dev->max_mtu = IP_MAX_MTU - dev->hard_header_len - t_hlen;
351         ip_tunnel_add(itn, nt);
352         return nt;
353
354 err_dev_set_mtu:
355         unregister_netdevice(dev);
356         return ERR_PTR(err);
357 }
358
359 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
360                   const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst,
361                   bool log_ecn_error)
362 {
363         const struct iphdr *iph = ip_hdr(skb);
364         int err;
365
366 #ifdef CONFIG_NET_IPGRE_BROADCAST
367         if (ipv4_is_multicast(iph->daddr)) {
368                 tunnel->dev->stats.multicast++;
369                 skb->pkt_type = PACKET_BROADCAST;
370         }
371 #endif
372
373         if ((!(tpi->flags&TUNNEL_CSUM) &&  (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
374              ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
375                 tunnel->dev->stats.rx_crc_errors++;
376                 tunnel->dev->stats.rx_errors++;
377                 goto drop;
378         }
379
380         if (tunnel->parms.i_flags&TUNNEL_SEQ) {
381                 if (!(tpi->flags&TUNNEL_SEQ) ||
382                     (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
383                         tunnel->dev->stats.rx_fifo_errors++;
384                         tunnel->dev->stats.rx_errors++;
385                         goto drop;
386                 }
387                 tunnel->i_seqno = ntohl(tpi->seq) + 1;
388         }
389
390         skb_reset_network_header(skb);
391
392         err = IP_ECN_decapsulate(iph, skb);
393         if (unlikely(err)) {
394                 if (log_ecn_error)
395                         net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
396                                         &iph->saddr, iph->tos);
397                 if (err > 1) {
398                         ++tunnel->dev->stats.rx_frame_errors;
399                         ++tunnel->dev->stats.rx_errors;
400                         goto drop;
401                 }
402         }
403
404         dev_sw_netstats_rx_add(tunnel->dev, skb->len);
405         skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
406
407         if (tunnel->dev->type == ARPHRD_ETHER) {
408                 skb->protocol = eth_type_trans(skb, tunnel->dev);
409                 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
410         } else {
411                 skb->dev = tunnel->dev;
412         }
413
414         if (tun_dst)
415                 skb_dst_set(skb, (struct dst_entry *)tun_dst);
416
417         gro_cells_receive(&tunnel->gro_cells, skb);
418         return 0;
419
420 drop:
421         if (tun_dst)
422                 dst_release((struct dst_entry *)tun_dst);
423         kfree_skb(skb);
424         return 0;
425 }
426 EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
427
428 int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops,
429                             unsigned int num)
430 {
431         if (num >= MAX_IPTUN_ENCAP_OPS)
432                 return -ERANGE;
433
434         return !cmpxchg((const struct ip_tunnel_encap_ops **)
435                         &iptun_encaps[num],
436                         NULL, ops) ? 0 : -1;
437 }
438 EXPORT_SYMBOL(ip_tunnel_encap_add_ops);
439
440 int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops,
441                             unsigned int num)
442 {
443         int ret;
444
445         if (num >= MAX_IPTUN_ENCAP_OPS)
446                 return -ERANGE;
447
448         ret = (cmpxchg((const struct ip_tunnel_encap_ops **)
449                        &iptun_encaps[num],
450                        ops, NULL) == ops) ? 0 : -1;
451
452         synchronize_net();
453
454         return ret;
455 }
456 EXPORT_SYMBOL(ip_tunnel_encap_del_ops);
457
458 int ip_tunnel_encap_setup(struct ip_tunnel *t,
459                           struct ip_tunnel_encap *ipencap)
460 {
461         int hlen;
462
463         memset(&t->encap, 0, sizeof(t->encap));
464
465         hlen = ip_encap_hlen(ipencap);
466         if (hlen < 0)
467                 return hlen;
468
469         t->encap.type = ipencap->type;
470         t->encap.sport = ipencap->sport;
471         t->encap.dport = ipencap->dport;
472         t->encap.flags = ipencap->flags;
473
474         t->encap_hlen = hlen;
475         t->hlen = t->encap_hlen + t->tun_hlen;
476
477         return 0;
478 }
479 EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup);
480
481 static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
482                             struct rtable *rt, __be16 df,
483                             const struct iphdr *inner_iph,
484                             int tunnel_hlen, __be32 dst, bool md)
485 {
486         struct ip_tunnel *tunnel = netdev_priv(dev);
487         int pkt_size;
488         int mtu;
489
490         tunnel_hlen = md ? tunnel_hlen : tunnel->hlen;
491         pkt_size = skb->len - tunnel_hlen - dev->hard_header_len;
492
493         if (df)
494                 mtu = dst_mtu(&rt->dst) - dev->hard_header_len
495                                         - sizeof(struct iphdr) - tunnel_hlen;
496         else
497                 mtu = skb_valid_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
498
499         if (skb_valid_dst(skb))
500                 skb_dst_update_pmtu_no_confirm(skb, mtu);
501
502         if (skb->protocol == htons(ETH_P_IP)) {
503                 if (!skb_is_gso(skb) &&
504                     (inner_iph->frag_off & htons(IP_DF)) &&
505                     mtu < pkt_size) {
506                         memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
507                         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
508                         return -E2BIG;
509                 }
510         }
511 #if IS_ENABLED(CONFIG_IPV6)
512         else if (skb->protocol == htons(ETH_P_IPV6)) {
513                 struct rt6_info *rt6;
514                 __be32 daddr;
515
516                 rt6 = skb_valid_dst(skb) ? (struct rt6_info *)skb_dst(skb) :
517                                            NULL;
518                 daddr = md ? dst : tunnel->parms.iph.daddr;
519
520                 if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
521                            mtu >= IPV6_MIN_MTU) {
522                         if ((daddr && !ipv4_is_multicast(daddr)) ||
523                             rt6->rt6i_dst.plen == 128) {
524                                 rt6->rt6i_flags |= RTF_MODIFIED;
525                                 dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
526                         }
527                 }
528
529                 if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
530                                         mtu < pkt_size) {
531                         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
532                         return -E2BIG;
533                 }
534         }
535 #endif
536         return 0;
537 }
538
539 void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
540                        u8 proto, int tunnel_hlen)
541 {
542         struct ip_tunnel *tunnel = netdev_priv(dev);
543         u32 headroom = sizeof(struct iphdr);
544         struct ip_tunnel_info *tun_info;
545         const struct ip_tunnel_key *key;
546         const struct iphdr *inner_iph;
547         struct rtable *rt = NULL;
548         struct flowi4 fl4;
549         __be16 df = 0;
550         u8 tos, ttl;
551         bool use_cache;
552
553         tun_info = skb_tunnel_info(skb);
554         if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
555                      ip_tunnel_info_af(tun_info) != AF_INET))
556                 goto tx_error;
557         key = &tun_info->key;
558         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
559         inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
560         tos = key->tos;
561         if (tos == 1) {
562                 if (skb->protocol == htons(ETH_P_IP))
563                         tos = inner_iph->tos;
564                 else if (skb->protocol == htons(ETH_P_IPV6))
565                         tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
566         }
567         ip_tunnel_init_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src,
568                             tunnel_id_to_key32(key->tun_id), RT_TOS(tos),
569                             0, skb->mark, skb_get_hash(skb));
570         if (tunnel->encap.type != TUNNEL_ENCAP_NONE)
571                 goto tx_error;
572
573         use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
574         if (use_cache)
575                 rt = dst_cache_get_ip4(&tun_info->dst_cache, &fl4.saddr);
576         if (!rt) {
577                 rt = ip_route_output_key(tunnel->net, &fl4);
578                 if (IS_ERR(rt)) {
579                         dev->stats.tx_carrier_errors++;
580                         goto tx_error;
581                 }
582                 if (use_cache)
583                         dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst,
584                                           fl4.saddr);
585         }
586         if (rt->dst.dev == dev) {
587                 ip_rt_put(rt);
588                 dev->stats.collisions++;
589                 goto tx_error;
590         }
591
592         if (key->tun_flags & TUNNEL_DONT_FRAGMENT)
593                 df = htons(IP_DF);
594         if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, tunnel_hlen,
595                             key->u.ipv4.dst, true)) {
596                 ip_rt_put(rt);
597                 goto tx_error;
598         }
599
600         tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
601         ttl = key->ttl;
602         if (ttl == 0) {
603                 if (skb->protocol == htons(ETH_P_IP))
604                         ttl = inner_iph->ttl;
605                 else if (skb->protocol == htons(ETH_P_IPV6))
606                         ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
607                 else
608                         ttl = ip4_dst_hoplimit(&rt->dst);
609         }
610
611         headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len;
612         if (headroom > dev->needed_headroom)
613                 dev->needed_headroom = headroom;
614
615         if (skb_cow_head(skb, dev->needed_headroom)) {
616                 ip_rt_put(rt);
617                 goto tx_dropped;
618         }
619         iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, tos, ttl,
620                       df, !net_eq(tunnel->net, dev_net(dev)));
621         return;
622 tx_error:
623         dev->stats.tx_errors++;
624         goto kfree;
625 tx_dropped:
626         dev->stats.tx_dropped++;
627 kfree:
628         kfree_skb(skb);
629 }
630 EXPORT_SYMBOL_GPL(ip_md_tunnel_xmit);
631
632 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
633                     const struct iphdr *tnl_params, u8 protocol)
634 {
635         struct ip_tunnel *tunnel = netdev_priv(dev);
636         struct ip_tunnel_info *tun_info = NULL;
637         const struct iphdr *inner_iph;
638         unsigned int max_headroom;      /* The extra header space needed */
639         struct rtable *rt = NULL;               /* Route to the other host */
640         bool use_cache = false;
641         struct flowi4 fl4;
642         bool md = false;
643         bool connected;
644         u8 tos, ttl;
645         __be32 dst;
646         __be16 df;
647
648         inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
649         connected = (tunnel->parms.iph.daddr != 0);
650
651         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
652
653         dst = tnl_params->daddr;
654         if (dst == 0) {
655                 /* NBMA tunnel */
656
657                 if (!skb_dst(skb)) {
658                         dev->stats.tx_fifo_errors++;
659                         goto tx_error;
660                 }
661
662                 tun_info = skb_tunnel_info(skb);
663                 if (tun_info && (tun_info->mode & IP_TUNNEL_INFO_TX) &&
664                     ip_tunnel_info_af(tun_info) == AF_INET &&
665                     tun_info->key.u.ipv4.dst) {
666                         dst = tun_info->key.u.ipv4.dst;
667                         md = true;
668                         connected = true;
669                 }
670                 else if (skb->protocol == htons(ETH_P_IP)) {
671                         rt = skb_rtable(skb);
672                         dst = rt_nexthop(rt, inner_iph->daddr);
673                 }
674 #if IS_ENABLED(CONFIG_IPV6)
675                 else if (skb->protocol == htons(ETH_P_IPV6)) {
676                         const struct in6_addr *addr6;
677                         struct neighbour *neigh;
678                         bool do_tx_error_icmp;
679                         int addr_type;
680
681                         neigh = dst_neigh_lookup(skb_dst(skb),
682                                                  &ipv6_hdr(skb)->daddr);
683                         if (!neigh)
684                                 goto tx_error;
685
686                         addr6 = (const struct in6_addr *)&neigh->primary_key;
687                         addr_type = ipv6_addr_type(addr6);
688
689                         if (addr_type == IPV6_ADDR_ANY) {
690                                 addr6 = &ipv6_hdr(skb)->daddr;
691                                 addr_type = ipv6_addr_type(addr6);
692                         }
693
694                         if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
695                                 do_tx_error_icmp = true;
696                         else {
697                                 do_tx_error_icmp = false;
698                                 dst = addr6->s6_addr32[3];
699                         }
700                         neigh_release(neigh);
701                         if (do_tx_error_icmp)
702                                 goto tx_error_icmp;
703                 }
704 #endif
705                 else
706                         goto tx_error;
707
708                 if (!md)
709                         connected = false;
710         }
711
712         tos = tnl_params->tos;
713         if (tos & 0x1) {
714                 tos &= ~0x1;
715                 if (skb->protocol == htons(ETH_P_IP)) {
716                         tos = inner_iph->tos;
717                         connected = false;
718                 } else if (skb->protocol == htons(ETH_P_IPV6)) {
719                         tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
720                         connected = false;
721                 }
722         }
723
724         ip_tunnel_init_flow(&fl4, protocol, dst, tnl_params->saddr,
725                             tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link,
726                             tunnel->fwmark, skb_get_hash(skb));
727
728         if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0)
729                 goto tx_error;
730
731         if (connected && md) {
732                 use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
733                 if (use_cache)
734                         rt = dst_cache_get_ip4(&tun_info->dst_cache,
735                                                &fl4.saddr);
736         } else {
737                 rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache,
738                                                 &fl4.saddr) : NULL;
739         }
740
741         if (!rt) {
742                 rt = ip_route_output_key(tunnel->net, &fl4);
743
744                 if (IS_ERR(rt)) {
745                         dev->stats.tx_carrier_errors++;
746                         goto tx_error;
747                 }
748                 if (use_cache)
749                         dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst,
750                                           fl4.saddr);
751                 else if (!md && connected)
752                         dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst,
753                                           fl4.saddr);
754         }
755
756         if (rt->dst.dev == dev) {
757                 ip_rt_put(rt);
758                 dev->stats.collisions++;
759                 goto tx_error;
760         }
761
762         if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off, inner_iph,
763                             0, 0, false)) {
764                 ip_rt_put(rt);
765                 goto tx_error;
766         }
767
768         if (tunnel->err_count > 0) {
769                 if (time_before(jiffies,
770                                 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
771                         tunnel->err_count--;
772
773                         dst_link_failure(skb);
774                 } else
775                         tunnel->err_count = 0;
776         }
777
778         tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
779         ttl = tnl_params->ttl;
780         if (ttl == 0) {
781                 if (skb->protocol == htons(ETH_P_IP))
782                         ttl = inner_iph->ttl;
783 #if IS_ENABLED(CONFIG_IPV6)
784                 else if (skb->protocol == htons(ETH_P_IPV6))
785                         ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
786 #endif
787                 else
788                         ttl = ip4_dst_hoplimit(&rt->dst);
789         }
790
791         df = tnl_params->frag_off;
792         if (skb->protocol == htons(ETH_P_IP) && !tunnel->ignore_df)
793                 df |= (inner_iph->frag_off&htons(IP_DF));
794
795         max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
796                         + rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
797         if (max_headroom > dev->needed_headroom)
798                 dev->needed_headroom = max_headroom;
799
800         if (skb_cow_head(skb, dev->needed_headroom)) {
801                 ip_rt_put(rt);
802                 dev->stats.tx_dropped++;
803                 kfree_skb(skb);
804                 return;
805         }
806
807         iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl,
808                       df, !net_eq(tunnel->net, dev_net(dev)));
809         return;
810
811 #if IS_ENABLED(CONFIG_IPV6)
812 tx_error_icmp:
813         dst_link_failure(skb);
814 #endif
815 tx_error:
816         dev->stats.tx_errors++;
817         kfree_skb(skb);
818 }
819 EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
820
821 static void ip_tunnel_update(struct ip_tunnel_net *itn,
822                              struct ip_tunnel *t,
823                              struct net_device *dev,
824                              struct ip_tunnel_parm *p,
825                              bool set_mtu,
826                              __u32 fwmark)
827 {
828         ip_tunnel_del(itn, t);
829         t->parms.iph.saddr = p->iph.saddr;
830         t->parms.iph.daddr = p->iph.daddr;
831         t->parms.i_key = p->i_key;
832         t->parms.o_key = p->o_key;
833         if (dev->type != ARPHRD_ETHER) {
834                 memcpy(dev->dev_addr, &p->iph.saddr, 4);
835                 memcpy(dev->broadcast, &p->iph.daddr, 4);
836         }
837         ip_tunnel_add(itn, t);
838
839         t->parms.iph.ttl = p->iph.ttl;
840         t->parms.iph.tos = p->iph.tos;
841         t->parms.iph.frag_off = p->iph.frag_off;
842
843         if (t->parms.link != p->link || t->fwmark != fwmark) {
844                 int mtu;
845
846                 t->parms.link = p->link;
847                 t->fwmark = fwmark;
848                 mtu = ip_tunnel_bind_dev(dev);
849                 if (set_mtu)
850                         dev->mtu = mtu;
851         }
852         dst_cache_reset(&t->dst_cache);
853         netdev_state_change(dev);
854 }
855
856 int ip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
857 {
858         int err = 0;
859         struct ip_tunnel *t = netdev_priv(dev);
860         struct net *net = t->net;
861         struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
862
863         switch (cmd) {
864         case SIOCGETTUNNEL:
865                 if (dev == itn->fb_tunnel_dev) {
866                         t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
867                         if (!t)
868                                 t = netdev_priv(dev);
869                 }
870                 memcpy(p, &t->parms, sizeof(*p));
871                 break;
872
873         case SIOCADDTUNNEL:
874         case SIOCCHGTUNNEL:
875                 err = -EPERM;
876                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
877                         goto done;
878                 if (p->iph.ttl)
879                         p->iph.frag_off |= htons(IP_DF);
880                 if (!(p->i_flags & VTI_ISVTI)) {
881                         if (!(p->i_flags & TUNNEL_KEY))
882                                 p->i_key = 0;
883                         if (!(p->o_flags & TUNNEL_KEY))
884                                 p->o_key = 0;
885                 }
886
887                 t = ip_tunnel_find(itn, p, itn->type);
888
889                 if (cmd == SIOCADDTUNNEL) {
890                         if (!t) {
891                                 t = ip_tunnel_create(net, itn, p);
892                                 err = PTR_ERR_OR_ZERO(t);
893                                 break;
894                         }
895
896                         err = -EEXIST;
897                         break;
898                 }
899                 if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
900                         if (t) {
901                                 if (t->dev != dev) {
902                                         err = -EEXIST;
903                                         break;
904                                 }
905                         } else {
906                                 unsigned int nflags = 0;
907
908                                 if (ipv4_is_multicast(p->iph.daddr))
909                                         nflags = IFF_BROADCAST;
910                                 else if (p->iph.daddr)
911                                         nflags = IFF_POINTOPOINT;
912
913                                 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
914                                         err = -EINVAL;
915                                         break;
916                                 }
917
918                                 t = netdev_priv(dev);
919                         }
920                 }
921
922                 if (t) {
923                         err = 0;
924                         ip_tunnel_update(itn, t, dev, p, true, 0);
925                 } else {
926                         err = -ENOENT;
927                 }
928                 break;
929
930         case SIOCDELTUNNEL:
931                 err = -EPERM;
932                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
933                         goto done;
934
935                 if (dev == itn->fb_tunnel_dev) {
936                         err = -ENOENT;
937                         t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
938                         if (!t)
939                                 goto done;
940                         err = -EPERM;
941                         if (t == netdev_priv(itn->fb_tunnel_dev))
942                                 goto done;
943                         dev = t->dev;
944                 }
945                 unregister_netdevice(dev);
946                 err = 0;
947                 break;
948
949         default:
950                 err = -EINVAL;
951         }
952
953 done:
954         return err;
955 }
956 EXPORT_SYMBOL_GPL(ip_tunnel_ctl);
957
958 int ip_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
959 {
960         struct ip_tunnel_parm p;
961         int err;
962
963         if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
964                 return -EFAULT;
965         err = dev->netdev_ops->ndo_tunnel_ctl(dev, &p, cmd);
966         if (!err && copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
967                 return -EFAULT;
968         return err;
969 }
970 EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
971
972 int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict)
973 {
974         struct ip_tunnel *tunnel = netdev_priv(dev);
975         int t_hlen = tunnel->hlen + sizeof(struct iphdr);
976         int max_mtu = IP_MAX_MTU - dev->hard_header_len - t_hlen;
977
978         if (new_mtu < ETH_MIN_MTU)
979                 return -EINVAL;
980
981         if (new_mtu > max_mtu) {
982                 if (strict)
983                         return -EINVAL;
984
985                 new_mtu = max_mtu;
986         }
987
988         dev->mtu = new_mtu;
989         return 0;
990 }
991 EXPORT_SYMBOL_GPL(__ip_tunnel_change_mtu);
992
993 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
994 {
995         return __ip_tunnel_change_mtu(dev, new_mtu, true);
996 }
997 EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
998
999 static void ip_tunnel_dev_free(struct net_device *dev)
1000 {
1001         struct ip_tunnel *tunnel = netdev_priv(dev);
1002
1003         gro_cells_destroy(&tunnel->gro_cells);
1004         dst_cache_destroy(&tunnel->dst_cache);
1005         free_percpu(dev->tstats);
1006 }
1007
1008 void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
1009 {
1010         struct ip_tunnel *tunnel = netdev_priv(dev);
1011         struct ip_tunnel_net *itn;
1012
1013         itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
1014
1015         if (itn->fb_tunnel_dev != dev) {
1016                 ip_tunnel_del(itn, netdev_priv(dev));
1017                 unregister_netdevice_queue(dev, head);
1018         }
1019 }
1020 EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
1021
1022 struct net *ip_tunnel_get_link_net(const struct net_device *dev)
1023 {
1024         struct ip_tunnel *tunnel = netdev_priv(dev);
1025
1026         return tunnel->net;
1027 }
1028 EXPORT_SYMBOL(ip_tunnel_get_link_net);
1029
1030 int ip_tunnel_get_iflink(const struct net_device *dev)
1031 {
1032         struct ip_tunnel *tunnel = netdev_priv(dev);
1033
1034         return tunnel->parms.link;
1035 }
1036 EXPORT_SYMBOL(ip_tunnel_get_iflink);
1037
1038 int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id,
1039                                   struct rtnl_link_ops *ops, char *devname)
1040 {
1041         struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
1042         struct ip_tunnel_parm parms;
1043         unsigned int i;
1044
1045         itn->rtnl_link_ops = ops;
1046         for (i = 0; i < IP_TNL_HASH_SIZE; i++)
1047                 INIT_HLIST_HEAD(&itn->tunnels[i]);
1048
1049         if (!ops || !net_has_fallback_tunnels(net)) {
1050                 struct ip_tunnel_net *it_init_net;
1051
1052                 it_init_net = net_generic(&init_net, ip_tnl_net_id);
1053                 itn->type = it_init_net->type;
1054                 itn->fb_tunnel_dev = NULL;
1055                 return 0;
1056         }
1057
1058         memset(&parms, 0, sizeof(parms));
1059         if (devname)
1060                 strlcpy(parms.name, devname, IFNAMSIZ);
1061
1062         rtnl_lock();
1063         itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
1064         /* FB netdevice is special: we have one, and only one per netns.
1065          * Allowing to move it to another netns is clearly unsafe.
1066          */
1067         if (!IS_ERR(itn->fb_tunnel_dev)) {
1068                 itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
1069                 itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
1070                 ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
1071                 itn->type = itn->fb_tunnel_dev->type;
1072         }
1073         rtnl_unlock();
1074
1075         return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
1076 }
1077 EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
1078
1079 static void ip_tunnel_destroy(struct net *net, struct ip_tunnel_net *itn,
1080                               struct list_head *head,
1081                               struct rtnl_link_ops *ops)
1082 {
1083         struct net_device *dev, *aux;
1084         int h;
1085
1086         for_each_netdev_safe(net, dev, aux)
1087                 if (dev->rtnl_link_ops == ops)
1088                         unregister_netdevice_queue(dev, head);
1089
1090         for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
1091                 struct ip_tunnel *t;
1092                 struct hlist_node *n;
1093                 struct hlist_head *thead = &itn->tunnels[h];
1094
1095                 hlist_for_each_entry_safe(t, n, thead, hash_node)
1096                         /* If dev is in the same netns, it has already
1097                          * been added to the list by the previous loop.
1098                          */
1099                         if (!net_eq(dev_net(t->dev), net))
1100                                 unregister_netdevice_queue(t->dev, head);
1101         }
1102 }
1103
1104 void ip_tunnel_delete_nets(struct list_head *net_list, unsigned int id,
1105                            struct rtnl_link_ops *ops)
1106 {
1107         struct ip_tunnel_net *itn;
1108         struct net *net;
1109         LIST_HEAD(list);
1110
1111         rtnl_lock();
1112         list_for_each_entry(net, net_list, exit_list) {
1113                 itn = net_generic(net, id);
1114                 ip_tunnel_destroy(net, itn, &list, ops);
1115         }
1116         unregister_netdevice_many(&list);
1117         rtnl_unlock();
1118 }
1119 EXPORT_SYMBOL_GPL(ip_tunnel_delete_nets);
1120
1121 int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
1122                       struct ip_tunnel_parm *p, __u32 fwmark)
1123 {
1124         struct ip_tunnel *nt;
1125         struct net *net = dev_net(dev);
1126         struct ip_tunnel_net *itn;
1127         int mtu;
1128         int err;
1129
1130         nt = netdev_priv(dev);
1131         itn = net_generic(net, nt->ip_tnl_net_id);
1132
1133         if (nt->collect_md) {
1134                 if (rtnl_dereference(itn->collect_md_tun))
1135                         return -EEXIST;
1136         } else {
1137                 if (ip_tunnel_find(itn, p, dev->type))
1138                         return -EEXIST;
1139         }
1140
1141         nt->net = net;
1142         nt->parms = *p;
1143         nt->fwmark = fwmark;
1144         err = register_netdevice(dev);
1145         if (err)
1146                 goto err_register_netdevice;
1147
1148         if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1149                 eth_hw_addr_random(dev);
1150
1151         mtu = ip_tunnel_bind_dev(dev);
1152         if (tb[IFLA_MTU]) {
1153                 unsigned int max = IP_MAX_MTU - dev->hard_header_len - nt->hlen;
1154
1155                 mtu = clamp(dev->mtu, (unsigned int)ETH_MIN_MTU,
1156                             (unsigned int)(max - sizeof(struct iphdr)));
1157         }
1158
1159         err = dev_set_mtu(dev, mtu);
1160         if (err)
1161                 goto err_dev_set_mtu;
1162
1163         ip_tunnel_add(itn, nt);
1164         return 0;
1165
1166 err_dev_set_mtu:
1167         unregister_netdevice(dev);
1168 err_register_netdevice:
1169         return err;
1170 }
1171 EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
1172
1173 int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
1174                          struct ip_tunnel_parm *p, __u32 fwmark)
1175 {
1176         struct ip_tunnel *t;
1177         struct ip_tunnel *tunnel = netdev_priv(dev);
1178         struct net *net = tunnel->net;
1179         struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
1180
1181         if (dev == itn->fb_tunnel_dev)
1182                 return -EINVAL;
1183
1184         t = ip_tunnel_find(itn, p, dev->type);
1185
1186         if (t) {
1187                 if (t->dev != dev)
1188                         return -EEXIST;
1189         } else {
1190                 t = tunnel;
1191
1192                 if (dev->type != ARPHRD_ETHER) {
1193                         unsigned int nflags = 0;
1194
1195                         if (ipv4_is_multicast(p->iph.daddr))
1196                                 nflags = IFF_BROADCAST;
1197                         else if (p->iph.daddr)
1198                                 nflags = IFF_POINTOPOINT;
1199
1200                         if ((dev->flags ^ nflags) &
1201                             (IFF_POINTOPOINT | IFF_BROADCAST))
1202                                 return -EINVAL;
1203                 }
1204         }
1205
1206         ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU], fwmark);
1207         return 0;
1208 }
1209 EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
1210
1211 int ip_tunnel_init(struct net_device *dev)
1212 {
1213         struct ip_tunnel *tunnel = netdev_priv(dev);
1214         struct iphdr *iph = &tunnel->parms.iph;
1215         int err;
1216
1217         dev->needs_free_netdev = true;
1218         dev->priv_destructor = ip_tunnel_dev_free;
1219         dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
1220         if (!dev->tstats)
1221                 return -ENOMEM;
1222
1223         err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
1224         if (err) {
1225                 free_percpu(dev->tstats);
1226                 return err;
1227         }
1228
1229         err = gro_cells_init(&tunnel->gro_cells, dev);
1230         if (err) {
1231                 dst_cache_destroy(&tunnel->dst_cache);
1232                 free_percpu(dev->tstats);
1233                 return err;
1234         }
1235
1236         tunnel->dev = dev;
1237         tunnel->net = dev_net(dev);
1238         strcpy(tunnel->parms.name, dev->name);
1239         iph->version            = 4;
1240         iph->ihl                = 5;
1241
1242         if (tunnel->collect_md)
1243                 netif_keep_dst(dev);
1244         return 0;
1245 }
1246 EXPORT_SYMBOL_GPL(ip_tunnel_init);
1247
1248 void ip_tunnel_uninit(struct net_device *dev)
1249 {
1250         struct ip_tunnel *tunnel = netdev_priv(dev);
1251         struct net *net = tunnel->net;
1252         struct ip_tunnel_net *itn;
1253
1254         itn = net_generic(net, tunnel->ip_tnl_net_id);
1255         ip_tunnel_del(itn, netdev_priv(dev));
1256         if (itn->fb_tunnel_dev == dev)
1257                 WRITE_ONCE(itn->fb_tunnel_dev, NULL);
1258
1259         dst_cache_reset(&tunnel->dst_cache);
1260 }
1261 EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1262
1263 /* Do least required initialization, rest of init is done in tunnel_init call */
1264 void ip_tunnel_setup(struct net_device *dev, unsigned int net_id)
1265 {
1266         struct ip_tunnel *tunnel = netdev_priv(dev);
1267         tunnel->ip_tnl_net_id = net_id;
1268 }
1269 EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1270
1271 MODULE_LICENSE("GPL");