Merge branch 'linus' of git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6
[linux-2.6-microblaze.git] / net / ipv4 / ip_tunnel.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (c) 2013 Nicira, Inc.
4  */
5
6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
7
8 #include <linux/capability.h>
9 #include <linux/module.h>
10 #include <linux/types.h>
11 #include <linux/kernel.h>
12 #include <linux/slab.h>
13 #include <linux/uaccess.h>
14 #include <linux/skbuff.h>
15 #include <linux/netdevice.h>
16 #include <linux/in.h>
17 #include <linux/tcp.h>
18 #include <linux/udp.h>
19 #include <linux/if_arp.h>
20 #include <linux/init.h>
21 #include <linux/in6.h>
22 #include <linux/inetdevice.h>
23 #include <linux/igmp.h>
24 #include <linux/netfilter_ipv4.h>
25 #include <linux/etherdevice.h>
26 #include <linux/if_ether.h>
27 #include <linux/if_vlan.h>
28 #include <linux/rculist.h>
29 #include <linux/err.h>
30
31 #include <net/sock.h>
32 #include <net/ip.h>
33 #include <net/icmp.h>
34 #include <net/protocol.h>
35 #include <net/ip_tunnels.h>
36 #include <net/arp.h>
37 #include <net/checksum.h>
38 #include <net/dsfield.h>
39 #include <net/inet_ecn.h>
40 #include <net/xfrm.h>
41 #include <net/net_namespace.h>
42 #include <net/netns/generic.h>
43 #include <net/rtnetlink.h>
44 #include <net/udp.h>
45 #include <net/dst_metadata.h>
46
47 #if IS_ENABLED(CONFIG_IPV6)
48 #include <net/ipv6.h>
49 #include <net/ip6_fib.h>
50 #include <net/ip6_route.h>
51 #endif
52
53 static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
54 {
55         return hash_32((__force u32)key ^ (__force u32)remote,
56                          IP_TNL_HASH_BITS);
57 }
58
59 static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
60                                 __be16 flags, __be32 key)
61 {
62         if (p->i_flags & TUNNEL_KEY) {
63                 if (flags & TUNNEL_KEY)
64                         return key == p->i_key;
65                 else
66                         /* key expected, none present */
67                         return false;
68         } else
69                 return !(flags & TUNNEL_KEY);
70 }
71
72 /* Fallback tunnel: no source, no destination, no key, no options
73
74    Tunnel hash table:
75    We require exact key match i.e. if a key is present in packet
76    it will match only tunnel with the same key; if it is not present,
77    it will match only keyless tunnel.
78
79    All keysless packets, if not matched configured keyless tunnels
80    will match fallback tunnel.
81    Given src, dst and key, find appropriate for input tunnel.
82 */
83 struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
84                                    int link, __be16 flags,
85                                    __be32 remote, __be32 local,
86                                    __be32 key)
87 {
88         struct ip_tunnel *t, *cand = NULL;
89         struct hlist_head *head;
90         struct net_device *ndev;
91         unsigned int hash;
92
93         hash = ip_tunnel_hash(key, remote);
94         head = &itn->tunnels[hash];
95
96         hlist_for_each_entry_rcu(t, head, hash_node) {
97                 if (local != t->parms.iph.saddr ||
98                     remote != t->parms.iph.daddr ||
99                     !(t->dev->flags & IFF_UP))
100                         continue;
101
102                 if (!ip_tunnel_key_match(&t->parms, flags, key))
103                         continue;
104
105                 if (t->parms.link == link)
106                         return t;
107                 else
108                         cand = t;
109         }
110
111         hlist_for_each_entry_rcu(t, head, hash_node) {
112                 if (remote != t->parms.iph.daddr ||
113                     t->parms.iph.saddr != 0 ||
114                     !(t->dev->flags & IFF_UP))
115                         continue;
116
117                 if (!ip_tunnel_key_match(&t->parms, flags, key))
118                         continue;
119
120                 if (t->parms.link == link)
121                         return t;
122                 else if (!cand)
123                         cand = t;
124         }
125
126         hash = ip_tunnel_hash(key, 0);
127         head = &itn->tunnels[hash];
128
129         hlist_for_each_entry_rcu(t, head, hash_node) {
130                 if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) &&
131                     (local != t->parms.iph.daddr || !ipv4_is_multicast(local)))
132                         continue;
133
134                 if (!(t->dev->flags & IFF_UP))
135                         continue;
136
137                 if (!ip_tunnel_key_match(&t->parms, flags, key))
138                         continue;
139
140                 if (t->parms.link == link)
141                         return t;
142                 else if (!cand)
143                         cand = t;
144         }
145
146         hlist_for_each_entry_rcu(t, head, hash_node) {
147                 if ((!(flags & TUNNEL_NO_KEY) && t->parms.i_key != key) ||
148                     t->parms.iph.saddr != 0 ||
149                     t->parms.iph.daddr != 0 ||
150                     !(t->dev->flags & IFF_UP))
151                         continue;
152
153                 if (t->parms.link == link)
154                         return t;
155                 else if (!cand)
156                         cand = t;
157         }
158
159         if (cand)
160                 return cand;
161
162         t = rcu_dereference(itn->collect_md_tun);
163         if (t && t->dev->flags & IFF_UP)
164                 return t;
165
166         ndev = READ_ONCE(itn->fb_tunnel_dev);
167         if (ndev && ndev->flags & IFF_UP)
168                 return netdev_priv(ndev);
169
170         return NULL;
171 }
172 EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
173
174 static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
175                                     struct ip_tunnel_parm *parms)
176 {
177         unsigned int h;
178         __be32 remote;
179         __be32 i_key = parms->i_key;
180
181         if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
182                 remote = parms->iph.daddr;
183         else
184                 remote = 0;
185
186         if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI))
187                 i_key = 0;
188
189         h = ip_tunnel_hash(i_key, remote);
190         return &itn->tunnels[h];
191 }
192
193 static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
194 {
195         struct hlist_head *head = ip_bucket(itn, &t->parms);
196
197         if (t->collect_md)
198                 rcu_assign_pointer(itn->collect_md_tun, t);
199         hlist_add_head_rcu(&t->hash_node, head);
200 }
201
202 static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t)
203 {
204         if (t->collect_md)
205                 rcu_assign_pointer(itn->collect_md_tun, NULL);
206         hlist_del_init_rcu(&t->hash_node);
207 }
208
209 static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
210                                         struct ip_tunnel_parm *parms,
211                                         int type)
212 {
213         __be32 remote = parms->iph.daddr;
214         __be32 local = parms->iph.saddr;
215         __be32 key = parms->i_key;
216         __be16 flags = parms->i_flags;
217         int link = parms->link;
218         struct ip_tunnel *t = NULL;
219         struct hlist_head *head = ip_bucket(itn, parms);
220
221         hlist_for_each_entry_rcu(t, head, hash_node) {
222                 if (local == t->parms.iph.saddr &&
223                     remote == t->parms.iph.daddr &&
224                     link == t->parms.link &&
225                     type == t->dev->type &&
226                     ip_tunnel_key_match(&t->parms, flags, key))
227                         break;
228         }
229         return t;
230 }
231
232 static struct net_device *__ip_tunnel_create(struct net *net,
233                                              const struct rtnl_link_ops *ops,
234                                              struct ip_tunnel_parm *parms)
235 {
236         int err;
237         struct ip_tunnel *tunnel;
238         struct net_device *dev;
239         char name[IFNAMSIZ];
240
241         err = -E2BIG;
242         if (parms->name[0]) {
243                 if (!dev_valid_name(parms->name))
244                         goto failed;
245                 strlcpy(name, parms->name, IFNAMSIZ);
246         } else {
247                 if (strlen(ops->kind) > (IFNAMSIZ - 3))
248                         goto failed;
249                 strcpy(name, ops->kind);
250                 strcat(name, "%d");
251         }
252
253         ASSERT_RTNL();
254         dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup);
255         if (!dev) {
256                 err = -ENOMEM;
257                 goto failed;
258         }
259         dev_net_set(dev, net);
260
261         dev->rtnl_link_ops = ops;
262
263         tunnel = netdev_priv(dev);
264         tunnel->parms = *parms;
265         tunnel->net = net;
266
267         err = register_netdevice(dev);
268         if (err)
269                 goto failed_free;
270
271         return dev;
272
273 failed_free:
274         free_netdev(dev);
275 failed:
276         return ERR_PTR(err);
277 }
278
279 static int ip_tunnel_bind_dev(struct net_device *dev)
280 {
281         struct net_device *tdev = NULL;
282         struct ip_tunnel *tunnel = netdev_priv(dev);
283         const struct iphdr *iph;
284         int hlen = LL_MAX_HEADER;
285         int mtu = ETH_DATA_LEN;
286         int t_hlen = tunnel->hlen + sizeof(struct iphdr);
287
288         iph = &tunnel->parms.iph;
289
290         /* Guess output device to choose reasonable mtu and needed_headroom */
291         if (iph->daddr) {
292                 struct flowi4 fl4;
293                 struct rtable *rt;
294
295                 ip_tunnel_init_flow(&fl4, iph->protocol, iph->daddr,
296                                     iph->saddr, tunnel->parms.o_key,
297                                     RT_TOS(iph->tos), tunnel->parms.link,
298                                     tunnel->fwmark, 0);
299                 rt = ip_route_output_key(tunnel->net, &fl4);
300
301                 if (!IS_ERR(rt)) {
302                         tdev = rt->dst.dev;
303                         ip_rt_put(rt);
304                 }
305                 if (dev->type != ARPHRD_ETHER)
306                         dev->flags |= IFF_POINTOPOINT;
307
308                 dst_cache_reset(&tunnel->dst_cache);
309         }
310
311         if (!tdev && tunnel->parms.link)
312                 tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
313
314         if (tdev) {
315                 hlen = tdev->hard_header_len + tdev->needed_headroom;
316                 mtu = min(tdev->mtu, IP_MAX_MTU);
317         }
318
319         dev->needed_headroom = t_hlen + hlen;
320         mtu -= t_hlen;
321
322         if (mtu < IPV4_MIN_MTU)
323                 mtu = IPV4_MIN_MTU;
324
325         return mtu;
326 }
327
328 static struct ip_tunnel *ip_tunnel_create(struct net *net,
329                                           struct ip_tunnel_net *itn,
330                                           struct ip_tunnel_parm *parms)
331 {
332         struct ip_tunnel *nt;
333         struct net_device *dev;
334         int t_hlen;
335         int mtu;
336         int err;
337
338         dev = __ip_tunnel_create(net, itn->rtnl_link_ops, parms);
339         if (IS_ERR(dev))
340                 return ERR_CAST(dev);
341
342         mtu = ip_tunnel_bind_dev(dev);
343         err = dev_set_mtu(dev, mtu);
344         if (err)
345                 goto err_dev_set_mtu;
346
347         nt = netdev_priv(dev);
348         t_hlen = nt->hlen + sizeof(struct iphdr);
349         dev->min_mtu = ETH_MIN_MTU;
350         dev->max_mtu = IP_MAX_MTU - t_hlen;
351         ip_tunnel_add(itn, nt);
352         return nt;
353
354 err_dev_set_mtu:
355         unregister_netdevice(dev);
356         return ERR_PTR(err);
357 }
358
359 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
360                   const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst,
361                   bool log_ecn_error)
362 {
363         const struct iphdr *iph = ip_hdr(skb);
364         int err;
365
366 #ifdef CONFIG_NET_IPGRE_BROADCAST
367         if (ipv4_is_multicast(iph->daddr)) {
368                 tunnel->dev->stats.multicast++;
369                 skb->pkt_type = PACKET_BROADCAST;
370         }
371 #endif
372
373         if ((!(tpi->flags&TUNNEL_CSUM) &&  (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
374              ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
375                 tunnel->dev->stats.rx_crc_errors++;
376                 tunnel->dev->stats.rx_errors++;
377                 goto drop;
378         }
379
380         if (tunnel->parms.i_flags&TUNNEL_SEQ) {
381                 if (!(tpi->flags&TUNNEL_SEQ) ||
382                     (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
383                         tunnel->dev->stats.rx_fifo_errors++;
384                         tunnel->dev->stats.rx_errors++;
385                         goto drop;
386                 }
387                 tunnel->i_seqno = ntohl(tpi->seq) + 1;
388         }
389
390         skb_reset_network_header(skb);
391
392         err = IP_ECN_decapsulate(iph, skb);
393         if (unlikely(err)) {
394                 if (log_ecn_error)
395                         net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
396                                         &iph->saddr, iph->tos);
397                 if (err > 1) {
398                         ++tunnel->dev->stats.rx_frame_errors;
399                         ++tunnel->dev->stats.rx_errors;
400                         goto drop;
401                 }
402         }
403
404         dev_sw_netstats_rx_add(tunnel->dev, skb->len);
405         skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
406
407         if (tunnel->dev->type == ARPHRD_ETHER) {
408                 skb->protocol = eth_type_trans(skb, tunnel->dev);
409                 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
410         } else {
411                 skb->dev = tunnel->dev;
412         }
413
414         if (tun_dst)
415                 skb_dst_set(skb, (struct dst_entry *)tun_dst);
416
417         gro_cells_receive(&tunnel->gro_cells, skb);
418         return 0;
419
420 drop:
421         if (tun_dst)
422                 dst_release((struct dst_entry *)tun_dst);
423         kfree_skb(skb);
424         return 0;
425 }
426 EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
427
428 int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops,
429                             unsigned int num)
430 {
431         if (num >= MAX_IPTUN_ENCAP_OPS)
432                 return -ERANGE;
433
434         return !cmpxchg((const struct ip_tunnel_encap_ops **)
435                         &iptun_encaps[num],
436                         NULL, ops) ? 0 : -1;
437 }
438 EXPORT_SYMBOL(ip_tunnel_encap_add_ops);
439
440 int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops,
441                             unsigned int num)
442 {
443         int ret;
444
445         if (num >= MAX_IPTUN_ENCAP_OPS)
446                 return -ERANGE;
447
448         ret = (cmpxchg((const struct ip_tunnel_encap_ops **)
449                        &iptun_encaps[num],
450                        ops, NULL) == ops) ? 0 : -1;
451
452         synchronize_net();
453
454         return ret;
455 }
456 EXPORT_SYMBOL(ip_tunnel_encap_del_ops);
457
458 int ip_tunnel_encap_setup(struct ip_tunnel *t,
459                           struct ip_tunnel_encap *ipencap)
460 {
461         int hlen;
462
463         memset(&t->encap, 0, sizeof(t->encap));
464
465         hlen = ip_encap_hlen(ipencap);
466         if (hlen < 0)
467                 return hlen;
468
469         t->encap.type = ipencap->type;
470         t->encap.sport = ipencap->sport;
471         t->encap.dport = ipencap->dport;
472         t->encap.flags = ipencap->flags;
473
474         t->encap_hlen = hlen;
475         t->hlen = t->encap_hlen + t->tun_hlen;
476
477         return 0;
478 }
479 EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup);
480
481 static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
482                             struct rtable *rt, __be16 df,
483                             const struct iphdr *inner_iph,
484                             int tunnel_hlen, __be32 dst, bool md)
485 {
486         struct ip_tunnel *tunnel = netdev_priv(dev);
487         int pkt_size;
488         int mtu;
489
490         tunnel_hlen = md ? tunnel_hlen : tunnel->hlen;
491         pkt_size = skb->len - tunnel_hlen;
492
493         if (df)
494                 mtu = dst_mtu(&rt->dst) - (sizeof(struct iphdr) + tunnel_hlen);
495         else
496                 mtu = skb_valid_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
497
498         if (skb_valid_dst(skb))
499                 skb_dst_update_pmtu_no_confirm(skb, mtu);
500
501         if (skb->protocol == htons(ETH_P_IP)) {
502                 if (!skb_is_gso(skb) &&
503                     (inner_iph->frag_off & htons(IP_DF)) &&
504                     mtu < pkt_size) {
505                         memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
506                         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
507                         return -E2BIG;
508                 }
509         }
510 #if IS_ENABLED(CONFIG_IPV6)
511         else if (skb->protocol == htons(ETH_P_IPV6)) {
512                 struct rt6_info *rt6;
513                 __be32 daddr;
514
515                 rt6 = skb_valid_dst(skb) ? (struct rt6_info *)skb_dst(skb) :
516                                            NULL;
517                 daddr = md ? dst : tunnel->parms.iph.daddr;
518
519                 if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
520                            mtu >= IPV6_MIN_MTU) {
521                         if ((daddr && !ipv4_is_multicast(daddr)) ||
522                             rt6->rt6i_dst.plen == 128) {
523                                 rt6->rt6i_flags |= RTF_MODIFIED;
524                                 dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
525                         }
526                 }
527
528                 if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
529                                         mtu < pkt_size) {
530                         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
531                         return -E2BIG;
532                 }
533         }
534 #endif
535         return 0;
536 }
537
538 void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
539                        u8 proto, int tunnel_hlen)
540 {
541         struct ip_tunnel *tunnel = netdev_priv(dev);
542         u32 headroom = sizeof(struct iphdr);
543         struct ip_tunnel_info *tun_info;
544         const struct ip_tunnel_key *key;
545         const struct iphdr *inner_iph;
546         struct rtable *rt = NULL;
547         struct flowi4 fl4;
548         __be16 df = 0;
549         u8 tos, ttl;
550         bool use_cache;
551
552         tun_info = skb_tunnel_info(skb);
553         if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
554                      ip_tunnel_info_af(tun_info) != AF_INET))
555                 goto tx_error;
556         key = &tun_info->key;
557         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
558         inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
559         tos = key->tos;
560         if (tos == 1) {
561                 if (skb->protocol == htons(ETH_P_IP))
562                         tos = inner_iph->tos;
563                 else if (skb->protocol == htons(ETH_P_IPV6))
564                         tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
565         }
566         ip_tunnel_init_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src,
567                             tunnel_id_to_key32(key->tun_id), RT_TOS(tos),
568                             0, skb->mark, skb_get_hash(skb));
569         if (tunnel->encap.type != TUNNEL_ENCAP_NONE)
570                 goto tx_error;
571
572         use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
573         if (use_cache)
574                 rt = dst_cache_get_ip4(&tun_info->dst_cache, &fl4.saddr);
575         if (!rt) {
576                 rt = ip_route_output_key(tunnel->net, &fl4);
577                 if (IS_ERR(rt)) {
578                         dev->stats.tx_carrier_errors++;
579                         goto tx_error;
580                 }
581                 if (use_cache)
582                         dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst,
583                                           fl4.saddr);
584         }
585         if (rt->dst.dev == dev) {
586                 ip_rt_put(rt);
587                 dev->stats.collisions++;
588                 goto tx_error;
589         }
590
591         if (key->tun_flags & TUNNEL_DONT_FRAGMENT)
592                 df = htons(IP_DF);
593         if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, tunnel_hlen,
594                             key->u.ipv4.dst, true)) {
595                 ip_rt_put(rt);
596                 goto tx_error;
597         }
598
599         tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
600         ttl = key->ttl;
601         if (ttl == 0) {
602                 if (skb->protocol == htons(ETH_P_IP))
603                         ttl = inner_iph->ttl;
604                 else if (skb->protocol == htons(ETH_P_IPV6))
605                         ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
606                 else
607                         ttl = ip4_dst_hoplimit(&rt->dst);
608         }
609
610         headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len;
611         if (headroom > dev->needed_headroom)
612                 dev->needed_headroom = headroom;
613
614         if (skb_cow_head(skb, dev->needed_headroom)) {
615                 ip_rt_put(rt);
616                 goto tx_dropped;
617         }
618         iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, tos, ttl,
619                       df, !net_eq(tunnel->net, dev_net(dev)));
620         return;
621 tx_error:
622         dev->stats.tx_errors++;
623         goto kfree;
624 tx_dropped:
625         dev->stats.tx_dropped++;
626 kfree:
627         kfree_skb(skb);
628 }
629 EXPORT_SYMBOL_GPL(ip_md_tunnel_xmit);
630
631 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
632                     const struct iphdr *tnl_params, u8 protocol)
633 {
634         struct ip_tunnel *tunnel = netdev_priv(dev);
635         struct ip_tunnel_info *tun_info = NULL;
636         const struct iphdr *inner_iph;
637         unsigned int max_headroom;      /* The extra header space needed */
638         struct rtable *rt = NULL;               /* Route to the other host */
639         bool use_cache = false;
640         struct flowi4 fl4;
641         bool md = false;
642         bool connected;
643         u8 tos, ttl;
644         __be32 dst;
645         __be16 df;
646
647         inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
648         connected = (tunnel->parms.iph.daddr != 0);
649
650         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
651
652         dst = tnl_params->daddr;
653         if (dst == 0) {
654                 /* NBMA tunnel */
655
656                 if (!skb_dst(skb)) {
657                         dev->stats.tx_fifo_errors++;
658                         goto tx_error;
659                 }
660
661                 tun_info = skb_tunnel_info(skb);
662                 if (tun_info && (tun_info->mode & IP_TUNNEL_INFO_TX) &&
663                     ip_tunnel_info_af(tun_info) == AF_INET &&
664                     tun_info->key.u.ipv4.dst) {
665                         dst = tun_info->key.u.ipv4.dst;
666                         md = true;
667                         connected = true;
668                 }
669                 else if (skb->protocol == htons(ETH_P_IP)) {
670                         rt = skb_rtable(skb);
671                         dst = rt_nexthop(rt, inner_iph->daddr);
672                 }
673 #if IS_ENABLED(CONFIG_IPV6)
674                 else if (skb->protocol == htons(ETH_P_IPV6)) {
675                         const struct in6_addr *addr6;
676                         struct neighbour *neigh;
677                         bool do_tx_error_icmp;
678                         int addr_type;
679
680                         neigh = dst_neigh_lookup(skb_dst(skb),
681                                                  &ipv6_hdr(skb)->daddr);
682                         if (!neigh)
683                                 goto tx_error;
684
685                         addr6 = (const struct in6_addr *)&neigh->primary_key;
686                         addr_type = ipv6_addr_type(addr6);
687
688                         if (addr_type == IPV6_ADDR_ANY) {
689                                 addr6 = &ipv6_hdr(skb)->daddr;
690                                 addr_type = ipv6_addr_type(addr6);
691                         }
692
693                         if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
694                                 do_tx_error_icmp = true;
695                         else {
696                                 do_tx_error_icmp = false;
697                                 dst = addr6->s6_addr32[3];
698                         }
699                         neigh_release(neigh);
700                         if (do_tx_error_icmp)
701                                 goto tx_error_icmp;
702                 }
703 #endif
704                 else
705                         goto tx_error;
706
707                 if (!md)
708                         connected = false;
709         }
710
711         tos = tnl_params->tos;
712         if (tos & 0x1) {
713                 tos &= ~0x1;
714                 if (skb->protocol == htons(ETH_P_IP)) {
715                         tos = inner_iph->tos;
716                         connected = false;
717                 } else if (skb->protocol == htons(ETH_P_IPV6)) {
718                         tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
719                         connected = false;
720                 }
721         }
722
723         ip_tunnel_init_flow(&fl4, protocol, dst, tnl_params->saddr,
724                             tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link,
725                             tunnel->fwmark, skb_get_hash(skb));
726
727         if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0)
728                 goto tx_error;
729
730         if (connected && md) {
731                 use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
732                 if (use_cache)
733                         rt = dst_cache_get_ip4(&tun_info->dst_cache,
734                                                &fl4.saddr);
735         } else {
736                 rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache,
737                                                 &fl4.saddr) : NULL;
738         }
739
740         if (!rt) {
741                 rt = ip_route_output_key(tunnel->net, &fl4);
742
743                 if (IS_ERR(rt)) {
744                         dev->stats.tx_carrier_errors++;
745                         goto tx_error;
746                 }
747                 if (use_cache)
748                         dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst,
749                                           fl4.saddr);
750                 else if (!md && connected)
751                         dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst,
752                                           fl4.saddr);
753         }
754
755         if (rt->dst.dev == dev) {
756                 ip_rt_put(rt);
757                 dev->stats.collisions++;
758                 goto tx_error;
759         }
760
761         df = tnl_params->frag_off;
762         if (skb->protocol == htons(ETH_P_IP) && !tunnel->ignore_df)
763                 df |= (inner_iph->frag_off & htons(IP_DF));
764
765         if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, 0, 0, false)) {
766                 ip_rt_put(rt);
767                 goto tx_error;
768         }
769
770         if (tunnel->err_count > 0) {
771                 if (time_before(jiffies,
772                                 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
773                         tunnel->err_count--;
774
775                         dst_link_failure(skb);
776                 } else
777                         tunnel->err_count = 0;
778         }
779
780         tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
781         ttl = tnl_params->ttl;
782         if (ttl == 0) {
783                 if (skb->protocol == htons(ETH_P_IP))
784                         ttl = inner_iph->ttl;
785 #if IS_ENABLED(CONFIG_IPV6)
786                 else if (skb->protocol == htons(ETH_P_IPV6))
787                         ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
788 #endif
789                 else
790                         ttl = ip4_dst_hoplimit(&rt->dst);
791         }
792
793         max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
794                         + rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
795         if (max_headroom > dev->needed_headroom)
796                 dev->needed_headroom = max_headroom;
797
798         if (skb_cow_head(skb, dev->needed_headroom)) {
799                 ip_rt_put(rt);
800                 dev->stats.tx_dropped++;
801                 kfree_skb(skb);
802                 return;
803         }
804
805         iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl,
806                       df, !net_eq(tunnel->net, dev_net(dev)));
807         return;
808
809 #if IS_ENABLED(CONFIG_IPV6)
810 tx_error_icmp:
811         dst_link_failure(skb);
812 #endif
813 tx_error:
814         dev->stats.tx_errors++;
815         kfree_skb(skb);
816 }
817 EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
818
819 static void ip_tunnel_update(struct ip_tunnel_net *itn,
820                              struct ip_tunnel *t,
821                              struct net_device *dev,
822                              struct ip_tunnel_parm *p,
823                              bool set_mtu,
824                              __u32 fwmark)
825 {
826         ip_tunnel_del(itn, t);
827         t->parms.iph.saddr = p->iph.saddr;
828         t->parms.iph.daddr = p->iph.daddr;
829         t->parms.i_key = p->i_key;
830         t->parms.o_key = p->o_key;
831         if (dev->type != ARPHRD_ETHER) {
832                 memcpy(dev->dev_addr, &p->iph.saddr, 4);
833                 memcpy(dev->broadcast, &p->iph.daddr, 4);
834         }
835         ip_tunnel_add(itn, t);
836
837         t->parms.iph.ttl = p->iph.ttl;
838         t->parms.iph.tos = p->iph.tos;
839         t->parms.iph.frag_off = p->iph.frag_off;
840
841         if (t->parms.link != p->link || t->fwmark != fwmark) {
842                 int mtu;
843
844                 t->parms.link = p->link;
845                 t->fwmark = fwmark;
846                 mtu = ip_tunnel_bind_dev(dev);
847                 if (set_mtu)
848                         dev->mtu = mtu;
849         }
850         dst_cache_reset(&t->dst_cache);
851         netdev_state_change(dev);
852 }
853
854 int ip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
855 {
856         int err = 0;
857         struct ip_tunnel *t = netdev_priv(dev);
858         struct net *net = t->net;
859         struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
860
861         switch (cmd) {
862         case SIOCGETTUNNEL:
863                 if (dev == itn->fb_tunnel_dev) {
864                         t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
865                         if (!t)
866                                 t = netdev_priv(dev);
867                 }
868                 memcpy(p, &t->parms, sizeof(*p));
869                 break;
870
871         case SIOCADDTUNNEL:
872         case SIOCCHGTUNNEL:
873                 err = -EPERM;
874                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
875                         goto done;
876                 if (p->iph.ttl)
877                         p->iph.frag_off |= htons(IP_DF);
878                 if (!(p->i_flags & VTI_ISVTI)) {
879                         if (!(p->i_flags & TUNNEL_KEY))
880                                 p->i_key = 0;
881                         if (!(p->o_flags & TUNNEL_KEY))
882                                 p->o_key = 0;
883                 }
884
885                 t = ip_tunnel_find(itn, p, itn->type);
886
887                 if (cmd == SIOCADDTUNNEL) {
888                         if (!t) {
889                                 t = ip_tunnel_create(net, itn, p);
890                                 err = PTR_ERR_OR_ZERO(t);
891                                 break;
892                         }
893
894                         err = -EEXIST;
895                         break;
896                 }
897                 if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
898                         if (t) {
899                                 if (t->dev != dev) {
900                                         err = -EEXIST;
901                                         break;
902                                 }
903                         } else {
904                                 unsigned int nflags = 0;
905
906                                 if (ipv4_is_multicast(p->iph.daddr))
907                                         nflags = IFF_BROADCAST;
908                                 else if (p->iph.daddr)
909                                         nflags = IFF_POINTOPOINT;
910
911                                 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
912                                         err = -EINVAL;
913                                         break;
914                                 }
915
916                                 t = netdev_priv(dev);
917                         }
918                 }
919
920                 if (t) {
921                         err = 0;
922                         ip_tunnel_update(itn, t, dev, p, true, 0);
923                 } else {
924                         err = -ENOENT;
925                 }
926                 break;
927
928         case SIOCDELTUNNEL:
929                 err = -EPERM;
930                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
931                         goto done;
932
933                 if (dev == itn->fb_tunnel_dev) {
934                         err = -ENOENT;
935                         t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
936                         if (!t)
937                                 goto done;
938                         err = -EPERM;
939                         if (t == netdev_priv(itn->fb_tunnel_dev))
940                                 goto done;
941                         dev = t->dev;
942                 }
943                 unregister_netdevice(dev);
944                 err = 0;
945                 break;
946
947         default:
948                 err = -EINVAL;
949         }
950
951 done:
952         return err;
953 }
954 EXPORT_SYMBOL_GPL(ip_tunnel_ctl);
955
956 int ip_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
957 {
958         struct ip_tunnel_parm p;
959         int err;
960
961         if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
962                 return -EFAULT;
963         err = dev->netdev_ops->ndo_tunnel_ctl(dev, &p, cmd);
964         if (!err && copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
965                 return -EFAULT;
966         return err;
967 }
968 EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
969
970 int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict)
971 {
972         struct ip_tunnel *tunnel = netdev_priv(dev);
973         int t_hlen = tunnel->hlen + sizeof(struct iphdr);
974         int max_mtu = IP_MAX_MTU - t_hlen;
975
976         if (new_mtu < ETH_MIN_MTU)
977                 return -EINVAL;
978
979         if (new_mtu > max_mtu) {
980                 if (strict)
981                         return -EINVAL;
982
983                 new_mtu = max_mtu;
984         }
985
986         dev->mtu = new_mtu;
987         return 0;
988 }
989 EXPORT_SYMBOL_GPL(__ip_tunnel_change_mtu);
990
991 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
992 {
993         return __ip_tunnel_change_mtu(dev, new_mtu, true);
994 }
995 EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
996
997 static void ip_tunnel_dev_free(struct net_device *dev)
998 {
999         struct ip_tunnel *tunnel = netdev_priv(dev);
1000
1001         gro_cells_destroy(&tunnel->gro_cells);
1002         dst_cache_destroy(&tunnel->dst_cache);
1003         free_percpu(dev->tstats);
1004 }
1005
1006 void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
1007 {
1008         struct ip_tunnel *tunnel = netdev_priv(dev);
1009         struct ip_tunnel_net *itn;
1010
1011         itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
1012
1013         if (itn->fb_tunnel_dev != dev) {
1014                 ip_tunnel_del(itn, netdev_priv(dev));
1015                 unregister_netdevice_queue(dev, head);
1016         }
1017 }
1018 EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
1019
1020 struct net *ip_tunnel_get_link_net(const struct net_device *dev)
1021 {
1022         struct ip_tunnel *tunnel = netdev_priv(dev);
1023
1024         return tunnel->net;
1025 }
1026 EXPORT_SYMBOL(ip_tunnel_get_link_net);
1027
1028 int ip_tunnel_get_iflink(const struct net_device *dev)
1029 {
1030         struct ip_tunnel *tunnel = netdev_priv(dev);
1031
1032         return tunnel->parms.link;
1033 }
1034 EXPORT_SYMBOL(ip_tunnel_get_iflink);
1035
1036 int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id,
1037                                   struct rtnl_link_ops *ops, char *devname)
1038 {
1039         struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
1040         struct ip_tunnel_parm parms;
1041         unsigned int i;
1042
1043         itn->rtnl_link_ops = ops;
1044         for (i = 0; i < IP_TNL_HASH_SIZE; i++)
1045                 INIT_HLIST_HEAD(&itn->tunnels[i]);
1046
1047         if (!ops || !net_has_fallback_tunnels(net)) {
1048                 struct ip_tunnel_net *it_init_net;
1049
1050                 it_init_net = net_generic(&init_net, ip_tnl_net_id);
1051                 itn->type = it_init_net->type;
1052                 itn->fb_tunnel_dev = NULL;
1053                 return 0;
1054         }
1055
1056         memset(&parms, 0, sizeof(parms));
1057         if (devname)
1058                 strlcpy(parms.name, devname, IFNAMSIZ);
1059
1060         rtnl_lock();
1061         itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
1062         /* FB netdevice is special: we have one, and only one per netns.
1063          * Allowing to move it to another netns is clearly unsafe.
1064          */
1065         if (!IS_ERR(itn->fb_tunnel_dev)) {
1066                 itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
1067                 itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
1068                 ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
1069                 itn->type = itn->fb_tunnel_dev->type;
1070         }
1071         rtnl_unlock();
1072
1073         return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
1074 }
1075 EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
1076
1077 static void ip_tunnel_destroy(struct net *net, struct ip_tunnel_net *itn,
1078                               struct list_head *head,
1079                               struct rtnl_link_ops *ops)
1080 {
1081         struct net_device *dev, *aux;
1082         int h;
1083
1084         for_each_netdev_safe(net, dev, aux)
1085                 if (dev->rtnl_link_ops == ops)
1086                         unregister_netdevice_queue(dev, head);
1087
1088         for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
1089                 struct ip_tunnel *t;
1090                 struct hlist_node *n;
1091                 struct hlist_head *thead = &itn->tunnels[h];
1092
1093                 hlist_for_each_entry_safe(t, n, thead, hash_node)
1094                         /* If dev is in the same netns, it has already
1095                          * been added to the list by the previous loop.
1096                          */
1097                         if (!net_eq(dev_net(t->dev), net))
1098                                 unregister_netdevice_queue(t->dev, head);
1099         }
1100 }
1101
1102 void ip_tunnel_delete_nets(struct list_head *net_list, unsigned int id,
1103                            struct rtnl_link_ops *ops)
1104 {
1105         struct ip_tunnel_net *itn;
1106         struct net *net;
1107         LIST_HEAD(list);
1108
1109         rtnl_lock();
1110         list_for_each_entry(net, net_list, exit_list) {
1111                 itn = net_generic(net, id);
1112                 ip_tunnel_destroy(net, itn, &list, ops);
1113         }
1114         unregister_netdevice_many(&list);
1115         rtnl_unlock();
1116 }
1117 EXPORT_SYMBOL_GPL(ip_tunnel_delete_nets);
1118
1119 int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
1120                       struct ip_tunnel_parm *p, __u32 fwmark)
1121 {
1122         struct ip_tunnel *nt;
1123         struct net *net = dev_net(dev);
1124         struct ip_tunnel_net *itn;
1125         int mtu;
1126         int err;
1127
1128         nt = netdev_priv(dev);
1129         itn = net_generic(net, nt->ip_tnl_net_id);
1130
1131         if (nt->collect_md) {
1132                 if (rtnl_dereference(itn->collect_md_tun))
1133                         return -EEXIST;
1134         } else {
1135                 if (ip_tunnel_find(itn, p, dev->type))
1136                         return -EEXIST;
1137         }
1138
1139         nt->net = net;
1140         nt->parms = *p;
1141         nt->fwmark = fwmark;
1142         err = register_netdevice(dev);
1143         if (err)
1144                 goto err_register_netdevice;
1145
1146         if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1147                 eth_hw_addr_random(dev);
1148
1149         mtu = ip_tunnel_bind_dev(dev);
1150         if (tb[IFLA_MTU]) {
1151                 unsigned int max = IP_MAX_MTU - (nt->hlen + sizeof(struct iphdr));
1152
1153                 mtu = clamp(dev->mtu, (unsigned int)ETH_MIN_MTU, max);
1154         }
1155
1156         err = dev_set_mtu(dev, mtu);
1157         if (err)
1158                 goto err_dev_set_mtu;
1159
1160         ip_tunnel_add(itn, nt);
1161         return 0;
1162
1163 err_dev_set_mtu:
1164         unregister_netdevice(dev);
1165 err_register_netdevice:
1166         return err;
1167 }
1168 EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
1169
1170 int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
1171                          struct ip_tunnel_parm *p, __u32 fwmark)
1172 {
1173         struct ip_tunnel *t;
1174         struct ip_tunnel *tunnel = netdev_priv(dev);
1175         struct net *net = tunnel->net;
1176         struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
1177
1178         if (dev == itn->fb_tunnel_dev)
1179                 return -EINVAL;
1180
1181         t = ip_tunnel_find(itn, p, dev->type);
1182
1183         if (t) {
1184                 if (t->dev != dev)
1185                         return -EEXIST;
1186         } else {
1187                 t = tunnel;
1188
1189                 if (dev->type != ARPHRD_ETHER) {
1190                         unsigned int nflags = 0;
1191
1192                         if (ipv4_is_multicast(p->iph.daddr))
1193                                 nflags = IFF_BROADCAST;
1194                         else if (p->iph.daddr)
1195                                 nflags = IFF_POINTOPOINT;
1196
1197                         if ((dev->flags ^ nflags) &
1198                             (IFF_POINTOPOINT | IFF_BROADCAST))
1199                                 return -EINVAL;
1200                 }
1201         }
1202
1203         ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU], fwmark);
1204         return 0;
1205 }
1206 EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
1207
1208 int ip_tunnel_init(struct net_device *dev)
1209 {
1210         struct ip_tunnel *tunnel = netdev_priv(dev);
1211         struct iphdr *iph = &tunnel->parms.iph;
1212         int err;
1213
1214         dev->needs_free_netdev = true;
1215         dev->priv_destructor = ip_tunnel_dev_free;
1216         dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
1217         if (!dev->tstats)
1218                 return -ENOMEM;
1219
1220         err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
1221         if (err) {
1222                 free_percpu(dev->tstats);
1223                 return err;
1224         }
1225
1226         err = gro_cells_init(&tunnel->gro_cells, dev);
1227         if (err) {
1228                 dst_cache_destroy(&tunnel->dst_cache);
1229                 free_percpu(dev->tstats);
1230                 return err;
1231         }
1232
1233         tunnel->dev = dev;
1234         tunnel->net = dev_net(dev);
1235         strcpy(tunnel->parms.name, dev->name);
1236         iph->version            = 4;
1237         iph->ihl                = 5;
1238
1239         if (tunnel->collect_md)
1240                 netif_keep_dst(dev);
1241         return 0;
1242 }
1243 EXPORT_SYMBOL_GPL(ip_tunnel_init);
1244
1245 void ip_tunnel_uninit(struct net_device *dev)
1246 {
1247         struct ip_tunnel *tunnel = netdev_priv(dev);
1248         struct net *net = tunnel->net;
1249         struct ip_tunnel_net *itn;
1250
1251         itn = net_generic(net, tunnel->ip_tnl_net_id);
1252         ip_tunnel_del(itn, netdev_priv(dev));
1253         if (itn->fb_tunnel_dev == dev)
1254                 WRITE_ONCE(itn->fb_tunnel_dev, NULL);
1255
1256         dst_cache_reset(&tunnel->dst_cache);
1257 }
1258 EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1259
1260 /* Do least required initialization, rest of init is done in tunnel_init call */
1261 void ip_tunnel_setup(struct net_device *dev, unsigned int net_id)
1262 {
1263         struct ip_tunnel *tunnel = netdev_priv(dev);
1264         tunnel->ip_tnl_net_id = net_id;
1265 }
1266 EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1267
1268 MODULE_LICENSE("GPL");