1 // SPDX-License-Identifier: GPL-2.0-only
3 * Copyright (c) 2013 Nicira, Inc.
6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
8 #include <linux/capability.h>
9 #include <linux/module.h>
10 #include <linux/types.h>
11 #include <linux/kernel.h>
12 #include <linux/slab.h>
13 #include <linux/uaccess.h>
14 #include <linux/skbuff.h>
15 #include <linux/netdevice.h>
17 #include <linux/tcp.h>
18 #include <linux/udp.h>
19 #include <linux/if_arp.h>
20 #include <linux/init.h>
21 #include <linux/in6.h>
22 #include <linux/inetdevice.h>
23 #include <linux/igmp.h>
24 #include <linux/netfilter_ipv4.h>
25 #include <linux/etherdevice.h>
26 #include <linux/if_ether.h>
27 #include <linux/if_vlan.h>
28 #include <linux/rculist.h>
29 #include <linux/err.h>
34 #include <net/protocol.h>
35 #include <net/ip_tunnels.h>
37 #include <net/checksum.h>
38 #include <net/dsfield.h>
39 #include <net/inet_ecn.h>
41 #include <net/net_namespace.h>
42 #include <net/netns/generic.h>
43 #include <net/rtnetlink.h>
45 #include <net/dst_metadata.h>
47 #if IS_ENABLED(CONFIG_IPV6)
49 #include <net/ip6_fib.h>
50 #include <net/ip6_route.h>
53 static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
55 return hash_32((__force u32)key ^ (__force u32)remote,
59 static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
60 __be16 flags, __be32 key)
62 if (p->i_flags & TUNNEL_KEY) {
63 if (flags & TUNNEL_KEY)
64 return key == p->i_key;
66 /* key expected, none present */
69 return !(flags & TUNNEL_KEY);
72 /* Fallback tunnel: no source, no destination, no key, no options
75 We require exact key match i.e. if a key is present in packet
76 it will match only tunnel with the same key; if it is not present,
77 it will match only keyless tunnel.
79 All keysless packets, if not matched configured keyless tunnels
80 will match fallback tunnel.
81 Given src, dst and key, find appropriate for input tunnel.
83 struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
84 int link, __be16 flags,
85 __be32 remote, __be32 local,
88 struct ip_tunnel *t, *cand = NULL;
89 struct hlist_head *head;
90 struct net_device *ndev;
93 hash = ip_tunnel_hash(key, remote);
94 head = &itn->tunnels[hash];
96 hlist_for_each_entry_rcu(t, head, hash_node) {
97 if (local != t->parms.iph.saddr ||
98 remote != t->parms.iph.daddr ||
99 !(t->dev->flags & IFF_UP))
102 if (!ip_tunnel_key_match(&t->parms, flags, key))
105 if (t->parms.link == link)
111 hlist_for_each_entry_rcu(t, head, hash_node) {
112 if (remote != t->parms.iph.daddr ||
113 t->parms.iph.saddr != 0 ||
114 !(t->dev->flags & IFF_UP))
117 if (!ip_tunnel_key_match(&t->parms, flags, key))
120 if (t->parms.link == link)
126 hash = ip_tunnel_hash(key, 0);
127 head = &itn->tunnels[hash];
129 hlist_for_each_entry_rcu(t, head, hash_node) {
130 if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) &&
131 (local != t->parms.iph.daddr || !ipv4_is_multicast(local)))
134 if (!(t->dev->flags & IFF_UP))
137 if (!ip_tunnel_key_match(&t->parms, flags, key))
140 if (t->parms.link == link)
146 hlist_for_each_entry_rcu(t, head, hash_node) {
147 if ((!(flags & TUNNEL_NO_KEY) && t->parms.i_key != key) ||
148 t->parms.iph.saddr != 0 ||
149 t->parms.iph.daddr != 0 ||
150 !(t->dev->flags & IFF_UP))
153 if (t->parms.link == link)
162 t = rcu_dereference(itn->collect_md_tun);
163 if (t && t->dev->flags & IFF_UP)
166 ndev = READ_ONCE(itn->fb_tunnel_dev);
167 if (ndev && ndev->flags & IFF_UP)
168 return netdev_priv(ndev);
172 EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
174 static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
175 struct ip_tunnel_parm *parms)
179 __be32 i_key = parms->i_key;
181 if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
182 remote = parms->iph.daddr;
186 if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI))
189 h = ip_tunnel_hash(i_key, remote);
190 return &itn->tunnels[h];
193 static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
195 struct hlist_head *head = ip_bucket(itn, &t->parms);
198 rcu_assign_pointer(itn->collect_md_tun, t);
199 hlist_add_head_rcu(&t->hash_node, head);
202 static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t)
205 rcu_assign_pointer(itn->collect_md_tun, NULL);
206 hlist_del_init_rcu(&t->hash_node);
209 static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
210 struct ip_tunnel_parm *parms,
213 __be32 remote = parms->iph.daddr;
214 __be32 local = parms->iph.saddr;
215 __be32 key = parms->i_key;
216 __be16 flags = parms->i_flags;
217 int link = parms->link;
218 struct ip_tunnel *t = NULL;
219 struct hlist_head *head = ip_bucket(itn, parms);
221 hlist_for_each_entry_rcu(t, head, hash_node) {
222 if (local == t->parms.iph.saddr &&
223 remote == t->parms.iph.daddr &&
224 link == t->parms.link &&
225 type == t->dev->type &&
226 ip_tunnel_key_match(&t->parms, flags, key))
232 static struct net_device *__ip_tunnel_create(struct net *net,
233 const struct rtnl_link_ops *ops,
234 struct ip_tunnel_parm *parms)
237 struct ip_tunnel *tunnel;
238 struct net_device *dev;
242 if (parms->name[0]) {
243 if (!dev_valid_name(parms->name))
245 strlcpy(name, parms->name, IFNAMSIZ);
247 if (strlen(ops->kind) > (IFNAMSIZ - 3))
249 strcpy(name, ops->kind);
254 dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup);
259 dev_net_set(dev, net);
261 dev->rtnl_link_ops = ops;
263 tunnel = netdev_priv(dev);
264 tunnel->parms = *parms;
267 err = register_netdevice(dev);
279 static int ip_tunnel_bind_dev(struct net_device *dev)
281 struct net_device *tdev = NULL;
282 struct ip_tunnel *tunnel = netdev_priv(dev);
283 const struct iphdr *iph;
284 int hlen = LL_MAX_HEADER;
285 int mtu = ETH_DATA_LEN;
286 int t_hlen = tunnel->hlen + sizeof(struct iphdr);
288 iph = &tunnel->parms.iph;
290 /* Guess output device to choose reasonable mtu and needed_headroom */
295 ip_tunnel_init_flow(&fl4, iph->protocol, iph->daddr,
296 iph->saddr, tunnel->parms.o_key,
297 RT_TOS(iph->tos), tunnel->parms.link,
299 rt = ip_route_output_key(tunnel->net, &fl4);
305 if (dev->type != ARPHRD_ETHER)
306 dev->flags |= IFF_POINTOPOINT;
308 dst_cache_reset(&tunnel->dst_cache);
311 if (!tdev && tunnel->parms.link)
312 tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
315 hlen = tdev->hard_header_len + tdev->needed_headroom;
316 mtu = min(tdev->mtu, IP_MAX_MTU);
319 dev->needed_headroom = t_hlen + hlen;
322 if (mtu < IPV4_MIN_MTU)
328 static struct ip_tunnel *ip_tunnel_create(struct net *net,
329 struct ip_tunnel_net *itn,
330 struct ip_tunnel_parm *parms)
332 struct ip_tunnel *nt;
333 struct net_device *dev;
338 dev = __ip_tunnel_create(net, itn->rtnl_link_ops, parms);
340 return ERR_CAST(dev);
342 mtu = ip_tunnel_bind_dev(dev);
343 err = dev_set_mtu(dev, mtu);
345 goto err_dev_set_mtu;
347 nt = netdev_priv(dev);
348 t_hlen = nt->hlen + sizeof(struct iphdr);
349 dev->min_mtu = ETH_MIN_MTU;
350 dev->max_mtu = IP_MAX_MTU - t_hlen;
351 ip_tunnel_add(itn, nt);
355 unregister_netdevice(dev);
359 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
360 const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst,
363 const struct iphdr *iph = ip_hdr(skb);
366 #ifdef CONFIG_NET_IPGRE_BROADCAST
367 if (ipv4_is_multicast(iph->daddr)) {
368 tunnel->dev->stats.multicast++;
369 skb->pkt_type = PACKET_BROADCAST;
373 if ((!(tpi->flags&TUNNEL_CSUM) && (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
374 ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
375 tunnel->dev->stats.rx_crc_errors++;
376 tunnel->dev->stats.rx_errors++;
380 if (tunnel->parms.i_flags&TUNNEL_SEQ) {
381 if (!(tpi->flags&TUNNEL_SEQ) ||
382 (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
383 tunnel->dev->stats.rx_fifo_errors++;
384 tunnel->dev->stats.rx_errors++;
387 tunnel->i_seqno = ntohl(tpi->seq) + 1;
390 skb_reset_network_header(skb);
392 err = IP_ECN_decapsulate(iph, skb);
395 net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
396 &iph->saddr, iph->tos);
398 ++tunnel->dev->stats.rx_frame_errors;
399 ++tunnel->dev->stats.rx_errors;
404 dev_sw_netstats_rx_add(tunnel->dev, skb->len);
405 skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
407 if (tunnel->dev->type == ARPHRD_ETHER) {
408 skb->protocol = eth_type_trans(skb, tunnel->dev);
409 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
411 skb->dev = tunnel->dev;
415 skb_dst_set(skb, (struct dst_entry *)tun_dst);
417 gro_cells_receive(&tunnel->gro_cells, skb);
422 dst_release((struct dst_entry *)tun_dst);
426 EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
428 int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops,
431 if (num >= MAX_IPTUN_ENCAP_OPS)
434 return !cmpxchg((const struct ip_tunnel_encap_ops **)
438 EXPORT_SYMBOL(ip_tunnel_encap_add_ops);
440 int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops,
445 if (num >= MAX_IPTUN_ENCAP_OPS)
448 ret = (cmpxchg((const struct ip_tunnel_encap_ops **)
450 ops, NULL) == ops) ? 0 : -1;
456 EXPORT_SYMBOL(ip_tunnel_encap_del_ops);
458 int ip_tunnel_encap_setup(struct ip_tunnel *t,
459 struct ip_tunnel_encap *ipencap)
463 memset(&t->encap, 0, sizeof(t->encap));
465 hlen = ip_encap_hlen(ipencap);
469 t->encap.type = ipencap->type;
470 t->encap.sport = ipencap->sport;
471 t->encap.dport = ipencap->dport;
472 t->encap.flags = ipencap->flags;
474 t->encap_hlen = hlen;
475 t->hlen = t->encap_hlen + t->tun_hlen;
479 EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup);
481 static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
482 struct rtable *rt, __be16 df,
483 const struct iphdr *inner_iph,
484 int tunnel_hlen, __be32 dst, bool md)
486 struct ip_tunnel *tunnel = netdev_priv(dev);
490 tunnel_hlen = md ? tunnel_hlen : tunnel->hlen;
491 pkt_size = skb->len - tunnel_hlen;
494 mtu = dst_mtu(&rt->dst) - (sizeof(struct iphdr) + tunnel_hlen);
496 mtu = skb_valid_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
498 if (skb_valid_dst(skb))
499 skb_dst_update_pmtu_no_confirm(skb, mtu);
501 if (skb->protocol == htons(ETH_P_IP)) {
502 if (!skb_is_gso(skb) &&
503 (inner_iph->frag_off & htons(IP_DF)) &&
505 memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
506 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
510 #if IS_ENABLED(CONFIG_IPV6)
511 else if (skb->protocol == htons(ETH_P_IPV6)) {
512 struct rt6_info *rt6;
515 rt6 = skb_valid_dst(skb) ? (struct rt6_info *)skb_dst(skb) :
517 daddr = md ? dst : tunnel->parms.iph.daddr;
519 if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
520 mtu >= IPV6_MIN_MTU) {
521 if ((daddr && !ipv4_is_multicast(daddr)) ||
522 rt6->rt6i_dst.plen == 128) {
523 rt6->rt6i_flags |= RTF_MODIFIED;
524 dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
528 if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
530 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
538 void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
539 u8 proto, int tunnel_hlen)
541 struct ip_tunnel *tunnel = netdev_priv(dev);
542 u32 headroom = sizeof(struct iphdr);
543 struct ip_tunnel_info *tun_info;
544 const struct ip_tunnel_key *key;
545 const struct iphdr *inner_iph;
546 struct rtable *rt = NULL;
552 tun_info = skb_tunnel_info(skb);
553 if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
554 ip_tunnel_info_af(tun_info) != AF_INET))
556 key = &tun_info->key;
557 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
558 inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
561 if (skb->protocol == htons(ETH_P_IP))
562 tos = inner_iph->tos;
563 else if (skb->protocol == htons(ETH_P_IPV6))
564 tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
566 ip_tunnel_init_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src,
567 tunnel_id_to_key32(key->tun_id), RT_TOS(tos),
568 0, skb->mark, skb_get_hash(skb));
569 if (tunnel->encap.type != TUNNEL_ENCAP_NONE)
572 use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
574 rt = dst_cache_get_ip4(&tun_info->dst_cache, &fl4.saddr);
576 rt = ip_route_output_key(tunnel->net, &fl4);
578 dev->stats.tx_carrier_errors++;
582 dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst,
585 if (rt->dst.dev == dev) {
587 dev->stats.collisions++;
591 if (key->tun_flags & TUNNEL_DONT_FRAGMENT)
593 if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, tunnel_hlen,
594 key->u.ipv4.dst, true)) {
599 tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
602 if (skb->protocol == htons(ETH_P_IP))
603 ttl = inner_iph->ttl;
604 else if (skb->protocol == htons(ETH_P_IPV6))
605 ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
607 ttl = ip4_dst_hoplimit(&rt->dst);
610 headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len;
611 if (headroom > dev->needed_headroom)
612 dev->needed_headroom = headroom;
614 if (skb_cow_head(skb, dev->needed_headroom)) {
618 iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, tos, ttl,
619 df, !net_eq(tunnel->net, dev_net(dev)));
622 dev->stats.tx_errors++;
625 dev->stats.tx_dropped++;
629 EXPORT_SYMBOL_GPL(ip_md_tunnel_xmit);
631 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
632 const struct iphdr *tnl_params, u8 protocol)
634 struct ip_tunnel *tunnel = netdev_priv(dev);
635 struct ip_tunnel_info *tun_info = NULL;
636 const struct iphdr *inner_iph;
637 unsigned int max_headroom; /* The extra header space needed */
638 struct rtable *rt = NULL; /* Route to the other host */
639 bool use_cache = false;
647 inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
648 connected = (tunnel->parms.iph.daddr != 0);
650 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
652 dst = tnl_params->daddr;
657 dev->stats.tx_fifo_errors++;
661 tun_info = skb_tunnel_info(skb);
662 if (tun_info && (tun_info->mode & IP_TUNNEL_INFO_TX) &&
663 ip_tunnel_info_af(tun_info) == AF_INET &&
664 tun_info->key.u.ipv4.dst) {
665 dst = tun_info->key.u.ipv4.dst;
669 else if (skb->protocol == htons(ETH_P_IP)) {
670 rt = skb_rtable(skb);
671 dst = rt_nexthop(rt, inner_iph->daddr);
673 #if IS_ENABLED(CONFIG_IPV6)
674 else if (skb->protocol == htons(ETH_P_IPV6)) {
675 const struct in6_addr *addr6;
676 struct neighbour *neigh;
677 bool do_tx_error_icmp;
680 neigh = dst_neigh_lookup(skb_dst(skb),
681 &ipv6_hdr(skb)->daddr);
685 addr6 = (const struct in6_addr *)&neigh->primary_key;
686 addr_type = ipv6_addr_type(addr6);
688 if (addr_type == IPV6_ADDR_ANY) {
689 addr6 = &ipv6_hdr(skb)->daddr;
690 addr_type = ipv6_addr_type(addr6);
693 if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
694 do_tx_error_icmp = true;
696 do_tx_error_icmp = false;
697 dst = addr6->s6_addr32[3];
699 neigh_release(neigh);
700 if (do_tx_error_icmp)
711 tos = tnl_params->tos;
714 if (skb->protocol == htons(ETH_P_IP)) {
715 tos = inner_iph->tos;
717 } else if (skb->protocol == htons(ETH_P_IPV6)) {
718 tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
723 ip_tunnel_init_flow(&fl4, protocol, dst, tnl_params->saddr,
724 tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link,
725 tunnel->fwmark, skb_get_hash(skb));
727 if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0)
730 if (connected && md) {
731 use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
733 rt = dst_cache_get_ip4(&tun_info->dst_cache,
736 rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache,
741 rt = ip_route_output_key(tunnel->net, &fl4);
744 dev->stats.tx_carrier_errors++;
748 dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst,
750 else if (!md && connected)
751 dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst,
755 if (rt->dst.dev == dev) {
757 dev->stats.collisions++;
761 df = tnl_params->frag_off;
762 if (skb->protocol == htons(ETH_P_IP) && !tunnel->ignore_df)
763 df |= (inner_iph->frag_off & htons(IP_DF));
765 if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, 0, 0, false)) {
770 if (tunnel->err_count > 0) {
771 if (time_before(jiffies,
772 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
775 dst_link_failure(skb);
777 tunnel->err_count = 0;
780 tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
781 ttl = tnl_params->ttl;
783 if (skb->protocol == htons(ETH_P_IP))
784 ttl = inner_iph->ttl;
785 #if IS_ENABLED(CONFIG_IPV6)
786 else if (skb->protocol == htons(ETH_P_IPV6))
787 ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
790 ttl = ip4_dst_hoplimit(&rt->dst);
793 max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
794 + rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
795 if (max_headroom > dev->needed_headroom)
796 dev->needed_headroom = max_headroom;
798 if (skb_cow_head(skb, dev->needed_headroom)) {
800 dev->stats.tx_dropped++;
805 iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl,
806 df, !net_eq(tunnel->net, dev_net(dev)));
809 #if IS_ENABLED(CONFIG_IPV6)
811 dst_link_failure(skb);
814 dev->stats.tx_errors++;
817 EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
819 static void ip_tunnel_update(struct ip_tunnel_net *itn,
821 struct net_device *dev,
822 struct ip_tunnel_parm *p,
826 ip_tunnel_del(itn, t);
827 t->parms.iph.saddr = p->iph.saddr;
828 t->parms.iph.daddr = p->iph.daddr;
829 t->parms.i_key = p->i_key;
830 t->parms.o_key = p->o_key;
831 if (dev->type != ARPHRD_ETHER) {
832 memcpy(dev->dev_addr, &p->iph.saddr, 4);
833 memcpy(dev->broadcast, &p->iph.daddr, 4);
835 ip_tunnel_add(itn, t);
837 t->parms.iph.ttl = p->iph.ttl;
838 t->parms.iph.tos = p->iph.tos;
839 t->parms.iph.frag_off = p->iph.frag_off;
841 if (t->parms.link != p->link || t->fwmark != fwmark) {
844 t->parms.link = p->link;
846 mtu = ip_tunnel_bind_dev(dev);
850 dst_cache_reset(&t->dst_cache);
851 netdev_state_change(dev);
854 int ip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
857 struct ip_tunnel *t = netdev_priv(dev);
858 struct net *net = t->net;
859 struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
863 if (dev == itn->fb_tunnel_dev) {
864 t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
866 t = netdev_priv(dev);
868 memcpy(p, &t->parms, sizeof(*p));
874 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
877 p->iph.frag_off |= htons(IP_DF);
878 if (!(p->i_flags & VTI_ISVTI)) {
879 if (!(p->i_flags & TUNNEL_KEY))
881 if (!(p->o_flags & TUNNEL_KEY))
885 t = ip_tunnel_find(itn, p, itn->type);
887 if (cmd == SIOCADDTUNNEL) {
889 t = ip_tunnel_create(net, itn, p);
890 err = PTR_ERR_OR_ZERO(t);
897 if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
904 unsigned int nflags = 0;
906 if (ipv4_is_multicast(p->iph.daddr))
907 nflags = IFF_BROADCAST;
908 else if (p->iph.daddr)
909 nflags = IFF_POINTOPOINT;
911 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
916 t = netdev_priv(dev);
922 ip_tunnel_update(itn, t, dev, p, true, 0);
930 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
933 if (dev == itn->fb_tunnel_dev) {
935 t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
939 if (t == netdev_priv(itn->fb_tunnel_dev))
943 unregister_netdevice(dev);
954 EXPORT_SYMBOL_GPL(ip_tunnel_ctl);
956 int ip_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
958 struct ip_tunnel_parm p;
961 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
963 err = dev->netdev_ops->ndo_tunnel_ctl(dev, &p, cmd);
964 if (!err && copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
968 EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
970 int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict)
972 struct ip_tunnel *tunnel = netdev_priv(dev);
973 int t_hlen = tunnel->hlen + sizeof(struct iphdr);
974 int max_mtu = IP_MAX_MTU - t_hlen;
976 if (new_mtu < ETH_MIN_MTU)
979 if (new_mtu > max_mtu) {
989 EXPORT_SYMBOL_GPL(__ip_tunnel_change_mtu);
991 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
993 return __ip_tunnel_change_mtu(dev, new_mtu, true);
995 EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
997 static void ip_tunnel_dev_free(struct net_device *dev)
999 struct ip_tunnel *tunnel = netdev_priv(dev);
1001 gro_cells_destroy(&tunnel->gro_cells);
1002 dst_cache_destroy(&tunnel->dst_cache);
1003 free_percpu(dev->tstats);
1006 void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
1008 struct ip_tunnel *tunnel = netdev_priv(dev);
1009 struct ip_tunnel_net *itn;
1011 itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
1013 if (itn->fb_tunnel_dev != dev) {
1014 ip_tunnel_del(itn, netdev_priv(dev));
1015 unregister_netdevice_queue(dev, head);
1018 EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
1020 struct net *ip_tunnel_get_link_net(const struct net_device *dev)
1022 struct ip_tunnel *tunnel = netdev_priv(dev);
1026 EXPORT_SYMBOL(ip_tunnel_get_link_net);
1028 int ip_tunnel_get_iflink(const struct net_device *dev)
1030 struct ip_tunnel *tunnel = netdev_priv(dev);
1032 return tunnel->parms.link;
1034 EXPORT_SYMBOL(ip_tunnel_get_iflink);
1036 int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id,
1037 struct rtnl_link_ops *ops, char *devname)
1039 struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
1040 struct ip_tunnel_parm parms;
1043 itn->rtnl_link_ops = ops;
1044 for (i = 0; i < IP_TNL_HASH_SIZE; i++)
1045 INIT_HLIST_HEAD(&itn->tunnels[i]);
1047 if (!ops || !net_has_fallback_tunnels(net)) {
1048 struct ip_tunnel_net *it_init_net;
1050 it_init_net = net_generic(&init_net, ip_tnl_net_id);
1051 itn->type = it_init_net->type;
1052 itn->fb_tunnel_dev = NULL;
1056 memset(&parms, 0, sizeof(parms));
1058 strlcpy(parms.name, devname, IFNAMSIZ);
1061 itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
1062 /* FB netdevice is special: we have one, and only one per netns.
1063 * Allowing to move it to another netns is clearly unsafe.
1065 if (!IS_ERR(itn->fb_tunnel_dev)) {
1066 itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
1067 itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
1068 ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
1069 itn->type = itn->fb_tunnel_dev->type;
1073 return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
1075 EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
1077 static void ip_tunnel_destroy(struct net *net, struct ip_tunnel_net *itn,
1078 struct list_head *head,
1079 struct rtnl_link_ops *ops)
1081 struct net_device *dev, *aux;
1084 for_each_netdev_safe(net, dev, aux)
1085 if (dev->rtnl_link_ops == ops)
1086 unregister_netdevice_queue(dev, head);
1088 for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
1089 struct ip_tunnel *t;
1090 struct hlist_node *n;
1091 struct hlist_head *thead = &itn->tunnels[h];
1093 hlist_for_each_entry_safe(t, n, thead, hash_node)
1094 /* If dev is in the same netns, it has already
1095 * been added to the list by the previous loop.
1097 if (!net_eq(dev_net(t->dev), net))
1098 unregister_netdevice_queue(t->dev, head);
1102 void ip_tunnel_delete_nets(struct list_head *net_list, unsigned int id,
1103 struct rtnl_link_ops *ops)
1105 struct ip_tunnel_net *itn;
1110 list_for_each_entry(net, net_list, exit_list) {
1111 itn = net_generic(net, id);
1112 ip_tunnel_destroy(net, itn, &list, ops);
1114 unregister_netdevice_many(&list);
1117 EXPORT_SYMBOL_GPL(ip_tunnel_delete_nets);
1119 int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
1120 struct ip_tunnel_parm *p, __u32 fwmark)
1122 struct ip_tunnel *nt;
1123 struct net *net = dev_net(dev);
1124 struct ip_tunnel_net *itn;
1128 nt = netdev_priv(dev);
1129 itn = net_generic(net, nt->ip_tnl_net_id);
1131 if (nt->collect_md) {
1132 if (rtnl_dereference(itn->collect_md_tun))
1135 if (ip_tunnel_find(itn, p, dev->type))
1141 nt->fwmark = fwmark;
1142 err = register_netdevice(dev);
1144 goto err_register_netdevice;
1146 if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1147 eth_hw_addr_random(dev);
1149 mtu = ip_tunnel_bind_dev(dev);
1151 unsigned int max = IP_MAX_MTU - (nt->hlen + sizeof(struct iphdr));
1153 mtu = clamp(dev->mtu, (unsigned int)ETH_MIN_MTU, max);
1156 err = dev_set_mtu(dev, mtu);
1158 goto err_dev_set_mtu;
1160 ip_tunnel_add(itn, nt);
1164 unregister_netdevice(dev);
1165 err_register_netdevice:
1168 EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
1170 int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
1171 struct ip_tunnel_parm *p, __u32 fwmark)
1173 struct ip_tunnel *t;
1174 struct ip_tunnel *tunnel = netdev_priv(dev);
1175 struct net *net = tunnel->net;
1176 struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
1178 if (dev == itn->fb_tunnel_dev)
1181 t = ip_tunnel_find(itn, p, dev->type);
1189 if (dev->type != ARPHRD_ETHER) {
1190 unsigned int nflags = 0;
1192 if (ipv4_is_multicast(p->iph.daddr))
1193 nflags = IFF_BROADCAST;
1194 else if (p->iph.daddr)
1195 nflags = IFF_POINTOPOINT;
1197 if ((dev->flags ^ nflags) &
1198 (IFF_POINTOPOINT | IFF_BROADCAST))
1203 ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU], fwmark);
1206 EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
1208 int ip_tunnel_init(struct net_device *dev)
1210 struct ip_tunnel *tunnel = netdev_priv(dev);
1211 struct iphdr *iph = &tunnel->parms.iph;
1214 dev->needs_free_netdev = true;
1215 dev->priv_destructor = ip_tunnel_dev_free;
1216 dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
1220 err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
1222 free_percpu(dev->tstats);
1226 err = gro_cells_init(&tunnel->gro_cells, dev);
1228 dst_cache_destroy(&tunnel->dst_cache);
1229 free_percpu(dev->tstats);
1234 tunnel->net = dev_net(dev);
1235 strcpy(tunnel->parms.name, dev->name);
1239 if (tunnel->collect_md)
1240 netif_keep_dst(dev);
1243 EXPORT_SYMBOL_GPL(ip_tunnel_init);
1245 void ip_tunnel_uninit(struct net_device *dev)
1247 struct ip_tunnel *tunnel = netdev_priv(dev);
1248 struct net *net = tunnel->net;
1249 struct ip_tunnel_net *itn;
1251 itn = net_generic(net, tunnel->ip_tnl_net_id);
1252 ip_tunnel_del(itn, netdev_priv(dev));
1253 if (itn->fb_tunnel_dev == dev)
1254 WRITE_ONCE(itn->fb_tunnel_dev, NULL);
1256 dst_cache_reset(&tunnel->dst_cache);
1258 EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1260 /* Do least required initialization, rest of init is done in tunnel_init call */
1261 void ip_tunnel_setup(struct net_device *dev, unsigned int net_id)
1263 struct ip_tunnel *tunnel = netdev_priv(dev);
1264 tunnel->ip_tnl_net_id = net_id;
1266 EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1268 MODULE_LICENSE("GPL");