1 // SPDX-License-Identifier: GPL-2.0-only
3 * Copyright (c) 2013 Nicira, Inc.
6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
8 #include <linux/capability.h>
9 #include <linux/module.h>
10 #include <linux/types.h>
11 #include <linux/kernel.h>
12 #include <linux/slab.h>
13 #include <linux/uaccess.h>
14 #include <linux/skbuff.h>
15 #include <linux/netdevice.h>
17 #include <linux/tcp.h>
18 #include <linux/udp.h>
19 #include <linux/if_arp.h>
20 #include <linux/init.h>
21 #include <linux/in6.h>
22 #include <linux/inetdevice.h>
23 #include <linux/igmp.h>
24 #include <linux/netfilter_ipv4.h>
25 #include <linux/etherdevice.h>
26 #include <linux/if_ether.h>
27 #include <linux/if_vlan.h>
28 #include <linux/rculist.h>
29 #include <linux/err.h>
34 #include <net/protocol.h>
35 #include <net/ip_tunnels.h>
37 #include <net/checksum.h>
38 #include <net/dsfield.h>
39 #include <net/inet_ecn.h>
41 #include <net/net_namespace.h>
42 #include <net/netns/generic.h>
43 #include <net/rtnetlink.h>
45 #include <net/dst_metadata.h>
47 #if IS_ENABLED(CONFIG_IPV6)
49 #include <net/ip6_fib.h>
50 #include <net/ip6_route.h>
53 static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
55 return hash_32((__force u32)key ^ (__force u32)remote,
59 static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
60 __be16 flags, __be32 key)
62 if (p->i_flags & TUNNEL_KEY) {
63 if (flags & TUNNEL_KEY)
64 return key == p->i_key;
66 /* key expected, none present */
69 return !(flags & TUNNEL_KEY);
72 /* Fallback tunnel: no source, no destination, no key, no options
75 We require exact key match i.e. if a key is present in packet
76 it will match only tunnel with the same key; if it is not present,
77 it will match only keyless tunnel.
79 All keysless packets, if not matched configured keyless tunnels
80 will match fallback tunnel.
81 Given src, dst and key, find appropriate for input tunnel.
83 struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
84 int link, __be16 flags,
85 __be32 remote, __be32 local,
88 struct ip_tunnel *t, *cand = NULL;
89 struct hlist_head *head;
90 struct net_device *ndev;
93 hash = ip_tunnel_hash(key, remote);
94 head = &itn->tunnels[hash];
96 hlist_for_each_entry_rcu(t, head, hash_node) {
97 if (local != t->parms.iph.saddr ||
98 remote != t->parms.iph.daddr ||
99 !(t->dev->flags & IFF_UP))
102 if (!ip_tunnel_key_match(&t->parms, flags, key))
105 if (t->parms.link == link)
111 hlist_for_each_entry_rcu(t, head, hash_node) {
112 if (remote != t->parms.iph.daddr ||
113 t->parms.iph.saddr != 0 ||
114 !(t->dev->flags & IFF_UP))
117 if (!ip_tunnel_key_match(&t->parms, flags, key))
120 if (t->parms.link == link)
126 hash = ip_tunnel_hash(key, 0);
127 head = &itn->tunnels[hash];
129 hlist_for_each_entry_rcu(t, head, hash_node) {
130 if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) &&
131 (local != t->parms.iph.daddr || !ipv4_is_multicast(local)))
134 if (!(t->dev->flags & IFF_UP))
137 if (!ip_tunnel_key_match(&t->parms, flags, key))
140 if (t->parms.link == link)
146 hlist_for_each_entry_rcu(t, head, hash_node) {
147 if ((!(flags & TUNNEL_NO_KEY) && t->parms.i_key != key) ||
148 t->parms.iph.saddr != 0 ||
149 t->parms.iph.daddr != 0 ||
150 !(t->dev->flags & IFF_UP))
153 if (t->parms.link == link)
162 t = rcu_dereference(itn->collect_md_tun);
163 if (t && t->dev->flags & IFF_UP)
166 ndev = READ_ONCE(itn->fb_tunnel_dev);
167 if (ndev && ndev->flags & IFF_UP)
168 return netdev_priv(ndev);
172 EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
174 static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
175 struct ip_tunnel_parm *parms)
179 __be32 i_key = parms->i_key;
181 if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
182 remote = parms->iph.daddr;
186 if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI))
189 h = ip_tunnel_hash(i_key, remote);
190 return &itn->tunnels[h];
193 static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
195 struct hlist_head *head = ip_bucket(itn, &t->parms);
198 rcu_assign_pointer(itn->collect_md_tun, t);
199 hlist_add_head_rcu(&t->hash_node, head);
202 static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t)
205 rcu_assign_pointer(itn->collect_md_tun, NULL);
206 hlist_del_init_rcu(&t->hash_node);
209 static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
210 struct ip_tunnel_parm *parms,
213 __be32 remote = parms->iph.daddr;
214 __be32 local = parms->iph.saddr;
215 __be32 key = parms->i_key;
216 __be16 flags = parms->i_flags;
217 int link = parms->link;
218 struct ip_tunnel *t = NULL;
219 struct hlist_head *head = ip_bucket(itn, parms);
221 hlist_for_each_entry_rcu(t, head, hash_node) {
222 if (local == t->parms.iph.saddr &&
223 remote == t->parms.iph.daddr &&
224 link == t->parms.link &&
225 type == t->dev->type &&
226 ip_tunnel_key_match(&t->parms, flags, key))
232 static struct net_device *__ip_tunnel_create(struct net *net,
233 const struct rtnl_link_ops *ops,
234 struct ip_tunnel_parm *parms)
237 struct ip_tunnel *tunnel;
238 struct net_device *dev;
242 if (parms->name[0]) {
243 if (!dev_valid_name(parms->name))
245 strlcpy(name, parms->name, IFNAMSIZ);
247 if (strlen(ops->kind) > (IFNAMSIZ - 3))
249 strcpy(name, ops->kind);
254 dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup);
259 dev_net_set(dev, net);
261 dev->rtnl_link_ops = ops;
263 tunnel = netdev_priv(dev);
264 tunnel->parms = *parms;
267 err = register_netdevice(dev);
279 static int ip_tunnel_bind_dev(struct net_device *dev)
281 struct net_device *tdev = NULL;
282 struct ip_tunnel *tunnel = netdev_priv(dev);
283 const struct iphdr *iph;
284 int hlen = LL_MAX_HEADER;
285 int mtu = ETH_DATA_LEN;
286 int t_hlen = tunnel->hlen + sizeof(struct iphdr);
288 iph = &tunnel->parms.iph;
290 /* Guess output device to choose reasonable mtu and needed_headroom */
295 ip_tunnel_init_flow(&fl4, iph->protocol, iph->daddr,
296 iph->saddr, tunnel->parms.o_key,
297 RT_TOS(iph->tos), tunnel->parms.link,
299 rt = ip_route_output_key(tunnel->net, &fl4);
305 if (dev->type != ARPHRD_ETHER)
306 dev->flags |= IFF_POINTOPOINT;
308 dst_cache_reset(&tunnel->dst_cache);
311 if (!tdev && tunnel->parms.link)
312 tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
315 hlen = tdev->hard_header_len + tdev->needed_headroom;
316 mtu = min(tdev->mtu, IP_MAX_MTU);
319 dev->needed_headroom = t_hlen + hlen;
320 mtu -= (dev->hard_header_len + t_hlen);
322 if (mtu < IPV4_MIN_MTU)
328 static struct ip_tunnel *ip_tunnel_create(struct net *net,
329 struct ip_tunnel_net *itn,
330 struct ip_tunnel_parm *parms)
332 struct ip_tunnel *nt;
333 struct net_device *dev;
338 dev = __ip_tunnel_create(net, itn->rtnl_link_ops, parms);
340 return ERR_CAST(dev);
342 mtu = ip_tunnel_bind_dev(dev);
343 err = dev_set_mtu(dev, mtu);
345 goto err_dev_set_mtu;
347 nt = netdev_priv(dev);
348 t_hlen = nt->hlen + sizeof(struct iphdr);
349 dev->min_mtu = ETH_MIN_MTU;
350 dev->max_mtu = IP_MAX_MTU - dev->hard_header_len - t_hlen;
351 ip_tunnel_add(itn, nt);
355 unregister_netdevice(dev);
359 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
360 const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst,
363 const struct iphdr *iph = ip_hdr(skb);
366 #ifdef CONFIG_NET_IPGRE_BROADCAST
367 if (ipv4_is_multicast(iph->daddr)) {
368 tunnel->dev->stats.multicast++;
369 skb->pkt_type = PACKET_BROADCAST;
373 if ((!(tpi->flags&TUNNEL_CSUM) && (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
374 ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
375 tunnel->dev->stats.rx_crc_errors++;
376 tunnel->dev->stats.rx_errors++;
380 if (tunnel->parms.i_flags&TUNNEL_SEQ) {
381 if (!(tpi->flags&TUNNEL_SEQ) ||
382 (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
383 tunnel->dev->stats.rx_fifo_errors++;
384 tunnel->dev->stats.rx_errors++;
387 tunnel->i_seqno = ntohl(tpi->seq) + 1;
390 skb_reset_network_header(skb);
392 err = IP_ECN_decapsulate(iph, skb);
395 net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
396 &iph->saddr, iph->tos);
398 ++tunnel->dev->stats.rx_frame_errors;
399 ++tunnel->dev->stats.rx_errors;
404 dev_sw_netstats_rx_add(tunnel->dev, skb->len);
405 skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
407 if (tunnel->dev->type == ARPHRD_ETHER) {
408 skb->protocol = eth_type_trans(skb, tunnel->dev);
409 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
411 skb->dev = tunnel->dev;
415 skb_dst_set(skb, (struct dst_entry *)tun_dst);
417 gro_cells_receive(&tunnel->gro_cells, skb);
422 dst_release((struct dst_entry *)tun_dst);
426 EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
428 int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops,
431 if (num >= MAX_IPTUN_ENCAP_OPS)
434 return !cmpxchg((const struct ip_tunnel_encap_ops **)
438 EXPORT_SYMBOL(ip_tunnel_encap_add_ops);
440 int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops,
445 if (num >= MAX_IPTUN_ENCAP_OPS)
448 ret = (cmpxchg((const struct ip_tunnel_encap_ops **)
450 ops, NULL) == ops) ? 0 : -1;
456 EXPORT_SYMBOL(ip_tunnel_encap_del_ops);
458 int ip_tunnel_encap_setup(struct ip_tunnel *t,
459 struct ip_tunnel_encap *ipencap)
463 memset(&t->encap, 0, sizeof(t->encap));
465 hlen = ip_encap_hlen(ipencap);
469 t->encap.type = ipencap->type;
470 t->encap.sport = ipencap->sport;
471 t->encap.dport = ipencap->dport;
472 t->encap.flags = ipencap->flags;
474 t->encap_hlen = hlen;
475 t->hlen = t->encap_hlen + t->tun_hlen;
479 EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup);
481 static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
482 struct rtable *rt, __be16 df,
483 const struct iphdr *inner_iph,
484 int tunnel_hlen, __be32 dst, bool md)
486 struct ip_tunnel *tunnel = netdev_priv(dev);
490 tunnel_hlen = md ? tunnel_hlen : tunnel->hlen;
491 pkt_size = skb->len - tunnel_hlen - dev->hard_header_len;
494 mtu = dst_mtu(&rt->dst) - dev->hard_header_len
495 - sizeof(struct iphdr) - tunnel_hlen;
497 mtu = skb_valid_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
499 if (skb_valid_dst(skb))
500 skb_dst_update_pmtu_no_confirm(skb, mtu);
502 if (skb->protocol == htons(ETH_P_IP)) {
503 if (!skb_is_gso(skb) &&
504 (inner_iph->frag_off & htons(IP_DF)) &&
506 memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
507 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
511 #if IS_ENABLED(CONFIG_IPV6)
512 else if (skb->protocol == htons(ETH_P_IPV6)) {
513 struct rt6_info *rt6;
516 rt6 = skb_valid_dst(skb) ? (struct rt6_info *)skb_dst(skb) :
518 daddr = md ? dst : tunnel->parms.iph.daddr;
520 if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
521 mtu >= IPV6_MIN_MTU) {
522 if ((daddr && !ipv4_is_multicast(daddr)) ||
523 rt6->rt6i_dst.plen == 128) {
524 rt6->rt6i_flags |= RTF_MODIFIED;
525 dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
529 if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
531 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
539 void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
540 u8 proto, int tunnel_hlen)
542 struct ip_tunnel *tunnel = netdev_priv(dev);
543 u32 headroom = sizeof(struct iphdr);
544 struct ip_tunnel_info *tun_info;
545 const struct ip_tunnel_key *key;
546 const struct iphdr *inner_iph;
547 struct rtable *rt = NULL;
553 tun_info = skb_tunnel_info(skb);
554 if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
555 ip_tunnel_info_af(tun_info) != AF_INET))
557 key = &tun_info->key;
558 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
559 inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
562 if (skb->protocol == htons(ETH_P_IP))
563 tos = inner_iph->tos;
564 else if (skb->protocol == htons(ETH_P_IPV6))
565 tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
567 ip_tunnel_init_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src,
568 tunnel_id_to_key32(key->tun_id), RT_TOS(tos),
569 0, skb->mark, skb_get_hash(skb));
570 if (tunnel->encap.type != TUNNEL_ENCAP_NONE)
573 use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
575 rt = dst_cache_get_ip4(&tun_info->dst_cache, &fl4.saddr);
577 rt = ip_route_output_key(tunnel->net, &fl4);
579 dev->stats.tx_carrier_errors++;
583 dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst,
586 if (rt->dst.dev == dev) {
588 dev->stats.collisions++;
592 if (key->tun_flags & TUNNEL_DONT_FRAGMENT)
594 if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, tunnel_hlen,
595 key->u.ipv4.dst, true)) {
600 tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
603 if (skb->protocol == htons(ETH_P_IP))
604 ttl = inner_iph->ttl;
605 else if (skb->protocol == htons(ETH_P_IPV6))
606 ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
608 ttl = ip4_dst_hoplimit(&rt->dst);
611 if (!df && skb->protocol == htons(ETH_P_IP))
612 df = inner_iph->frag_off & htons(IP_DF);
614 headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len;
615 if (headroom > dev->needed_headroom)
616 dev->needed_headroom = headroom;
618 if (skb_cow_head(skb, dev->needed_headroom)) {
622 iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, tos, ttl,
623 df, !net_eq(tunnel->net, dev_net(dev)));
626 dev->stats.tx_errors++;
629 dev->stats.tx_dropped++;
633 EXPORT_SYMBOL_GPL(ip_md_tunnel_xmit);
635 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
636 const struct iphdr *tnl_params, u8 protocol)
638 struct ip_tunnel *tunnel = netdev_priv(dev);
639 struct ip_tunnel_info *tun_info = NULL;
640 const struct iphdr *inner_iph;
641 unsigned int max_headroom; /* The extra header space needed */
642 struct rtable *rt = NULL; /* Route to the other host */
643 bool use_cache = false;
651 inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
652 connected = (tunnel->parms.iph.daddr != 0);
654 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
656 dst = tnl_params->daddr;
661 dev->stats.tx_fifo_errors++;
665 tun_info = skb_tunnel_info(skb);
666 if (tun_info && (tun_info->mode & IP_TUNNEL_INFO_TX) &&
667 ip_tunnel_info_af(tun_info) == AF_INET &&
668 tun_info->key.u.ipv4.dst) {
669 dst = tun_info->key.u.ipv4.dst;
673 else if (skb->protocol == htons(ETH_P_IP)) {
674 rt = skb_rtable(skb);
675 dst = rt_nexthop(rt, inner_iph->daddr);
677 #if IS_ENABLED(CONFIG_IPV6)
678 else if (skb->protocol == htons(ETH_P_IPV6)) {
679 const struct in6_addr *addr6;
680 struct neighbour *neigh;
681 bool do_tx_error_icmp;
684 neigh = dst_neigh_lookup(skb_dst(skb),
685 &ipv6_hdr(skb)->daddr);
689 addr6 = (const struct in6_addr *)&neigh->primary_key;
690 addr_type = ipv6_addr_type(addr6);
692 if (addr_type == IPV6_ADDR_ANY) {
693 addr6 = &ipv6_hdr(skb)->daddr;
694 addr_type = ipv6_addr_type(addr6);
697 if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
698 do_tx_error_icmp = true;
700 do_tx_error_icmp = false;
701 dst = addr6->s6_addr32[3];
703 neigh_release(neigh);
704 if (do_tx_error_icmp)
715 tos = tnl_params->tos;
718 if (skb->protocol == htons(ETH_P_IP)) {
719 tos = inner_iph->tos;
721 } else if (skb->protocol == htons(ETH_P_IPV6)) {
722 tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
727 ip_tunnel_init_flow(&fl4, protocol, dst, tnl_params->saddr,
728 tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link,
729 tunnel->fwmark, skb_get_hash(skb));
731 if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0)
734 if (connected && md) {
735 use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
737 rt = dst_cache_get_ip4(&tun_info->dst_cache,
740 rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache,
745 rt = ip_route_output_key(tunnel->net, &fl4);
748 dev->stats.tx_carrier_errors++;
752 dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst,
754 else if (!md && connected)
755 dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst,
759 if (rt->dst.dev == dev) {
761 dev->stats.collisions++;
765 if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off, inner_iph,
771 if (tunnel->err_count > 0) {
772 if (time_before(jiffies,
773 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
776 dst_link_failure(skb);
778 tunnel->err_count = 0;
781 tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
782 ttl = tnl_params->ttl;
784 if (skb->protocol == htons(ETH_P_IP))
785 ttl = inner_iph->ttl;
786 #if IS_ENABLED(CONFIG_IPV6)
787 else if (skb->protocol == htons(ETH_P_IPV6))
788 ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
791 ttl = ip4_dst_hoplimit(&rt->dst);
794 df = tnl_params->frag_off;
795 if (skb->protocol == htons(ETH_P_IP) && !tunnel->ignore_df)
796 df |= (inner_iph->frag_off&htons(IP_DF));
798 max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
799 + rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
800 if (max_headroom > dev->needed_headroom)
801 dev->needed_headroom = max_headroom;
803 if (skb_cow_head(skb, dev->needed_headroom)) {
805 dev->stats.tx_dropped++;
810 iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl,
811 df, !net_eq(tunnel->net, dev_net(dev)));
814 #if IS_ENABLED(CONFIG_IPV6)
816 dst_link_failure(skb);
819 dev->stats.tx_errors++;
822 EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
824 static void ip_tunnel_update(struct ip_tunnel_net *itn,
826 struct net_device *dev,
827 struct ip_tunnel_parm *p,
831 ip_tunnel_del(itn, t);
832 t->parms.iph.saddr = p->iph.saddr;
833 t->parms.iph.daddr = p->iph.daddr;
834 t->parms.i_key = p->i_key;
835 t->parms.o_key = p->o_key;
836 if (dev->type != ARPHRD_ETHER) {
837 memcpy(dev->dev_addr, &p->iph.saddr, 4);
838 memcpy(dev->broadcast, &p->iph.daddr, 4);
840 ip_tunnel_add(itn, t);
842 t->parms.iph.ttl = p->iph.ttl;
843 t->parms.iph.tos = p->iph.tos;
844 t->parms.iph.frag_off = p->iph.frag_off;
846 if (t->parms.link != p->link || t->fwmark != fwmark) {
849 t->parms.link = p->link;
851 mtu = ip_tunnel_bind_dev(dev);
855 dst_cache_reset(&t->dst_cache);
856 netdev_state_change(dev);
859 int ip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
862 struct ip_tunnel *t = netdev_priv(dev);
863 struct net *net = t->net;
864 struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
868 if (dev == itn->fb_tunnel_dev) {
869 t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
871 t = netdev_priv(dev);
873 memcpy(p, &t->parms, sizeof(*p));
879 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
882 p->iph.frag_off |= htons(IP_DF);
883 if (!(p->i_flags & VTI_ISVTI)) {
884 if (!(p->i_flags & TUNNEL_KEY))
886 if (!(p->o_flags & TUNNEL_KEY))
890 t = ip_tunnel_find(itn, p, itn->type);
892 if (cmd == SIOCADDTUNNEL) {
894 t = ip_tunnel_create(net, itn, p);
895 err = PTR_ERR_OR_ZERO(t);
902 if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
909 unsigned int nflags = 0;
911 if (ipv4_is_multicast(p->iph.daddr))
912 nflags = IFF_BROADCAST;
913 else if (p->iph.daddr)
914 nflags = IFF_POINTOPOINT;
916 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
921 t = netdev_priv(dev);
927 ip_tunnel_update(itn, t, dev, p, true, 0);
935 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
938 if (dev == itn->fb_tunnel_dev) {
940 t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
944 if (t == netdev_priv(itn->fb_tunnel_dev))
948 unregister_netdevice(dev);
959 EXPORT_SYMBOL_GPL(ip_tunnel_ctl);
961 int ip_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
963 struct ip_tunnel_parm p;
966 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
968 err = dev->netdev_ops->ndo_tunnel_ctl(dev, &p, cmd);
969 if (!err && copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
973 EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
975 int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict)
977 struct ip_tunnel *tunnel = netdev_priv(dev);
978 int t_hlen = tunnel->hlen + sizeof(struct iphdr);
979 int max_mtu = IP_MAX_MTU - dev->hard_header_len - t_hlen;
981 if (new_mtu < ETH_MIN_MTU)
984 if (new_mtu > max_mtu) {
994 EXPORT_SYMBOL_GPL(__ip_tunnel_change_mtu);
996 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
998 return __ip_tunnel_change_mtu(dev, new_mtu, true);
1000 EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
1002 static void ip_tunnel_dev_free(struct net_device *dev)
1004 struct ip_tunnel *tunnel = netdev_priv(dev);
1006 gro_cells_destroy(&tunnel->gro_cells);
1007 dst_cache_destroy(&tunnel->dst_cache);
1008 free_percpu(dev->tstats);
1011 void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
1013 struct ip_tunnel *tunnel = netdev_priv(dev);
1014 struct ip_tunnel_net *itn;
1016 itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
1018 if (itn->fb_tunnel_dev != dev) {
1019 ip_tunnel_del(itn, netdev_priv(dev));
1020 unregister_netdevice_queue(dev, head);
1023 EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
1025 struct net *ip_tunnel_get_link_net(const struct net_device *dev)
1027 struct ip_tunnel *tunnel = netdev_priv(dev);
1031 EXPORT_SYMBOL(ip_tunnel_get_link_net);
1033 int ip_tunnel_get_iflink(const struct net_device *dev)
1035 struct ip_tunnel *tunnel = netdev_priv(dev);
1037 return tunnel->parms.link;
1039 EXPORT_SYMBOL(ip_tunnel_get_iflink);
1041 int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id,
1042 struct rtnl_link_ops *ops, char *devname)
1044 struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
1045 struct ip_tunnel_parm parms;
1048 itn->rtnl_link_ops = ops;
1049 for (i = 0; i < IP_TNL_HASH_SIZE; i++)
1050 INIT_HLIST_HEAD(&itn->tunnels[i]);
1052 if (!ops || !net_has_fallback_tunnels(net)) {
1053 struct ip_tunnel_net *it_init_net;
1055 it_init_net = net_generic(&init_net, ip_tnl_net_id);
1056 itn->type = it_init_net->type;
1057 itn->fb_tunnel_dev = NULL;
1061 memset(&parms, 0, sizeof(parms));
1063 strlcpy(parms.name, devname, IFNAMSIZ);
1066 itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
1067 /* FB netdevice is special: we have one, and only one per netns.
1068 * Allowing to move it to another netns is clearly unsafe.
1070 if (!IS_ERR(itn->fb_tunnel_dev)) {
1071 itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
1072 itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
1073 ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
1074 itn->type = itn->fb_tunnel_dev->type;
1078 return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
1080 EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
1082 static void ip_tunnel_destroy(struct net *net, struct ip_tunnel_net *itn,
1083 struct list_head *head,
1084 struct rtnl_link_ops *ops)
1086 struct net_device *dev, *aux;
1089 for_each_netdev_safe(net, dev, aux)
1090 if (dev->rtnl_link_ops == ops)
1091 unregister_netdevice_queue(dev, head);
1093 for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
1094 struct ip_tunnel *t;
1095 struct hlist_node *n;
1096 struct hlist_head *thead = &itn->tunnels[h];
1098 hlist_for_each_entry_safe(t, n, thead, hash_node)
1099 /* If dev is in the same netns, it has already
1100 * been added to the list by the previous loop.
1102 if (!net_eq(dev_net(t->dev), net))
1103 unregister_netdevice_queue(t->dev, head);
1107 void ip_tunnel_delete_nets(struct list_head *net_list, unsigned int id,
1108 struct rtnl_link_ops *ops)
1110 struct ip_tunnel_net *itn;
1115 list_for_each_entry(net, net_list, exit_list) {
1116 itn = net_generic(net, id);
1117 ip_tunnel_destroy(net, itn, &list, ops);
1119 unregister_netdevice_many(&list);
1122 EXPORT_SYMBOL_GPL(ip_tunnel_delete_nets);
1124 int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
1125 struct ip_tunnel_parm *p, __u32 fwmark)
1127 struct ip_tunnel *nt;
1128 struct net *net = dev_net(dev);
1129 struct ip_tunnel_net *itn;
1133 nt = netdev_priv(dev);
1134 itn = net_generic(net, nt->ip_tnl_net_id);
1136 if (nt->collect_md) {
1137 if (rtnl_dereference(itn->collect_md_tun))
1140 if (ip_tunnel_find(itn, p, dev->type))
1146 nt->fwmark = fwmark;
1147 err = register_netdevice(dev);
1149 goto err_register_netdevice;
1151 if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1152 eth_hw_addr_random(dev);
1154 mtu = ip_tunnel_bind_dev(dev);
1156 unsigned int max = IP_MAX_MTU - dev->hard_header_len - nt->hlen;
1158 mtu = clamp(dev->mtu, (unsigned int)ETH_MIN_MTU,
1159 (unsigned int)(max - sizeof(struct iphdr)));
1162 err = dev_set_mtu(dev, mtu);
1164 goto err_dev_set_mtu;
1166 ip_tunnel_add(itn, nt);
1170 unregister_netdevice(dev);
1171 err_register_netdevice:
1174 EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
1176 int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
1177 struct ip_tunnel_parm *p, __u32 fwmark)
1179 struct ip_tunnel *t;
1180 struct ip_tunnel *tunnel = netdev_priv(dev);
1181 struct net *net = tunnel->net;
1182 struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
1184 if (dev == itn->fb_tunnel_dev)
1187 t = ip_tunnel_find(itn, p, dev->type);
1195 if (dev->type != ARPHRD_ETHER) {
1196 unsigned int nflags = 0;
1198 if (ipv4_is_multicast(p->iph.daddr))
1199 nflags = IFF_BROADCAST;
1200 else if (p->iph.daddr)
1201 nflags = IFF_POINTOPOINT;
1203 if ((dev->flags ^ nflags) &
1204 (IFF_POINTOPOINT | IFF_BROADCAST))
1209 ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU], fwmark);
1212 EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
1214 int ip_tunnel_init(struct net_device *dev)
1216 struct ip_tunnel *tunnel = netdev_priv(dev);
1217 struct iphdr *iph = &tunnel->parms.iph;
1220 dev->needs_free_netdev = true;
1221 dev->priv_destructor = ip_tunnel_dev_free;
1222 dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
1226 err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
1228 free_percpu(dev->tstats);
1232 err = gro_cells_init(&tunnel->gro_cells, dev);
1234 dst_cache_destroy(&tunnel->dst_cache);
1235 free_percpu(dev->tstats);
1240 tunnel->net = dev_net(dev);
1241 strcpy(tunnel->parms.name, dev->name);
1245 if (tunnel->collect_md)
1246 netif_keep_dst(dev);
1249 EXPORT_SYMBOL_GPL(ip_tunnel_init);
1251 void ip_tunnel_uninit(struct net_device *dev)
1253 struct ip_tunnel *tunnel = netdev_priv(dev);
1254 struct net *net = tunnel->net;
1255 struct ip_tunnel_net *itn;
1257 itn = net_generic(net, tunnel->ip_tnl_net_id);
1258 ip_tunnel_del(itn, netdev_priv(dev));
1259 if (itn->fb_tunnel_dev == dev)
1260 WRITE_ONCE(itn->fb_tunnel_dev, NULL);
1262 dst_cache_reset(&tunnel->dst_cache);
1264 EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1266 /* Do least required initialization, rest of init is done in tunnel_init call */
1267 void ip_tunnel_setup(struct net_device *dev, unsigned int net_id)
1269 struct ip_tunnel *tunnel = netdev_priv(dev);
1270 tunnel->ip_tnl_net_id = net_id;
1272 EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1274 MODULE_LICENSE("GPL");