tunnel: Clear IPCB(skb)->opt before dst_link_failure called
[linux-2.6-microblaze.git] / net / ipv4 / ip_tunnel.c
1 /*
2  * Copyright (c) 2013 Nicira, Inc.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of version 2 of the GNU General Public
6  * License as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful, but
9  * WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public License
14  * along with this program; if not, write to the Free Software
15  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16  * 02110-1301, USA
17  */
18
19 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
20
21 #include <linux/capability.h>
22 #include <linux/module.h>
23 #include <linux/types.h>
24 #include <linux/kernel.h>
25 #include <linux/slab.h>
26 #include <linux/uaccess.h>
27 #include <linux/skbuff.h>
28 #include <linux/netdevice.h>
29 #include <linux/in.h>
30 #include <linux/tcp.h>
31 #include <linux/udp.h>
32 #include <linux/if_arp.h>
33 #include <linux/init.h>
34 #include <linux/in6.h>
35 #include <linux/inetdevice.h>
36 #include <linux/igmp.h>
37 #include <linux/netfilter_ipv4.h>
38 #include <linux/etherdevice.h>
39 #include <linux/if_ether.h>
40 #include <linux/if_vlan.h>
41 #include <linux/rculist.h>
42 #include <linux/err.h>
43
44 #include <net/sock.h>
45 #include <net/ip.h>
46 #include <net/icmp.h>
47 #include <net/protocol.h>
48 #include <net/ip_tunnels.h>
49 #include <net/arp.h>
50 #include <net/checksum.h>
51 #include <net/dsfield.h>
52 #include <net/inet_ecn.h>
53 #include <net/xfrm.h>
54 #include <net/net_namespace.h>
55 #include <net/netns/generic.h>
56 #include <net/rtnetlink.h>
57 #include <net/udp.h>
58
59 #if IS_ENABLED(CONFIG_IPV6)
60 #include <net/ipv6.h>
61 #include <net/ip6_fib.h>
62 #include <net/ip6_route.h>
63 #endif
64
65 static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
66 {
67         return hash_32((__force u32)key ^ (__force u32)remote,
68                          IP_TNL_HASH_BITS);
69 }
70
71 static void __tunnel_dst_set(struct ip_tunnel_dst *idst,
72                              struct dst_entry *dst, __be32 saddr)
73 {
74         struct dst_entry *old_dst;
75
76         dst_clone(dst);
77         old_dst = xchg((__force struct dst_entry **)&idst->dst, dst);
78         dst_release(old_dst);
79         idst->saddr = saddr;
80 }
81
82 static noinline void tunnel_dst_set(struct ip_tunnel *t,
83                            struct dst_entry *dst, __be32 saddr)
84 {
85         __tunnel_dst_set(raw_cpu_ptr(t->dst_cache), dst, saddr);
86 }
87
88 static void tunnel_dst_reset(struct ip_tunnel *t)
89 {
90         tunnel_dst_set(t, NULL, 0);
91 }
92
93 void ip_tunnel_dst_reset_all(struct ip_tunnel *t)
94 {
95         int i;
96
97         for_each_possible_cpu(i)
98                 __tunnel_dst_set(per_cpu_ptr(t->dst_cache, i), NULL, 0);
99 }
100 EXPORT_SYMBOL(ip_tunnel_dst_reset_all);
101
102 static struct rtable *tunnel_rtable_get(struct ip_tunnel *t,
103                                         u32 cookie, __be32 *saddr)
104 {
105         struct ip_tunnel_dst *idst;
106         struct dst_entry *dst;
107
108         rcu_read_lock();
109         idst = raw_cpu_ptr(t->dst_cache);
110         dst = rcu_dereference(idst->dst);
111         if (dst && !atomic_inc_not_zero(&dst->__refcnt))
112                 dst = NULL;
113         if (dst) {
114                 if (!dst->obsolete || dst->ops->check(dst, cookie)) {
115                         *saddr = idst->saddr;
116                 } else {
117                         tunnel_dst_reset(t);
118                         dst_release(dst);
119                         dst = NULL;
120                 }
121         }
122         rcu_read_unlock();
123         return (struct rtable *)dst;
124 }
125
126 static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
127                                 __be16 flags, __be32 key)
128 {
129         if (p->i_flags & TUNNEL_KEY) {
130                 if (flags & TUNNEL_KEY)
131                         return key == p->i_key;
132                 else
133                         /* key expected, none present */
134                         return false;
135         } else
136                 return !(flags & TUNNEL_KEY);
137 }
138
139 /* Fallback tunnel: no source, no destination, no key, no options
140
141    Tunnel hash table:
142    We require exact key match i.e. if a key is present in packet
143    it will match only tunnel with the same key; if it is not present,
144    it will match only keyless tunnel.
145
146    All keysless packets, if not matched configured keyless tunnels
147    will match fallback tunnel.
148    Given src, dst and key, find appropriate for input tunnel.
149 */
150 struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
151                                    int link, __be16 flags,
152                                    __be32 remote, __be32 local,
153                                    __be32 key)
154 {
155         unsigned int hash;
156         struct ip_tunnel *t, *cand = NULL;
157         struct hlist_head *head;
158
159         hash = ip_tunnel_hash(key, remote);
160         head = &itn->tunnels[hash];
161
162         hlist_for_each_entry_rcu(t, head, hash_node) {
163                 if (local != t->parms.iph.saddr ||
164                     remote != t->parms.iph.daddr ||
165                     !(t->dev->flags & IFF_UP))
166                         continue;
167
168                 if (!ip_tunnel_key_match(&t->parms, flags, key))
169                         continue;
170
171                 if (t->parms.link == link)
172                         return t;
173                 else
174                         cand = t;
175         }
176
177         hlist_for_each_entry_rcu(t, head, hash_node) {
178                 if (remote != t->parms.iph.daddr ||
179                     t->parms.iph.saddr != 0 ||
180                     !(t->dev->flags & IFF_UP))
181                         continue;
182
183                 if (!ip_tunnel_key_match(&t->parms, flags, key))
184                         continue;
185
186                 if (t->parms.link == link)
187                         return t;
188                 else if (!cand)
189                         cand = t;
190         }
191
192         hash = ip_tunnel_hash(key, 0);
193         head = &itn->tunnels[hash];
194
195         hlist_for_each_entry_rcu(t, head, hash_node) {
196                 if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) &&
197                     (local != t->parms.iph.daddr || !ipv4_is_multicast(local)))
198                         continue;
199
200                 if (!(t->dev->flags & IFF_UP))
201                         continue;
202
203                 if (!ip_tunnel_key_match(&t->parms, flags, key))
204                         continue;
205
206                 if (t->parms.link == link)
207                         return t;
208                 else if (!cand)
209                         cand = t;
210         }
211
212         if (flags & TUNNEL_NO_KEY)
213                 goto skip_key_lookup;
214
215         hlist_for_each_entry_rcu(t, head, hash_node) {
216                 if (t->parms.i_key != key ||
217                     t->parms.iph.saddr != 0 ||
218                     t->parms.iph.daddr != 0 ||
219                     !(t->dev->flags & IFF_UP))
220                         continue;
221
222                 if (t->parms.link == link)
223                         return t;
224                 else if (!cand)
225                         cand = t;
226         }
227
228 skip_key_lookup:
229         if (cand)
230                 return cand;
231
232         t = rcu_dereference(itn->collect_md_tun);
233         if (t)
234                 return t;
235
236         if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP)
237                 return netdev_priv(itn->fb_tunnel_dev);
238
239         return NULL;
240 }
241 EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
242
243 static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
244                                     struct ip_tunnel_parm *parms)
245 {
246         unsigned int h;
247         __be32 remote;
248         __be32 i_key = parms->i_key;
249
250         if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
251                 remote = parms->iph.daddr;
252         else
253                 remote = 0;
254
255         if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI))
256                 i_key = 0;
257
258         h = ip_tunnel_hash(i_key, remote);
259         return &itn->tunnels[h];
260 }
261
262 static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
263 {
264         struct hlist_head *head = ip_bucket(itn, &t->parms);
265
266         if (t->collect_md)
267                 rcu_assign_pointer(itn->collect_md_tun, t);
268         hlist_add_head_rcu(&t->hash_node, head);
269 }
270
271 static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t)
272 {
273         if (t->collect_md)
274                 rcu_assign_pointer(itn->collect_md_tun, NULL);
275         hlist_del_init_rcu(&t->hash_node);
276 }
277
278 static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
279                                         struct ip_tunnel_parm *parms,
280                                         int type)
281 {
282         __be32 remote = parms->iph.daddr;
283         __be32 local = parms->iph.saddr;
284         __be32 key = parms->i_key;
285         __be16 flags = parms->i_flags;
286         int link = parms->link;
287         struct ip_tunnel *t = NULL;
288         struct hlist_head *head = ip_bucket(itn, parms);
289
290         hlist_for_each_entry_rcu(t, head, hash_node) {
291                 if (local == t->parms.iph.saddr &&
292                     remote == t->parms.iph.daddr &&
293                     link == t->parms.link &&
294                     type == t->dev->type &&
295                     ip_tunnel_key_match(&t->parms, flags, key))
296                         break;
297         }
298         return t;
299 }
300
301 static struct net_device *__ip_tunnel_create(struct net *net,
302                                              const struct rtnl_link_ops *ops,
303                                              struct ip_tunnel_parm *parms)
304 {
305         int err;
306         struct ip_tunnel *tunnel;
307         struct net_device *dev;
308         char name[IFNAMSIZ];
309
310         if (parms->name[0])
311                 strlcpy(name, parms->name, IFNAMSIZ);
312         else {
313                 if (strlen(ops->kind) > (IFNAMSIZ - 3)) {
314                         err = -E2BIG;
315                         goto failed;
316                 }
317                 strlcpy(name, ops->kind, IFNAMSIZ);
318                 strncat(name, "%d", 2);
319         }
320
321         ASSERT_RTNL();
322         dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup);
323         if (!dev) {
324                 err = -ENOMEM;
325                 goto failed;
326         }
327         dev_net_set(dev, net);
328
329         dev->rtnl_link_ops = ops;
330
331         tunnel = netdev_priv(dev);
332         tunnel->parms = *parms;
333         tunnel->net = net;
334
335         err = register_netdevice(dev);
336         if (err)
337                 goto failed_free;
338
339         return dev;
340
341 failed_free:
342         free_netdev(dev);
343 failed:
344         return ERR_PTR(err);
345 }
346
347 static inline void init_tunnel_flow(struct flowi4 *fl4,
348                                     int proto,
349                                     __be32 daddr, __be32 saddr,
350                                     __be32 key, __u8 tos, int oif)
351 {
352         memset(fl4, 0, sizeof(*fl4));
353         fl4->flowi4_oif = oif;
354         fl4->daddr = daddr;
355         fl4->saddr = saddr;
356         fl4->flowi4_tos = tos;
357         fl4->flowi4_proto = proto;
358         fl4->fl4_gre_key = key;
359 }
360
361 static int ip_tunnel_bind_dev(struct net_device *dev)
362 {
363         struct net_device *tdev = NULL;
364         struct ip_tunnel *tunnel = netdev_priv(dev);
365         const struct iphdr *iph;
366         int hlen = LL_MAX_HEADER;
367         int mtu = ETH_DATA_LEN;
368         int t_hlen = tunnel->hlen + sizeof(struct iphdr);
369
370         iph = &tunnel->parms.iph;
371
372         /* Guess output device to choose reasonable mtu and needed_headroom */
373         if (iph->daddr) {
374                 struct flowi4 fl4;
375                 struct rtable *rt;
376
377                 init_tunnel_flow(&fl4, iph->protocol, iph->daddr,
378                                  iph->saddr, tunnel->parms.o_key,
379                                  RT_TOS(iph->tos), tunnel->parms.link);
380                 rt = ip_route_output_key(tunnel->net, &fl4);
381
382                 if (!IS_ERR(rt)) {
383                         tdev = rt->dst.dev;
384                         tunnel_dst_set(tunnel, &rt->dst, fl4.saddr);
385                         ip_rt_put(rt);
386                 }
387                 if (dev->type != ARPHRD_ETHER)
388                         dev->flags |= IFF_POINTOPOINT;
389         }
390
391         if (!tdev && tunnel->parms.link)
392                 tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
393
394         if (tdev) {
395                 hlen = tdev->hard_header_len + tdev->needed_headroom;
396                 mtu = tdev->mtu;
397         }
398
399         dev->needed_headroom = t_hlen + hlen;
400         mtu -= (dev->hard_header_len + t_hlen);
401
402         if (mtu < 68)
403                 mtu = 68;
404
405         return mtu;
406 }
407
408 static struct ip_tunnel *ip_tunnel_create(struct net *net,
409                                           struct ip_tunnel_net *itn,
410                                           struct ip_tunnel_parm *parms)
411 {
412         struct ip_tunnel *nt;
413         struct net_device *dev;
414
415         BUG_ON(!itn->fb_tunnel_dev);
416         dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms);
417         if (IS_ERR(dev))
418                 return ERR_CAST(dev);
419
420         dev->mtu = ip_tunnel_bind_dev(dev);
421
422         nt = netdev_priv(dev);
423         ip_tunnel_add(itn, nt);
424         return nt;
425 }
426
427 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
428                   const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst,
429                   bool log_ecn_error)
430 {
431         struct pcpu_sw_netstats *tstats;
432         const struct iphdr *iph = ip_hdr(skb);
433         int err;
434
435 #ifdef CONFIG_NET_IPGRE_BROADCAST
436         if (ipv4_is_multicast(iph->daddr)) {
437                 tunnel->dev->stats.multicast++;
438                 skb->pkt_type = PACKET_BROADCAST;
439         }
440 #endif
441
442         if ((!(tpi->flags&TUNNEL_CSUM) &&  (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
443              ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
444                 tunnel->dev->stats.rx_crc_errors++;
445                 tunnel->dev->stats.rx_errors++;
446                 goto drop;
447         }
448
449         if (tunnel->parms.i_flags&TUNNEL_SEQ) {
450                 if (!(tpi->flags&TUNNEL_SEQ) ||
451                     (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
452                         tunnel->dev->stats.rx_fifo_errors++;
453                         tunnel->dev->stats.rx_errors++;
454                         goto drop;
455                 }
456                 tunnel->i_seqno = ntohl(tpi->seq) + 1;
457         }
458
459         skb_reset_network_header(skb);
460
461         err = IP_ECN_decapsulate(iph, skb);
462         if (unlikely(err)) {
463                 if (log_ecn_error)
464                         net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
465                                         &iph->saddr, iph->tos);
466                 if (err > 1) {
467                         ++tunnel->dev->stats.rx_frame_errors;
468                         ++tunnel->dev->stats.rx_errors;
469                         goto drop;
470                 }
471         }
472
473         tstats = this_cpu_ptr(tunnel->dev->tstats);
474         u64_stats_update_begin(&tstats->syncp);
475         tstats->rx_packets++;
476         tstats->rx_bytes += skb->len;
477         u64_stats_update_end(&tstats->syncp);
478
479         skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
480
481         if (tunnel->dev->type == ARPHRD_ETHER) {
482                 skb->protocol = eth_type_trans(skb, tunnel->dev);
483                 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
484         } else {
485                 skb->dev = tunnel->dev;
486         }
487
488         if (tun_dst)
489                 skb_dst_set(skb, (struct dst_entry *)tun_dst);
490
491         gro_cells_receive(&tunnel->gro_cells, skb);
492         return 0;
493
494 drop:
495         kfree_skb(skb);
496         return 0;
497 }
498 EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
499
500 static int ip_encap_hlen(struct ip_tunnel_encap *e)
501 {
502         const struct ip_tunnel_encap_ops *ops;
503         int hlen = -EINVAL;
504
505         if (e->type == TUNNEL_ENCAP_NONE)
506                 return 0;
507
508         if (e->type >= MAX_IPTUN_ENCAP_OPS)
509                 return -EINVAL;
510
511         rcu_read_lock();
512         ops = rcu_dereference(iptun_encaps[e->type]);
513         if (likely(ops && ops->encap_hlen))
514                 hlen = ops->encap_hlen(e);
515         rcu_read_unlock();
516
517         return hlen;
518 }
519
520 const struct ip_tunnel_encap_ops __rcu *
521                 iptun_encaps[MAX_IPTUN_ENCAP_OPS] __read_mostly;
522
523 int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops,
524                             unsigned int num)
525 {
526         if (num >= MAX_IPTUN_ENCAP_OPS)
527                 return -ERANGE;
528
529         return !cmpxchg((const struct ip_tunnel_encap_ops **)
530                         &iptun_encaps[num],
531                         NULL, ops) ? 0 : -1;
532 }
533 EXPORT_SYMBOL(ip_tunnel_encap_add_ops);
534
535 int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops,
536                             unsigned int num)
537 {
538         int ret;
539
540         if (num >= MAX_IPTUN_ENCAP_OPS)
541                 return -ERANGE;
542
543         ret = (cmpxchg((const struct ip_tunnel_encap_ops **)
544                        &iptun_encaps[num],
545                        ops, NULL) == ops) ? 0 : -1;
546
547         synchronize_net();
548
549         return ret;
550 }
551 EXPORT_SYMBOL(ip_tunnel_encap_del_ops);
552
553 int ip_tunnel_encap_setup(struct ip_tunnel *t,
554                           struct ip_tunnel_encap *ipencap)
555 {
556         int hlen;
557
558         memset(&t->encap, 0, sizeof(t->encap));
559
560         hlen = ip_encap_hlen(ipencap);
561         if (hlen < 0)
562                 return hlen;
563
564         t->encap.type = ipencap->type;
565         t->encap.sport = ipencap->sport;
566         t->encap.dport = ipencap->dport;
567         t->encap.flags = ipencap->flags;
568
569         t->encap_hlen = hlen;
570         t->hlen = t->encap_hlen + t->tun_hlen;
571
572         return 0;
573 }
574 EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup);
575
576 int ip_tunnel_encap(struct sk_buff *skb, struct ip_tunnel *t,
577                     u8 *protocol, struct flowi4 *fl4)
578 {
579         const struct ip_tunnel_encap_ops *ops;
580         int ret = -EINVAL;
581
582         if (t->encap.type == TUNNEL_ENCAP_NONE)
583                 return 0;
584
585         if (t->encap.type >= MAX_IPTUN_ENCAP_OPS)
586                 return -EINVAL;
587
588         rcu_read_lock();
589         ops = rcu_dereference(iptun_encaps[t->encap.type]);
590         if (likely(ops && ops->build_header))
591                 ret = ops->build_header(skb, &t->encap, protocol, fl4);
592         rcu_read_unlock();
593
594         return ret;
595 }
596 EXPORT_SYMBOL(ip_tunnel_encap);
597
598 static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
599                             struct rtable *rt, __be16 df,
600                             const struct iphdr *inner_iph)
601 {
602         struct ip_tunnel *tunnel = netdev_priv(dev);
603         int pkt_size = skb->len - tunnel->hlen - dev->hard_header_len;
604         int mtu;
605
606         if (df)
607                 mtu = dst_mtu(&rt->dst) - dev->hard_header_len
608                                         - sizeof(struct iphdr) - tunnel->hlen;
609         else
610                 mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
611
612         if (skb_dst(skb))
613                 skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
614
615         if (skb->protocol == htons(ETH_P_IP)) {
616                 if (!skb_is_gso(skb) &&
617                     (inner_iph->frag_off & htons(IP_DF)) &&
618                     mtu < pkt_size) {
619                         memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
620                         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
621                         return -E2BIG;
622                 }
623         }
624 #if IS_ENABLED(CONFIG_IPV6)
625         else if (skb->protocol == htons(ETH_P_IPV6)) {
626                 struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
627
628                 if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
629                            mtu >= IPV6_MIN_MTU) {
630                         if ((tunnel->parms.iph.daddr &&
631                             !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
632                             rt6->rt6i_dst.plen == 128) {
633                                 rt6->rt6i_flags |= RTF_MODIFIED;
634                                 dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
635                         }
636                 }
637
638                 if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
639                                         mtu < pkt_size) {
640                         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
641                         return -E2BIG;
642                 }
643         }
644 #endif
645         return 0;
646 }
647
648 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
649                     const struct iphdr *tnl_params, u8 protocol)
650 {
651         struct ip_tunnel *tunnel = netdev_priv(dev);
652         const struct iphdr *inner_iph;
653         struct flowi4 fl4;
654         u8     tos, ttl;
655         __be16 df;
656         struct rtable *rt;              /* Route to the other host */
657         unsigned int max_headroom;      /* The extra header space needed */
658         __be32 dst;
659         bool connected;
660
661         inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
662         connected = (tunnel->parms.iph.daddr != 0);
663
664         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
665
666         dst = tnl_params->daddr;
667         if (dst == 0) {
668                 /* NBMA tunnel */
669
670                 if (!skb_dst(skb)) {
671                         dev->stats.tx_fifo_errors++;
672                         goto tx_error;
673                 }
674
675                 if (skb->protocol == htons(ETH_P_IP)) {
676                         rt = skb_rtable(skb);
677                         dst = rt_nexthop(rt, inner_iph->daddr);
678                 }
679 #if IS_ENABLED(CONFIG_IPV6)
680                 else if (skb->protocol == htons(ETH_P_IPV6)) {
681                         const struct in6_addr *addr6;
682                         struct neighbour *neigh;
683                         bool do_tx_error_icmp;
684                         int addr_type;
685
686                         neigh = dst_neigh_lookup(skb_dst(skb),
687                                                  &ipv6_hdr(skb)->daddr);
688                         if (!neigh)
689                                 goto tx_error;
690
691                         addr6 = (const struct in6_addr *)&neigh->primary_key;
692                         addr_type = ipv6_addr_type(addr6);
693
694                         if (addr_type == IPV6_ADDR_ANY) {
695                                 addr6 = &ipv6_hdr(skb)->daddr;
696                                 addr_type = ipv6_addr_type(addr6);
697                         }
698
699                         if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
700                                 do_tx_error_icmp = true;
701                         else {
702                                 do_tx_error_icmp = false;
703                                 dst = addr6->s6_addr32[3];
704                         }
705                         neigh_release(neigh);
706                         if (do_tx_error_icmp)
707                                 goto tx_error_icmp;
708                 }
709 #endif
710                 else
711                         goto tx_error;
712
713                 connected = false;
714         }
715
716         tos = tnl_params->tos;
717         if (tos & 0x1) {
718                 tos &= ~0x1;
719                 if (skb->protocol == htons(ETH_P_IP)) {
720                         tos = inner_iph->tos;
721                         connected = false;
722                 } else if (skb->protocol == htons(ETH_P_IPV6)) {
723                         tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
724                         connected = false;
725                 }
726         }
727
728         init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr,
729                          tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link);
730
731         if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0)
732                 goto tx_error;
733
734         rt = connected ? tunnel_rtable_get(tunnel, 0, &fl4.saddr) : NULL;
735
736         if (!rt) {
737                 rt = ip_route_output_key(tunnel->net, &fl4);
738
739                 if (IS_ERR(rt)) {
740                         dev->stats.tx_carrier_errors++;
741                         goto tx_error;
742                 }
743                 if (connected)
744                         tunnel_dst_set(tunnel, &rt->dst, fl4.saddr);
745         }
746
747         if (rt->dst.dev == dev) {
748                 ip_rt_put(rt);
749                 dev->stats.collisions++;
750                 goto tx_error;
751         }
752
753         if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off, inner_iph)) {
754                 ip_rt_put(rt);
755                 goto tx_error;
756         }
757
758         if (tunnel->err_count > 0) {
759                 if (time_before(jiffies,
760                                 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
761                         tunnel->err_count--;
762
763                         dst_link_failure(skb);
764                 } else
765                         tunnel->err_count = 0;
766         }
767
768         tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
769         ttl = tnl_params->ttl;
770         if (ttl == 0) {
771                 if (skb->protocol == htons(ETH_P_IP))
772                         ttl = inner_iph->ttl;
773 #if IS_ENABLED(CONFIG_IPV6)
774                 else if (skb->protocol == htons(ETH_P_IPV6))
775                         ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
776 #endif
777                 else
778                         ttl = ip4_dst_hoplimit(&rt->dst);
779         }
780
781         df = tnl_params->frag_off;
782         if (skb->protocol == htons(ETH_P_IP))
783                 df |= (inner_iph->frag_off&htons(IP_DF));
784
785         max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
786                         + rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
787         if (max_headroom > dev->needed_headroom)
788                 dev->needed_headroom = max_headroom;
789
790         if (skb_cow_head(skb, dev->needed_headroom)) {
791                 ip_rt_put(rt);
792                 dev->stats.tx_dropped++;
793                 kfree_skb(skb);
794                 return;
795         }
796
797         iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl,
798                       df, !net_eq(tunnel->net, dev_net(dev)));
799         return;
800
801 #if IS_ENABLED(CONFIG_IPV6)
802 tx_error_icmp:
803         dst_link_failure(skb);
804 #endif
805 tx_error:
806         dev->stats.tx_errors++;
807         kfree_skb(skb);
808 }
809 EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
810
811 static void ip_tunnel_update(struct ip_tunnel_net *itn,
812                              struct ip_tunnel *t,
813                              struct net_device *dev,
814                              struct ip_tunnel_parm *p,
815                              bool set_mtu)
816 {
817         ip_tunnel_del(itn, t);
818         t->parms.iph.saddr = p->iph.saddr;
819         t->parms.iph.daddr = p->iph.daddr;
820         t->parms.i_key = p->i_key;
821         t->parms.o_key = p->o_key;
822         if (dev->type != ARPHRD_ETHER) {
823                 memcpy(dev->dev_addr, &p->iph.saddr, 4);
824                 memcpy(dev->broadcast, &p->iph.daddr, 4);
825         }
826         ip_tunnel_add(itn, t);
827
828         t->parms.iph.ttl = p->iph.ttl;
829         t->parms.iph.tos = p->iph.tos;
830         t->parms.iph.frag_off = p->iph.frag_off;
831
832         if (t->parms.link != p->link) {
833                 int mtu;
834
835                 t->parms.link = p->link;
836                 mtu = ip_tunnel_bind_dev(dev);
837                 if (set_mtu)
838                         dev->mtu = mtu;
839         }
840         ip_tunnel_dst_reset_all(t);
841         netdev_state_change(dev);
842 }
843
844 int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
845 {
846         int err = 0;
847         struct ip_tunnel *t = netdev_priv(dev);
848         struct net *net = t->net;
849         struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
850
851         BUG_ON(!itn->fb_tunnel_dev);
852         switch (cmd) {
853         case SIOCGETTUNNEL:
854                 if (dev == itn->fb_tunnel_dev) {
855                         t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
856                         if (!t)
857                                 t = netdev_priv(dev);
858                 }
859                 memcpy(p, &t->parms, sizeof(*p));
860                 break;
861
862         case SIOCADDTUNNEL:
863         case SIOCCHGTUNNEL:
864                 err = -EPERM;
865                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
866                         goto done;
867                 if (p->iph.ttl)
868                         p->iph.frag_off |= htons(IP_DF);
869                 if (!(p->i_flags & VTI_ISVTI)) {
870                         if (!(p->i_flags & TUNNEL_KEY))
871                                 p->i_key = 0;
872                         if (!(p->o_flags & TUNNEL_KEY))
873                                 p->o_key = 0;
874                 }
875
876                 t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
877
878                 if (cmd == SIOCADDTUNNEL) {
879                         if (!t) {
880                                 t = ip_tunnel_create(net, itn, p);
881                                 err = PTR_ERR_OR_ZERO(t);
882                                 break;
883                         }
884
885                         err = -EEXIST;
886                         break;
887                 }
888                 if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
889                         if (t) {
890                                 if (t->dev != dev) {
891                                         err = -EEXIST;
892                                         break;
893                                 }
894                         } else {
895                                 unsigned int nflags = 0;
896
897                                 if (ipv4_is_multicast(p->iph.daddr))
898                                         nflags = IFF_BROADCAST;
899                                 else if (p->iph.daddr)
900                                         nflags = IFF_POINTOPOINT;
901
902                                 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
903                                         err = -EINVAL;
904                                         break;
905                                 }
906
907                                 t = netdev_priv(dev);
908                         }
909                 }
910
911                 if (t) {
912                         err = 0;
913                         ip_tunnel_update(itn, t, dev, p, true);
914                 } else {
915                         err = -ENOENT;
916                 }
917                 break;
918
919         case SIOCDELTUNNEL:
920                 err = -EPERM;
921                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
922                         goto done;
923
924                 if (dev == itn->fb_tunnel_dev) {
925                         err = -ENOENT;
926                         t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
927                         if (!t)
928                                 goto done;
929                         err = -EPERM;
930                         if (t == netdev_priv(itn->fb_tunnel_dev))
931                                 goto done;
932                         dev = t->dev;
933                 }
934                 unregister_netdevice(dev);
935                 err = 0;
936                 break;
937
938         default:
939                 err = -EINVAL;
940         }
941
942 done:
943         return err;
944 }
945 EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
946
947 int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict)
948 {
949         struct ip_tunnel *tunnel = netdev_priv(dev);
950         int t_hlen = tunnel->hlen + sizeof(struct iphdr);
951         int max_mtu = 0xFFF8 - dev->hard_header_len - t_hlen;
952
953         if (new_mtu < 68)
954                 return -EINVAL;
955
956         if (new_mtu > max_mtu) {
957                 if (strict)
958                         return -EINVAL;
959
960                 new_mtu = max_mtu;
961         }
962
963         dev->mtu = new_mtu;
964         return 0;
965 }
966 EXPORT_SYMBOL_GPL(__ip_tunnel_change_mtu);
967
968 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
969 {
970         return __ip_tunnel_change_mtu(dev, new_mtu, true);
971 }
972 EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
973
974 static void ip_tunnel_dev_free(struct net_device *dev)
975 {
976         struct ip_tunnel *tunnel = netdev_priv(dev);
977
978         gro_cells_destroy(&tunnel->gro_cells);
979         free_percpu(tunnel->dst_cache);
980         free_percpu(dev->tstats);
981         free_netdev(dev);
982 }
983
984 void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
985 {
986         struct ip_tunnel *tunnel = netdev_priv(dev);
987         struct ip_tunnel_net *itn;
988
989         itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
990
991         if (itn->fb_tunnel_dev != dev) {
992                 ip_tunnel_del(itn, netdev_priv(dev));
993                 unregister_netdevice_queue(dev, head);
994         }
995 }
996 EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
997
998 struct net *ip_tunnel_get_link_net(const struct net_device *dev)
999 {
1000         struct ip_tunnel *tunnel = netdev_priv(dev);
1001
1002         return tunnel->net;
1003 }
1004 EXPORT_SYMBOL(ip_tunnel_get_link_net);
1005
1006 int ip_tunnel_get_iflink(const struct net_device *dev)
1007 {
1008         struct ip_tunnel *tunnel = netdev_priv(dev);
1009
1010         return tunnel->parms.link;
1011 }
1012 EXPORT_SYMBOL(ip_tunnel_get_iflink);
1013
1014 int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id,
1015                                   struct rtnl_link_ops *ops, char *devname)
1016 {
1017         struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
1018         struct ip_tunnel_parm parms;
1019         unsigned int i;
1020
1021         for (i = 0; i < IP_TNL_HASH_SIZE; i++)
1022                 INIT_HLIST_HEAD(&itn->tunnels[i]);
1023
1024         if (!ops) {
1025                 itn->fb_tunnel_dev = NULL;
1026                 return 0;
1027         }
1028
1029         memset(&parms, 0, sizeof(parms));
1030         if (devname)
1031                 strlcpy(parms.name, devname, IFNAMSIZ);
1032
1033         rtnl_lock();
1034         itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
1035         /* FB netdevice is special: we have one, and only one per netns.
1036          * Allowing to move it to another netns is clearly unsafe.
1037          */
1038         if (!IS_ERR(itn->fb_tunnel_dev)) {
1039                 itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
1040                 itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
1041                 ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
1042         }
1043         rtnl_unlock();
1044
1045         return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
1046 }
1047 EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
1048
1049 static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head,
1050                               struct rtnl_link_ops *ops)
1051 {
1052         struct net *net = dev_net(itn->fb_tunnel_dev);
1053         struct net_device *dev, *aux;
1054         int h;
1055
1056         for_each_netdev_safe(net, dev, aux)
1057                 if (dev->rtnl_link_ops == ops)
1058                         unregister_netdevice_queue(dev, head);
1059
1060         for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
1061                 struct ip_tunnel *t;
1062                 struct hlist_node *n;
1063                 struct hlist_head *thead = &itn->tunnels[h];
1064
1065                 hlist_for_each_entry_safe(t, n, thead, hash_node)
1066                         /* If dev is in the same netns, it has already
1067                          * been added to the list by the previous loop.
1068                          */
1069                         if (!net_eq(dev_net(t->dev), net))
1070                                 unregister_netdevice_queue(t->dev, head);
1071         }
1072 }
1073
1074 void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops)
1075 {
1076         LIST_HEAD(list);
1077
1078         rtnl_lock();
1079         ip_tunnel_destroy(itn, &list, ops);
1080         unregister_netdevice_many(&list);
1081         rtnl_unlock();
1082 }
1083 EXPORT_SYMBOL_GPL(ip_tunnel_delete_net);
1084
1085 int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
1086                       struct ip_tunnel_parm *p)
1087 {
1088         struct ip_tunnel *nt;
1089         struct net *net = dev_net(dev);
1090         struct ip_tunnel_net *itn;
1091         int mtu;
1092         int err;
1093
1094         nt = netdev_priv(dev);
1095         itn = net_generic(net, nt->ip_tnl_net_id);
1096
1097         if (nt->collect_md) {
1098                 if (rtnl_dereference(itn->collect_md_tun))
1099                         return -EEXIST;
1100         } else {
1101                 if (ip_tunnel_find(itn, p, dev->type))
1102                         return -EEXIST;
1103         }
1104
1105         nt->net = net;
1106         nt->parms = *p;
1107         err = register_netdevice(dev);
1108         if (err)
1109                 goto out;
1110
1111         if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1112                 eth_hw_addr_random(dev);
1113
1114         mtu = ip_tunnel_bind_dev(dev);
1115         if (!tb[IFLA_MTU])
1116                 dev->mtu = mtu;
1117
1118         ip_tunnel_add(itn, nt);
1119 out:
1120         return err;
1121 }
1122 EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
1123
1124 int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
1125                          struct ip_tunnel_parm *p)
1126 {
1127         struct ip_tunnel *t;
1128         struct ip_tunnel *tunnel = netdev_priv(dev);
1129         struct net *net = tunnel->net;
1130         struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
1131
1132         if (dev == itn->fb_tunnel_dev)
1133                 return -EINVAL;
1134
1135         t = ip_tunnel_find(itn, p, dev->type);
1136
1137         if (t) {
1138                 if (t->dev != dev)
1139                         return -EEXIST;
1140         } else {
1141                 t = tunnel;
1142
1143                 if (dev->type != ARPHRD_ETHER) {
1144                         unsigned int nflags = 0;
1145
1146                         if (ipv4_is_multicast(p->iph.daddr))
1147                                 nflags = IFF_BROADCAST;
1148                         else if (p->iph.daddr)
1149                                 nflags = IFF_POINTOPOINT;
1150
1151                         if ((dev->flags ^ nflags) &
1152                             (IFF_POINTOPOINT | IFF_BROADCAST))
1153                                 return -EINVAL;
1154                 }
1155         }
1156
1157         ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU]);
1158         return 0;
1159 }
1160 EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
1161
1162 int ip_tunnel_init(struct net_device *dev)
1163 {
1164         struct ip_tunnel *tunnel = netdev_priv(dev);
1165         struct iphdr *iph = &tunnel->parms.iph;
1166         int err;
1167
1168         dev->destructor = ip_tunnel_dev_free;
1169         dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
1170         if (!dev->tstats)
1171                 return -ENOMEM;
1172
1173         tunnel->dst_cache = alloc_percpu(struct ip_tunnel_dst);
1174         if (!tunnel->dst_cache) {
1175                 free_percpu(dev->tstats);
1176                 return -ENOMEM;
1177         }
1178
1179         err = gro_cells_init(&tunnel->gro_cells, dev);
1180         if (err) {
1181                 free_percpu(tunnel->dst_cache);
1182                 free_percpu(dev->tstats);
1183                 return err;
1184         }
1185
1186         tunnel->dev = dev;
1187         tunnel->net = dev_net(dev);
1188         strcpy(tunnel->parms.name, dev->name);
1189         iph->version            = 4;
1190         iph->ihl                = 5;
1191
1192         if (tunnel->collect_md) {
1193                 dev->features |= NETIF_F_NETNS_LOCAL;
1194                 netif_keep_dst(dev);
1195         }
1196         return 0;
1197 }
1198 EXPORT_SYMBOL_GPL(ip_tunnel_init);
1199
1200 void ip_tunnel_uninit(struct net_device *dev)
1201 {
1202         struct ip_tunnel *tunnel = netdev_priv(dev);
1203         struct net *net = tunnel->net;
1204         struct ip_tunnel_net *itn;
1205
1206         itn = net_generic(net, tunnel->ip_tnl_net_id);
1207         /* fb_tunnel_dev will be unregisted in net-exit call. */
1208         if (itn->fb_tunnel_dev != dev)
1209                 ip_tunnel_del(itn, netdev_priv(dev));
1210
1211         ip_tunnel_dst_reset_all(tunnel);
1212 }
1213 EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1214
1215 /* Do least required initialization, rest of init is done in tunnel_init call */
1216 void ip_tunnel_setup(struct net_device *dev, int net_id)
1217 {
1218         struct ip_tunnel *tunnel = netdev_priv(dev);
1219         tunnel->ip_tnl_net_id = net_id;
1220 }
1221 EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1222
1223 MODULE_LICENSE("GPL");