mm: create the new vm_fault_t type
[linux-2.6-microblaze.git] / net / ipv4 / ip_tunnel.c
1 /*
2  * Copyright (c) 2013 Nicira, Inc.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of version 2 of the GNU General Public
6  * License as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful, but
9  * WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public License
14  * along with this program; if not, write to the Free Software
15  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16  * 02110-1301, USA
17  */
18
19 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
20
21 #include <linux/capability.h>
22 #include <linux/module.h>
23 #include <linux/types.h>
24 #include <linux/kernel.h>
25 #include <linux/slab.h>
26 #include <linux/uaccess.h>
27 #include <linux/skbuff.h>
28 #include <linux/netdevice.h>
29 #include <linux/in.h>
30 #include <linux/tcp.h>
31 #include <linux/udp.h>
32 #include <linux/if_arp.h>
33 #include <linux/init.h>
34 #include <linux/in6.h>
35 #include <linux/inetdevice.h>
36 #include <linux/igmp.h>
37 #include <linux/netfilter_ipv4.h>
38 #include <linux/etherdevice.h>
39 #include <linux/if_ether.h>
40 #include <linux/if_vlan.h>
41 #include <linux/rculist.h>
42 #include <linux/err.h>
43
44 #include <net/sock.h>
45 #include <net/ip.h>
46 #include <net/icmp.h>
47 #include <net/protocol.h>
48 #include <net/ip_tunnels.h>
49 #include <net/arp.h>
50 #include <net/checksum.h>
51 #include <net/dsfield.h>
52 #include <net/inet_ecn.h>
53 #include <net/xfrm.h>
54 #include <net/net_namespace.h>
55 #include <net/netns/generic.h>
56 #include <net/rtnetlink.h>
57 #include <net/udp.h>
58 #include <net/dst_metadata.h>
59
60 #if IS_ENABLED(CONFIG_IPV6)
61 #include <net/ipv6.h>
62 #include <net/ip6_fib.h>
63 #include <net/ip6_route.h>
64 #endif
65
66 static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
67 {
68         return hash_32((__force u32)key ^ (__force u32)remote,
69                          IP_TNL_HASH_BITS);
70 }
71
72 static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
73                                 __be16 flags, __be32 key)
74 {
75         if (p->i_flags & TUNNEL_KEY) {
76                 if (flags & TUNNEL_KEY)
77                         return key == p->i_key;
78                 else
79                         /* key expected, none present */
80                         return false;
81         } else
82                 return !(flags & TUNNEL_KEY);
83 }
84
85 /* Fallback tunnel: no source, no destination, no key, no options
86
87    Tunnel hash table:
88    We require exact key match i.e. if a key is present in packet
89    it will match only tunnel with the same key; if it is not present,
90    it will match only keyless tunnel.
91
92    All keysless packets, if not matched configured keyless tunnels
93    will match fallback tunnel.
94    Given src, dst and key, find appropriate for input tunnel.
95 */
96 struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
97                                    int link, __be16 flags,
98                                    __be32 remote, __be32 local,
99                                    __be32 key)
100 {
101         unsigned int hash;
102         struct ip_tunnel *t, *cand = NULL;
103         struct hlist_head *head;
104
105         hash = ip_tunnel_hash(key, remote);
106         head = &itn->tunnels[hash];
107
108         hlist_for_each_entry_rcu(t, head, hash_node) {
109                 if (local != t->parms.iph.saddr ||
110                     remote != t->parms.iph.daddr ||
111                     !(t->dev->flags & IFF_UP))
112                         continue;
113
114                 if (!ip_tunnel_key_match(&t->parms, flags, key))
115                         continue;
116
117                 if (t->parms.link == link)
118                         return t;
119                 else
120                         cand = t;
121         }
122
123         hlist_for_each_entry_rcu(t, head, hash_node) {
124                 if (remote != t->parms.iph.daddr ||
125                     t->parms.iph.saddr != 0 ||
126                     !(t->dev->flags & IFF_UP))
127                         continue;
128
129                 if (!ip_tunnel_key_match(&t->parms, flags, key))
130                         continue;
131
132                 if (t->parms.link == link)
133                         return t;
134                 else if (!cand)
135                         cand = t;
136         }
137
138         hash = ip_tunnel_hash(key, 0);
139         head = &itn->tunnels[hash];
140
141         hlist_for_each_entry_rcu(t, head, hash_node) {
142                 if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) &&
143                     (local != t->parms.iph.daddr || !ipv4_is_multicast(local)))
144                         continue;
145
146                 if (!(t->dev->flags & IFF_UP))
147                         continue;
148
149                 if (!ip_tunnel_key_match(&t->parms, flags, key))
150                         continue;
151
152                 if (t->parms.link == link)
153                         return t;
154                 else if (!cand)
155                         cand = t;
156         }
157
158         if (flags & TUNNEL_NO_KEY)
159                 goto skip_key_lookup;
160
161         hlist_for_each_entry_rcu(t, head, hash_node) {
162                 if (t->parms.i_key != key ||
163                     t->parms.iph.saddr != 0 ||
164                     t->parms.iph.daddr != 0 ||
165                     !(t->dev->flags & IFF_UP))
166                         continue;
167
168                 if (t->parms.link == link)
169                         return t;
170                 else if (!cand)
171                         cand = t;
172         }
173
174 skip_key_lookup:
175         if (cand)
176                 return cand;
177
178         t = rcu_dereference(itn->collect_md_tun);
179         if (t && t->dev->flags & IFF_UP)
180                 return t;
181
182         if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP)
183                 return netdev_priv(itn->fb_tunnel_dev);
184
185         return NULL;
186 }
187 EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
188
189 static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
190                                     struct ip_tunnel_parm *parms)
191 {
192         unsigned int h;
193         __be32 remote;
194         __be32 i_key = parms->i_key;
195
196         if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
197                 remote = parms->iph.daddr;
198         else
199                 remote = 0;
200
201         if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI))
202                 i_key = 0;
203
204         h = ip_tunnel_hash(i_key, remote);
205         return &itn->tunnels[h];
206 }
207
208 static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
209 {
210         struct hlist_head *head = ip_bucket(itn, &t->parms);
211
212         if (t->collect_md)
213                 rcu_assign_pointer(itn->collect_md_tun, t);
214         hlist_add_head_rcu(&t->hash_node, head);
215 }
216
217 static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t)
218 {
219         if (t->collect_md)
220                 rcu_assign_pointer(itn->collect_md_tun, NULL);
221         hlist_del_init_rcu(&t->hash_node);
222 }
223
224 static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
225                                         struct ip_tunnel_parm *parms,
226                                         int type)
227 {
228         __be32 remote = parms->iph.daddr;
229         __be32 local = parms->iph.saddr;
230         __be32 key = parms->i_key;
231         __be16 flags = parms->i_flags;
232         int link = parms->link;
233         struct ip_tunnel *t = NULL;
234         struct hlist_head *head = ip_bucket(itn, parms);
235
236         hlist_for_each_entry_rcu(t, head, hash_node) {
237                 if (local == t->parms.iph.saddr &&
238                     remote == t->parms.iph.daddr &&
239                     link == t->parms.link &&
240                     type == t->dev->type &&
241                     ip_tunnel_key_match(&t->parms, flags, key))
242                         break;
243         }
244         return t;
245 }
246
247 static struct net_device *__ip_tunnel_create(struct net *net,
248                                              const struct rtnl_link_ops *ops,
249                                              struct ip_tunnel_parm *parms)
250 {
251         int err;
252         struct ip_tunnel *tunnel;
253         struct net_device *dev;
254         char name[IFNAMSIZ];
255
256         err = -E2BIG;
257         if (parms->name[0]) {
258                 if (!dev_valid_name(parms->name))
259                         goto failed;
260                 strlcpy(name, parms->name, IFNAMSIZ);
261         } else {
262                 if (strlen(ops->kind) > (IFNAMSIZ - 3))
263                         goto failed;
264                 strcpy(name, ops->kind);
265                 strcat(name, "%d");
266         }
267
268         ASSERT_RTNL();
269         dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup);
270         if (!dev) {
271                 err = -ENOMEM;
272                 goto failed;
273         }
274         dev_net_set(dev, net);
275
276         dev->rtnl_link_ops = ops;
277
278         tunnel = netdev_priv(dev);
279         tunnel->parms = *parms;
280         tunnel->net = net;
281
282         err = register_netdevice(dev);
283         if (err)
284                 goto failed_free;
285
286         return dev;
287
288 failed_free:
289         free_netdev(dev);
290 failed:
291         return ERR_PTR(err);
292 }
293
294 static int ip_tunnel_bind_dev(struct net_device *dev)
295 {
296         struct net_device *tdev = NULL;
297         struct ip_tunnel *tunnel = netdev_priv(dev);
298         const struct iphdr *iph;
299         int hlen = LL_MAX_HEADER;
300         int mtu = ETH_DATA_LEN;
301         int t_hlen = tunnel->hlen + sizeof(struct iphdr);
302
303         iph = &tunnel->parms.iph;
304
305         /* Guess output device to choose reasonable mtu and needed_headroom */
306         if (iph->daddr) {
307                 struct flowi4 fl4;
308                 struct rtable *rt;
309
310                 ip_tunnel_init_flow(&fl4, iph->protocol, iph->daddr,
311                                     iph->saddr, tunnel->parms.o_key,
312                                     RT_TOS(iph->tos), tunnel->parms.link,
313                                     tunnel->fwmark, 0);
314                 rt = ip_route_output_key(tunnel->net, &fl4);
315
316                 if (!IS_ERR(rt)) {
317                         tdev = rt->dst.dev;
318                         ip_rt_put(rt);
319                 }
320                 if (dev->type != ARPHRD_ETHER)
321                         dev->flags |= IFF_POINTOPOINT;
322
323                 dst_cache_reset(&tunnel->dst_cache);
324         }
325
326         if (!tdev && tunnel->parms.link)
327                 tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
328
329         if (tdev) {
330                 hlen = tdev->hard_header_len + tdev->needed_headroom;
331                 mtu = min(tdev->mtu, IP_MAX_MTU);
332         }
333
334         dev->needed_headroom = t_hlen + hlen;
335         mtu -= (dev->hard_header_len + t_hlen);
336
337         if (mtu < IPV4_MIN_MTU)
338                 mtu = IPV4_MIN_MTU;
339
340         return mtu;
341 }
342
343 static struct ip_tunnel *ip_tunnel_create(struct net *net,
344                                           struct ip_tunnel_net *itn,
345                                           struct ip_tunnel_parm *parms)
346 {
347         struct ip_tunnel *nt;
348         struct net_device *dev;
349         int t_hlen;
350         int mtu;
351         int err;
352
353         dev = __ip_tunnel_create(net, itn->rtnl_link_ops, parms);
354         if (IS_ERR(dev))
355                 return ERR_CAST(dev);
356
357         mtu = ip_tunnel_bind_dev(dev);
358         err = dev_set_mtu(dev, mtu);
359         if (err)
360                 goto err_dev_set_mtu;
361
362         nt = netdev_priv(dev);
363         t_hlen = nt->hlen + sizeof(struct iphdr);
364         dev->min_mtu = ETH_MIN_MTU;
365         dev->max_mtu = IP_MAX_MTU - dev->hard_header_len - t_hlen;
366         ip_tunnel_add(itn, nt);
367         return nt;
368
369 err_dev_set_mtu:
370         unregister_netdevice(dev);
371         return ERR_PTR(err);
372 }
373
374 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
375                   const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst,
376                   bool log_ecn_error)
377 {
378         struct pcpu_sw_netstats *tstats;
379         const struct iphdr *iph = ip_hdr(skb);
380         int err;
381
382 #ifdef CONFIG_NET_IPGRE_BROADCAST
383         if (ipv4_is_multicast(iph->daddr)) {
384                 tunnel->dev->stats.multicast++;
385                 skb->pkt_type = PACKET_BROADCAST;
386         }
387 #endif
388
389         if ((!(tpi->flags&TUNNEL_CSUM) &&  (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
390              ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
391                 tunnel->dev->stats.rx_crc_errors++;
392                 tunnel->dev->stats.rx_errors++;
393                 goto drop;
394         }
395
396         if (tunnel->parms.i_flags&TUNNEL_SEQ) {
397                 if (!(tpi->flags&TUNNEL_SEQ) ||
398                     (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
399                         tunnel->dev->stats.rx_fifo_errors++;
400                         tunnel->dev->stats.rx_errors++;
401                         goto drop;
402                 }
403                 tunnel->i_seqno = ntohl(tpi->seq) + 1;
404         }
405
406         skb_reset_network_header(skb);
407
408         err = IP_ECN_decapsulate(iph, skb);
409         if (unlikely(err)) {
410                 if (log_ecn_error)
411                         net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
412                                         &iph->saddr, iph->tos);
413                 if (err > 1) {
414                         ++tunnel->dev->stats.rx_frame_errors;
415                         ++tunnel->dev->stats.rx_errors;
416                         goto drop;
417                 }
418         }
419
420         tstats = this_cpu_ptr(tunnel->dev->tstats);
421         u64_stats_update_begin(&tstats->syncp);
422         tstats->rx_packets++;
423         tstats->rx_bytes += skb->len;
424         u64_stats_update_end(&tstats->syncp);
425
426         skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
427
428         if (tunnel->dev->type == ARPHRD_ETHER) {
429                 skb->protocol = eth_type_trans(skb, tunnel->dev);
430                 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
431         } else {
432                 skb->dev = tunnel->dev;
433         }
434
435         if (tun_dst)
436                 skb_dst_set(skb, (struct dst_entry *)tun_dst);
437
438         gro_cells_receive(&tunnel->gro_cells, skb);
439         return 0;
440
441 drop:
442         if (tun_dst)
443                 dst_release((struct dst_entry *)tun_dst);
444         kfree_skb(skb);
445         return 0;
446 }
447 EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
448
449 int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops,
450                             unsigned int num)
451 {
452         if (num >= MAX_IPTUN_ENCAP_OPS)
453                 return -ERANGE;
454
455         return !cmpxchg((const struct ip_tunnel_encap_ops **)
456                         &iptun_encaps[num],
457                         NULL, ops) ? 0 : -1;
458 }
459 EXPORT_SYMBOL(ip_tunnel_encap_add_ops);
460
461 int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops,
462                             unsigned int num)
463 {
464         int ret;
465
466         if (num >= MAX_IPTUN_ENCAP_OPS)
467                 return -ERANGE;
468
469         ret = (cmpxchg((const struct ip_tunnel_encap_ops **)
470                        &iptun_encaps[num],
471                        ops, NULL) == ops) ? 0 : -1;
472
473         synchronize_net();
474
475         return ret;
476 }
477 EXPORT_SYMBOL(ip_tunnel_encap_del_ops);
478
479 int ip_tunnel_encap_setup(struct ip_tunnel *t,
480                           struct ip_tunnel_encap *ipencap)
481 {
482         int hlen;
483
484         memset(&t->encap, 0, sizeof(t->encap));
485
486         hlen = ip_encap_hlen(ipencap);
487         if (hlen < 0)
488                 return hlen;
489
490         t->encap.type = ipencap->type;
491         t->encap.sport = ipencap->sport;
492         t->encap.dport = ipencap->dport;
493         t->encap.flags = ipencap->flags;
494
495         t->encap_hlen = hlen;
496         t->hlen = t->encap_hlen + t->tun_hlen;
497
498         return 0;
499 }
500 EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup);
501
502 static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
503                             struct rtable *rt, __be16 df,
504                             const struct iphdr *inner_iph,
505                             int tunnel_hlen, __be32 dst, bool md)
506 {
507         struct ip_tunnel *tunnel = netdev_priv(dev);
508         int pkt_size;
509         int mtu;
510
511         tunnel_hlen = md ? tunnel_hlen : tunnel->hlen;
512         pkt_size = skb->len - tunnel_hlen - dev->hard_header_len;
513
514         if (df)
515                 mtu = dst_mtu(&rt->dst) - dev->hard_header_len
516                                         - sizeof(struct iphdr) - tunnel_hlen;
517         else
518                 mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
519
520         skb_dst_update_pmtu(skb, mtu);
521
522         if (skb->protocol == htons(ETH_P_IP)) {
523                 if (!skb_is_gso(skb) &&
524                     (inner_iph->frag_off & htons(IP_DF)) &&
525                     mtu < pkt_size) {
526                         memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
527                         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
528                         return -E2BIG;
529                 }
530         }
531 #if IS_ENABLED(CONFIG_IPV6)
532         else if (skb->protocol == htons(ETH_P_IPV6)) {
533                 struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
534                 __be32 daddr;
535
536                 daddr = md ? dst : tunnel->parms.iph.daddr;
537
538                 if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
539                            mtu >= IPV6_MIN_MTU) {
540                         if ((daddr && !ipv4_is_multicast(daddr)) ||
541                             rt6->rt6i_dst.plen == 128) {
542                                 rt6->rt6i_flags |= RTF_MODIFIED;
543                                 dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
544                         }
545                 }
546
547                 if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
548                                         mtu < pkt_size) {
549                         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
550                         return -E2BIG;
551                 }
552         }
553 #endif
554         return 0;
555 }
556
557 void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
558                        u8 proto, int tunnel_hlen)
559 {
560         struct ip_tunnel *tunnel = netdev_priv(dev);
561         u32 headroom = sizeof(struct iphdr);
562         struct ip_tunnel_info *tun_info;
563         const struct ip_tunnel_key *key;
564         const struct iphdr *inner_iph;
565         struct rtable *rt = NULL;
566         struct flowi4 fl4;
567         __be16 df = 0;
568         u8 tos, ttl;
569         bool use_cache;
570
571         tun_info = skb_tunnel_info(skb);
572         if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
573                      ip_tunnel_info_af(tun_info) != AF_INET))
574                 goto tx_error;
575         key = &tun_info->key;
576         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
577         inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
578         tos = key->tos;
579         if (tos == 1) {
580                 if (skb->protocol == htons(ETH_P_IP))
581                         tos = inner_iph->tos;
582                 else if (skb->protocol == htons(ETH_P_IPV6))
583                         tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
584         }
585         ip_tunnel_init_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src,
586                             tunnel_id_to_key32(key->tun_id), RT_TOS(tos),
587                             0, skb->mark, skb_get_hash(skb));
588         if (tunnel->encap.type != TUNNEL_ENCAP_NONE)
589                 goto tx_error;
590
591         use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
592         if (use_cache)
593                 rt = dst_cache_get_ip4(&tun_info->dst_cache, &fl4.saddr);
594         if (!rt) {
595                 rt = ip_route_output_key(tunnel->net, &fl4);
596                 if (IS_ERR(rt)) {
597                         dev->stats.tx_carrier_errors++;
598                         goto tx_error;
599                 }
600                 if (use_cache)
601                         dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst,
602                                           fl4.saddr);
603         }
604         if (rt->dst.dev == dev) {
605                 ip_rt_put(rt);
606                 dev->stats.collisions++;
607                 goto tx_error;
608         }
609
610         if (key->tun_flags & TUNNEL_DONT_FRAGMENT)
611                 df = htons(IP_DF);
612         if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, tunnel_hlen,
613                             key->u.ipv4.dst, true)) {
614                 ip_rt_put(rt);
615                 goto tx_error;
616         }
617
618         tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
619         ttl = key->ttl;
620         if (ttl == 0) {
621                 if (skb->protocol == htons(ETH_P_IP))
622                         ttl = inner_iph->ttl;
623                 else if (skb->protocol == htons(ETH_P_IPV6))
624                         ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
625                 else
626                         ttl = ip4_dst_hoplimit(&rt->dst);
627         }
628
629         if (!df && skb->protocol == htons(ETH_P_IP))
630                 df = inner_iph->frag_off & htons(IP_DF);
631
632         headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len;
633         if (headroom > dev->needed_headroom)
634                 dev->needed_headroom = headroom;
635
636         if (skb_cow_head(skb, dev->needed_headroom)) {
637                 ip_rt_put(rt);
638                 goto tx_dropped;
639         }
640         iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, tos, ttl,
641                       df, !net_eq(tunnel->net, dev_net(dev)));
642         return;
643 tx_error:
644         dev->stats.tx_errors++;
645         goto kfree;
646 tx_dropped:
647         dev->stats.tx_dropped++;
648 kfree:
649         kfree_skb(skb);
650 }
651 EXPORT_SYMBOL_GPL(ip_md_tunnel_xmit);
652
653 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
654                     const struct iphdr *tnl_params, u8 protocol)
655 {
656         struct ip_tunnel *tunnel = netdev_priv(dev);
657         struct ip_tunnel_info *tun_info = NULL;
658         const struct iphdr *inner_iph;
659         unsigned int max_headroom;      /* The extra header space needed */
660         struct rtable *rt = NULL;               /* Route to the other host */
661         bool use_cache = false;
662         struct flowi4 fl4;
663         bool md = false;
664         bool connected;
665         u8 tos, ttl;
666         __be32 dst;
667         __be16 df;
668
669         inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
670         connected = (tunnel->parms.iph.daddr != 0);
671
672         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
673
674         dst = tnl_params->daddr;
675         if (dst == 0) {
676                 /* NBMA tunnel */
677
678                 if (!skb_dst(skb)) {
679                         dev->stats.tx_fifo_errors++;
680                         goto tx_error;
681                 }
682
683                 tun_info = skb_tunnel_info(skb);
684                 if (tun_info && (tun_info->mode & IP_TUNNEL_INFO_TX) &&
685                     ip_tunnel_info_af(tun_info) == AF_INET &&
686                     tun_info->key.u.ipv4.dst) {
687                         dst = tun_info->key.u.ipv4.dst;
688                         md = true;
689                         connected = true;
690                 }
691                 else if (skb->protocol == htons(ETH_P_IP)) {
692                         rt = skb_rtable(skb);
693                         dst = rt_nexthop(rt, inner_iph->daddr);
694                 }
695 #if IS_ENABLED(CONFIG_IPV6)
696                 else if (skb->protocol == htons(ETH_P_IPV6)) {
697                         const struct in6_addr *addr6;
698                         struct neighbour *neigh;
699                         bool do_tx_error_icmp;
700                         int addr_type;
701
702                         neigh = dst_neigh_lookup(skb_dst(skb),
703                                                  &ipv6_hdr(skb)->daddr);
704                         if (!neigh)
705                                 goto tx_error;
706
707                         addr6 = (const struct in6_addr *)&neigh->primary_key;
708                         addr_type = ipv6_addr_type(addr6);
709
710                         if (addr_type == IPV6_ADDR_ANY) {
711                                 addr6 = &ipv6_hdr(skb)->daddr;
712                                 addr_type = ipv6_addr_type(addr6);
713                         }
714
715                         if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
716                                 do_tx_error_icmp = true;
717                         else {
718                                 do_tx_error_icmp = false;
719                                 dst = addr6->s6_addr32[3];
720                         }
721                         neigh_release(neigh);
722                         if (do_tx_error_icmp)
723                                 goto tx_error_icmp;
724                 }
725 #endif
726                 else
727                         goto tx_error;
728
729                 if (!md)
730                         connected = false;
731         }
732
733         tos = tnl_params->tos;
734         if (tos & 0x1) {
735                 tos &= ~0x1;
736                 if (skb->protocol == htons(ETH_P_IP)) {
737                         tos = inner_iph->tos;
738                         connected = false;
739                 } else if (skb->protocol == htons(ETH_P_IPV6)) {
740                         tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
741                         connected = false;
742                 }
743         }
744
745         ip_tunnel_init_flow(&fl4, protocol, dst, tnl_params->saddr,
746                             tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link,
747                             tunnel->fwmark, skb_get_hash(skb));
748
749         if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0)
750                 goto tx_error;
751
752         if (connected && md) {
753                 use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
754                 if (use_cache)
755                         rt = dst_cache_get_ip4(&tun_info->dst_cache,
756                                                &fl4.saddr);
757         } else {
758                 rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache,
759                                                 &fl4.saddr) : NULL;
760         }
761
762         if (!rt) {
763                 rt = ip_route_output_key(tunnel->net, &fl4);
764
765                 if (IS_ERR(rt)) {
766                         dev->stats.tx_carrier_errors++;
767                         goto tx_error;
768                 }
769                 if (use_cache)
770                         dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst,
771                                           fl4.saddr);
772                 else if (!md && connected)
773                         dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst,
774                                           fl4.saddr);
775         }
776
777         if (rt->dst.dev == dev) {
778                 ip_rt_put(rt);
779                 dev->stats.collisions++;
780                 goto tx_error;
781         }
782
783         if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off, inner_iph,
784                             0, 0, false)) {
785                 ip_rt_put(rt);
786                 goto tx_error;
787         }
788
789         if (tunnel->err_count > 0) {
790                 if (time_before(jiffies,
791                                 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
792                         tunnel->err_count--;
793
794                         dst_link_failure(skb);
795                 } else
796                         tunnel->err_count = 0;
797         }
798
799         tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
800         ttl = tnl_params->ttl;
801         if (ttl == 0) {
802                 if (skb->protocol == htons(ETH_P_IP))
803                         ttl = inner_iph->ttl;
804 #if IS_ENABLED(CONFIG_IPV6)
805                 else if (skb->protocol == htons(ETH_P_IPV6))
806                         ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
807 #endif
808                 else
809                         ttl = ip4_dst_hoplimit(&rt->dst);
810         }
811
812         df = tnl_params->frag_off;
813         if (skb->protocol == htons(ETH_P_IP) && !tunnel->ignore_df)
814                 df |= (inner_iph->frag_off&htons(IP_DF));
815
816         max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
817                         + rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
818         if (max_headroom > dev->needed_headroom)
819                 dev->needed_headroom = max_headroom;
820
821         if (skb_cow_head(skb, dev->needed_headroom)) {
822                 ip_rt_put(rt);
823                 dev->stats.tx_dropped++;
824                 kfree_skb(skb);
825                 return;
826         }
827
828         iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl,
829                       df, !net_eq(tunnel->net, dev_net(dev)));
830         return;
831
832 #if IS_ENABLED(CONFIG_IPV6)
833 tx_error_icmp:
834         dst_link_failure(skb);
835 #endif
836 tx_error:
837         dev->stats.tx_errors++;
838         kfree_skb(skb);
839 }
840 EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
841
842 static void ip_tunnel_update(struct ip_tunnel_net *itn,
843                              struct ip_tunnel *t,
844                              struct net_device *dev,
845                              struct ip_tunnel_parm *p,
846                              bool set_mtu,
847                              __u32 fwmark)
848 {
849         ip_tunnel_del(itn, t);
850         t->parms.iph.saddr = p->iph.saddr;
851         t->parms.iph.daddr = p->iph.daddr;
852         t->parms.i_key = p->i_key;
853         t->parms.o_key = p->o_key;
854         if (dev->type != ARPHRD_ETHER) {
855                 memcpy(dev->dev_addr, &p->iph.saddr, 4);
856                 memcpy(dev->broadcast, &p->iph.daddr, 4);
857         }
858         ip_tunnel_add(itn, t);
859
860         t->parms.iph.ttl = p->iph.ttl;
861         t->parms.iph.tos = p->iph.tos;
862         t->parms.iph.frag_off = p->iph.frag_off;
863
864         if (t->parms.link != p->link || t->fwmark != fwmark) {
865                 int mtu;
866
867                 t->parms.link = p->link;
868                 t->fwmark = fwmark;
869                 mtu = ip_tunnel_bind_dev(dev);
870                 if (set_mtu)
871                         dev->mtu = mtu;
872         }
873         dst_cache_reset(&t->dst_cache);
874         netdev_state_change(dev);
875 }
876
877 int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
878 {
879         int err = 0;
880         struct ip_tunnel *t = netdev_priv(dev);
881         struct net *net = t->net;
882         struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
883
884         switch (cmd) {
885         case SIOCGETTUNNEL:
886                 if (dev == itn->fb_tunnel_dev) {
887                         t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
888                         if (!t)
889                                 t = netdev_priv(dev);
890                 }
891                 memcpy(p, &t->parms, sizeof(*p));
892                 break;
893
894         case SIOCADDTUNNEL:
895         case SIOCCHGTUNNEL:
896                 err = -EPERM;
897                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
898                         goto done;
899                 if (p->iph.ttl)
900                         p->iph.frag_off |= htons(IP_DF);
901                 if (!(p->i_flags & VTI_ISVTI)) {
902                         if (!(p->i_flags & TUNNEL_KEY))
903                                 p->i_key = 0;
904                         if (!(p->o_flags & TUNNEL_KEY))
905                                 p->o_key = 0;
906                 }
907
908                 t = ip_tunnel_find(itn, p, itn->type);
909
910                 if (cmd == SIOCADDTUNNEL) {
911                         if (!t) {
912                                 t = ip_tunnel_create(net, itn, p);
913                                 err = PTR_ERR_OR_ZERO(t);
914                                 break;
915                         }
916
917                         err = -EEXIST;
918                         break;
919                 }
920                 if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
921                         if (t) {
922                                 if (t->dev != dev) {
923                                         err = -EEXIST;
924                                         break;
925                                 }
926                         } else {
927                                 unsigned int nflags = 0;
928
929                                 if (ipv4_is_multicast(p->iph.daddr))
930                                         nflags = IFF_BROADCAST;
931                                 else if (p->iph.daddr)
932                                         nflags = IFF_POINTOPOINT;
933
934                                 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
935                                         err = -EINVAL;
936                                         break;
937                                 }
938
939                                 t = netdev_priv(dev);
940                         }
941                 }
942
943                 if (t) {
944                         err = 0;
945                         ip_tunnel_update(itn, t, dev, p, true, 0);
946                 } else {
947                         err = -ENOENT;
948                 }
949                 break;
950
951         case SIOCDELTUNNEL:
952                 err = -EPERM;
953                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
954                         goto done;
955
956                 if (dev == itn->fb_tunnel_dev) {
957                         err = -ENOENT;
958                         t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
959                         if (!t)
960                                 goto done;
961                         err = -EPERM;
962                         if (t == netdev_priv(itn->fb_tunnel_dev))
963                                 goto done;
964                         dev = t->dev;
965                 }
966                 unregister_netdevice(dev);
967                 err = 0;
968                 break;
969
970         default:
971                 err = -EINVAL;
972         }
973
974 done:
975         return err;
976 }
977 EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
978
979 int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict)
980 {
981         struct ip_tunnel *tunnel = netdev_priv(dev);
982         int t_hlen = tunnel->hlen + sizeof(struct iphdr);
983         int max_mtu = IP_MAX_MTU - dev->hard_header_len - t_hlen;
984
985         if (new_mtu < ETH_MIN_MTU)
986                 return -EINVAL;
987
988         if (new_mtu > max_mtu) {
989                 if (strict)
990                         return -EINVAL;
991
992                 new_mtu = max_mtu;
993         }
994
995         dev->mtu = new_mtu;
996         return 0;
997 }
998 EXPORT_SYMBOL_GPL(__ip_tunnel_change_mtu);
999
1000 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1001 {
1002         return __ip_tunnel_change_mtu(dev, new_mtu, true);
1003 }
1004 EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
1005
1006 static void ip_tunnel_dev_free(struct net_device *dev)
1007 {
1008         struct ip_tunnel *tunnel = netdev_priv(dev);
1009
1010         gro_cells_destroy(&tunnel->gro_cells);
1011         dst_cache_destroy(&tunnel->dst_cache);
1012         free_percpu(dev->tstats);
1013 }
1014
1015 void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
1016 {
1017         struct ip_tunnel *tunnel = netdev_priv(dev);
1018         struct ip_tunnel_net *itn;
1019
1020         itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
1021
1022         if (itn->fb_tunnel_dev != dev) {
1023                 ip_tunnel_del(itn, netdev_priv(dev));
1024                 unregister_netdevice_queue(dev, head);
1025         }
1026 }
1027 EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
1028
1029 struct net *ip_tunnel_get_link_net(const struct net_device *dev)
1030 {
1031         struct ip_tunnel *tunnel = netdev_priv(dev);
1032
1033         return tunnel->net;
1034 }
1035 EXPORT_SYMBOL(ip_tunnel_get_link_net);
1036
1037 int ip_tunnel_get_iflink(const struct net_device *dev)
1038 {
1039         struct ip_tunnel *tunnel = netdev_priv(dev);
1040
1041         return tunnel->parms.link;
1042 }
1043 EXPORT_SYMBOL(ip_tunnel_get_iflink);
1044
1045 int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id,
1046                                   struct rtnl_link_ops *ops, char *devname)
1047 {
1048         struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
1049         struct ip_tunnel_parm parms;
1050         unsigned int i;
1051
1052         itn->rtnl_link_ops = ops;
1053         for (i = 0; i < IP_TNL_HASH_SIZE; i++)
1054                 INIT_HLIST_HEAD(&itn->tunnels[i]);
1055
1056         if (!ops || !net_has_fallback_tunnels(net)) {
1057                 struct ip_tunnel_net *it_init_net;
1058
1059                 it_init_net = net_generic(&init_net, ip_tnl_net_id);
1060                 itn->type = it_init_net->type;
1061                 itn->fb_tunnel_dev = NULL;
1062                 return 0;
1063         }
1064
1065         memset(&parms, 0, sizeof(parms));
1066         if (devname)
1067                 strlcpy(parms.name, devname, IFNAMSIZ);
1068
1069         rtnl_lock();
1070         itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
1071         /* FB netdevice is special: we have one, and only one per netns.
1072          * Allowing to move it to another netns is clearly unsafe.
1073          */
1074         if (!IS_ERR(itn->fb_tunnel_dev)) {
1075                 itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
1076                 itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
1077                 ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
1078                 itn->type = itn->fb_tunnel_dev->type;
1079         }
1080         rtnl_unlock();
1081
1082         return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
1083 }
1084 EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
1085
1086 static void ip_tunnel_destroy(struct net *net, struct ip_tunnel_net *itn,
1087                               struct list_head *head,
1088                               struct rtnl_link_ops *ops)
1089 {
1090         struct net_device *dev, *aux;
1091         int h;
1092
1093         for_each_netdev_safe(net, dev, aux)
1094                 if (dev->rtnl_link_ops == ops)
1095                         unregister_netdevice_queue(dev, head);
1096
1097         for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
1098                 struct ip_tunnel *t;
1099                 struct hlist_node *n;
1100                 struct hlist_head *thead = &itn->tunnels[h];
1101
1102                 hlist_for_each_entry_safe(t, n, thead, hash_node)
1103                         /* If dev is in the same netns, it has already
1104                          * been added to the list by the previous loop.
1105                          */
1106                         if (!net_eq(dev_net(t->dev), net))
1107                                 unregister_netdevice_queue(t->dev, head);
1108         }
1109 }
1110
1111 void ip_tunnel_delete_nets(struct list_head *net_list, unsigned int id,
1112                            struct rtnl_link_ops *ops)
1113 {
1114         struct ip_tunnel_net *itn;
1115         struct net *net;
1116         LIST_HEAD(list);
1117
1118         rtnl_lock();
1119         list_for_each_entry(net, net_list, exit_list) {
1120                 itn = net_generic(net, id);
1121                 ip_tunnel_destroy(net, itn, &list, ops);
1122         }
1123         unregister_netdevice_many(&list);
1124         rtnl_unlock();
1125 }
1126 EXPORT_SYMBOL_GPL(ip_tunnel_delete_nets);
1127
1128 int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
1129                       struct ip_tunnel_parm *p, __u32 fwmark)
1130 {
1131         struct ip_tunnel *nt;
1132         struct net *net = dev_net(dev);
1133         struct ip_tunnel_net *itn;
1134         int mtu;
1135         int err;
1136
1137         nt = netdev_priv(dev);
1138         itn = net_generic(net, nt->ip_tnl_net_id);
1139
1140         if (nt->collect_md) {
1141                 if (rtnl_dereference(itn->collect_md_tun))
1142                         return -EEXIST;
1143         } else {
1144                 if (ip_tunnel_find(itn, p, dev->type))
1145                         return -EEXIST;
1146         }
1147
1148         nt->net = net;
1149         nt->parms = *p;
1150         nt->fwmark = fwmark;
1151         err = register_netdevice(dev);
1152         if (err)
1153                 goto err_register_netdevice;
1154
1155         if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1156                 eth_hw_addr_random(dev);
1157
1158         mtu = ip_tunnel_bind_dev(dev);
1159         if (tb[IFLA_MTU]) {
1160                 unsigned int max = IP_MAX_MTU - dev->hard_header_len - nt->hlen;
1161
1162                 mtu = clamp(dev->mtu, (unsigned int)ETH_MIN_MTU,
1163                             (unsigned int)(max - sizeof(struct iphdr)));
1164         }
1165
1166         err = dev_set_mtu(dev, mtu);
1167         if (err)
1168                 goto err_dev_set_mtu;
1169
1170         ip_tunnel_add(itn, nt);
1171         return 0;
1172
1173 err_dev_set_mtu:
1174         unregister_netdevice(dev);
1175 err_register_netdevice:
1176         return err;
1177 }
1178 EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
1179
1180 int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
1181                          struct ip_tunnel_parm *p, __u32 fwmark)
1182 {
1183         struct ip_tunnel *t;
1184         struct ip_tunnel *tunnel = netdev_priv(dev);
1185         struct net *net = tunnel->net;
1186         struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
1187
1188         if (dev == itn->fb_tunnel_dev)
1189                 return -EINVAL;
1190
1191         t = ip_tunnel_find(itn, p, dev->type);
1192
1193         if (t) {
1194                 if (t->dev != dev)
1195                         return -EEXIST;
1196         } else {
1197                 t = tunnel;
1198
1199                 if (dev->type != ARPHRD_ETHER) {
1200                         unsigned int nflags = 0;
1201
1202                         if (ipv4_is_multicast(p->iph.daddr))
1203                                 nflags = IFF_BROADCAST;
1204                         else if (p->iph.daddr)
1205                                 nflags = IFF_POINTOPOINT;
1206
1207                         if ((dev->flags ^ nflags) &
1208                             (IFF_POINTOPOINT | IFF_BROADCAST))
1209                                 return -EINVAL;
1210                 }
1211         }
1212
1213         ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU], fwmark);
1214         return 0;
1215 }
1216 EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
1217
1218 int ip_tunnel_init(struct net_device *dev)
1219 {
1220         struct ip_tunnel *tunnel = netdev_priv(dev);
1221         struct iphdr *iph = &tunnel->parms.iph;
1222         int err;
1223
1224         dev->needs_free_netdev = true;
1225         dev->priv_destructor = ip_tunnel_dev_free;
1226         dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
1227         if (!dev->tstats)
1228                 return -ENOMEM;
1229
1230         err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
1231         if (err) {
1232                 free_percpu(dev->tstats);
1233                 return err;
1234         }
1235
1236         err = gro_cells_init(&tunnel->gro_cells, dev);
1237         if (err) {
1238                 dst_cache_destroy(&tunnel->dst_cache);
1239                 free_percpu(dev->tstats);
1240                 return err;
1241         }
1242
1243         tunnel->dev = dev;
1244         tunnel->net = dev_net(dev);
1245         strcpy(tunnel->parms.name, dev->name);
1246         iph->version            = 4;
1247         iph->ihl                = 5;
1248
1249         if (tunnel->collect_md) {
1250                 dev->features |= NETIF_F_NETNS_LOCAL;
1251                 netif_keep_dst(dev);
1252         }
1253         return 0;
1254 }
1255 EXPORT_SYMBOL_GPL(ip_tunnel_init);
1256
1257 void ip_tunnel_uninit(struct net_device *dev)
1258 {
1259         struct ip_tunnel *tunnel = netdev_priv(dev);
1260         struct net *net = tunnel->net;
1261         struct ip_tunnel_net *itn;
1262
1263         itn = net_generic(net, tunnel->ip_tnl_net_id);
1264         /* fb_tunnel_dev will be unregisted in net-exit call. */
1265         if (itn->fb_tunnel_dev != dev)
1266                 ip_tunnel_del(itn, netdev_priv(dev));
1267
1268         dst_cache_reset(&tunnel->dst_cache);
1269 }
1270 EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1271
1272 /* Do least required initialization, rest of init is done in tunnel_init call */
1273 void ip_tunnel_setup(struct net_device *dev, unsigned int net_id)
1274 {
1275         struct ip_tunnel *tunnel = netdev_priv(dev);
1276         tunnel->ip_tnl_net_id = net_id;
1277 }
1278 EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1279
1280 MODULE_LICENSE("GPL");