2 * Linux INET6 implementation
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
16 * YOSHIFUJI Hideaki @USAGI
17 * reworked default router selection.
18 * - respect outgoing interface
19 * - select from (probably) reachable routers (i.e.
20 * routers in REACHABLE, STALE, DELAY or PROBE states).
21 * - always select the same router if it is (probably)
22 * reachable. otherwise, round-robin the list.
24 * Fixed routing subtrees.
27 #define pr_fmt(fmt) "IPv6: " fmt
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
56 #include <linux/rtnetlink.h>
58 #include <net/dst_metadata.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/nexthop.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
66 #include <trace/events/fib6.h>
68 #include <linux/uaccess.h>
71 #include <linux/sysctl.h>
75 RT6_NUD_FAIL_HARD = -3,
76 RT6_NUD_FAIL_PROBE = -2,
77 RT6_NUD_FAIL_DO_RR = -1,
81 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
82 static unsigned int ip6_default_advmss(const struct dst_entry *dst);
83 static unsigned int ip6_mtu(const struct dst_entry *dst);
84 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
85 static void ip6_dst_destroy(struct dst_entry *);
86 static void ip6_dst_ifdown(struct dst_entry *,
87 struct net_device *dev, int how);
88 static int ip6_dst_gc(struct dst_ops *ops);
90 static int ip6_pkt_discard(struct sk_buff *skb);
91 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
92 static int ip6_pkt_prohibit(struct sk_buff *skb);
93 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
94 static void ip6_link_failure(struct sk_buff *skb);
95 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
96 struct sk_buff *skb, u32 mtu);
97 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
99 static int rt6_score_route(struct fib6_info *rt, int oif, int strict);
100 static size_t rt6_nlmsg_size(struct fib6_info *rt);
101 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
102 struct fib6_info *rt, struct dst_entry *dst,
103 struct in6_addr *dest, struct in6_addr *src,
104 int iif, int type, u32 portid, u32 seq,
106 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
107 struct in6_addr *daddr,
108 struct in6_addr *saddr);
110 #ifdef CONFIG_IPV6_ROUTE_INFO
111 static struct fib6_info *rt6_add_route_info(struct net *net,
112 const struct in6_addr *prefix, int prefixlen,
113 const struct in6_addr *gwaddr,
114 struct net_device *dev,
116 static struct fib6_info *rt6_get_route_info(struct net *net,
117 const struct in6_addr *prefix, int prefixlen,
118 const struct in6_addr *gwaddr,
119 struct net_device *dev);
122 struct uncached_list {
124 struct list_head head;
127 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
129 void rt6_uncached_list_add(struct rt6_info *rt)
131 struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
133 rt->rt6i_uncached_list = ul;
135 spin_lock_bh(&ul->lock);
136 list_add_tail(&rt->rt6i_uncached, &ul->head);
137 spin_unlock_bh(&ul->lock);
140 void rt6_uncached_list_del(struct rt6_info *rt)
142 if (!list_empty(&rt->rt6i_uncached)) {
143 struct uncached_list *ul = rt->rt6i_uncached_list;
144 struct net *net = dev_net(rt->dst.dev);
146 spin_lock_bh(&ul->lock);
147 list_del(&rt->rt6i_uncached);
148 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
149 spin_unlock_bh(&ul->lock);
153 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
155 struct net_device *loopback_dev = net->loopback_dev;
158 if (dev == loopback_dev)
161 for_each_possible_cpu(cpu) {
162 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
165 spin_lock_bh(&ul->lock);
166 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
167 struct inet6_dev *rt_idev = rt->rt6i_idev;
168 struct net_device *rt_dev = rt->dst.dev;
170 if (rt_idev->dev == dev) {
171 rt->rt6i_idev = in6_dev_get(loopback_dev);
172 in6_dev_put(rt_idev);
176 rt->dst.dev = loopback_dev;
177 dev_hold(rt->dst.dev);
181 spin_unlock_bh(&ul->lock);
185 static inline const void *choose_neigh_daddr(const struct in6_addr *p,
189 if (!ipv6_addr_any(p))
190 return (const void *) p;
192 return &ipv6_hdr(skb)->daddr;
196 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
197 struct net_device *dev,
203 daddr = choose_neigh_daddr(gw, skb, daddr);
204 n = __ipv6_neigh_lookup(dev, daddr);
207 return neigh_create(&nd_tbl, daddr, dev);
210 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst,
214 const struct rt6_info *rt = container_of(dst, struct rt6_info, dst);
216 return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr);
219 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
221 struct net_device *dev = dst->dev;
222 struct rt6_info *rt = (struct rt6_info *)dst;
224 daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr);
227 if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
229 if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
231 __ipv6_confirm_neigh(dev, daddr);
234 static struct dst_ops ip6_dst_ops_template = {
238 .check = ip6_dst_check,
239 .default_advmss = ip6_default_advmss,
241 .cow_metrics = dst_cow_metrics_generic,
242 .destroy = ip6_dst_destroy,
243 .ifdown = ip6_dst_ifdown,
244 .negative_advice = ip6_negative_advice,
245 .link_failure = ip6_link_failure,
246 .update_pmtu = ip6_rt_update_pmtu,
247 .redirect = rt6_do_redirect,
248 .local_out = __ip6_local_out,
249 .neigh_lookup = ip6_dst_neigh_lookup,
250 .confirm_neigh = ip6_confirm_neigh,
253 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
255 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
257 return mtu ? : dst->dev->mtu;
260 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
261 struct sk_buff *skb, u32 mtu)
265 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
270 static struct dst_ops ip6_dst_blackhole_ops = {
272 .destroy = ip6_dst_destroy,
273 .check = ip6_dst_check,
274 .mtu = ip6_blackhole_mtu,
275 .default_advmss = ip6_default_advmss,
276 .update_pmtu = ip6_rt_blackhole_update_pmtu,
277 .redirect = ip6_rt_blackhole_redirect,
278 .cow_metrics = dst_cow_metrics_generic,
279 .neigh_lookup = ip6_dst_neigh_lookup,
282 static const u32 ip6_template_metrics[RTAX_MAX] = {
283 [RTAX_HOPLIMIT - 1] = 0,
286 static const struct fib6_info fib6_null_entry_template = {
287 .fib6_flags = (RTF_REJECT | RTF_NONEXTHOP),
288 .fib6_protocol = RTPROT_KERNEL,
289 .fib6_metric = ~(u32)0,
290 .fib6_ref = ATOMIC_INIT(1),
291 .fib6_type = RTN_UNREACHABLE,
292 .fib6_metrics = (struct dst_metrics *)&dst_default_metrics,
295 static const struct rt6_info ip6_null_entry_template = {
297 .__refcnt = ATOMIC_INIT(1),
299 .obsolete = DST_OBSOLETE_FORCE_CHK,
300 .error = -ENETUNREACH,
301 .input = ip6_pkt_discard,
302 .output = ip6_pkt_discard_out,
304 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
307 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
309 static const struct rt6_info ip6_prohibit_entry_template = {
311 .__refcnt = ATOMIC_INIT(1),
313 .obsolete = DST_OBSOLETE_FORCE_CHK,
315 .input = ip6_pkt_prohibit,
316 .output = ip6_pkt_prohibit_out,
318 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
321 static const struct rt6_info ip6_blk_hole_entry_template = {
323 .__refcnt = ATOMIC_INIT(1),
325 .obsolete = DST_OBSOLETE_FORCE_CHK,
327 .input = dst_discard,
328 .output = dst_discard_out,
330 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
335 static void rt6_info_init(struct rt6_info *rt)
337 struct dst_entry *dst = &rt->dst;
339 memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
340 INIT_LIST_HEAD(&rt->rt6i_uncached);
343 /* allocate dst with ip6_dst_ops */
344 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
347 struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
348 1, DST_OBSOLETE_FORCE_CHK, flags);
352 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
357 EXPORT_SYMBOL(ip6_dst_alloc);
359 static void ip6_dst_destroy(struct dst_entry *dst)
361 struct rt6_info *rt = (struct rt6_info *)dst;
362 struct fib6_info *from;
363 struct inet6_dev *idev;
365 dst_destroy_metrics_generic(dst);
366 rt6_uncached_list_del(rt);
368 idev = rt->rt6i_idev;
370 rt->rt6i_idev = NULL;
375 from = rcu_dereference(rt->from);
376 rcu_assign_pointer(rt->from, NULL);
377 fib6_info_release(from);
381 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
384 struct rt6_info *rt = (struct rt6_info *)dst;
385 struct inet6_dev *idev = rt->rt6i_idev;
386 struct net_device *loopback_dev =
387 dev_net(dev)->loopback_dev;
389 if (idev && idev->dev != loopback_dev) {
390 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
392 rt->rt6i_idev = loopback_idev;
398 static bool __rt6_check_expired(const struct rt6_info *rt)
400 if (rt->rt6i_flags & RTF_EXPIRES)
401 return time_after(jiffies, rt->dst.expires);
406 static bool rt6_check_expired(const struct rt6_info *rt)
408 struct fib6_info *from;
410 from = rcu_dereference(rt->from);
412 if (rt->rt6i_flags & RTF_EXPIRES) {
413 if (time_after(jiffies, rt->dst.expires))
416 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
417 fib6_check_expired(from);
422 static struct fib6_info *rt6_multipath_select(const struct net *net,
423 struct fib6_info *match,
424 struct flowi6 *fl6, int oif,
425 const struct sk_buff *skb,
428 struct fib6_info *sibling, *next_sibling;
430 /* We might have already computed the hash for ICMPv6 errors. In such
431 * case it will always be non-zero. Otherwise now is the time to do it.
434 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
436 if (fl6->mp_hash <= atomic_read(&match->fib6_nh.nh_upper_bound))
439 list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
443 nh_upper_bound = atomic_read(&sibling->fib6_nh.nh_upper_bound);
444 if (fl6->mp_hash > nh_upper_bound)
446 if (rt6_score_route(sibling, oif, strict) < 0)
456 * Route lookup. rcu_read_lock() should be held.
459 static inline struct fib6_info *rt6_device_match(struct net *net,
460 struct fib6_info *rt,
461 const struct in6_addr *saddr,
465 struct fib6_info *sprt;
467 if (!oif && ipv6_addr_any(saddr) &&
468 !(rt->fib6_nh.nh_flags & RTNH_F_DEAD))
471 for (sprt = rt; sprt; sprt = rcu_dereference(sprt->fib6_next)) {
472 const struct net_device *dev = sprt->fib6_nh.nh_dev;
474 if (sprt->fib6_nh.nh_flags & RTNH_F_DEAD)
478 if (dev->ifindex == oif)
481 if (ipv6_chk_addr(net, saddr, dev,
482 flags & RT6_LOOKUP_F_IFACE))
487 if (oif && flags & RT6_LOOKUP_F_IFACE)
488 return net->ipv6.fib6_null_entry;
490 return rt->fib6_nh.nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt;
493 #ifdef CONFIG_IPV6_ROUTER_PREF
494 struct __rt6_probe_work {
495 struct work_struct work;
496 struct in6_addr target;
497 struct net_device *dev;
500 static void rt6_probe_deferred(struct work_struct *w)
502 struct in6_addr mcaddr;
503 struct __rt6_probe_work *work =
504 container_of(w, struct __rt6_probe_work, work);
506 addrconf_addr_solict_mult(&work->target, &mcaddr);
507 ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
512 static void rt6_probe(struct fib6_info *rt)
514 struct __rt6_probe_work *work;
515 const struct in6_addr *nh_gw;
516 struct neighbour *neigh;
517 struct net_device *dev;
520 * Okay, this does not seem to be appropriate
521 * for now, however, we need to check if it
522 * is really so; aka Router Reachability Probing.
524 * Router Reachability Probe MUST be rate-limited
525 * to no more than one per minute.
527 if (!rt || !(rt->fib6_flags & RTF_GATEWAY))
530 nh_gw = &rt->fib6_nh.nh_gw;
531 dev = rt->fib6_nh.nh_dev;
533 neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
535 struct inet6_dev *idev;
537 if (neigh->nud_state & NUD_VALID)
540 idev = __in6_dev_get(dev);
542 write_lock(&neigh->lock);
543 if (!(neigh->nud_state & NUD_VALID) &&
545 neigh->updated + idev->cnf.rtr_probe_interval)) {
546 work = kmalloc(sizeof(*work), GFP_ATOMIC);
548 __neigh_set_probe_once(neigh);
550 write_unlock(&neigh->lock);
552 work = kmalloc(sizeof(*work), GFP_ATOMIC);
556 INIT_WORK(&work->work, rt6_probe_deferred);
557 work->target = *nh_gw;
560 schedule_work(&work->work);
564 rcu_read_unlock_bh();
567 static inline void rt6_probe(struct fib6_info *rt)
573 * Default Router Selection (RFC 2461 6.3.6)
575 static inline int rt6_check_dev(struct fib6_info *rt, int oif)
577 const struct net_device *dev = rt->fib6_nh.nh_dev;
579 if (!oif || dev->ifindex == oif)
584 static inline enum rt6_nud_state rt6_check_neigh(struct fib6_info *rt)
586 enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
587 struct neighbour *neigh;
589 if (rt->fib6_flags & RTF_NONEXTHOP ||
590 !(rt->fib6_flags & RTF_GATEWAY))
591 return RT6_NUD_SUCCEED;
594 neigh = __ipv6_neigh_lookup_noref(rt->fib6_nh.nh_dev,
597 read_lock(&neigh->lock);
598 if (neigh->nud_state & NUD_VALID)
599 ret = RT6_NUD_SUCCEED;
600 #ifdef CONFIG_IPV6_ROUTER_PREF
601 else if (!(neigh->nud_state & NUD_FAILED))
602 ret = RT6_NUD_SUCCEED;
604 ret = RT6_NUD_FAIL_PROBE;
606 read_unlock(&neigh->lock);
608 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
609 RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
611 rcu_read_unlock_bh();
616 static int rt6_score_route(struct fib6_info *rt, int oif, int strict)
620 m = rt6_check_dev(rt, oif);
621 if (!m && (strict & RT6_LOOKUP_F_IFACE))
622 return RT6_NUD_FAIL_HARD;
623 #ifdef CONFIG_IPV6_ROUTER_PREF
624 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->fib6_flags)) << 2;
626 if (strict & RT6_LOOKUP_F_REACHABLE) {
627 int n = rt6_check_neigh(rt);
634 /* called with rc_read_lock held */
635 static inline bool fib6_ignore_linkdown(const struct fib6_info *f6i)
637 const struct net_device *dev = fib6_info_nh_dev(f6i);
641 const struct inet6_dev *idev = __in6_dev_get(dev);
643 rc = !!idev->cnf.ignore_routes_with_linkdown;
649 static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict,
650 int *mpri, struct fib6_info *match,
654 bool match_do_rr = false;
656 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
659 if (fib6_ignore_linkdown(rt) &&
660 rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
661 !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
664 if (fib6_check_expired(rt))
667 m = rt6_score_route(rt, oif, strict);
668 if (m == RT6_NUD_FAIL_DO_RR) {
670 m = 0; /* lowest valid score */
671 } else if (m == RT6_NUD_FAIL_HARD) {
675 if (strict & RT6_LOOKUP_F_REACHABLE)
678 /* note that m can be RT6_NUD_FAIL_PROBE at this point */
680 *do_rr = match_do_rr;
688 static struct fib6_info *find_rr_leaf(struct fib6_node *fn,
689 struct fib6_info *leaf,
690 struct fib6_info *rr_head,
691 u32 metric, int oif, int strict,
694 struct fib6_info *rt, *match, *cont;
699 for (rt = rr_head; rt; rt = rcu_dereference(rt->fib6_next)) {
700 if (rt->fib6_metric != metric) {
705 match = find_match(rt, oif, strict, &mpri, match, do_rr);
708 for (rt = leaf; rt && rt != rr_head;
709 rt = rcu_dereference(rt->fib6_next)) {
710 if (rt->fib6_metric != metric) {
715 match = find_match(rt, oif, strict, &mpri, match, do_rr);
721 for (rt = cont; rt; rt = rcu_dereference(rt->fib6_next))
722 match = find_match(rt, oif, strict, &mpri, match, do_rr);
727 static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn,
730 struct fib6_info *leaf = rcu_dereference(fn->leaf);
731 struct fib6_info *match, *rt0;
735 if (!leaf || leaf == net->ipv6.fib6_null_entry)
736 return net->ipv6.fib6_null_entry;
738 rt0 = rcu_dereference(fn->rr_ptr);
742 /* Double check to make sure fn is not an intermediate node
743 * and fn->leaf does not points to its child's leaf
744 * (This might happen if all routes under fn are deleted from
745 * the tree and fib6_repair_tree() is called on the node.)
747 key_plen = rt0->fib6_dst.plen;
748 #ifdef CONFIG_IPV6_SUBTREES
749 if (rt0->fib6_src.plen)
750 key_plen = rt0->fib6_src.plen;
752 if (fn->fn_bit != key_plen)
753 return net->ipv6.fib6_null_entry;
755 match = find_rr_leaf(fn, leaf, rt0, rt0->fib6_metric, oif, strict,
759 struct fib6_info *next = rcu_dereference(rt0->fib6_next);
761 /* no entries matched; do round-robin */
762 if (!next || next->fib6_metric != rt0->fib6_metric)
766 spin_lock_bh(&leaf->fib6_table->tb6_lock);
767 /* make sure next is not being deleted from the tree */
769 rcu_assign_pointer(fn->rr_ptr, next);
770 spin_unlock_bh(&leaf->fib6_table->tb6_lock);
774 return match ? match : net->ipv6.fib6_null_entry;
777 static bool rt6_is_gw_or_nonexthop(const struct fib6_info *rt)
779 return (rt->fib6_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
782 #ifdef CONFIG_IPV6_ROUTE_INFO
783 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
784 const struct in6_addr *gwaddr)
786 struct net *net = dev_net(dev);
787 struct route_info *rinfo = (struct route_info *) opt;
788 struct in6_addr prefix_buf, *prefix;
790 unsigned long lifetime;
791 struct fib6_info *rt;
793 if (len < sizeof(struct route_info)) {
797 /* Sanity check for prefix_len and length */
798 if (rinfo->length > 3) {
800 } else if (rinfo->prefix_len > 128) {
802 } else if (rinfo->prefix_len > 64) {
803 if (rinfo->length < 2) {
806 } else if (rinfo->prefix_len > 0) {
807 if (rinfo->length < 1) {
812 pref = rinfo->route_pref;
813 if (pref == ICMPV6_ROUTER_PREF_INVALID)
816 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
818 if (rinfo->length == 3)
819 prefix = (struct in6_addr *)rinfo->prefix;
821 /* this function is safe */
822 ipv6_addr_prefix(&prefix_buf,
823 (struct in6_addr *)rinfo->prefix,
825 prefix = &prefix_buf;
828 if (rinfo->prefix_len == 0)
829 rt = rt6_get_dflt_router(net, gwaddr, dev);
831 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
834 if (rt && !lifetime) {
840 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
843 rt->fib6_flags = RTF_ROUTEINFO |
844 (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
847 if (!addrconf_finite_timeout(lifetime))
848 fib6_clean_expires(rt);
850 fib6_set_expires(rt, jiffies + HZ * lifetime);
852 fib6_info_release(rt);
859 * Misc support functions
862 /* called with rcu_lock held */
863 static struct net_device *ip6_rt_get_dev_rcu(struct fib6_info *rt)
865 struct net_device *dev = rt->fib6_nh.nh_dev;
867 if (rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) {
868 /* for copies of local routes, dst->dev needs to be the
869 * device if it is a master device, the master device if
870 * device is enslaved, and the loopback as the default
872 if (netif_is_l3_slave(dev) &&
873 !rt6_need_strict(&rt->fib6_dst.addr))
874 dev = l3mdev_master_dev_rcu(dev);
875 else if (!netif_is_l3_master(dev))
876 dev = dev_net(dev)->loopback_dev;
877 /* last case is netif_is_l3_master(dev) is true in which
878 * case we want dev returned to be dev
885 static const int fib6_prop[RTN_MAX + 1] = {
892 [RTN_BLACKHOLE] = -EINVAL,
893 [RTN_UNREACHABLE] = -EHOSTUNREACH,
894 [RTN_PROHIBIT] = -EACCES,
895 [RTN_THROW] = -EAGAIN,
897 [RTN_XRESOLVE] = -EINVAL,
900 static int ip6_rt_type_to_error(u8 fib6_type)
902 return fib6_prop[fib6_type];
905 static unsigned short fib6_info_dst_flags(struct fib6_info *rt)
907 unsigned short flags = 0;
910 flags |= DST_NOCOUNT;
911 if (rt->dst_nopolicy)
912 flags |= DST_NOPOLICY;
919 static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort)
921 rt->dst.error = ip6_rt_type_to_error(ort->fib6_type);
923 switch (ort->fib6_type) {
925 rt->dst.output = dst_discard_out;
926 rt->dst.input = dst_discard;
929 rt->dst.output = ip6_pkt_prohibit_out;
930 rt->dst.input = ip6_pkt_prohibit;
933 case RTN_UNREACHABLE:
935 rt->dst.output = ip6_pkt_discard_out;
936 rt->dst.input = ip6_pkt_discard;
941 static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort)
943 rt->dst.flags |= fib6_info_dst_flags(ort);
945 if (ort->fib6_flags & RTF_REJECT) {
946 ip6_rt_init_dst_reject(rt, ort);
951 rt->dst.output = ip6_output;
953 if (ort->fib6_type == RTN_LOCAL) {
954 rt->dst.input = ip6_input;
955 } else if (ipv6_addr_type(&ort->fib6_dst.addr) & IPV6_ADDR_MULTICAST) {
956 rt->dst.input = ip6_mc_input;
958 rt->dst.input = ip6_forward;
961 if (ort->fib6_nh.nh_lwtstate) {
962 rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate);
963 lwtunnel_set_redirect(&rt->dst);
966 rt->dst.lastuse = jiffies;
969 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
971 rt->rt6i_flags &= ~RTF_EXPIRES;
972 fib6_info_hold(from);
973 rcu_assign_pointer(rt->from, from);
974 dst_init_metrics(&rt->dst, from->fib6_metrics->metrics, true);
975 if (from->fib6_metrics != &dst_default_metrics) {
976 rt->dst._metrics |= DST_METRICS_REFCOUNTED;
977 refcount_inc(&from->fib6_metrics->refcnt);
981 static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort)
983 struct net_device *dev = fib6_info_nh_dev(ort);
985 ip6_rt_init_dst(rt, ort);
987 rt->rt6i_dst = ort->fib6_dst;
988 rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL;
989 rt->rt6i_gateway = ort->fib6_nh.nh_gw;
990 rt->rt6i_flags = ort->fib6_flags;
991 rt6_set_from(rt, ort);
992 #ifdef CONFIG_IPV6_SUBTREES
993 rt->rt6i_src = ort->fib6_src;
995 rt->rt6i_prefsrc = ort->fib6_prefsrc;
996 rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate);
999 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
1000 struct in6_addr *saddr)
1002 struct fib6_node *pn, *sn;
1004 if (fn->fn_flags & RTN_TL_ROOT)
1006 pn = rcu_dereference(fn->parent);
1007 sn = FIB6_SUBTREE(pn);
1009 fn = fib6_lookup(sn, NULL, saddr);
1012 if (fn->fn_flags & RTN_RTINFO)
1017 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
1020 struct rt6_info *rt = *prt;
1022 if (dst_hold_safe(&rt->dst))
1024 if (null_fallback) {
1025 rt = net->ipv6.ip6_null_entry;
1034 /* called with rcu_lock held */
1035 static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt)
1037 unsigned short flags = fib6_info_dst_flags(rt);
1038 struct net_device *dev = rt->fib6_nh.nh_dev;
1039 struct rt6_info *nrt;
1041 nrt = ip6_dst_alloc(dev_net(dev), dev, flags);
1043 ip6_rt_copy_init(nrt, rt);
1048 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
1049 struct fib6_table *table,
1051 const struct sk_buff *skb,
1054 struct fib6_info *f6i;
1055 struct fib6_node *fn;
1056 struct rt6_info *rt;
1058 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1059 flags &= ~RT6_LOOKUP_F_IFACE;
1062 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1064 f6i = rcu_dereference(fn->leaf);
1066 f6i = net->ipv6.fib6_null_entry;
1068 f6i = rt6_device_match(net, f6i, &fl6->saddr,
1069 fl6->flowi6_oif, flags);
1070 if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0)
1071 f6i = rt6_multipath_select(net, f6i, fl6,
1072 fl6->flowi6_oif, skb, flags);
1074 if (f6i == net->ipv6.fib6_null_entry) {
1075 fn = fib6_backtrack(fn, &fl6->saddr);
1080 /* Search through exception table */
1081 rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1083 if (ip6_hold_safe(net, &rt, true))
1084 dst_use_noref(&rt->dst, jiffies);
1085 } else if (f6i == net->ipv6.fib6_null_entry) {
1086 rt = net->ipv6.ip6_null_entry;
1089 rt = ip6_create_rt_rcu(f6i);
1091 rt = net->ipv6.ip6_null_entry;
1098 trace_fib6_table_lookup(net, rt, table, fl6);
1103 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
1104 const struct sk_buff *skb, int flags)
1106 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
1108 EXPORT_SYMBOL_GPL(ip6_route_lookup);
1110 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
1111 const struct in6_addr *saddr, int oif,
1112 const struct sk_buff *skb, int strict)
1114 struct flowi6 fl6 = {
1118 struct dst_entry *dst;
1119 int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
1122 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
1123 flags |= RT6_LOOKUP_F_HAS_SADDR;
1126 dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
1127 if (dst->error == 0)
1128 return (struct rt6_info *) dst;
1134 EXPORT_SYMBOL(rt6_lookup);
1136 /* ip6_ins_rt is called with FREE table->tb6_lock.
1137 * It takes new route entry, the addition fails by any reason the
1138 * route is released.
1139 * Caller must hold dst before calling it.
1142 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info,
1143 struct netlink_ext_ack *extack)
1146 struct fib6_table *table;
1148 table = rt->fib6_table;
1149 spin_lock_bh(&table->tb6_lock);
1150 err = fib6_add(&table->tb6_root, rt, info, extack);
1151 spin_unlock_bh(&table->tb6_lock);
1156 int ip6_ins_rt(struct net *net, struct fib6_info *rt)
1158 struct nl_info info = { .nl_net = net, };
1160 return __ip6_ins_rt(rt, &info, NULL);
1163 static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort,
1164 const struct in6_addr *daddr,
1165 const struct in6_addr *saddr)
1167 struct net_device *dev;
1168 struct rt6_info *rt;
1174 dev = ip6_rt_get_dev_rcu(ort);
1175 rt = ip6_dst_alloc(dev_net(dev), dev, 0);
1179 ip6_rt_copy_init(rt, ort);
1180 rt->rt6i_flags |= RTF_CACHE;
1181 rt->dst.flags |= DST_HOST;
1182 rt->rt6i_dst.addr = *daddr;
1183 rt->rt6i_dst.plen = 128;
1185 if (!rt6_is_gw_or_nonexthop(ort)) {
1186 if (ort->fib6_dst.plen != 128 &&
1187 ipv6_addr_equal(&ort->fib6_dst.addr, daddr))
1188 rt->rt6i_flags |= RTF_ANYCAST;
1189 #ifdef CONFIG_IPV6_SUBTREES
1190 if (rt->rt6i_src.plen && saddr) {
1191 rt->rt6i_src.addr = *saddr;
1192 rt->rt6i_src.plen = 128;
1200 static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt)
1202 unsigned short flags = fib6_info_dst_flags(rt);
1203 struct net_device *dev;
1204 struct rt6_info *pcpu_rt;
1207 dev = ip6_rt_get_dev_rcu(rt);
1208 pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags);
1212 ip6_rt_copy_init(pcpu_rt, rt);
1213 pcpu_rt->rt6i_flags |= RTF_PCPU;
1217 /* It should be called with rcu_read_lock() acquired */
1218 static struct rt6_info *rt6_get_pcpu_route(struct fib6_info *rt)
1220 struct rt6_info *pcpu_rt, **p;
1222 p = this_cpu_ptr(rt->rt6i_pcpu);
1226 ip6_hold_safe(NULL, &pcpu_rt, false);
1231 static struct rt6_info *rt6_make_pcpu_route(struct net *net,
1232 struct fib6_info *rt)
1234 struct rt6_info *pcpu_rt, *prev, **p;
1236 pcpu_rt = ip6_rt_pcpu_alloc(rt);
1238 dst_hold(&net->ipv6.ip6_null_entry->dst);
1239 return net->ipv6.ip6_null_entry;
1242 dst_hold(&pcpu_rt->dst);
1243 p = this_cpu_ptr(rt->rt6i_pcpu);
1244 prev = cmpxchg(p, NULL, pcpu_rt);
1250 /* exception hash table implementation
1252 static DEFINE_SPINLOCK(rt6_exception_lock);
1254 /* Remove rt6_ex from hash table and free the memory
1255 * Caller must hold rt6_exception_lock
1257 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1258 struct rt6_exception *rt6_ex)
1262 if (!bucket || !rt6_ex)
1265 net = dev_net(rt6_ex->rt6i->dst.dev);
1266 hlist_del_rcu(&rt6_ex->hlist);
1267 dst_release(&rt6_ex->rt6i->dst);
1268 kfree_rcu(rt6_ex, rcu);
1269 WARN_ON_ONCE(!bucket->depth);
1271 net->ipv6.rt6_stats->fib_rt_cache--;
1274 /* Remove oldest rt6_ex in bucket and free the memory
1275 * Caller must hold rt6_exception_lock
1277 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1279 struct rt6_exception *rt6_ex, *oldest = NULL;
1284 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1285 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1288 rt6_remove_exception(bucket, oldest);
1291 static u32 rt6_exception_hash(const struct in6_addr *dst,
1292 const struct in6_addr *src)
1294 static u32 seed __read_mostly;
1297 net_get_random_once(&seed, sizeof(seed));
1298 val = jhash(dst, sizeof(*dst), seed);
1300 #ifdef CONFIG_IPV6_SUBTREES
1302 val = jhash(src, sizeof(*src), val);
1304 return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1307 /* Helper function to find the cached rt in the hash table
1308 * and update bucket pointer to point to the bucket for this
1309 * (daddr, saddr) pair
1310 * Caller must hold rt6_exception_lock
1312 static struct rt6_exception *
1313 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1314 const struct in6_addr *daddr,
1315 const struct in6_addr *saddr)
1317 struct rt6_exception *rt6_ex;
1320 if (!(*bucket) || !daddr)
1323 hval = rt6_exception_hash(daddr, saddr);
1326 hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1327 struct rt6_info *rt6 = rt6_ex->rt6i;
1328 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1330 #ifdef CONFIG_IPV6_SUBTREES
1331 if (matched && saddr)
1332 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1340 /* Helper function to find the cached rt in the hash table
1341 * and update bucket pointer to point to the bucket for this
1342 * (daddr, saddr) pair
1343 * Caller must hold rcu_read_lock()
1345 static struct rt6_exception *
1346 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1347 const struct in6_addr *daddr,
1348 const struct in6_addr *saddr)
1350 struct rt6_exception *rt6_ex;
1353 WARN_ON_ONCE(!rcu_read_lock_held());
1355 if (!(*bucket) || !daddr)
1358 hval = rt6_exception_hash(daddr, saddr);
1361 hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1362 struct rt6_info *rt6 = rt6_ex->rt6i;
1363 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1365 #ifdef CONFIG_IPV6_SUBTREES
1366 if (matched && saddr)
1367 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1375 static unsigned int fib6_mtu(const struct fib6_info *rt)
1379 if (rt->fib6_pmtu) {
1380 mtu = rt->fib6_pmtu;
1382 struct net_device *dev = fib6_info_nh_dev(rt);
1383 struct inet6_dev *idev;
1386 idev = __in6_dev_get(dev);
1387 mtu = idev->cnf.mtu6;
1391 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1393 return mtu - lwtunnel_headroom(rt->fib6_nh.nh_lwtstate, mtu);
1396 static int rt6_insert_exception(struct rt6_info *nrt,
1397 struct fib6_info *ort)
1399 struct net *net = dev_net(nrt->dst.dev);
1400 struct rt6_exception_bucket *bucket;
1401 struct in6_addr *src_key = NULL;
1402 struct rt6_exception *rt6_ex;
1405 spin_lock_bh(&rt6_exception_lock);
1407 if (ort->exception_bucket_flushed) {
1412 bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1413 lockdep_is_held(&rt6_exception_lock));
1415 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1421 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1424 #ifdef CONFIG_IPV6_SUBTREES
1425 /* rt6i_src.plen != 0 indicates ort is in subtree
1426 * and exception table is indexed by a hash of
1427 * both rt6i_dst and rt6i_src.
1428 * Otherwise, the exception table is indexed by
1429 * a hash of only rt6i_dst.
1431 if (ort->fib6_src.plen)
1432 src_key = &nrt->rt6i_src.addr;
1435 /* Update rt6i_prefsrc as it could be changed
1436 * in rt6_remove_prefsrc()
1438 nrt->rt6i_prefsrc = ort->fib6_prefsrc;
1439 /* rt6_mtu_change() might lower mtu on ort.
1440 * Only insert this exception route if its mtu
1441 * is less than ort's mtu value.
1443 if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(ort)) {
1448 rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1451 rt6_remove_exception(bucket, rt6_ex);
1453 rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1459 rt6_ex->stamp = jiffies;
1460 hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1462 net->ipv6.rt6_stats->fib_rt_cache++;
1464 if (bucket->depth > FIB6_MAX_DEPTH)
1465 rt6_exception_remove_oldest(bucket);
1468 spin_unlock_bh(&rt6_exception_lock);
1470 /* Update fn->fn_sernum to invalidate all cached dst */
1472 spin_lock_bh(&ort->fib6_table->tb6_lock);
1473 fib6_update_sernum(net, ort);
1474 spin_unlock_bh(&ort->fib6_table->tb6_lock);
1475 fib6_force_start_gc(net);
1481 void rt6_flush_exceptions(struct fib6_info *rt)
1483 struct rt6_exception_bucket *bucket;
1484 struct rt6_exception *rt6_ex;
1485 struct hlist_node *tmp;
1488 spin_lock_bh(&rt6_exception_lock);
1489 /* Prevent rt6_insert_exception() to recreate the bucket list */
1490 rt->exception_bucket_flushed = 1;
1492 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1493 lockdep_is_held(&rt6_exception_lock));
1497 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1498 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1499 rt6_remove_exception(bucket, rt6_ex);
1500 WARN_ON_ONCE(bucket->depth);
1505 spin_unlock_bh(&rt6_exception_lock);
1508 /* Find cached rt in the hash table inside passed in rt
1509 * Caller has to hold rcu_read_lock()
1511 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
1512 struct in6_addr *daddr,
1513 struct in6_addr *saddr)
1515 struct rt6_exception_bucket *bucket;
1516 struct in6_addr *src_key = NULL;
1517 struct rt6_exception *rt6_ex;
1518 struct rt6_info *res = NULL;
1520 bucket = rcu_dereference(rt->rt6i_exception_bucket);
1522 #ifdef CONFIG_IPV6_SUBTREES
1523 /* rt6i_src.plen != 0 indicates rt is in subtree
1524 * and exception table is indexed by a hash of
1525 * both rt6i_dst and rt6i_src.
1526 * Otherwise, the exception table is indexed by
1527 * a hash of only rt6i_dst.
1529 if (rt->fib6_src.plen)
1532 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1534 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1540 /* Remove the passed in cached rt from the hash table that contains it */
1541 static int rt6_remove_exception_rt(struct rt6_info *rt)
1543 struct rt6_exception_bucket *bucket;
1544 struct in6_addr *src_key = NULL;
1545 struct rt6_exception *rt6_ex;
1546 struct fib6_info *from;
1549 from = rcu_dereference(rt->from);
1551 !(rt->rt6i_flags & RTF_CACHE))
1554 if (!rcu_access_pointer(from->rt6i_exception_bucket))
1557 spin_lock_bh(&rt6_exception_lock);
1558 bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1559 lockdep_is_held(&rt6_exception_lock));
1560 #ifdef CONFIG_IPV6_SUBTREES
1561 /* rt6i_src.plen != 0 indicates 'from' is in subtree
1562 * and exception table is indexed by a hash of
1563 * both rt6i_dst and rt6i_src.
1564 * Otherwise, the exception table is indexed by
1565 * a hash of only rt6i_dst.
1567 if (from->fib6_src.plen)
1568 src_key = &rt->rt6i_src.addr;
1570 rt6_ex = __rt6_find_exception_spinlock(&bucket,
1574 rt6_remove_exception(bucket, rt6_ex);
1580 spin_unlock_bh(&rt6_exception_lock);
1584 /* Find rt6_ex which contains the passed in rt cache and
1587 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1589 struct rt6_exception_bucket *bucket;
1590 struct fib6_info *from = rt->from;
1591 struct in6_addr *src_key = NULL;
1592 struct rt6_exception *rt6_ex;
1595 !(rt->rt6i_flags & RTF_CACHE))
1599 bucket = rcu_dereference(from->rt6i_exception_bucket);
1601 #ifdef CONFIG_IPV6_SUBTREES
1602 /* rt6i_src.plen != 0 indicates 'from' is in subtree
1603 * and exception table is indexed by a hash of
1604 * both rt6i_dst and rt6i_src.
1605 * Otherwise, the exception table is indexed by
1606 * a hash of only rt6i_dst.
1608 if (from->fib6_src.plen)
1609 src_key = &rt->rt6i_src.addr;
1611 rt6_ex = __rt6_find_exception_rcu(&bucket,
1615 rt6_ex->stamp = jiffies;
1620 static void rt6_exceptions_remove_prefsrc(struct fib6_info *rt)
1622 struct rt6_exception_bucket *bucket;
1623 struct rt6_exception *rt6_ex;
1626 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1627 lockdep_is_held(&rt6_exception_lock));
1630 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1631 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1632 rt6_ex->rt6i->rt6i_prefsrc.plen = 0;
1639 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
1640 struct rt6_info *rt, int mtu)
1642 /* If the new MTU is lower than the route PMTU, this new MTU will be the
1643 * lowest MTU in the path: always allow updating the route PMTU to
1644 * reflect PMTU decreases.
1646 * If the new MTU is higher, and the route PMTU is equal to the local
1647 * MTU, this means the old MTU is the lowest in the path, so allow
1648 * updating it: if other nodes now have lower MTUs, PMTU discovery will
1652 if (dst_mtu(&rt->dst) >= mtu)
1655 if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
1661 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
1662 struct fib6_info *rt, int mtu)
1664 struct rt6_exception_bucket *bucket;
1665 struct rt6_exception *rt6_ex;
1668 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1669 lockdep_is_held(&rt6_exception_lock));
1674 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1675 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1676 struct rt6_info *entry = rt6_ex->rt6i;
1678 /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
1679 * route), the metrics of its rt->from have already
1682 if (dst_metric_raw(&entry->dst, RTAX_MTU) &&
1683 rt6_mtu_change_route_allowed(idev, entry, mtu))
1684 dst_metric_set(&entry->dst, RTAX_MTU, mtu);
1690 #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE)
1692 static void rt6_exceptions_clean_tohost(struct fib6_info *rt,
1693 struct in6_addr *gateway)
1695 struct rt6_exception_bucket *bucket;
1696 struct rt6_exception *rt6_ex;
1697 struct hlist_node *tmp;
1700 if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1703 spin_lock_bh(&rt6_exception_lock);
1704 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1705 lockdep_is_held(&rt6_exception_lock));
1708 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1709 hlist_for_each_entry_safe(rt6_ex, tmp,
1710 &bucket->chain, hlist) {
1711 struct rt6_info *entry = rt6_ex->rt6i;
1713 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1714 RTF_CACHE_GATEWAY &&
1715 ipv6_addr_equal(gateway,
1716 &entry->rt6i_gateway)) {
1717 rt6_remove_exception(bucket, rt6_ex);
1724 spin_unlock_bh(&rt6_exception_lock);
1727 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1728 struct rt6_exception *rt6_ex,
1729 struct fib6_gc_args *gc_args,
1732 struct rt6_info *rt = rt6_ex->rt6i;
1734 /* we are pruning and obsoleting aged-out and non gateway exceptions
1735 * even if others have still references to them, so that on next
1736 * dst_check() such references can be dropped.
1737 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1738 * expired, independently from their aging, as per RFC 8201 section 4
1740 if (!(rt->rt6i_flags & RTF_EXPIRES)) {
1741 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1742 RT6_TRACE("aging clone %p\n", rt);
1743 rt6_remove_exception(bucket, rt6_ex);
1746 } else if (time_after(jiffies, rt->dst.expires)) {
1747 RT6_TRACE("purging expired route %p\n", rt);
1748 rt6_remove_exception(bucket, rt6_ex);
1752 if (rt->rt6i_flags & RTF_GATEWAY) {
1753 struct neighbour *neigh;
1754 __u8 neigh_flags = 0;
1756 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
1758 neigh_flags = neigh->flags;
1760 if (!(neigh_flags & NTF_ROUTER)) {
1761 RT6_TRACE("purging route %p via non-router but gateway\n",
1763 rt6_remove_exception(bucket, rt6_ex);
1771 void rt6_age_exceptions(struct fib6_info *rt,
1772 struct fib6_gc_args *gc_args,
1775 struct rt6_exception_bucket *bucket;
1776 struct rt6_exception *rt6_ex;
1777 struct hlist_node *tmp;
1780 if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1784 spin_lock(&rt6_exception_lock);
1785 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1786 lockdep_is_held(&rt6_exception_lock));
1789 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1790 hlist_for_each_entry_safe(rt6_ex, tmp,
1791 &bucket->chain, hlist) {
1792 rt6_age_examine_exception(bucket, rt6_ex,
1798 spin_unlock(&rt6_exception_lock);
1799 rcu_read_unlock_bh();
1802 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1803 int oif, struct flowi6 *fl6,
1804 const struct sk_buff *skb, int flags)
1806 struct fib6_node *fn, *saved_fn;
1807 struct fib6_info *f6i;
1808 struct rt6_info *rt;
1811 strict |= flags & RT6_LOOKUP_F_IFACE;
1812 strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1813 if (net->ipv6.devconf_all->forwarding == 0)
1814 strict |= RT6_LOOKUP_F_REACHABLE;
1818 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1821 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1825 f6i = rt6_select(net, fn, oif, strict);
1826 if (f6i->fib6_nsiblings)
1827 f6i = rt6_multipath_select(net, f6i, fl6, oif, skb, strict);
1828 if (f6i == net->ipv6.fib6_null_entry) {
1829 fn = fib6_backtrack(fn, &fl6->saddr);
1831 goto redo_rt6_select;
1832 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1833 /* also consider unreachable route */
1834 strict &= ~RT6_LOOKUP_F_REACHABLE;
1836 goto redo_rt6_select;
1840 if (f6i == net->ipv6.fib6_null_entry) {
1841 rt = net->ipv6.ip6_null_entry;
1844 trace_fib6_table_lookup(net, rt, table, fl6);
1848 /*Search through exception table */
1849 rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1851 if (ip6_hold_safe(net, &rt, true))
1852 dst_use_noref(&rt->dst, jiffies);
1855 trace_fib6_table_lookup(net, rt, table, fl6);
1857 } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1858 !(f6i->fib6_flags & RTF_GATEWAY))) {
1859 /* Create a RTF_CACHE clone which will not be
1860 * owned by the fib6 tree. It is for the special case where
1861 * the daddr in the skb during the neighbor look-up is different
1862 * from the fl6->daddr used to look-up route here.
1864 struct rt6_info *uncached_rt;
1866 uncached_rt = ip6_rt_cache_alloc(f6i, &fl6->daddr, NULL);
1871 /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1872 * No need for another dst_hold()
1874 rt6_uncached_list_add(uncached_rt);
1875 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1877 uncached_rt = net->ipv6.ip6_null_entry;
1878 dst_hold(&uncached_rt->dst);
1881 trace_fib6_table_lookup(net, uncached_rt, table, fl6);
1885 /* Get a percpu copy */
1887 struct rt6_info *pcpu_rt;
1890 pcpu_rt = rt6_get_pcpu_route(f6i);
1893 pcpu_rt = rt6_make_pcpu_route(net, f6i);
1897 trace_fib6_table_lookup(net, pcpu_rt, table, fl6);
1901 EXPORT_SYMBOL_GPL(ip6_pol_route);
1903 static struct rt6_info *ip6_pol_route_input(struct net *net,
1904 struct fib6_table *table,
1906 const struct sk_buff *skb,
1909 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
1912 struct dst_entry *ip6_route_input_lookup(struct net *net,
1913 struct net_device *dev,
1915 const struct sk_buff *skb,
1918 if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1919 flags |= RT6_LOOKUP_F_IFACE;
1921 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
1923 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1925 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1926 struct flow_keys *keys,
1927 struct flow_keys *flkeys)
1929 const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1930 const struct ipv6hdr *key_iph = outer_iph;
1931 struct flow_keys *_flkeys = flkeys;
1932 const struct ipv6hdr *inner_iph;
1933 const struct icmp6hdr *icmph;
1934 struct ipv6hdr _inner_iph;
1935 struct icmp6hdr _icmph;
1937 if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1940 icmph = skb_header_pointer(skb, skb_transport_offset(skb),
1941 sizeof(_icmph), &_icmph);
1945 if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1946 icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1947 icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1948 icmph->icmp6_type != ICMPV6_PARAMPROB)
1951 inner_iph = skb_header_pointer(skb,
1952 skb_transport_offset(skb) + sizeof(*icmph),
1953 sizeof(_inner_iph), &_inner_iph);
1957 key_iph = inner_iph;
1961 keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
1962 keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
1963 keys->tags.flow_label = _flkeys->tags.flow_label;
1964 keys->basic.ip_proto = _flkeys->basic.ip_proto;
1966 keys->addrs.v6addrs.src = key_iph->saddr;
1967 keys->addrs.v6addrs.dst = key_iph->daddr;
1968 keys->tags.flow_label = ip6_flowinfo(key_iph);
1969 keys->basic.ip_proto = key_iph->nexthdr;
1973 /* if skb is set it will be used and fl6 can be NULL */
1974 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
1975 const struct sk_buff *skb, struct flow_keys *flkeys)
1977 struct flow_keys hash_keys;
1980 switch (ip6_multipath_hash_policy(net)) {
1982 memset(&hash_keys, 0, sizeof(hash_keys));
1983 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1985 ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
1987 hash_keys.addrs.v6addrs.src = fl6->saddr;
1988 hash_keys.addrs.v6addrs.dst = fl6->daddr;
1989 hash_keys.tags.flow_label = (__force u32)fl6->flowlabel;
1990 hash_keys.basic.ip_proto = fl6->flowi6_proto;
1995 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1996 struct flow_keys keys;
1998 /* short-circuit if we already have L4 hash present */
2000 return skb_get_hash_raw(skb) >> 1;
2002 memset(&hash_keys, 0, sizeof(hash_keys));
2005 skb_flow_dissect_flow_keys(skb, &keys, flag);
2008 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2009 hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
2010 hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
2011 hash_keys.ports.src = flkeys->ports.src;
2012 hash_keys.ports.dst = flkeys->ports.dst;
2013 hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2015 memset(&hash_keys, 0, sizeof(hash_keys));
2016 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2017 hash_keys.addrs.v6addrs.src = fl6->saddr;
2018 hash_keys.addrs.v6addrs.dst = fl6->daddr;
2019 hash_keys.ports.src = fl6->fl6_sport;
2020 hash_keys.ports.dst = fl6->fl6_dport;
2021 hash_keys.basic.ip_proto = fl6->flowi6_proto;
2025 mhash = flow_hash_from_keys(&hash_keys);
2030 void ip6_route_input(struct sk_buff *skb)
2032 const struct ipv6hdr *iph = ipv6_hdr(skb);
2033 struct net *net = dev_net(skb->dev);
2034 int flags = RT6_LOOKUP_F_HAS_SADDR;
2035 struct ip_tunnel_info *tun_info;
2036 struct flowi6 fl6 = {
2037 .flowi6_iif = skb->dev->ifindex,
2038 .daddr = iph->daddr,
2039 .saddr = iph->saddr,
2040 .flowlabel = ip6_flowinfo(iph),
2041 .flowi6_mark = skb->mark,
2042 .flowi6_proto = iph->nexthdr,
2044 struct flow_keys *flkeys = NULL, _flkeys;
2046 tun_info = skb_tunnel_info(skb);
2047 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2048 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
2050 if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
2053 if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
2054 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
2057 ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
2060 static struct rt6_info *ip6_pol_route_output(struct net *net,
2061 struct fib6_table *table,
2063 const struct sk_buff *skb,
2066 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
2069 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
2070 struct flowi6 *fl6, int flags)
2074 if (rt6_need_strict(&fl6->daddr)) {
2075 struct dst_entry *dst;
2077 dst = l3mdev_link_scope_lookup(net, fl6);
2082 fl6->flowi6_iif = LOOPBACK_IFINDEX;
2084 any_src = ipv6_addr_any(&fl6->saddr);
2085 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
2086 (fl6->flowi6_oif && any_src))
2087 flags |= RT6_LOOKUP_F_IFACE;
2090 flags |= RT6_LOOKUP_F_HAS_SADDR;
2092 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
2094 return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
2096 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
2098 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2100 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
2101 struct net_device *loopback_dev = net->loopback_dev;
2102 struct dst_entry *new = NULL;
2104 rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
2105 DST_OBSOLETE_DEAD, 0);
2108 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
2112 new->input = dst_discard;
2113 new->output = dst_discard_out;
2115 dst_copy_metrics(new, &ort->dst);
2117 rt->rt6i_idev = in6_dev_get(loopback_dev);
2118 rt->rt6i_gateway = ort->rt6i_gateway;
2119 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
2121 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
2122 #ifdef CONFIG_IPV6_SUBTREES
2123 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
2127 dst_release(dst_orig);
2128 return new ? new : ERR_PTR(-ENOMEM);
2132 * Destination cache support functions
2135 static bool fib6_check(struct fib6_info *f6i, u32 cookie)
2139 if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie)
2142 if (fib6_check_expired(f6i))
2148 static struct dst_entry *rt6_check(struct rt6_info *rt,
2149 struct fib6_info *from,
2154 if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) ||
2155 rt_cookie != cookie)
2158 if (rt6_check_expired(rt))
2164 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt,
2165 struct fib6_info *from,
2168 if (!__rt6_check_expired(rt) &&
2169 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
2170 fib6_check(from, cookie))
2176 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
2178 struct dst_entry *dst_ret;
2179 struct fib6_info *from;
2180 struct rt6_info *rt;
2182 rt = container_of(dst, struct rt6_info, dst);
2186 /* All IPV6 dsts are created with ->obsolete set to the value
2187 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
2188 * into this function always.
2191 from = rcu_dereference(rt->from);
2193 if (from && (rt->rt6i_flags & RTF_PCPU ||
2194 unlikely(!list_empty(&rt->rt6i_uncached))))
2195 dst_ret = rt6_dst_from_check(rt, from, cookie);
2197 dst_ret = rt6_check(rt, from, cookie);
2204 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
2206 struct rt6_info *rt = (struct rt6_info *) dst;
2209 if (rt->rt6i_flags & RTF_CACHE) {
2211 if (rt6_check_expired(rt)) {
2212 rt6_remove_exception_rt(rt);
2224 static void ip6_link_failure(struct sk_buff *skb)
2226 struct rt6_info *rt;
2228 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2230 rt = (struct rt6_info *) skb_dst(skb);
2233 if (rt->rt6i_flags & RTF_CACHE) {
2234 if (dst_hold_safe(&rt->dst))
2235 rt6_remove_exception_rt(rt);
2237 struct fib6_info *from;
2238 struct fib6_node *fn;
2240 from = rcu_dereference(rt->from);
2242 fn = rcu_dereference(from->fib6_node);
2243 if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2251 static void rt6_update_expires(struct rt6_info *rt0, int timeout)
2253 if (!(rt0->rt6i_flags & RTF_EXPIRES)) {
2254 struct fib6_info *from;
2257 from = rcu_dereference(rt0->from);
2259 rt0->dst.expires = from->expires;
2263 dst_set_expires(&rt0->dst, timeout);
2264 rt0->rt6i_flags |= RTF_EXPIRES;
2267 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2269 struct net *net = dev_net(rt->dst.dev);
2271 dst_metric_set(&rt->dst, RTAX_MTU, mtu);
2272 rt->rt6i_flags |= RTF_MODIFIED;
2273 rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2276 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2281 from_set = !!rcu_dereference(rt->from);
2284 return !(rt->rt6i_flags & RTF_CACHE) &&
2285 (rt->rt6i_flags & RTF_PCPU || from_set);
2288 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2289 const struct ipv6hdr *iph, u32 mtu)
2291 const struct in6_addr *daddr, *saddr;
2292 struct rt6_info *rt6 = (struct rt6_info *)dst;
2294 if (rt6->rt6i_flags & RTF_LOCAL)
2297 if (dst_metric_locked(dst, RTAX_MTU))
2301 daddr = &iph->daddr;
2302 saddr = &iph->saddr;
2304 daddr = &sk->sk_v6_daddr;
2305 saddr = &inet6_sk(sk)->saddr;
2310 dst_confirm_neigh(dst, daddr);
2311 mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2312 if (mtu >= dst_mtu(dst))
2315 if (!rt6_cache_allowed_for_pmtu(rt6)) {
2316 rt6_do_update_pmtu(rt6, mtu);
2317 /* update rt6_ex->stamp for cache */
2318 if (rt6->rt6i_flags & RTF_CACHE)
2319 rt6_update_exception_stamp_rt(rt6);
2321 struct fib6_info *from;
2322 struct rt6_info *nrt6;
2325 from = rcu_dereference(rt6->from);
2326 nrt6 = ip6_rt_cache_alloc(from, daddr, saddr);
2328 rt6_do_update_pmtu(nrt6, mtu);
2329 if (rt6_insert_exception(nrt6, from))
2330 dst_release_immediate(&nrt6->dst);
2336 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2337 struct sk_buff *skb, u32 mtu)
2339 __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2342 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2343 int oif, u32 mark, kuid_t uid)
2345 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2346 struct dst_entry *dst;
2349 memset(&fl6, 0, sizeof(fl6));
2350 fl6.flowi6_oif = oif;
2351 fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
2352 fl6.daddr = iph->daddr;
2353 fl6.saddr = iph->saddr;
2354 fl6.flowlabel = ip6_flowinfo(iph);
2355 fl6.flowi6_uid = uid;
2357 dst = ip6_route_output(net, NULL, &fl6);
2359 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2362 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2364 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2366 struct dst_entry *dst;
2368 ip6_update_pmtu(skb, sock_net(sk), mtu,
2369 sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid);
2371 dst = __sk_dst_get(sk);
2372 if (!dst || !dst->obsolete ||
2373 dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2377 if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2378 ip6_datagram_dst_update(sk, false);
2381 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2383 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
2384 const struct flowi6 *fl6)
2386 #ifdef CONFIG_IPV6_SUBTREES
2387 struct ipv6_pinfo *np = inet6_sk(sk);
2390 ip6_dst_store(sk, dst,
2391 ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ?
2392 &sk->sk_v6_daddr : NULL,
2393 #ifdef CONFIG_IPV6_SUBTREES
2394 ipv6_addr_equal(&fl6->saddr, &np->saddr) ?
2400 /* Handle redirects */
2401 struct ip6rd_flowi {
2403 struct in6_addr gateway;
2406 static struct rt6_info *__ip6_route_redirect(struct net *net,
2407 struct fib6_table *table,
2409 const struct sk_buff *skb,
2412 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2413 struct rt6_info *ret = NULL, *rt_cache;
2414 struct fib6_info *rt;
2415 struct fib6_node *fn;
2417 /* Get the "current" route for this destination and
2418 * check if the redirect has come from appropriate router.
2420 * RFC 4861 specifies that redirects should only be
2421 * accepted if they come from the nexthop to the target.
2422 * Due to the way the routes are chosen, this notion
2423 * is a bit fuzzy and one might need to check all possible
2428 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2430 for_each_fib6_node_rt_rcu(fn) {
2431 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
2433 if (fib6_check_expired(rt))
2435 if (rt->fib6_flags & RTF_REJECT)
2437 if (!(rt->fib6_flags & RTF_GATEWAY))
2439 if (fl6->flowi6_oif != rt->fib6_nh.nh_dev->ifindex)
2441 /* rt_cache's gateway might be different from its 'parent'
2442 * in the case of an ip redirect.
2443 * So we keep searching in the exception table if the gateway
2446 if (!ipv6_addr_equal(&rdfl->gateway, &rt->fib6_nh.nh_gw)) {
2447 rt_cache = rt6_find_cached_rt(rt,
2451 ipv6_addr_equal(&rdfl->gateway,
2452 &rt_cache->rt6i_gateway)) {
2462 rt = net->ipv6.fib6_null_entry;
2463 else if (rt->fib6_flags & RTF_REJECT) {
2464 ret = net->ipv6.ip6_null_entry;
2468 if (rt == net->ipv6.fib6_null_entry) {
2469 fn = fib6_backtrack(fn, &fl6->saddr);
2476 dst_hold(&ret->dst);
2478 ret = ip6_create_rt_rcu(rt);
2482 trace_fib6_table_lookup(net, ret, table, fl6);
2486 static struct dst_entry *ip6_route_redirect(struct net *net,
2487 const struct flowi6 *fl6,
2488 const struct sk_buff *skb,
2489 const struct in6_addr *gateway)
2491 int flags = RT6_LOOKUP_F_HAS_SADDR;
2492 struct ip6rd_flowi rdfl;
2495 rdfl.gateway = *gateway;
2497 return fib6_rule_lookup(net, &rdfl.fl6, skb,
2498 flags, __ip6_route_redirect);
2501 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2504 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2505 struct dst_entry *dst;
2508 memset(&fl6, 0, sizeof(fl6));
2509 fl6.flowi6_iif = LOOPBACK_IFINDEX;
2510 fl6.flowi6_oif = oif;
2511 fl6.flowi6_mark = mark;
2512 fl6.daddr = iph->daddr;
2513 fl6.saddr = iph->saddr;
2514 fl6.flowlabel = ip6_flowinfo(iph);
2515 fl6.flowi6_uid = uid;
2517 dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
2518 rt6_do_redirect(dst, NULL, skb);
2521 EXPORT_SYMBOL_GPL(ip6_redirect);
2523 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
2526 const struct ipv6hdr *iph = ipv6_hdr(skb);
2527 const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2528 struct dst_entry *dst;
2531 memset(&fl6, 0, sizeof(fl6));
2532 fl6.flowi6_iif = LOOPBACK_IFINDEX;
2533 fl6.flowi6_oif = oif;
2534 fl6.flowi6_mark = mark;
2535 fl6.daddr = msg->dest;
2536 fl6.saddr = iph->daddr;
2537 fl6.flowi6_uid = sock_net_uid(net, NULL);
2539 dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
2540 rt6_do_redirect(dst, NULL, skb);
2544 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2546 ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2549 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2551 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2553 struct net_device *dev = dst->dev;
2554 unsigned int mtu = dst_mtu(dst);
2555 struct net *net = dev_net(dev);
2557 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2559 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2560 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2563 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2564 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2565 * IPV6_MAXPLEN is also valid and means: "any MSS,
2566 * rely only on pmtu discovery"
2568 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2573 static unsigned int ip6_mtu(const struct dst_entry *dst)
2575 struct inet6_dev *idev;
2578 mtu = dst_metric_raw(dst, RTAX_MTU);
2585 idev = __in6_dev_get(dst->dev);
2587 mtu = idev->cnf.mtu6;
2591 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2593 return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2596 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2599 struct dst_entry *dst;
2600 struct rt6_info *rt;
2601 struct inet6_dev *idev = in6_dev_get(dev);
2602 struct net *net = dev_net(dev);
2604 if (unlikely(!idev))
2605 return ERR_PTR(-ENODEV);
2607 rt = ip6_dst_alloc(net, dev, 0);
2608 if (unlikely(!rt)) {
2610 dst = ERR_PTR(-ENOMEM);
2614 rt->dst.flags |= DST_HOST;
2615 rt->dst.input = ip6_input;
2616 rt->dst.output = ip6_output;
2617 rt->rt6i_gateway = fl6->daddr;
2618 rt->rt6i_dst.addr = fl6->daddr;
2619 rt->rt6i_dst.plen = 128;
2620 rt->rt6i_idev = idev;
2621 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2623 /* Add this dst into uncached_list so that rt6_disable_ip() can
2624 * do proper release of the net_device
2626 rt6_uncached_list_add(rt);
2627 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2629 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2635 static int ip6_dst_gc(struct dst_ops *ops)
2637 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2638 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2639 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2640 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2641 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2642 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2645 entries = dst_entries_get_fast(ops);
2646 if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2647 entries <= rt_max_size)
2650 net->ipv6.ip6_rt_gc_expire++;
2651 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2652 entries = dst_entries_get_slow(ops);
2653 if (entries < ops->gc_thresh)
2654 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2656 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2657 return entries > rt_max_size;
2660 static int ip6_convert_metrics(struct net *net, struct fib6_info *rt,
2661 struct fib6_config *cfg)
2663 struct dst_metrics *p;
2668 p = kzalloc(sizeof(*rt->fib6_metrics), GFP_KERNEL);
2672 refcount_set(&p->refcnt, 1);
2673 rt->fib6_metrics = p;
2675 return ip_metrics_convert(net, cfg->fc_mx, cfg->fc_mx_len, p->metrics);
2678 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2679 struct fib6_config *cfg,
2680 const struct in6_addr *gw_addr,
2681 u32 tbid, int flags)
2683 struct flowi6 fl6 = {
2684 .flowi6_oif = cfg->fc_ifindex,
2686 .saddr = cfg->fc_prefsrc,
2688 struct fib6_table *table;
2689 struct rt6_info *rt;
2691 table = fib6_get_table(net, tbid);
2695 if (!ipv6_addr_any(&cfg->fc_prefsrc))
2696 flags |= RT6_LOOKUP_F_HAS_SADDR;
2698 flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
2699 rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags);
2701 /* if table lookup failed, fall back to full lookup */
2702 if (rt == net->ipv6.ip6_null_entry) {
2710 static int ip6_route_check_nh_onlink(struct net *net,
2711 struct fib6_config *cfg,
2712 const struct net_device *dev,
2713 struct netlink_ext_ack *extack)
2715 u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
2716 const struct in6_addr *gw_addr = &cfg->fc_gateway;
2717 u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
2718 struct rt6_info *grt;
2722 grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
2724 if (!grt->dst.error &&
2725 (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
2726 NL_SET_ERR_MSG(extack,
2727 "Nexthop has invalid gateway or device mismatch");
2737 static int ip6_route_check_nh(struct net *net,
2738 struct fib6_config *cfg,
2739 struct net_device **_dev,
2740 struct inet6_dev **idev)
2742 const struct in6_addr *gw_addr = &cfg->fc_gateway;
2743 struct net_device *dev = _dev ? *_dev : NULL;
2744 struct rt6_info *grt = NULL;
2745 int err = -EHOSTUNREACH;
2747 if (cfg->fc_table) {
2748 int flags = RT6_LOOKUP_F_IFACE;
2750 grt = ip6_nh_lookup_table(net, cfg, gw_addr,
2751 cfg->fc_table, flags);
2753 if (grt->rt6i_flags & RTF_GATEWAY ||
2754 (dev && dev != grt->dst.dev)) {
2762 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1);
2768 if (dev != grt->dst.dev) {
2773 *_dev = dev = grt->dst.dev;
2774 *idev = grt->rt6i_idev;
2776 in6_dev_hold(grt->rt6i_idev);
2779 if (!(grt->rt6i_flags & RTF_GATEWAY))
2788 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
2789 struct net_device **_dev, struct inet6_dev **idev,
2790 struct netlink_ext_ack *extack)
2792 const struct in6_addr *gw_addr = &cfg->fc_gateway;
2793 int gwa_type = ipv6_addr_type(gw_addr);
2794 bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true;
2795 const struct net_device *dev = *_dev;
2796 bool need_addr_check = !dev;
2799 /* if gw_addr is local we will fail to detect this in case
2800 * address is still TENTATIVE (DAD in progress). rt6_lookup()
2801 * will return already-added prefix route via interface that
2802 * prefix route was assigned to, which might be non-loopback.
2805 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2806 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2810 if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
2811 /* IPv6 strictly inhibits using not link-local
2812 * addresses as nexthop address.
2813 * Otherwise, router will not able to send redirects.
2814 * It is very good, but in some (rare!) circumstances
2815 * (SIT, PtP, NBMA NOARP links) it is handy to allow
2816 * some exceptions. --ANK
2817 * We allow IPv4-mapped nexthops to support RFC4798-type
2820 if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) {
2821 NL_SET_ERR_MSG(extack, "Invalid gateway address");
2825 if (cfg->fc_flags & RTNH_F_ONLINK)
2826 err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
2828 err = ip6_route_check_nh(net, cfg, _dev, idev);
2834 /* reload in case device was changed */
2839 NL_SET_ERR_MSG(extack, "Egress device not specified");
2841 } else if (dev->flags & IFF_LOOPBACK) {
2842 NL_SET_ERR_MSG(extack,
2843 "Egress device can not be loopback device for this route");
2847 /* if we did not check gw_addr above, do so now that the
2848 * egress device has been resolved.
2850 if (need_addr_check &&
2851 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2852 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2861 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
2863 struct netlink_ext_ack *extack)
2865 struct net *net = cfg->fc_nlinfo.nl_net;
2866 struct fib6_info *rt = NULL;
2867 struct net_device *dev = NULL;
2868 struct inet6_dev *idev = NULL;
2869 struct fib6_table *table;
2873 /* RTF_PCPU is an internal flag; can not be set by userspace */
2874 if (cfg->fc_flags & RTF_PCPU) {
2875 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
2879 /* RTF_CACHE is an internal flag; can not be set by userspace */
2880 if (cfg->fc_flags & RTF_CACHE) {
2881 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
2885 if (cfg->fc_type > RTN_MAX) {
2886 NL_SET_ERR_MSG(extack, "Invalid route type");
2890 if (cfg->fc_dst_len > 128) {
2891 NL_SET_ERR_MSG(extack, "Invalid prefix length");
2894 if (cfg->fc_src_len > 128) {
2895 NL_SET_ERR_MSG(extack, "Invalid source address length");
2898 #ifndef CONFIG_IPV6_SUBTREES
2899 if (cfg->fc_src_len) {
2900 NL_SET_ERR_MSG(extack,
2901 "Specifying source address requires IPV6_SUBTREES to be enabled");
2905 if (cfg->fc_ifindex) {
2907 dev = dev_get_by_index(net, cfg->fc_ifindex);
2910 idev = in6_dev_get(dev);
2915 if (cfg->fc_metric == 0)
2916 cfg->fc_metric = IP6_RT_PRIO_USER;
2918 if (cfg->fc_flags & RTNH_F_ONLINK) {
2920 NL_SET_ERR_MSG(extack,
2921 "Nexthop device required for onlink");
2926 if (!(dev->flags & IFF_UP)) {
2927 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
2934 if (cfg->fc_nlinfo.nlh &&
2935 !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
2936 table = fib6_get_table(net, cfg->fc_table);
2938 pr_warn("NLM_F_CREATE should be specified when creating new route\n");
2939 table = fib6_new_table(net, cfg->fc_table);
2942 table = fib6_new_table(net, cfg->fc_table);
2949 rt = fib6_info_alloc(gfp_flags);
2953 if (cfg->fc_flags & RTF_ADDRCONF)
2954 rt->dst_nocount = true;
2956 err = ip6_convert_metrics(net, rt, cfg);
2960 if (cfg->fc_flags & RTF_EXPIRES)
2961 fib6_set_expires(rt, jiffies +
2962 clock_t_to_jiffies(cfg->fc_expires));
2964 fib6_clean_expires(rt);
2966 if (cfg->fc_protocol == RTPROT_UNSPEC)
2967 cfg->fc_protocol = RTPROT_BOOT;
2968 rt->fib6_protocol = cfg->fc_protocol;
2970 addr_type = ipv6_addr_type(&cfg->fc_dst);
2972 if (cfg->fc_encap) {
2973 struct lwtunnel_state *lwtstate;
2975 err = lwtunnel_build_state(cfg->fc_encap_type,
2976 cfg->fc_encap, AF_INET6, cfg,
2980 rt->fib6_nh.nh_lwtstate = lwtstate_get(lwtstate);
2983 ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
2984 rt->fib6_dst.plen = cfg->fc_dst_len;
2985 if (rt->fib6_dst.plen == 128)
2986 rt->dst_host = true;
2988 #ifdef CONFIG_IPV6_SUBTREES
2989 ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
2990 rt->fib6_src.plen = cfg->fc_src_len;
2993 rt->fib6_metric = cfg->fc_metric;
2994 rt->fib6_nh.nh_weight = 1;
2996 rt->fib6_type = cfg->fc_type;
2998 /* We cannot add true routes via loopback here,
2999 they would result in kernel looping; promote them to reject routes
3001 if ((cfg->fc_flags & RTF_REJECT) ||
3002 (dev && (dev->flags & IFF_LOOPBACK) &&
3003 !(addr_type & IPV6_ADDR_LOOPBACK) &&
3004 !(cfg->fc_flags & RTF_LOCAL))) {
3005 /* hold loopback dev/idev if we haven't done so. */
3006 if (dev != net->loopback_dev) {
3011 dev = net->loopback_dev;
3013 idev = in6_dev_get(dev);
3019 rt->fib6_flags = RTF_REJECT|RTF_NONEXTHOP;
3023 if (cfg->fc_flags & RTF_GATEWAY) {
3024 err = ip6_validate_gw(net, cfg, &dev, &idev, extack);
3028 rt->fib6_nh.nh_gw = cfg->fc_gateway;
3035 if (idev->cnf.disable_ipv6) {
3036 NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
3041 if (!(dev->flags & IFF_UP)) {
3042 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3047 if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
3048 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
3049 NL_SET_ERR_MSG(extack, "Invalid source address");
3053 rt->fib6_prefsrc.addr = cfg->fc_prefsrc;
3054 rt->fib6_prefsrc.plen = 128;
3056 rt->fib6_prefsrc.plen = 0;
3058 rt->fib6_flags = cfg->fc_flags;
3061 if (!(rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
3062 !netif_carrier_ok(dev))
3063 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
3064 rt->fib6_nh.nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK);
3065 rt->fib6_nh.nh_dev = dev;
3066 rt->fib6_table = table;
3068 cfg->fc_nlinfo.nl_net = dev_net(dev);
3080 fib6_info_release(rt);
3081 return ERR_PTR(err);
3084 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
3085 struct netlink_ext_ack *extack)
3087 struct fib6_info *rt;
3090 rt = ip6_route_info_create(cfg, gfp_flags, extack);
3094 err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack);
3095 fib6_info_release(rt);
3100 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info)
3102 struct net *net = info->nl_net;
3103 struct fib6_table *table;
3106 if (rt == net->ipv6.fib6_null_entry) {
3111 table = rt->fib6_table;
3112 spin_lock_bh(&table->tb6_lock);
3113 err = fib6_del(rt, info);
3114 spin_unlock_bh(&table->tb6_lock);
3117 fib6_info_release(rt);
3121 int ip6_del_rt(struct net *net, struct fib6_info *rt)
3123 struct nl_info info = { .nl_net = net };
3125 return __ip6_del_rt(rt, &info);
3128 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
3130 struct nl_info *info = &cfg->fc_nlinfo;
3131 struct net *net = info->nl_net;
3132 struct sk_buff *skb = NULL;
3133 struct fib6_table *table;
3136 if (rt == net->ipv6.fib6_null_entry)
3138 table = rt->fib6_table;
3139 spin_lock_bh(&table->tb6_lock);
3141 if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) {
3142 struct fib6_info *sibling, *next_sibling;
3144 /* prefer to send a single notification with all hops */
3145 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3147 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3149 if (rt6_fill_node(net, skb, rt, NULL,
3150 NULL, NULL, 0, RTM_DELROUTE,
3151 info->portid, seq, 0) < 0) {
3155 info->skip_notify = 1;
3158 list_for_each_entry_safe(sibling, next_sibling,
3161 err = fib6_del(sibling, info);
3167 err = fib6_del(rt, info);
3169 spin_unlock_bh(&table->tb6_lock);
3171 fib6_info_release(rt);
3174 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3175 info->nlh, gfp_any());
3180 static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
3184 if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex)
3187 if (cfg->fc_flags & RTF_GATEWAY &&
3188 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
3190 if (dst_hold_safe(&rt->dst))
3191 rc = rt6_remove_exception_rt(rt);
3196 static int ip6_route_del(struct fib6_config *cfg,
3197 struct netlink_ext_ack *extack)
3199 struct rt6_info *rt_cache;
3200 struct fib6_table *table;
3201 struct fib6_info *rt;
3202 struct fib6_node *fn;
3205 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
3207 NL_SET_ERR_MSG(extack, "FIB table does not exist");
3213 fn = fib6_locate(&table->tb6_root,
3214 &cfg->fc_dst, cfg->fc_dst_len,
3215 &cfg->fc_src, cfg->fc_src_len,
3216 !(cfg->fc_flags & RTF_CACHE));
3219 for_each_fib6_node_rt_rcu(fn) {
3220 if (cfg->fc_flags & RTF_CACHE) {
3223 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
3226 rc = ip6_del_cached_rt(rt_cache, cfg);
3234 if (cfg->fc_ifindex &&
3235 (!rt->fib6_nh.nh_dev ||
3236 rt->fib6_nh.nh_dev->ifindex != cfg->fc_ifindex))
3238 if (cfg->fc_flags & RTF_GATEWAY &&
3239 !ipv6_addr_equal(&cfg->fc_gateway, &rt->fib6_nh.nh_gw))
3241 if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
3243 if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol)
3248 /* if gateway was specified only delete the one hop */
3249 if (cfg->fc_flags & RTF_GATEWAY)
3250 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
3252 return __ip6_del_rt_siblings(rt, cfg);
3260 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
3262 struct netevent_redirect netevent;
3263 struct rt6_info *rt, *nrt = NULL;
3264 struct ndisc_options ndopts;
3265 struct inet6_dev *in6_dev;
3266 struct neighbour *neigh;
3267 struct fib6_info *from;
3269 int optlen, on_link;
3272 optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
3273 optlen -= sizeof(*msg);
3276 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
3280 msg = (struct rd_msg *)icmp6_hdr(skb);
3282 if (ipv6_addr_is_multicast(&msg->dest)) {
3283 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
3288 if (ipv6_addr_equal(&msg->dest, &msg->target)) {
3290 } else if (ipv6_addr_type(&msg->target) !=
3291 (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
3292 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
3296 in6_dev = __in6_dev_get(skb->dev);
3299 if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
3303 * The IP source address of the Redirect MUST be the same as the current
3304 * first-hop router for the specified ICMP Destination Address.
3307 if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
3308 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
3313 if (ndopts.nd_opts_tgt_lladdr) {
3314 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
3317 net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
3322 rt = (struct rt6_info *) dst;
3323 if (rt->rt6i_flags & RTF_REJECT) {
3324 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3328 /* Redirect received -> path was valid.
3329 * Look, redirects are sent only in response to data packets,
3330 * so that this nexthop apparently is reachable. --ANK
3332 dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3334 neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3339 * We have finally decided to accept it.
3342 ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3343 NEIGH_UPDATE_F_WEAK_OVERRIDE|
3344 NEIGH_UPDATE_F_OVERRIDE|
3345 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3346 NEIGH_UPDATE_F_ISROUTER)),
3347 NDISC_REDIRECT, &ndopts);
3350 from = rcu_dereference(rt->from);
3351 fib6_info_hold(from);
3354 nrt = ip6_rt_cache_alloc(from, &msg->dest, NULL);
3358 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3360 nrt->rt6i_flags &= ~RTF_GATEWAY;
3362 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3364 /* No need to remove rt from the exception table if rt is
3365 * a cached route because rt6_insert_exception() will
3368 if (rt6_insert_exception(nrt, from)) {
3369 dst_release_immediate(&nrt->dst);
3373 netevent.old = &rt->dst;
3374 netevent.new = &nrt->dst;
3375 netevent.daddr = &msg->dest;
3376 netevent.neigh = neigh;
3377 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3380 fib6_info_release(from);
3381 neigh_release(neigh);
3384 #ifdef CONFIG_IPV6_ROUTE_INFO
3385 static struct fib6_info *rt6_get_route_info(struct net *net,
3386 const struct in6_addr *prefix, int prefixlen,
3387 const struct in6_addr *gwaddr,
3388 struct net_device *dev)
3390 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3391 int ifindex = dev->ifindex;
3392 struct fib6_node *fn;
3393 struct fib6_info *rt = NULL;
3394 struct fib6_table *table;
3396 table = fib6_get_table(net, tb_id);
3401 fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3405 for_each_fib6_node_rt_rcu(fn) {
3406 if (rt->fib6_nh.nh_dev->ifindex != ifindex)
3408 if ((rt->fib6_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
3410 if (!ipv6_addr_equal(&rt->fib6_nh.nh_gw, gwaddr))
3420 static struct fib6_info *rt6_add_route_info(struct net *net,
3421 const struct in6_addr *prefix, int prefixlen,
3422 const struct in6_addr *gwaddr,
3423 struct net_device *dev,
3426 struct fib6_config cfg = {
3427 .fc_metric = IP6_RT_PRIO_USER,
3428 .fc_ifindex = dev->ifindex,
3429 .fc_dst_len = prefixlen,
3430 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3431 RTF_UP | RTF_PREF(pref),
3432 .fc_protocol = RTPROT_RA,
3433 .fc_type = RTN_UNICAST,
3434 .fc_nlinfo.portid = 0,
3435 .fc_nlinfo.nlh = NULL,
3436 .fc_nlinfo.nl_net = net,
3439 cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3440 cfg.fc_dst = *prefix;
3441 cfg.fc_gateway = *gwaddr;
3443 /* We should treat it as a default route if prefix length is 0. */
3445 cfg.fc_flags |= RTF_DEFAULT;
3447 ip6_route_add(&cfg, GFP_ATOMIC, NULL);
3449 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3453 struct fib6_info *rt6_get_dflt_router(struct net *net,
3454 const struct in6_addr *addr,
3455 struct net_device *dev)
3457 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3458 struct fib6_info *rt;
3459 struct fib6_table *table;
3461 table = fib6_get_table(net, tb_id);
3466 for_each_fib6_node_rt_rcu(&table->tb6_root) {
3467 if (dev == rt->fib6_nh.nh_dev &&
3468 ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3469 ipv6_addr_equal(&rt->fib6_nh.nh_gw, addr))
3478 struct fib6_info *rt6_add_dflt_router(struct net *net,
3479 const struct in6_addr *gwaddr,
3480 struct net_device *dev,
3483 struct fib6_config cfg = {
3484 .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3485 .fc_metric = IP6_RT_PRIO_USER,
3486 .fc_ifindex = dev->ifindex,
3487 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3488 RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3489 .fc_protocol = RTPROT_RA,
3490 .fc_type = RTN_UNICAST,
3491 .fc_nlinfo.portid = 0,
3492 .fc_nlinfo.nlh = NULL,
3493 .fc_nlinfo.nl_net = net,
3496 cfg.fc_gateway = *gwaddr;
3498 if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) {
3499 struct fib6_table *table;
3501 table = fib6_get_table(dev_net(dev), cfg.fc_table);
3503 table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3506 return rt6_get_dflt_router(net, gwaddr, dev);
3509 static void __rt6_purge_dflt_routers(struct net *net,
3510 struct fib6_table *table)
3512 struct fib6_info *rt;
3516 for_each_fib6_node_rt_rcu(&table->tb6_root) {
3517 struct net_device *dev = fib6_info_nh_dev(rt);
3518 struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL;
3520 if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3521 (!idev || idev->cnf.accept_ra != 2)) {
3524 ip6_del_rt(net, rt);
3530 table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3533 void rt6_purge_dflt_routers(struct net *net)
3535 struct fib6_table *table;
3536 struct hlist_head *head;
3541 for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3542 head = &net->ipv6.fib_table_hash[h];
3543 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3544 if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3545 __rt6_purge_dflt_routers(net, table);
3552 static void rtmsg_to_fib6_config(struct net *net,
3553 struct in6_rtmsg *rtmsg,
3554 struct fib6_config *cfg)
3556 memset(cfg, 0, sizeof(*cfg));
3558 cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3560 cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
3561 cfg->fc_metric = rtmsg->rtmsg_metric;
3562 cfg->fc_expires = rtmsg->rtmsg_info;
3563 cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
3564 cfg->fc_src_len = rtmsg->rtmsg_src_len;
3565 cfg->fc_flags = rtmsg->rtmsg_flags;
3566 cfg->fc_type = rtmsg->rtmsg_type;
3568 cfg->fc_nlinfo.nl_net = net;
3570 cfg->fc_dst = rtmsg->rtmsg_dst;
3571 cfg->fc_src = rtmsg->rtmsg_src;
3572 cfg->fc_gateway = rtmsg->rtmsg_gateway;
3575 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3577 struct fib6_config cfg;
3578 struct in6_rtmsg rtmsg;
3582 case SIOCADDRT: /* Add a route */
3583 case SIOCDELRT: /* Delete a route */
3584 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3586 err = copy_from_user(&rtmsg, arg,
3587 sizeof(struct in6_rtmsg));
3591 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3596 err = ip6_route_add(&cfg, GFP_KERNEL, NULL);
3599 err = ip6_route_del(&cfg, NULL);
3613 * Drop the packet on the floor
3616 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3619 struct dst_entry *dst = skb_dst(skb);
3620 switch (ipstats_mib_noroutes) {
3621 case IPSTATS_MIB_INNOROUTES:
3622 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3623 if (type == IPV6_ADDR_ANY) {
3624 IP6_INC_STATS(dev_net(dst->dev),
3625 __in6_dev_get_safely(skb->dev),
3626 IPSTATS_MIB_INADDRERRORS);
3630 case IPSTATS_MIB_OUTNOROUTES:
3631 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3632 ipstats_mib_noroutes);
3635 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3640 static int ip6_pkt_discard(struct sk_buff *skb)
3642 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3645 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3647 skb->dev = skb_dst(skb)->dev;
3648 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3651 static int ip6_pkt_prohibit(struct sk_buff *skb)
3653 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3656 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3658 skb->dev = skb_dst(skb)->dev;
3659 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3663 * Allocate a dst for local (unicast / anycast) address.
3666 struct fib6_info *addrconf_f6i_alloc(struct net *net,
3667 struct inet6_dev *idev,
3668 const struct in6_addr *addr,
3669 bool anycast, gfp_t gfp_flags)
3672 struct net_device *dev = idev->dev;
3673 struct fib6_info *f6i;
3675 f6i = fib6_info_alloc(gfp_flags);
3677 return ERR_PTR(-ENOMEM);
3679 f6i->dst_nocount = true;
3680 f6i->dst_host = true;
3681 f6i->fib6_protocol = RTPROT_KERNEL;
3682 f6i->fib6_flags = RTF_UP | RTF_NONEXTHOP;
3684 f6i->fib6_type = RTN_ANYCAST;
3685 f6i->fib6_flags |= RTF_ANYCAST;
3687 f6i->fib6_type = RTN_LOCAL;
3688 f6i->fib6_flags |= RTF_LOCAL;
3691 f6i->fib6_nh.nh_gw = *addr;
3693 f6i->fib6_nh.nh_dev = dev;
3694 f6i->fib6_dst.addr = *addr;
3695 f6i->fib6_dst.plen = 128;
3696 tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
3697 f6i->fib6_table = fib6_get_table(net, tb_id);
3702 /* remove deleted ip from prefsrc entries */
3703 struct arg_dev_net_ip {
3704 struct net_device *dev;
3706 struct in6_addr *addr;
3709 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
3711 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3712 struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3713 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3715 if (((void *)rt->fib6_nh.nh_dev == dev || !dev) &&
3716 rt != net->ipv6.fib6_null_entry &&
3717 ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) {
3718 spin_lock_bh(&rt6_exception_lock);
3719 /* remove prefsrc entry */
3720 rt->fib6_prefsrc.plen = 0;
3721 /* need to update cache as well */
3722 rt6_exceptions_remove_prefsrc(rt);
3723 spin_unlock_bh(&rt6_exception_lock);
3728 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3730 struct net *net = dev_net(ifp->idev->dev);
3731 struct arg_dev_net_ip adni = {
3732 .dev = ifp->idev->dev,
3736 fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3739 #define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
3741 /* Remove routers and update dst entries when gateway turn into host. */
3742 static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
3744 struct in6_addr *gateway = (struct in6_addr *)arg;
3746 if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3747 ipv6_addr_equal(gateway, &rt->fib6_nh.nh_gw)) {
3751 /* Further clean up cached routes in exception table.
3752 * This is needed because cached route may have a different
3753 * gateway than its 'parent' in the case of an ip redirect.
3755 rt6_exceptions_clean_tohost(rt, gateway);
3760 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3762 fib6_clean_all(net, fib6_clean_tohost, gateway);
3765 struct arg_netdev_event {
3766 const struct net_device *dev;
3768 unsigned int nh_flags;
3769 unsigned long event;
3773 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
3775 struct fib6_info *iter;
3776 struct fib6_node *fn;
3778 fn = rcu_dereference_protected(rt->fib6_node,
3779 lockdep_is_held(&rt->fib6_table->tb6_lock));
3780 iter = rcu_dereference_protected(fn->leaf,
3781 lockdep_is_held(&rt->fib6_table->tb6_lock));
3783 if (iter->fib6_metric == rt->fib6_metric &&
3784 rt6_qualify_for_ecmp(iter))
3786 iter = rcu_dereference_protected(iter->fib6_next,
3787 lockdep_is_held(&rt->fib6_table->tb6_lock));
3793 static bool rt6_is_dead(const struct fib6_info *rt)
3795 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD ||
3796 (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
3797 fib6_ignore_linkdown(rt)))
3803 static int rt6_multipath_total_weight(const struct fib6_info *rt)
3805 struct fib6_info *iter;
3808 if (!rt6_is_dead(rt))
3809 total += rt->fib6_nh.nh_weight;
3811 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) {
3812 if (!rt6_is_dead(iter))
3813 total += iter->fib6_nh.nh_weight;
3819 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total)
3821 int upper_bound = -1;
3823 if (!rt6_is_dead(rt)) {
3824 *weight += rt->fib6_nh.nh_weight;
3825 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
3828 atomic_set(&rt->fib6_nh.nh_upper_bound, upper_bound);
3831 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total)
3833 struct fib6_info *iter;
3836 rt6_upper_bound_set(rt, &weight, total);
3838 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3839 rt6_upper_bound_set(iter, &weight, total);
3842 void rt6_multipath_rebalance(struct fib6_info *rt)
3844 struct fib6_info *first;
3847 /* In case the entire multipath route was marked for flushing,
3848 * then there is no need to rebalance upon the removal of every
3851 if (!rt->fib6_nsiblings || rt->should_flush)
3854 /* During lookup routes are evaluated in order, so we need to
3855 * make sure upper bounds are assigned from the first sibling
3858 first = rt6_multipath_first_sibling(rt);
3859 if (WARN_ON_ONCE(!first))
3862 total = rt6_multipath_total_weight(first);
3863 rt6_multipath_upper_bound_set(first, total);
3866 static int fib6_ifup(struct fib6_info *rt, void *p_arg)
3868 const struct arg_netdev_event *arg = p_arg;
3869 struct net *net = dev_net(arg->dev);
3871 if (rt != net->ipv6.fib6_null_entry && rt->fib6_nh.nh_dev == arg->dev) {
3872 rt->fib6_nh.nh_flags &= ~arg->nh_flags;
3873 fib6_update_sernum_upto_root(net, rt);
3874 rt6_multipath_rebalance(rt);
3880 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags)
3882 struct arg_netdev_event arg = {
3885 .nh_flags = nh_flags,
3889 if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
3890 arg.nh_flags |= RTNH_F_LINKDOWN;
3892 fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
3895 static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
3896 const struct net_device *dev)
3898 struct fib6_info *iter;
3900 if (rt->fib6_nh.nh_dev == dev)
3902 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3903 if (iter->fib6_nh.nh_dev == dev)
3909 static void rt6_multipath_flush(struct fib6_info *rt)
3911 struct fib6_info *iter;
3913 rt->should_flush = 1;
3914 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3915 iter->should_flush = 1;
3918 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt,
3919 const struct net_device *down_dev)
3921 struct fib6_info *iter;
3922 unsigned int dead = 0;
3924 if (rt->fib6_nh.nh_dev == down_dev ||
3925 rt->fib6_nh.nh_flags & RTNH_F_DEAD)
3927 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3928 if (iter->fib6_nh.nh_dev == down_dev ||
3929 iter->fib6_nh.nh_flags & RTNH_F_DEAD)
3935 static void rt6_multipath_nh_flags_set(struct fib6_info *rt,
3936 const struct net_device *dev,
3937 unsigned int nh_flags)
3939 struct fib6_info *iter;
3941 if (rt->fib6_nh.nh_dev == dev)
3942 rt->fib6_nh.nh_flags |= nh_flags;
3943 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3944 if (iter->fib6_nh.nh_dev == dev)
3945 iter->fib6_nh.nh_flags |= nh_flags;
3948 /* called with write lock held for table with rt */
3949 static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
3951 const struct arg_netdev_event *arg = p_arg;
3952 const struct net_device *dev = arg->dev;
3953 struct net *net = dev_net(dev);
3955 if (rt == net->ipv6.fib6_null_entry)
3958 switch (arg->event) {
3959 case NETDEV_UNREGISTER:
3960 return rt->fib6_nh.nh_dev == dev ? -1 : 0;
3962 if (rt->should_flush)
3964 if (!rt->fib6_nsiblings)
3965 return rt->fib6_nh.nh_dev == dev ? -1 : 0;
3966 if (rt6_multipath_uses_dev(rt, dev)) {
3969 count = rt6_multipath_dead_count(rt, dev);
3970 if (rt->fib6_nsiblings + 1 == count) {
3971 rt6_multipath_flush(rt);
3974 rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
3976 fib6_update_sernum(net, rt);
3977 rt6_multipath_rebalance(rt);
3981 if (rt->fib6_nh.nh_dev != dev ||
3982 rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
3984 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
3985 rt6_multipath_rebalance(rt);
3992 void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
3994 struct arg_netdev_event arg = {
4001 fib6_clean_all(dev_net(dev), fib6_ifdown, &arg);
4004 void rt6_disable_ip(struct net_device *dev, unsigned long event)
4006 rt6_sync_down_dev(dev, event);
4007 rt6_uncached_list_flush_dev(dev_net(dev), dev);
4008 neigh_ifdown(&nd_tbl, dev);
4011 struct rt6_mtu_change_arg {
4012 struct net_device *dev;
4016 static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg)
4018 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
4019 struct inet6_dev *idev;
4021 /* In IPv6 pmtu discovery is not optional,
4022 so that RTAX_MTU lock cannot disable it.
4023 We still use this lock to block changes
4024 caused by addrconf/ndisc.
4027 idev = __in6_dev_get(arg->dev);
4031 /* For administrative MTU increase, there is no way to discover
4032 IPv6 PMTU increase, so PMTU increase should be updated here.
4033 Since RFC 1981 doesn't include administrative MTU increase
4034 update PMTU increase is a MUST. (i.e. jumbo frame)
4036 if (rt->fib6_nh.nh_dev == arg->dev &&
4037 !fib6_metric_locked(rt, RTAX_MTU)) {
4038 u32 mtu = rt->fib6_pmtu;
4040 if (mtu >= arg->mtu ||
4041 (mtu < arg->mtu && mtu == idev->cnf.mtu6))
4042 fib6_metric_set(rt, RTAX_MTU, arg->mtu);
4044 spin_lock_bh(&rt6_exception_lock);
4045 rt6_exceptions_update_pmtu(idev, rt, arg->mtu);
4046 spin_unlock_bh(&rt6_exception_lock);
4051 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
4053 struct rt6_mtu_change_arg arg = {
4058 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
4061 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
4062 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) },
4063 [RTA_PREFSRC] = { .len = sizeof(struct in6_addr) },
4064 [RTA_OIF] = { .type = NLA_U32 },
4065 [RTA_IIF] = { .type = NLA_U32 },
4066 [RTA_PRIORITY] = { .type = NLA_U32 },
4067 [RTA_METRICS] = { .type = NLA_NESTED },
4068 [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) },
4069 [RTA_PREF] = { .type = NLA_U8 },
4070 [RTA_ENCAP_TYPE] = { .type = NLA_U16 },
4071 [RTA_ENCAP] = { .type = NLA_NESTED },
4072 [RTA_EXPIRES] = { .type = NLA_U32 },
4073 [RTA_UID] = { .type = NLA_U32 },
4074 [RTA_MARK] = { .type = NLA_U32 },
4075 [RTA_TABLE] = { .type = NLA_U32 },
4078 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
4079 struct fib6_config *cfg,
4080 struct netlink_ext_ack *extack)
4083 struct nlattr *tb[RTA_MAX+1];
4087 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4093 rtm = nlmsg_data(nlh);
4094 memset(cfg, 0, sizeof(*cfg));
4096 cfg->fc_table = rtm->rtm_table;
4097 cfg->fc_dst_len = rtm->rtm_dst_len;
4098 cfg->fc_src_len = rtm->rtm_src_len;
4099 cfg->fc_flags = RTF_UP;
4100 cfg->fc_protocol = rtm->rtm_protocol;
4101 cfg->fc_type = rtm->rtm_type;
4103 if (rtm->rtm_type == RTN_UNREACHABLE ||
4104 rtm->rtm_type == RTN_BLACKHOLE ||
4105 rtm->rtm_type == RTN_PROHIBIT ||
4106 rtm->rtm_type == RTN_THROW)
4107 cfg->fc_flags |= RTF_REJECT;
4109 if (rtm->rtm_type == RTN_LOCAL)
4110 cfg->fc_flags |= RTF_LOCAL;
4112 if (rtm->rtm_flags & RTM_F_CLONED)
4113 cfg->fc_flags |= RTF_CACHE;
4115 cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
4117 cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
4118 cfg->fc_nlinfo.nlh = nlh;
4119 cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
4121 if (tb[RTA_GATEWAY]) {
4122 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
4123 cfg->fc_flags |= RTF_GATEWAY;
4127 int plen = (rtm->rtm_dst_len + 7) >> 3;
4129 if (nla_len(tb[RTA_DST]) < plen)
4132 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
4136 int plen = (rtm->rtm_src_len + 7) >> 3;
4138 if (nla_len(tb[RTA_SRC]) < plen)
4141 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
4144 if (tb[RTA_PREFSRC])
4145 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
4148 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
4150 if (tb[RTA_PRIORITY])
4151 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
4153 if (tb[RTA_METRICS]) {
4154 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
4155 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
4159 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
4161 if (tb[RTA_MULTIPATH]) {
4162 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
4163 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
4165 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
4166 cfg->fc_mp_len, extack);
4172 pref = nla_get_u8(tb[RTA_PREF]);
4173 if (pref != ICMPV6_ROUTER_PREF_LOW &&
4174 pref != ICMPV6_ROUTER_PREF_HIGH)
4175 pref = ICMPV6_ROUTER_PREF_MEDIUM;
4176 cfg->fc_flags |= RTF_PREF(pref);
4180 cfg->fc_encap = tb[RTA_ENCAP];
4182 if (tb[RTA_ENCAP_TYPE]) {
4183 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
4185 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
4190 if (tb[RTA_EXPIRES]) {
4191 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
4193 if (addrconf_finite_timeout(timeout)) {
4194 cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
4195 cfg->fc_flags |= RTF_EXPIRES;
4205 struct fib6_info *fib6_info;
4206 struct fib6_config r_cfg;
4207 struct list_head next;
4210 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
4214 list_for_each_entry(nh, rt6_nh_list, next) {
4215 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
4216 &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
4217 nh->r_cfg.fc_ifindex);
4221 static int ip6_route_info_append(struct net *net,
4222 struct list_head *rt6_nh_list,
4223 struct fib6_info *rt,
4224 struct fib6_config *r_cfg)
4229 list_for_each_entry(nh, rt6_nh_list, next) {
4230 /* check if fib6_info already exists */
4231 if (rt6_duplicate_nexthop(nh->fib6_info, rt))
4235 nh = kzalloc(sizeof(*nh), GFP_KERNEL);
4239 err = ip6_convert_metrics(net, rt, r_cfg);
4244 memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
4245 list_add_tail(&nh->next, rt6_nh_list);
4250 static void ip6_route_mpath_notify(struct fib6_info *rt,
4251 struct fib6_info *rt_last,
4252 struct nl_info *info,
4255 /* if this is an APPEND route, then rt points to the first route
4256 * inserted and rt_last points to last route inserted. Userspace
4257 * wants a consistent dump of the route which starts at the first
4258 * nexthop. Since sibling routes are always added at the end of
4259 * the list, find the first sibling of the last route appended
4261 if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) {
4262 rt = list_first_entry(&rt_last->fib6_siblings,
4268 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
4271 static int ip6_route_multipath_add(struct fib6_config *cfg,
4272 struct netlink_ext_ack *extack)
4274 struct fib6_info *rt_notif = NULL, *rt_last = NULL;
4275 struct nl_info *info = &cfg->fc_nlinfo;
4276 struct fib6_config r_cfg;
4277 struct rtnexthop *rtnh;
4278 struct fib6_info *rt;
4279 struct rt6_nh *err_nh;
4280 struct rt6_nh *nh, *nh_safe;
4286 int replace = (cfg->fc_nlinfo.nlh &&
4287 (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
4288 LIST_HEAD(rt6_nh_list);
4290 nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
4291 if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
4292 nlflags |= NLM_F_APPEND;
4294 remaining = cfg->fc_mp_len;
4295 rtnh = (struct rtnexthop *)cfg->fc_mp;
4297 /* Parse a Multipath Entry and build a list (rt6_nh_list) of
4298 * fib6_info structs per nexthop
4300 while (rtnh_ok(rtnh, remaining)) {
4301 memcpy(&r_cfg, cfg, sizeof(*cfg));
4302 if (rtnh->rtnh_ifindex)
4303 r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4305 attrlen = rtnh_attrlen(rtnh);
4307 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4309 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4311 r_cfg.fc_gateway = nla_get_in6_addr(nla);
4312 r_cfg.fc_flags |= RTF_GATEWAY;
4314 r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
4315 nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
4317 r_cfg.fc_encap_type = nla_get_u16(nla);
4320 r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
4321 rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack);
4328 rt->fib6_nh.nh_weight = rtnh->rtnh_hops + 1;
4330 err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
4333 fib6_info_release(rt);
4337 rtnh = rtnh_next(rtnh, &remaining);
4340 /* for add and replace send one notification with all nexthops.
4341 * Skip the notification in fib6_add_rt2node and send one with
4342 * the full route when done
4344 info->skip_notify = 1;
4347 list_for_each_entry(nh, &rt6_nh_list, next) {
4348 rt_last = nh->fib6_info;
4349 err = __ip6_ins_rt(nh->fib6_info, info, extack);
4350 fib6_info_release(nh->fib6_info);
4352 /* save reference to first route for notification */
4353 if (!rt_notif && !err)
4354 rt_notif = nh->fib6_info;
4356 /* nh->fib6_info is used or freed at this point, reset to NULL*/
4357 nh->fib6_info = NULL;
4360 ip6_print_replace_route_err(&rt6_nh_list);
4365 /* Because each route is added like a single route we remove
4366 * these flags after the first nexthop: if there is a collision,
4367 * we have already failed to add the first nexthop:
4368 * fib6_add_rt2node() has rejected it; when replacing, old
4369 * nexthops have been replaced by first new, the rest should
4372 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
4377 /* success ... tell user about new route */
4378 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4382 /* send notification for routes that were added so that
4383 * the delete notifications sent by ip6_route_del are
4387 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4389 /* Delete routes that were already added */
4390 list_for_each_entry(nh, &rt6_nh_list, next) {
4393 ip6_route_del(&nh->r_cfg, extack);
4397 list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
4399 fib6_info_release(nh->fib6_info);
4400 list_del(&nh->next);
4407 static int ip6_route_multipath_del(struct fib6_config *cfg,
4408 struct netlink_ext_ack *extack)
4410 struct fib6_config r_cfg;
4411 struct rtnexthop *rtnh;
4414 int err = 1, last_err = 0;
4416 remaining = cfg->fc_mp_len;
4417 rtnh = (struct rtnexthop *)cfg->fc_mp;
4419 /* Parse a Multipath Entry */
4420 while (rtnh_ok(rtnh, remaining)) {
4421 memcpy(&r_cfg, cfg, sizeof(*cfg));
4422 if (rtnh->rtnh_ifindex)
4423 r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4425 attrlen = rtnh_attrlen(rtnh);
4427 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4429 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4431 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
4432 r_cfg.fc_flags |= RTF_GATEWAY;
4435 err = ip6_route_del(&r_cfg, extack);
4439 rtnh = rtnh_next(rtnh, &remaining);
4445 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4446 struct netlink_ext_ack *extack)
4448 struct fib6_config cfg;
4451 err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4456 return ip6_route_multipath_del(&cfg, extack);
4458 cfg.fc_delete_all_nh = 1;
4459 return ip6_route_del(&cfg, extack);
4463 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4464 struct netlink_ext_ack *extack)
4466 struct fib6_config cfg;
4469 err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4474 return ip6_route_multipath_add(&cfg, extack);
4476 return ip6_route_add(&cfg, GFP_KERNEL, extack);
4479 static size_t rt6_nlmsg_size(struct fib6_info *rt)
4481 int nexthop_len = 0;
4483 if (rt->fib6_nsiblings) {
4484 nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */
4485 + NLA_ALIGN(sizeof(struct rtnexthop))
4486 + nla_total_size(16) /* RTA_GATEWAY */
4487 + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate);
4489 nexthop_len *= rt->fib6_nsiblings;
4492 return NLMSG_ALIGN(sizeof(struct rtmsg))
4493 + nla_total_size(16) /* RTA_SRC */
4494 + nla_total_size(16) /* RTA_DST */
4495 + nla_total_size(16) /* RTA_GATEWAY */
4496 + nla_total_size(16) /* RTA_PREFSRC */
4497 + nla_total_size(4) /* RTA_TABLE */
4498 + nla_total_size(4) /* RTA_IIF */
4499 + nla_total_size(4) /* RTA_OIF */
4500 + nla_total_size(4) /* RTA_PRIORITY */
4501 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
4502 + nla_total_size(sizeof(struct rta_cacheinfo))
4503 + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
4504 + nla_total_size(1) /* RTA_PREF */
4505 + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate)
4509 static int rt6_nexthop_info(struct sk_buff *skb, struct fib6_info *rt,
4510 unsigned int *flags, bool skip_oif)
4512 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
4513 *flags |= RTNH_F_DEAD;
4515 if (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN) {
4516 *flags |= RTNH_F_LINKDOWN;
4519 if (fib6_ignore_linkdown(rt))
4520 *flags |= RTNH_F_DEAD;
4524 if (rt->fib6_flags & RTF_GATEWAY) {
4525 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->fib6_nh.nh_gw) < 0)
4526 goto nla_put_failure;
4529 *flags |= (rt->fib6_nh.nh_flags & RTNH_F_ONLINK);
4530 if (rt->fib6_nh.nh_flags & RTNH_F_OFFLOAD)
4531 *flags |= RTNH_F_OFFLOAD;
4533 /* not needed for multipath encoding b/c it has a rtnexthop struct */
4534 if (!skip_oif && rt->fib6_nh.nh_dev &&
4535 nla_put_u32(skb, RTA_OIF, rt->fib6_nh.nh_dev->ifindex))
4536 goto nla_put_failure;
4538 if (rt->fib6_nh.nh_lwtstate &&
4539 lwtunnel_fill_encap(skb, rt->fib6_nh.nh_lwtstate) < 0)
4540 goto nla_put_failure;
4548 /* add multipath next hop */
4549 static int rt6_add_nexthop(struct sk_buff *skb, struct fib6_info *rt)
4551 const struct net_device *dev = rt->fib6_nh.nh_dev;
4552 struct rtnexthop *rtnh;
4553 unsigned int flags = 0;
4555 rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
4557 goto nla_put_failure;
4559 rtnh->rtnh_hops = rt->fib6_nh.nh_weight - 1;
4560 rtnh->rtnh_ifindex = dev ? dev->ifindex : 0;
4562 if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
4563 goto nla_put_failure;
4565 rtnh->rtnh_flags = flags;
4567 /* length of rtnetlink header + attributes */
4568 rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
4576 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
4577 struct fib6_info *rt, struct dst_entry *dst,
4578 struct in6_addr *dest, struct in6_addr *src,
4579 int iif, int type, u32 portid, u32 seq,
4583 struct nlmsghdr *nlh;
4588 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4592 rtm = nlmsg_data(nlh);
4593 rtm->rtm_family = AF_INET6;
4594 rtm->rtm_dst_len = rt->fib6_dst.plen;
4595 rtm->rtm_src_len = rt->fib6_src.plen;
4598 table = rt->fib6_table->tb6_id;
4600 table = RT6_TABLE_UNSPEC;
4601 rtm->rtm_table = table;
4602 if (nla_put_u32(skb, RTA_TABLE, table))
4603 goto nla_put_failure;
4605 rtm->rtm_type = rt->fib6_type;
4607 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4608 rtm->rtm_protocol = rt->fib6_protocol;
4610 if (rt->fib6_flags & RTF_CACHE)
4611 rtm->rtm_flags |= RTM_F_CLONED;
4614 if (nla_put_in6_addr(skb, RTA_DST, dest))
4615 goto nla_put_failure;
4616 rtm->rtm_dst_len = 128;
4617 } else if (rtm->rtm_dst_len)
4618 if (nla_put_in6_addr(skb, RTA_DST, &rt->fib6_dst.addr))
4619 goto nla_put_failure;
4620 #ifdef CONFIG_IPV6_SUBTREES
4622 if (nla_put_in6_addr(skb, RTA_SRC, src))
4623 goto nla_put_failure;
4624 rtm->rtm_src_len = 128;
4625 } else if (rtm->rtm_src_len &&
4626 nla_put_in6_addr(skb, RTA_SRC, &rt->fib6_src.addr))
4627 goto nla_put_failure;
4630 #ifdef CONFIG_IPV6_MROUTE
4631 if (ipv6_addr_is_multicast(&rt->fib6_dst.addr)) {
4632 int err = ip6mr_get_route(net, skb, rtm, portid);
4637 goto nla_put_failure;
4640 if (nla_put_u32(skb, RTA_IIF, iif))
4641 goto nla_put_failure;
4643 struct in6_addr saddr_buf;
4644 if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 &&
4645 nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4646 goto nla_put_failure;
4649 if (rt->fib6_prefsrc.plen) {
4650 struct in6_addr saddr_buf;
4651 saddr_buf = rt->fib6_prefsrc.addr;
4652 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4653 goto nla_put_failure;
4656 pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics;
4657 if (rtnetlink_put_metrics(skb, pmetrics) < 0)
4658 goto nla_put_failure;
4660 if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric))
4661 goto nla_put_failure;
4663 /* For multipath routes, walk the siblings list and add
4664 * each as a nexthop within RTA_MULTIPATH.
4666 if (rt->fib6_nsiblings) {
4667 struct fib6_info *sibling, *next_sibling;
4670 mp = nla_nest_start(skb, RTA_MULTIPATH);
4672 goto nla_put_failure;
4674 if (rt6_add_nexthop(skb, rt) < 0)
4675 goto nla_put_failure;
4677 list_for_each_entry_safe(sibling, next_sibling,
4678 &rt->fib6_siblings, fib6_siblings) {
4679 if (rt6_add_nexthop(skb, sibling) < 0)
4680 goto nla_put_failure;
4683 nla_nest_end(skb, mp);
4685 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
4686 goto nla_put_failure;
4689 if (rt->fib6_flags & RTF_EXPIRES) {
4690 expires = dst ? dst->expires : rt->expires;
4694 if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
4695 goto nla_put_failure;
4697 if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->fib6_flags)))
4698 goto nla_put_failure;
4701 nlmsg_end(skb, nlh);
4705 nlmsg_cancel(skb, nlh);
4709 int rt6_dump_route(struct fib6_info *rt, void *p_arg)
4711 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4712 struct net *net = arg->net;
4714 if (rt == net->ipv6.fib6_null_entry)
4717 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
4718 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
4720 /* user wants prefix routes only */
4721 if (rtm->rtm_flags & RTM_F_PREFIX &&
4722 !(rt->fib6_flags & RTF_PREFIX_RT)) {
4723 /* success since this is not a prefix route */
4728 return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0,
4729 RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid,
4730 arg->cb->nlh->nlmsg_seq, NLM_F_MULTI);
4733 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4734 struct netlink_ext_ack *extack)
4736 struct net *net = sock_net(in_skb->sk);
4737 struct nlattr *tb[RTA_MAX+1];
4738 int err, iif = 0, oif = 0;
4739 struct fib6_info *from;
4740 struct dst_entry *dst;
4741 struct rt6_info *rt;
4742 struct sk_buff *skb;
4747 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4753 memset(&fl6, 0, sizeof(fl6));
4754 rtm = nlmsg_data(nlh);
4755 fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4756 fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4759 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4762 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4766 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4769 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4773 iif = nla_get_u32(tb[RTA_IIF]);
4776 oif = nla_get_u32(tb[RTA_OIF]);
4779 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4782 fl6.flowi6_uid = make_kuid(current_user_ns(),
4783 nla_get_u32(tb[RTA_UID]));
4785 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4788 struct net_device *dev;
4793 dev = dev_get_by_index_rcu(net, iif);
4800 fl6.flowi6_iif = iif;
4802 if (!ipv6_addr_any(&fl6.saddr))
4803 flags |= RT6_LOOKUP_F_HAS_SADDR;
4805 dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags);
4809 fl6.flowi6_oif = oif;
4811 dst = ip6_route_output(net, NULL, &fl6);
4815 rt = container_of(dst, struct rt6_info, dst);
4816 if (rt->dst.error) {
4817 err = rt->dst.error;
4822 if (rt == net->ipv6.ip6_null_entry) {
4823 err = rt->dst.error;
4828 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
4835 skb_dst_set(skb, &rt->dst);
4838 from = rcu_dereference(rt->from);
4841 err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, iif,
4842 RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4845 err = rt6_fill_node(net, skb, from, dst, &fl6.daddr,
4846 &fl6.saddr, iif, RTM_NEWROUTE,
4847 NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
4856 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
4861 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
4862 unsigned int nlm_flags)
4864 struct sk_buff *skb;
4865 struct net *net = info->nl_net;
4870 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
4872 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
4876 err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
4877 event, info->portid, seq, nlm_flags);
4879 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
4880 WARN_ON(err == -EMSGSIZE);
4884 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
4885 info->nlh, gfp_any());
4889 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
4892 static int ip6_route_dev_notify(struct notifier_block *this,
4893 unsigned long event, void *ptr)
4895 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
4896 struct net *net = dev_net(dev);
4898 if (!(dev->flags & IFF_LOOPBACK))
4901 if (event == NETDEV_REGISTER) {
4902 net->ipv6.fib6_null_entry->fib6_nh.nh_dev = dev;
4903 net->ipv6.ip6_null_entry->dst.dev = dev;
4904 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
4905 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4906 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
4907 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
4908 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
4909 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
4911 } else if (event == NETDEV_UNREGISTER &&
4912 dev->reg_state != NETREG_UNREGISTERED) {
4913 /* NETDEV_UNREGISTER could be fired for multiple times by
4914 * netdev_wait_allrefs(). Make sure we only call this once.
4916 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
4917 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4918 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
4919 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
4930 #ifdef CONFIG_PROC_FS
4932 static const struct file_operations ipv6_route_proc_fops = {
4933 .open = ipv6_route_open,
4935 .llseek = seq_lseek,
4936 .release = seq_release_net,
4939 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
4941 struct net *net = (struct net *)seq->private;
4942 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
4943 net->ipv6.rt6_stats->fib_nodes,
4944 net->ipv6.rt6_stats->fib_route_nodes,
4945 atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
4946 net->ipv6.rt6_stats->fib_rt_entries,
4947 net->ipv6.rt6_stats->fib_rt_cache,
4948 dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
4949 net->ipv6.rt6_stats->fib_discarded_routes);
4954 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
4956 return single_open_net(inode, file, rt6_stats_seq_show);
4959 static const struct file_operations rt6_stats_seq_fops = {
4960 .open = rt6_stats_seq_open,
4962 .llseek = seq_lseek,
4963 .release = single_release_net,
4965 #endif /* CONFIG_PROC_FS */
4967 #ifdef CONFIG_SYSCTL
4970 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
4971 void __user *buffer, size_t *lenp, loff_t *ppos)
4978 net = (struct net *)ctl->extra1;
4979 delay = net->ipv6.sysctl.flush_delay;
4980 proc_dointvec(ctl, write, buffer, lenp, ppos);
4981 fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
4985 struct ctl_table ipv6_route_table_template[] = {
4987 .procname = "flush",
4988 .data = &init_net.ipv6.sysctl.flush_delay,
4989 .maxlen = sizeof(int),
4991 .proc_handler = ipv6_sysctl_rtcache_flush
4994 .procname = "gc_thresh",
4995 .data = &ip6_dst_ops_template.gc_thresh,
4996 .maxlen = sizeof(int),
4998 .proc_handler = proc_dointvec,
5001 .procname = "max_size",
5002 .data = &init_net.ipv6.sysctl.ip6_rt_max_size,
5003 .maxlen = sizeof(int),
5005 .proc_handler = proc_dointvec,
5008 .procname = "gc_min_interval",
5009 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5010 .maxlen = sizeof(int),
5012 .proc_handler = proc_dointvec_jiffies,
5015 .procname = "gc_timeout",
5016 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
5017 .maxlen = sizeof(int),
5019 .proc_handler = proc_dointvec_jiffies,
5022 .procname = "gc_interval",
5023 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval,
5024 .maxlen = sizeof(int),
5026 .proc_handler = proc_dointvec_jiffies,
5029 .procname = "gc_elasticity",
5030 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
5031 .maxlen = sizeof(int),
5033 .proc_handler = proc_dointvec,
5036 .procname = "mtu_expires",
5037 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
5038 .maxlen = sizeof(int),
5040 .proc_handler = proc_dointvec_jiffies,
5043 .procname = "min_adv_mss",
5044 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss,
5045 .maxlen = sizeof(int),
5047 .proc_handler = proc_dointvec,
5050 .procname = "gc_min_interval_ms",
5051 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5052 .maxlen = sizeof(int),
5054 .proc_handler = proc_dointvec_ms_jiffies,
5059 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
5061 struct ctl_table *table;
5063 table = kmemdup(ipv6_route_table_template,
5064 sizeof(ipv6_route_table_template),
5068 table[0].data = &net->ipv6.sysctl.flush_delay;
5069 table[0].extra1 = net;
5070 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
5071 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
5072 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5073 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
5074 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
5075 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
5076 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
5077 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
5078 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5080 /* Don't export sysctls to unprivileged users */
5081 if (net->user_ns != &init_user_ns)
5082 table[0].procname = NULL;
5089 static int __net_init ip6_route_net_init(struct net *net)
5093 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
5094 sizeof(net->ipv6.ip6_dst_ops));
5096 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
5097 goto out_ip6_dst_ops;
5099 net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template,
5100 sizeof(*net->ipv6.fib6_null_entry),
5102 if (!net->ipv6.fib6_null_entry)
5103 goto out_ip6_dst_entries;
5105 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
5106 sizeof(*net->ipv6.ip6_null_entry),
5108 if (!net->ipv6.ip6_null_entry)
5109 goto out_fib6_null_entry;
5110 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5111 dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
5112 ip6_template_metrics, true);
5114 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5115 net->ipv6.fib6_has_custom_rules = false;
5116 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
5117 sizeof(*net->ipv6.ip6_prohibit_entry),
5119 if (!net->ipv6.ip6_prohibit_entry)
5120 goto out_ip6_null_entry;
5121 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5122 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
5123 ip6_template_metrics, true);
5125 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
5126 sizeof(*net->ipv6.ip6_blk_hole_entry),
5128 if (!net->ipv6.ip6_blk_hole_entry)
5129 goto out_ip6_prohibit_entry;
5130 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5131 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
5132 ip6_template_metrics, true);
5135 net->ipv6.sysctl.flush_delay = 0;
5136 net->ipv6.sysctl.ip6_rt_max_size = 4096;
5137 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
5138 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
5139 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
5140 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
5141 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
5142 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
5144 net->ipv6.ip6_rt_gc_expire = 30*HZ;
5150 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5151 out_ip6_prohibit_entry:
5152 kfree(net->ipv6.ip6_prohibit_entry);
5154 kfree(net->ipv6.ip6_null_entry);
5156 out_fib6_null_entry:
5157 kfree(net->ipv6.fib6_null_entry);
5158 out_ip6_dst_entries:
5159 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5164 static void __net_exit ip6_route_net_exit(struct net *net)
5166 kfree(net->ipv6.fib6_null_entry);
5167 kfree(net->ipv6.ip6_null_entry);
5168 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5169 kfree(net->ipv6.ip6_prohibit_entry);
5170 kfree(net->ipv6.ip6_blk_hole_entry);
5172 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5175 static int __net_init ip6_route_net_init_late(struct net *net)
5177 #ifdef CONFIG_PROC_FS
5178 proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
5179 proc_create("rt6_stats", 0444, net->proc_net, &rt6_stats_seq_fops);
5184 static void __net_exit ip6_route_net_exit_late(struct net *net)
5186 #ifdef CONFIG_PROC_FS
5187 remove_proc_entry("ipv6_route", net->proc_net);
5188 remove_proc_entry("rt6_stats", net->proc_net);
5192 static struct pernet_operations ip6_route_net_ops = {
5193 .init = ip6_route_net_init,
5194 .exit = ip6_route_net_exit,
5197 static int __net_init ipv6_inetpeer_init(struct net *net)
5199 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
5203 inet_peer_base_init(bp);
5204 net->ipv6.peers = bp;
5208 static void __net_exit ipv6_inetpeer_exit(struct net *net)
5210 struct inet_peer_base *bp = net->ipv6.peers;
5212 net->ipv6.peers = NULL;
5213 inetpeer_invalidate_tree(bp);
5217 static struct pernet_operations ipv6_inetpeer_ops = {
5218 .init = ipv6_inetpeer_init,
5219 .exit = ipv6_inetpeer_exit,
5222 static struct pernet_operations ip6_route_net_late_ops = {
5223 .init = ip6_route_net_init_late,
5224 .exit = ip6_route_net_exit_late,
5227 static struct notifier_block ip6_route_dev_notifier = {
5228 .notifier_call = ip6_route_dev_notify,
5229 .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
5232 void __init ip6_route_init_special_entries(void)
5234 /* Registering of the loopback is done before this portion of code,
5235 * the loopback reference in rt6_info will not be taken, do it
5236 * manually for init_net */
5237 init_net.ipv6.fib6_null_entry->fib6_nh.nh_dev = init_net.loopback_dev;
5238 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
5239 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5240 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5241 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
5242 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5243 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
5244 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5248 int __init ip6_route_init(void)
5254 ip6_dst_ops_template.kmem_cachep =
5255 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
5256 SLAB_HWCACHE_ALIGN, NULL);
5257 if (!ip6_dst_ops_template.kmem_cachep)
5260 ret = dst_entries_init(&ip6_dst_blackhole_ops);
5262 goto out_kmem_cache;
5264 ret = register_pernet_subsys(&ipv6_inetpeer_ops);
5266 goto out_dst_entries;
5268 ret = register_pernet_subsys(&ip6_route_net_ops);
5270 goto out_register_inetpeer;
5272 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
5276 goto out_register_subsys;
5282 ret = fib6_rules_init();
5286 ret = register_pernet_subsys(&ip6_route_net_late_ops);
5288 goto fib6_rules_init;
5290 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
5291 inet6_rtm_newroute, NULL, 0);
5293 goto out_register_late_subsys;
5295 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
5296 inet6_rtm_delroute, NULL, 0);
5298 goto out_register_late_subsys;
5300 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
5301 inet6_rtm_getroute, NULL,
5302 RTNL_FLAG_DOIT_UNLOCKED);
5304 goto out_register_late_subsys;
5306 ret = register_netdevice_notifier(&ip6_route_dev_notifier);
5308 goto out_register_late_subsys;
5310 for_each_possible_cpu(cpu) {
5311 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
5313 INIT_LIST_HEAD(&ul->head);
5314 spin_lock_init(&ul->lock);
5320 out_register_late_subsys:
5321 rtnl_unregister_all(PF_INET6);
5322 unregister_pernet_subsys(&ip6_route_net_late_ops);
5324 fib6_rules_cleanup();
5329 out_register_subsys:
5330 unregister_pernet_subsys(&ip6_route_net_ops);
5331 out_register_inetpeer:
5332 unregister_pernet_subsys(&ipv6_inetpeer_ops);
5334 dst_entries_destroy(&ip6_dst_blackhole_ops);
5336 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5340 void ip6_route_cleanup(void)
5342 unregister_netdevice_notifier(&ip6_route_dev_notifier);
5343 unregister_pernet_subsys(&ip6_route_net_late_ops);
5344 fib6_rules_cleanup();
5347 unregister_pernet_subsys(&ipv6_inetpeer_ops);
5348 unregister_pernet_subsys(&ip6_route_net_ops);
5349 dst_entries_destroy(&ip6_dst_blackhole_ops);
5350 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);