2 * Linux INET6 implementation
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
16 * YOSHIFUJI Hideaki @USAGI
17 * reworked default router selection.
18 * - respect outgoing interface
19 * - select from (probably) reachable routers (i.e.
20 * routers in REACHABLE, STALE, DELAY or PROBE states).
21 * - always select the same router if it is (probably)
22 * reachable. otherwise, round-robin the list.
24 * Fixed routing subtrees.
27 #define pr_fmt(fmt) "IPv6: " fmt
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
56 #include <linux/rtnetlink.h>
58 #include <net/dst_metadata.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/nexthop.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
67 #include <linux/uaccess.h>
70 #include <linux/sysctl.h>
73 static int ip6_rt_type_to_error(u8 fib6_type);
75 #define CREATE_TRACE_POINTS
76 #include <trace/events/fib6.h>
77 EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup);
78 #undef CREATE_TRACE_POINTS
81 RT6_NUD_FAIL_HARD = -3,
82 RT6_NUD_FAIL_PROBE = -2,
83 RT6_NUD_FAIL_DO_RR = -1,
87 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
88 static unsigned int ip6_default_advmss(const struct dst_entry *dst);
89 static unsigned int ip6_mtu(const struct dst_entry *dst);
90 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
91 static void ip6_dst_destroy(struct dst_entry *);
92 static void ip6_dst_ifdown(struct dst_entry *,
93 struct net_device *dev, int how);
94 static int ip6_dst_gc(struct dst_ops *ops);
96 static int ip6_pkt_discard(struct sk_buff *skb);
97 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
98 static int ip6_pkt_prohibit(struct sk_buff *skb);
99 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
100 static void ip6_link_failure(struct sk_buff *skb);
101 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
102 struct sk_buff *skb, u32 mtu);
103 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
104 struct sk_buff *skb);
105 static int rt6_score_route(struct fib6_info *rt, int oif, int strict);
106 static size_t rt6_nlmsg_size(struct fib6_info *rt);
107 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
108 struct fib6_info *rt, struct dst_entry *dst,
109 struct in6_addr *dest, struct in6_addr *src,
110 int iif, int type, u32 portid, u32 seq,
112 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
113 struct in6_addr *daddr,
114 struct in6_addr *saddr);
116 #ifdef CONFIG_IPV6_ROUTE_INFO
117 static struct fib6_info *rt6_add_route_info(struct net *net,
118 const struct in6_addr *prefix, int prefixlen,
119 const struct in6_addr *gwaddr,
120 struct net_device *dev,
122 static struct fib6_info *rt6_get_route_info(struct net *net,
123 const struct in6_addr *prefix, int prefixlen,
124 const struct in6_addr *gwaddr,
125 struct net_device *dev);
128 struct uncached_list {
130 struct list_head head;
133 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
135 void rt6_uncached_list_add(struct rt6_info *rt)
137 struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
139 rt->rt6i_uncached_list = ul;
141 spin_lock_bh(&ul->lock);
142 list_add_tail(&rt->rt6i_uncached, &ul->head);
143 spin_unlock_bh(&ul->lock);
146 void rt6_uncached_list_del(struct rt6_info *rt)
148 if (!list_empty(&rt->rt6i_uncached)) {
149 struct uncached_list *ul = rt->rt6i_uncached_list;
150 struct net *net = dev_net(rt->dst.dev);
152 spin_lock_bh(&ul->lock);
153 list_del(&rt->rt6i_uncached);
154 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
155 spin_unlock_bh(&ul->lock);
159 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
161 struct net_device *loopback_dev = net->loopback_dev;
164 if (dev == loopback_dev)
167 for_each_possible_cpu(cpu) {
168 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
171 spin_lock_bh(&ul->lock);
172 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
173 struct inet6_dev *rt_idev = rt->rt6i_idev;
174 struct net_device *rt_dev = rt->dst.dev;
176 if (rt_idev->dev == dev) {
177 rt->rt6i_idev = in6_dev_get(loopback_dev);
178 in6_dev_put(rt_idev);
182 rt->dst.dev = loopback_dev;
183 dev_hold(rt->dst.dev);
187 spin_unlock_bh(&ul->lock);
191 static inline const void *choose_neigh_daddr(const struct in6_addr *p,
195 if (!ipv6_addr_any(p))
196 return (const void *) p;
198 return &ipv6_hdr(skb)->daddr;
202 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
203 struct net_device *dev,
209 daddr = choose_neigh_daddr(gw, skb, daddr);
210 n = __ipv6_neigh_lookup(dev, daddr);
213 return neigh_create(&nd_tbl, daddr, dev);
216 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst,
220 const struct rt6_info *rt = container_of(dst, struct rt6_info, dst);
222 return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr);
225 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
227 struct net_device *dev = dst->dev;
228 struct rt6_info *rt = (struct rt6_info *)dst;
230 daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr);
233 if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
235 if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
237 __ipv6_confirm_neigh(dev, daddr);
240 static struct dst_ops ip6_dst_ops_template = {
244 .check = ip6_dst_check,
245 .default_advmss = ip6_default_advmss,
247 .cow_metrics = dst_cow_metrics_generic,
248 .destroy = ip6_dst_destroy,
249 .ifdown = ip6_dst_ifdown,
250 .negative_advice = ip6_negative_advice,
251 .link_failure = ip6_link_failure,
252 .update_pmtu = ip6_rt_update_pmtu,
253 .redirect = rt6_do_redirect,
254 .local_out = __ip6_local_out,
255 .neigh_lookup = ip6_dst_neigh_lookup,
256 .confirm_neigh = ip6_confirm_neigh,
259 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
261 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
263 return mtu ? : dst->dev->mtu;
266 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
267 struct sk_buff *skb, u32 mtu)
271 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
276 static struct dst_ops ip6_dst_blackhole_ops = {
278 .destroy = ip6_dst_destroy,
279 .check = ip6_dst_check,
280 .mtu = ip6_blackhole_mtu,
281 .default_advmss = ip6_default_advmss,
282 .update_pmtu = ip6_rt_blackhole_update_pmtu,
283 .redirect = ip6_rt_blackhole_redirect,
284 .cow_metrics = dst_cow_metrics_generic,
285 .neigh_lookup = ip6_dst_neigh_lookup,
288 static const u32 ip6_template_metrics[RTAX_MAX] = {
289 [RTAX_HOPLIMIT - 1] = 0,
292 static const struct fib6_info fib6_null_entry_template = {
293 .fib6_flags = (RTF_REJECT | RTF_NONEXTHOP),
294 .fib6_protocol = RTPROT_KERNEL,
295 .fib6_metric = ~(u32)0,
296 .fib6_ref = ATOMIC_INIT(1),
297 .fib6_type = RTN_UNREACHABLE,
298 .fib6_metrics = (struct dst_metrics *)&dst_default_metrics,
301 static const struct rt6_info ip6_null_entry_template = {
303 .__refcnt = ATOMIC_INIT(1),
305 .obsolete = DST_OBSOLETE_FORCE_CHK,
306 .error = -ENETUNREACH,
307 .input = ip6_pkt_discard,
308 .output = ip6_pkt_discard_out,
310 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
313 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
315 static const struct rt6_info ip6_prohibit_entry_template = {
317 .__refcnt = ATOMIC_INIT(1),
319 .obsolete = DST_OBSOLETE_FORCE_CHK,
321 .input = ip6_pkt_prohibit,
322 .output = ip6_pkt_prohibit_out,
324 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
327 static const struct rt6_info ip6_blk_hole_entry_template = {
329 .__refcnt = ATOMIC_INIT(1),
331 .obsolete = DST_OBSOLETE_FORCE_CHK,
333 .input = dst_discard,
334 .output = dst_discard_out,
336 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
341 static void rt6_info_init(struct rt6_info *rt)
343 struct dst_entry *dst = &rt->dst;
345 memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
346 INIT_LIST_HEAD(&rt->rt6i_uncached);
349 /* allocate dst with ip6_dst_ops */
350 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
353 struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
354 1, DST_OBSOLETE_FORCE_CHK, flags);
358 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
363 EXPORT_SYMBOL(ip6_dst_alloc);
365 static void ip6_dst_destroy(struct dst_entry *dst)
367 struct rt6_info *rt = (struct rt6_info *)dst;
368 struct fib6_info *from;
369 struct inet6_dev *idev;
371 dst_destroy_metrics_generic(dst);
372 rt6_uncached_list_del(rt);
374 idev = rt->rt6i_idev;
376 rt->rt6i_idev = NULL;
381 from = rcu_dereference(rt->from);
382 rcu_assign_pointer(rt->from, NULL);
383 fib6_info_release(from);
387 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
390 struct rt6_info *rt = (struct rt6_info *)dst;
391 struct inet6_dev *idev = rt->rt6i_idev;
392 struct net_device *loopback_dev =
393 dev_net(dev)->loopback_dev;
395 if (idev && idev->dev != loopback_dev) {
396 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
398 rt->rt6i_idev = loopback_idev;
404 static bool __rt6_check_expired(const struct rt6_info *rt)
406 if (rt->rt6i_flags & RTF_EXPIRES)
407 return time_after(jiffies, rt->dst.expires);
412 static bool rt6_check_expired(const struct rt6_info *rt)
414 struct fib6_info *from;
416 from = rcu_dereference(rt->from);
418 if (rt->rt6i_flags & RTF_EXPIRES) {
419 if (time_after(jiffies, rt->dst.expires))
422 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
423 fib6_check_expired(from);
428 struct fib6_info *fib6_multipath_select(const struct net *net,
429 struct fib6_info *match,
430 struct flowi6 *fl6, int oif,
431 const struct sk_buff *skb,
434 struct fib6_info *sibling, *next_sibling;
436 /* We might have already computed the hash for ICMPv6 errors. In such
437 * case it will always be non-zero. Otherwise now is the time to do it.
440 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
442 if (fl6->mp_hash <= atomic_read(&match->fib6_nh.nh_upper_bound))
445 list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
449 nh_upper_bound = atomic_read(&sibling->fib6_nh.nh_upper_bound);
450 if (fl6->mp_hash > nh_upper_bound)
452 if (rt6_score_route(sibling, oif, strict) < 0)
462 * Route lookup. rcu_read_lock() should be held.
465 static inline struct fib6_info *rt6_device_match(struct net *net,
466 struct fib6_info *rt,
467 const struct in6_addr *saddr,
471 struct fib6_info *sprt;
473 if (!oif && ipv6_addr_any(saddr) &&
474 !(rt->fib6_nh.nh_flags & RTNH_F_DEAD))
477 for (sprt = rt; sprt; sprt = rcu_dereference(sprt->fib6_next)) {
478 const struct net_device *dev = sprt->fib6_nh.nh_dev;
480 if (sprt->fib6_nh.nh_flags & RTNH_F_DEAD)
484 if (dev->ifindex == oif)
487 if (ipv6_chk_addr(net, saddr, dev,
488 flags & RT6_LOOKUP_F_IFACE))
493 if (oif && flags & RT6_LOOKUP_F_IFACE)
494 return net->ipv6.fib6_null_entry;
496 return rt->fib6_nh.nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt;
499 #ifdef CONFIG_IPV6_ROUTER_PREF
500 struct __rt6_probe_work {
501 struct work_struct work;
502 struct in6_addr target;
503 struct net_device *dev;
506 static void rt6_probe_deferred(struct work_struct *w)
508 struct in6_addr mcaddr;
509 struct __rt6_probe_work *work =
510 container_of(w, struct __rt6_probe_work, work);
512 addrconf_addr_solict_mult(&work->target, &mcaddr);
513 ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
518 static void rt6_probe(struct fib6_info *rt)
520 struct __rt6_probe_work *work;
521 const struct in6_addr *nh_gw;
522 struct neighbour *neigh;
523 struct net_device *dev;
526 * Okay, this does not seem to be appropriate
527 * for now, however, we need to check if it
528 * is really so; aka Router Reachability Probing.
530 * Router Reachability Probe MUST be rate-limited
531 * to no more than one per minute.
533 if (!rt || !(rt->fib6_flags & RTF_GATEWAY))
536 nh_gw = &rt->fib6_nh.nh_gw;
537 dev = rt->fib6_nh.nh_dev;
539 neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
541 struct inet6_dev *idev;
543 if (neigh->nud_state & NUD_VALID)
546 idev = __in6_dev_get(dev);
548 write_lock(&neigh->lock);
549 if (!(neigh->nud_state & NUD_VALID) &&
551 neigh->updated + idev->cnf.rtr_probe_interval)) {
552 work = kmalloc(sizeof(*work), GFP_ATOMIC);
554 __neigh_set_probe_once(neigh);
556 write_unlock(&neigh->lock);
558 work = kmalloc(sizeof(*work), GFP_ATOMIC);
562 INIT_WORK(&work->work, rt6_probe_deferred);
563 work->target = *nh_gw;
566 schedule_work(&work->work);
570 rcu_read_unlock_bh();
573 static inline void rt6_probe(struct fib6_info *rt)
579 * Default Router Selection (RFC 2461 6.3.6)
581 static inline int rt6_check_dev(struct fib6_info *rt, int oif)
583 const struct net_device *dev = rt->fib6_nh.nh_dev;
585 if (!oif || dev->ifindex == oif)
590 static inline enum rt6_nud_state rt6_check_neigh(struct fib6_info *rt)
592 enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
593 struct neighbour *neigh;
595 if (rt->fib6_flags & RTF_NONEXTHOP ||
596 !(rt->fib6_flags & RTF_GATEWAY))
597 return RT6_NUD_SUCCEED;
600 neigh = __ipv6_neigh_lookup_noref(rt->fib6_nh.nh_dev,
603 read_lock(&neigh->lock);
604 if (neigh->nud_state & NUD_VALID)
605 ret = RT6_NUD_SUCCEED;
606 #ifdef CONFIG_IPV6_ROUTER_PREF
607 else if (!(neigh->nud_state & NUD_FAILED))
608 ret = RT6_NUD_SUCCEED;
610 ret = RT6_NUD_FAIL_PROBE;
612 read_unlock(&neigh->lock);
614 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
615 RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
617 rcu_read_unlock_bh();
622 static int rt6_score_route(struct fib6_info *rt, int oif, int strict)
626 m = rt6_check_dev(rt, oif);
627 if (!m && (strict & RT6_LOOKUP_F_IFACE))
628 return RT6_NUD_FAIL_HARD;
629 #ifdef CONFIG_IPV6_ROUTER_PREF
630 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->fib6_flags)) << 2;
632 if (strict & RT6_LOOKUP_F_REACHABLE) {
633 int n = rt6_check_neigh(rt);
640 /* called with rc_read_lock held */
641 static inline bool fib6_ignore_linkdown(const struct fib6_info *f6i)
643 const struct net_device *dev = fib6_info_nh_dev(f6i);
647 const struct inet6_dev *idev = __in6_dev_get(dev);
649 rc = !!idev->cnf.ignore_routes_with_linkdown;
655 static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict,
656 int *mpri, struct fib6_info *match,
660 bool match_do_rr = false;
662 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
665 if (fib6_ignore_linkdown(rt) &&
666 rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
667 !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
670 if (fib6_check_expired(rt))
673 m = rt6_score_route(rt, oif, strict);
674 if (m == RT6_NUD_FAIL_DO_RR) {
676 m = 0; /* lowest valid score */
677 } else if (m == RT6_NUD_FAIL_HARD) {
681 if (strict & RT6_LOOKUP_F_REACHABLE)
684 /* note that m can be RT6_NUD_FAIL_PROBE at this point */
686 *do_rr = match_do_rr;
694 static struct fib6_info *find_rr_leaf(struct fib6_node *fn,
695 struct fib6_info *leaf,
696 struct fib6_info *rr_head,
697 u32 metric, int oif, int strict,
700 struct fib6_info *rt, *match, *cont;
705 for (rt = rr_head; rt; rt = rcu_dereference(rt->fib6_next)) {
706 if (rt->fib6_metric != metric) {
711 match = find_match(rt, oif, strict, &mpri, match, do_rr);
714 for (rt = leaf; rt && rt != rr_head;
715 rt = rcu_dereference(rt->fib6_next)) {
716 if (rt->fib6_metric != metric) {
721 match = find_match(rt, oif, strict, &mpri, match, do_rr);
727 for (rt = cont; rt; rt = rcu_dereference(rt->fib6_next))
728 match = find_match(rt, oif, strict, &mpri, match, do_rr);
733 static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn,
736 struct fib6_info *leaf = rcu_dereference(fn->leaf);
737 struct fib6_info *match, *rt0;
741 if (!leaf || leaf == net->ipv6.fib6_null_entry)
742 return net->ipv6.fib6_null_entry;
744 rt0 = rcu_dereference(fn->rr_ptr);
748 /* Double check to make sure fn is not an intermediate node
749 * and fn->leaf does not points to its child's leaf
750 * (This might happen if all routes under fn are deleted from
751 * the tree and fib6_repair_tree() is called on the node.)
753 key_plen = rt0->fib6_dst.plen;
754 #ifdef CONFIG_IPV6_SUBTREES
755 if (rt0->fib6_src.plen)
756 key_plen = rt0->fib6_src.plen;
758 if (fn->fn_bit != key_plen)
759 return net->ipv6.fib6_null_entry;
761 match = find_rr_leaf(fn, leaf, rt0, rt0->fib6_metric, oif, strict,
765 struct fib6_info *next = rcu_dereference(rt0->fib6_next);
767 /* no entries matched; do round-robin */
768 if (!next || next->fib6_metric != rt0->fib6_metric)
772 spin_lock_bh(&leaf->fib6_table->tb6_lock);
773 /* make sure next is not being deleted from the tree */
775 rcu_assign_pointer(fn->rr_ptr, next);
776 spin_unlock_bh(&leaf->fib6_table->tb6_lock);
780 return match ? match : net->ipv6.fib6_null_entry;
783 static bool rt6_is_gw_or_nonexthop(const struct fib6_info *rt)
785 return (rt->fib6_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
788 #ifdef CONFIG_IPV6_ROUTE_INFO
789 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
790 const struct in6_addr *gwaddr)
792 struct net *net = dev_net(dev);
793 struct route_info *rinfo = (struct route_info *) opt;
794 struct in6_addr prefix_buf, *prefix;
796 unsigned long lifetime;
797 struct fib6_info *rt;
799 if (len < sizeof(struct route_info)) {
803 /* Sanity check for prefix_len and length */
804 if (rinfo->length > 3) {
806 } else if (rinfo->prefix_len > 128) {
808 } else if (rinfo->prefix_len > 64) {
809 if (rinfo->length < 2) {
812 } else if (rinfo->prefix_len > 0) {
813 if (rinfo->length < 1) {
818 pref = rinfo->route_pref;
819 if (pref == ICMPV6_ROUTER_PREF_INVALID)
822 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
824 if (rinfo->length == 3)
825 prefix = (struct in6_addr *)rinfo->prefix;
827 /* this function is safe */
828 ipv6_addr_prefix(&prefix_buf,
829 (struct in6_addr *)rinfo->prefix,
831 prefix = &prefix_buf;
834 if (rinfo->prefix_len == 0)
835 rt = rt6_get_dflt_router(net, gwaddr, dev);
837 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
840 if (rt && !lifetime) {
846 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
849 rt->fib6_flags = RTF_ROUTEINFO |
850 (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
853 if (!addrconf_finite_timeout(lifetime))
854 fib6_clean_expires(rt);
856 fib6_set_expires(rt, jiffies + HZ * lifetime);
858 fib6_info_release(rt);
865 * Misc support functions
868 /* called with rcu_lock held */
869 static struct net_device *ip6_rt_get_dev_rcu(struct fib6_info *rt)
871 struct net_device *dev = rt->fib6_nh.nh_dev;
873 if (rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) {
874 /* for copies of local routes, dst->dev needs to be the
875 * device if it is a master device, the master device if
876 * device is enslaved, and the loopback as the default
878 if (netif_is_l3_slave(dev) &&
879 !rt6_need_strict(&rt->fib6_dst.addr))
880 dev = l3mdev_master_dev_rcu(dev);
881 else if (!netif_is_l3_master(dev))
882 dev = dev_net(dev)->loopback_dev;
883 /* last case is netif_is_l3_master(dev) is true in which
884 * case we want dev returned to be dev
891 static const int fib6_prop[RTN_MAX + 1] = {
898 [RTN_BLACKHOLE] = -EINVAL,
899 [RTN_UNREACHABLE] = -EHOSTUNREACH,
900 [RTN_PROHIBIT] = -EACCES,
901 [RTN_THROW] = -EAGAIN,
903 [RTN_XRESOLVE] = -EINVAL,
906 static int ip6_rt_type_to_error(u8 fib6_type)
908 return fib6_prop[fib6_type];
911 static unsigned short fib6_info_dst_flags(struct fib6_info *rt)
913 unsigned short flags = 0;
916 flags |= DST_NOCOUNT;
917 if (rt->dst_nopolicy)
918 flags |= DST_NOPOLICY;
925 static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort)
927 rt->dst.error = ip6_rt_type_to_error(ort->fib6_type);
929 switch (ort->fib6_type) {
931 rt->dst.output = dst_discard_out;
932 rt->dst.input = dst_discard;
935 rt->dst.output = ip6_pkt_prohibit_out;
936 rt->dst.input = ip6_pkt_prohibit;
939 case RTN_UNREACHABLE:
941 rt->dst.output = ip6_pkt_discard_out;
942 rt->dst.input = ip6_pkt_discard;
947 static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort)
949 if (ort->fib6_flags & RTF_REJECT) {
950 ip6_rt_init_dst_reject(rt, ort);
955 rt->dst.output = ip6_output;
957 if (ort->fib6_type == RTN_LOCAL || ort->fib6_type == RTN_ANYCAST) {
958 rt->dst.input = ip6_input;
959 } else if (ipv6_addr_type(&ort->fib6_dst.addr) & IPV6_ADDR_MULTICAST) {
960 rt->dst.input = ip6_mc_input;
962 rt->dst.input = ip6_forward;
965 if (ort->fib6_nh.nh_lwtstate) {
966 rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate);
967 lwtunnel_set_redirect(&rt->dst);
970 rt->dst.lastuse = jiffies;
973 /* Caller must already hold reference to @from */
974 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
976 rt->rt6i_flags &= ~RTF_EXPIRES;
977 rcu_assign_pointer(rt->from, from);
978 dst_init_metrics(&rt->dst, from->fib6_metrics->metrics, true);
981 /* Caller must already hold reference to @ort */
982 static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort)
984 struct net_device *dev = fib6_info_nh_dev(ort);
986 ip6_rt_init_dst(rt, ort);
988 rt->rt6i_dst = ort->fib6_dst;
989 rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL;
990 rt->rt6i_gateway = ort->fib6_nh.nh_gw;
991 rt->rt6i_flags = ort->fib6_flags;
992 rt6_set_from(rt, ort);
993 #ifdef CONFIG_IPV6_SUBTREES
994 rt->rt6i_src = ort->fib6_src;
996 rt->rt6i_prefsrc = ort->fib6_prefsrc;
999 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
1000 struct in6_addr *saddr)
1002 struct fib6_node *pn, *sn;
1004 if (fn->fn_flags & RTN_TL_ROOT)
1006 pn = rcu_dereference(fn->parent);
1007 sn = FIB6_SUBTREE(pn);
1009 fn = fib6_node_lookup(sn, NULL, saddr);
1012 if (fn->fn_flags & RTN_RTINFO)
1017 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
1020 struct rt6_info *rt = *prt;
1022 if (dst_hold_safe(&rt->dst))
1024 if (null_fallback) {
1025 rt = net->ipv6.ip6_null_entry;
1034 /* called with rcu_lock held */
1035 static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt)
1037 unsigned short flags = fib6_info_dst_flags(rt);
1038 struct net_device *dev = rt->fib6_nh.nh_dev;
1039 struct rt6_info *nrt;
1041 if (!fib6_info_hold_safe(rt))
1044 nrt = ip6_dst_alloc(dev_net(dev), dev, flags);
1046 ip6_rt_copy_init(nrt, rt);
1048 fib6_info_release(rt);
1053 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
1054 struct fib6_table *table,
1056 const struct sk_buff *skb,
1059 struct fib6_info *f6i;
1060 struct fib6_node *fn;
1061 struct rt6_info *rt;
1063 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1064 flags &= ~RT6_LOOKUP_F_IFACE;
1067 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1069 f6i = rcu_dereference(fn->leaf);
1071 f6i = net->ipv6.fib6_null_entry;
1073 f6i = rt6_device_match(net, f6i, &fl6->saddr,
1074 fl6->flowi6_oif, flags);
1075 if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0)
1076 f6i = fib6_multipath_select(net, f6i, fl6,
1077 fl6->flowi6_oif, skb,
1080 if (f6i == net->ipv6.fib6_null_entry) {
1081 fn = fib6_backtrack(fn, &fl6->saddr);
1086 trace_fib6_table_lookup(net, f6i, table, fl6);
1088 /* Search through exception table */
1089 rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1091 if (ip6_hold_safe(net, &rt, true))
1092 dst_use_noref(&rt->dst, jiffies);
1093 } else if (f6i == net->ipv6.fib6_null_entry) {
1094 rt = net->ipv6.ip6_null_entry;
1097 rt = ip6_create_rt_rcu(f6i);
1099 rt = net->ipv6.ip6_null_entry;
1109 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
1110 const struct sk_buff *skb, int flags)
1112 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
1114 EXPORT_SYMBOL_GPL(ip6_route_lookup);
1116 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
1117 const struct in6_addr *saddr, int oif,
1118 const struct sk_buff *skb, int strict)
1120 struct flowi6 fl6 = {
1124 struct dst_entry *dst;
1125 int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
1128 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
1129 flags |= RT6_LOOKUP_F_HAS_SADDR;
1132 dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
1133 if (dst->error == 0)
1134 return (struct rt6_info *) dst;
1140 EXPORT_SYMBOL(rt6_lookup);
1142 /* ip6_ins_rt is called with FREE table->tb6_lock.
1143 * It takes new route entry, the addition fails by any reason the
1144 * route is released.
1145 * Caller must hold dst before calling it.
1148 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info,
1149 struct netlink_ext_ack *extack)
1152 struct fib6_table *table;
1154 table = rt->fib6_table;
1155 spin_lock_bh(&table->tb6_lock);
1156 err = fib6_add(&table->tb6_root, rt, info, extack);
1157 spin_unlock_bh(&table->tb6_lock);
1162 int ip6_ins_rt(struct net *net, struct fib6_info *rt)
1164 struct nl_info info = { .nl_net = net, };
1166 return __ip6_ins_rt(rt, &info, NULL);
1169 static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort,
1170 const struct in6_addr *daddr,
1171 const struct in6_addr *saddr)
1173 struct net_device *dev;
1174 struct rt6_info *rt;
1180 if (!fib6_info_hold_safe(ort))
1183 dev = ip6_rt_get_dev_rcu(ort);
1184 rt = ip6_dst_alloc(dev_net(dev), dev, 0);
1186 fib6_info_release(ort);
1190 ip6_rt_copy_init(rt, ort);
1191 rt->rt6i_flags |= RTF_CACHE;
1192 rt->dst.flags |= DST_HOST;
1193 rt->rt6i_dst.addr = *daddr;
1194 rt->rt6i_dst.plen = 128;
1196 if (!rt6_is_gw_or_nonexthop(ort)) {
1197 if (ort->fib6_dst.plen != 128 &&
1198 ipv6_addr_equal(&ort->fib6_dst.addr, daddr))
1199 rt->rt6i_flags |= RTF_ANYCAST;
1200 #ifdef CONFIG_IPV6_SUBTREES
1201 if (rt->rt6i_src.plen && saddr) {
1202 rt->rt6i_src.addr = *saddr;
1203 rt->rt6i_src.plen = 128;
1211 static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt)
1213 unsigned short flags = fib6_info_dst_flags(rt);
1214 struct net_device *dev;
1215 struct rt6_info *pcpu_rt;
1217 if (!fib6_info_hold_safe(rt))
1221 dev = ip6_rt_get_dev_rcu(rt);
1222 pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags);
1225 fib6_info_release(rt);
1228 ip6_rt_copy_init(pcpu_rt, rt);
1229 pcpu_rt->rt6i_flags |= RTF_PCPU;
1233 /* It should be called with rcu_read_lock() acquired */
1234 static struct rt6_info *rt6_get_pcpu_route(struct fib6_info *rt)
1236 struct rt6_info *pcpu_rt, **p;
1238 p = this_cpu_ptr(rt->rt6i_pcpu);
1242 ip6_hold_safe(NULL, &pcpu_rt, false);
1247 static struct rt6_info *rt6_make_pcpu_route(struct net *net,
1248 struct fib6_info *rt)
1250 struct rt6_info *pcpu_rt, *prev, **p;
1252 pcpu_rt = ip6_rt_pcpu_alloc(rt);
1254 dst_hold(&net->ipv6.ip6_null_entry->dst);
1255 return net->ipv6.ip6_null_entry;
1258 dst_hold(&pcpu_rt->dst);
1259 p = this_cpu_ptr(rt->rt6i_pcpu);
1260 prev = cmpxchg(p, NULL, pcpu_rt);
1266 /* exception hash table implementation
1268 static DEFINE_SPINLOCK(rt6_exception_lock);
1270 /* Remove rt6_ex from hash table and free the memory
1271 * Caller must hold rt6_exception_lock
1273 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1274 struct rt6_exception *rt6_ex)
1278 if (!bucket || !rt6_ex)
1281 net = dev_net(rt6_ex->rt6i->dst.dev);
1282 hlist_del_rcu(&rt6_ex->hlist);
1283 dst_release(&rt6_ex->rt6i->dst);
1284 kfree_rcu(rt6_ex, rcu);
1285 WARN_ON_ONCE(!bucket->depth);
1287 net->ipv6.rt6_stats->fib_rt_cache--;
1290 /* Remove oldest rt6_ex in bucket and free the memory
1291 * Caller must hold rt6_exception_lock
1293 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1295 struct rt6_exception *rt6_ex, *oldest = NULL;
1300 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1301 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1304 rt6_remove_exception(bucket, oldest);
1307 static u32 rt6_exception_hash(const struct in6_addr *dst,
1308 const struct in6_addr *src)
1310 static u32 seed __read_mostly;
1313 net_get_random_once(&seed, sizeof(seed));
1314 val = jhash(dst, sizeof(*dst), seed);
1316 #ifdef CONFIG_IPV6_SUBTREES
1318 val = jhash(src, sizeof(*src), val);
1320 return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1323 /* Helper function to find the cached rt in the hash table
1324 * and update bucket pointer to point to the bucket for this
1325 * (daddr, saddr) pair
1326 * Caller must hold rt6_exception_lock
1328 static struct rt6_exception *
1329 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1330 const struct in6_addr *daddr,
1331 const struct in6_addr *saddr)
1333 struct rt6_exception *rt6_ex;
1336 if (!(*bucket) || !daddr)
1339 hval = rt6_exception_hash(daddr, saddr);
1342 hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1343 struct rt6_info *rt6 = rt6_ex->rt6i;
1344 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1346 #ifdef CONFIG_IPV6_SUBTREES
1347 if (matched && saddr)
1348 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1356 /* Helper function to find the cached rt in the hash table
1357 * and update bucket pointer to point to the bucket for this
1358 * (daddr, saddr) pair
1359 * Caller must hold rcu_read_lock()
1361 static struct rt6_exception *
1362 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1363 const struct in6_addr *daddr,
1364 const struct in6_addr *saddr)
1366 struct rt6_exception *rt6_ex;
1369 WARN_ON_ONCE(!rcu_read_lock_held());
1371 if (!(*bucket) || !daddr)
1374 hval = rt6_exception_hash(daddr, saddr);
1377 hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1378 struct rt6_info *rt6 = rt6_ex->rt6i;
1379 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1381 #ifdef CONFIG_IPV6_SUBTREES
1382 if (matched && saddr)
1383 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1391 static unsigned int fib6_mtu(const struct fib6_info *rt)
1395 if (rt->fib6_pmtu) {
1396 mtu = rt->fib6_pmtu;
1398 struct net_device *dev = fib6_info_nh_dev(rt);
1399 struct inet6_dev *idev;
1402 idev = __in6_dev_get(dev);
1403 mtu = idev->cnf.mtu6;
1407 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1409 return mtu - lwtunnel_headroom(rt->fib6_nh.nh_lwtstate, mtu);
1412 static int rt6_insert_exception(struct rt6_info *nrt,
1413 struct fib6_info *ort)
1415 struct net *net = dev_net(nrt->dst.dev);
1416 struct rt6_exception_bucket *bucket;
1417 struct in6_addr *src_key = NULL;
1418 struct rt6_exception *rt6_ex;
1421 spin_lock_bh(&rt6_exception_lock);
1423 if (ort->exception_bucket_flushed) {
1428 bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1429 lockdep_is_held(&rt6_exception_lock));
1431 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1437 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1440 #ifdef CONFIG_IPV6_SUBTREES
1441 /* rt6i_src.plen != 0 indicates ort is in subtree
1442 * and exception table is indexed by a hash of
1443 * both rt6i_dst and rt6i_src.
1444 * Otherwise, the exception table is indexed by
1445 * a hash of only rt6i_dst.
1447 if (ort->fib6_src.plen)
1448 src_key = &nrt->rt6i_src.addr;
1451 /* Update rt6i_prefsrc as it could be changed
1452 * in rt6_remove_prefsrc()
1454 nrt->rt6i_prefsrc = ort->fib6_prefsrc;
1455 /* rt6_mtu_change() might lower mtu on ort.
1456 * Only insert this exception route if its mtu
1457 * is less than ort's mtu value.
1459 if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(ort)) {
1464 rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1467 rt6_remove_exception(bucket, rt6_ex);
1469 rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1475 rt6_ex->stamp = jiffies;
1476 hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1478 net->ipv6.rt6_stats->fib_rt_cache++;
1480 if (bucket->depth > FIB6_MAX_DEPTH)
1481 rt6_exception_remove_oldest(bucket);
1484 spin_unlock_bh(&rt6_exception_lock);
1486 /* Update fn->fn_sernum to invalidate all cached dst */
1488 spin_lock_bh(&ort->fib6_table->tb6_lock);
1489 fib6_update_sernum(net, ort);
1490 spin_unlock_bh(&ort->fib6_table->tb6_lock);
1491 fib6_force_start_gc(net);
1497 void rt6_flush_exceptions(struct fib6_info *rt)
1499 struct rt6_exception_bucket *bucket;
1500 struct rt6_exception *rt6_ex;
1501 struct hlist_node *tmp;
1504 spin_lock_bh(&rt6_exception_lock);
1505 /* Prevent rt6_insert_exception() to recreate the bucket list */
1506 rt->exception_bucket_flushed = 1;
1508 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1509 lockdep_is_held(&rt6_exception_lock));
1513 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1514 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1515 rt6_remove_exception(bucket, rt6_ex);
1516 WARN_ON_ONCE(bucket->depth);
1521 spin_unlock_bh(&rt6_exception_lock);
1524 /* Find cached rt in the hash table inside passed in rt
1525 * Caller has to hold rcu_read_lock()
1527 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
1528 struct in6_addr *daddr,
1529 struct in6_addr *saddr)
1531 struct rt6_exception_bucket *bucket;
1532 struct in6_addr *src_key = NULL;
1533 struct rt6_exception *rt6_ex;
1534 struct rt6_info *res = NULL;
1536 bucket = rcu_dereference(rt->rt6i_exception_bucket);
1538 #ifdef CONFIG_IPV6_SUBTREES
1539 /* rt6i_src.plen != 0 indicates rt is in subtree
1540 * and exception table is indexed by a hash of
1541 * both rt6i_dst and rt6i_src.
1542 * Otherwise, the exception table is indexed by
1543 * a hash of only rt6i_dst.
1545 if (rt->fib6_src.plen)
1548 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1550 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1556 /* Remove the passed in cached rt from the hash table that contains it */
1557 static int rt6_remove_exception_rt(struct rt6_info *rt)
1559 struct rt6_exception_bucket *bucket;
1560 struct in6_addr *src_key = NULL;
1561 struct rt6_exception *rt6_ex;
1562 struct fib6_info *from;
1565 from = rcu_dereference(rt->from);
1567 !(rt->rt6i_flags & RTF_CACHE))
1570 if (!rcu_access_pointer(from->rt6i_exception_bucket))
1573 spin_lock_bh(&rt6_exception_lock);
1574 bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1575 lockdep_is_held(&rt6_exception_lock));
1576 #ifdef CONFIG_IPV6_SUBTREES
1577 /* rt6i_src.plen != 0 indicates 'from' is in subtree
1578 * and exception table is indexed by a hash of
1579 * both rt6i_dst and rt6i_src.
1580 * Otherwise, the exception table is indexed by
1581 * a hash of only rt6i_dst.
1583 if (from->fib6_src.plen)
1584 src_key = &rt->rt6i_src.addr;
1586 rt6_ex = __rt6_find_exception_spinlock(&bucket,
1590 rt6_remove_exception(bucket, rt6_ex);
1596 spin_unlock_bh(&rt6_exception_lock);
1600 /* Find rt6_ex which contains the passed in rt cache and
1603 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1605 struct rt6_exception_bucket *bucket;
1606 struct fib6_info *from = rt->from;
1607 struct in6_addr *src_key = NULL;
1608 struct rt6_exception *rt6_ex;
1611 !(rt->rt6i_flags & RTF_CACHE))
1615 bucket = rcu_dereference(from->rt6i_exception_bucket);
1617 #ifdef CONFIG_IPV6_SUBTREES
1618 /* rt6i_src.plen != 0 indicates 'from' is in subtree
1619 * and exception table is indexed by a hash of
1620 * both rt6i_dst and rt6i_src.
1621 * Otherwise, the exception table is indexed by
1622 * a hash of only rt6i_dst.
1624 if (from->fib6_src.plen)
1625 src_key = &rt->rt6i_src.addr;
1627 rt6_ex = __rt6_find_exception_rcu(&bucket,
1631 rt6_ex->stamp = jiffies;
1636 static void rt6_exceptions_remove_prefsrc(struct fib6_info *rt)
1638 struct rt6_exception_bucket *bucket;
1639 struct rt6_exception *rt6_ex;
1642 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1643 lockdep_is_held(&rt6_exception_lock));
1646 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1647 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1648 rt6_ex->rt6i->rt6i_prefsrc.plen = 0;
1655 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
1656 struct rt6_info *rt, int mtu)
1658 /* If the new MTU is lower than the route PMTU, this new MTU will be the
1659 * lowest MTU in the path: always allow updating the route PMTU to
1660 * reflect PMTU decreases.
1662 * If the new MTU is higher, and the route PMTU is equal to the local
1663 * MTU, this means the old MTU is the lowest in the path, so allow
1664 * updating it: if other nodes now have lower MTUs, PMTU discovery will
1668 if (dst_mtu(&rt->dst) >= mtu)
1671 if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
1677 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
1678 struct fib6_info *rt, int mtu)
1680 struct rt6_exception_bucket *bucket;
1681 struct rt6_exception *rt6_ex;
1684 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1685 lockdep_is_held(&rt6_exception_lock));
1690 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1691 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1692 struct rt6_info *entry = rt6_ex->rt6i;
1694 /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
1695 * route), the metrics of its rt->from have already
1698 if (dst_metric_raw(&entry->dst, RTAX_MTU) &&
1699 rt6_mtu_change_route_allowed(idev, entry, mtu))
1700 dst_metric_set(&entry->dst, RTAX_MTU, mtu);
1706 #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE)
1708 static void rt6_exceptions_clean_tohost(struct fib6_info *rt,
1709 struct in6_addr *gateway)
1711 struct rt6_exception_bucket *bucket;
1712 struct rt6_exception *rt6_ex;
1713 struct hlist_node *tmp;
1716 if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1719 spin_lock_bh(&rt6_exception_lock);
1720 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1721 lockdep_is_held(&rt6_exception_lock));
1724 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1725 hlist_for_each_entry_safe(rt6_ex, tmp,
1726 &bucket->chain, hlist) {
1727 struct rt6_info *entry = rt6_ex->rt6i;
1729 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1730 RTF_CACHE_GATEWAY &&
1731 ipv6_addr_equal(gateway,
1732 &entry->rt6i_gateway)) {
1733 rt6_remove_exception(bucket, rt6_ex);
1740 spin_unlock_bh(&rt6_exception_lock);
1743 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1744 struct rt6_exception *rt6_ex,
1745 struct fib6_gc_args *gc_args,
1748 struct rt6_info *rt = rt6_ex->rt6i;
1750 /* we are pruning and obsoleting aged-out and non gateway exceptions
1751 * even if others have still references to them, so that on next
1752 * dst_check() such references can be dropped.
1753 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1754 * expired, independently from their aging, as per RFC 8201 section 4
1756 if (!(rt->rt6i_flags & RTF_EXPIRES)) {
1757 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1758 RT6_TRACE("aging clone %p\n", rt);
1759 rt6_remove_exception(bucket, rt6_ex);
1762 } else if (time_after(jiffies, rt->dst.expires)) {
1763 RT6_TRACE("purging expired route %p\n", rt);
1764 rt6_remove_exception(bucket, rt6_ex);
1768 if (rt->rt6i_flags & RTF_GATEWAY) {
1769 struct neighbour *neigh;
1770 __u8 neigh_flags = 0;
1772 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
1774 neigh_flags = neigh->flags;
1776 if (!(neigh_flags & NTF_ROUTER)) {
1777 RT6_TRACE("purging route %p via non-router but gateway\n",
1779 rt6_remove_exception(bucket, rt6_ex);
1787 void rt6_age_exceptions(struct fib6_info *rt,
1788 struct fib6_gc_args *gc_args,
1791 struct rt6_exception_bucket *bucket;
1792 struct rt6_exception *rt6_ex;
1793 struct hlist_node *tmp;
1796 if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1800 spin_lock(&rt6_exception_lock);
1801 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1802 lockdep_is_held(&rt6_exception_lock));
1805 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1806 hlist_for_each_entry_safe(rt6_ex, tmp,
1807 &bucket->chain, hlist) {
1808 rt6_age_examine_exception(bucket, rt6_ex,
1814 spin_unlock(&rt6_exception_lock);
1815 rcu_read_unlock_bh();
1818 /* must be called with rcu lock held */
1819 struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table,
1820 int oif, struct flowi6 *fl6, int strict)
1822 struct fib6_node *fn, *saved_fn;
1823 struct fib6_info *f6i;
1825 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1828 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1832 f6i = rt6_select(net, fn, oif, strict);
1833 if (f6i == net->ipv6.fib6_null_entry) {
1834 fn = fib6_backtrack(fn, &fl6->saddr);
1836 goto redo_rt6_select;
1837 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1838 /* also consider unreachable route */
1839 strict &= ~RT6_LOOKUP_F_REACHABLE;
1841 goto redo_rt6_select;
1845 trace_fib6_table_lookup(net, f6i, table, fl6);
1850 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1851 int oif, struct flowi6 *fl6,
1852 const struct sk_buff *skb, int flags)
1854 struct fib6_info *f6i;
1855 struct rt6_info *rt;
1858 strict |= flags & RT6_LOOKUP_F_IFACE;
1859 strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1860 if (net->ipv6.devconf_all->forwarding == 0)
1861 strict |= RT6_LOOKUP_F_REACHABLE;
1865 f6i = fib6_table_lookup(net, table, oif, fl6, strict);
1866 if (f6i->fib6_nsiblings)
1867 f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict);
1869 if (f6i == net->ipv6.fib6_null_entry) {
1870 rt = net->ipv6.ip6_null_entry;
1876 /*Search through exception table */
1877 rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1879 if (ip6_hold_safe(net, &rt, true))
1880 dst_use_noref(&rt->dst, jiffies);
1884 } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1885 !(f6i->fib6_flags & RTF_GATEWAY))) {
1886 /* Create a RTF_CACHE clone which will not be
1887 * owned by the fib6 tree. It is for the special case where
1888 * the daddr in the skb during the neighbor look-up is different
1889 * from the fl6->daddr used to look-up route here.
1891 struct rt6_info *uncached_rt;
1893 uncached_rt = ip6_rt_cache_alloc(f6i, &fl6->daddr, NULL);
1898 /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1899 * No need for another dst_hold()
1901 rt6_uncached_list_add(uncached_rt);
1902 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1904 uncached_rt = net->ipv6.ip6_null_entry;
1905 dst_hold(&uncached_rt->dst);
1910 /* Get a percpu copy */
1912 struct rt6_info *pcpu_rt;
1915 pcpu_rt = rt6_get_pcpu_route(f6i);
1918 pcpu_rt = rt6_make_pcpu_route(net, f6i);
1926 EXPORT_SYMBOL_GPL(ip6_pol_route);
1928 static struct rt6_info *ip6_pol_route_input(struct net *net,
1929 struct fib6_table *table,
1931 const struct sk_buff *skb,
1934 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
1937 struct dst_entry *ip6_route_input_lookup(struct net *net,
1938 struct net_device *dev,
1940 const struct sk_buff *skb,
1943 if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1944 flags |= RT6_LOOKUP_F_IFACE;
1946 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
1948 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1950 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1951 struct flow_keys *keys,
1952 struct flow_keys *flkeys)
1954 const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1955 const struct ipv6hdr *key_iph = outer_iph;
1956 struct flow_keys *_flkeys = flkeys;
1957 const struct ipv6hdr *inner_iph;
1958 const struct icmp6hdr *icmph;
1959 struct ipv6hdr _inner_iph;
1960 struct icmp6hdr _icmph;
1962 if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1965 icmph = skb_header_pointer(skb, skb_transport_offset(skb),
1966 sizeof(_icmph), &_icmph);
1970 if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1971 icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1972 icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1973 icmph->icmp6_type != ICMPV6_PARAMPROB)
1976 inner_iph = skb_header_pointer(skb,
1977 skb_transport_offset(skb) + sizeof(*icmph),
1978 sizeof(_inner_iph), &_inner_iph);
1982 key_iph = inner_iph;
1986 keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
1987 keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
1988 keys->tags.flow_label = _flkeys->tags.flow_label;
1989 keys->basic.ip_proto = _flkeys->basic.ip_proto;
1991 keys->addrs.v6addrs.src = key_iph->saddr;
1992 keys->addrs.v6addrs.dst = key_iph->daddr;
1993 keys->tags.flow_label = ip6_flowlabel(key_iph);
1994 keys->basic.ip_proto = key_iph->nexthdr;
1998 /* if skb is set it will be used and fl6 can be NULL */
1999 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
2000 const struct sk_buff *skb, struct flow_keys *flkeys)
2002 struct flow_keys hash_keys;
2005 switch (ip6_multipath_hash_policy(net)) {
2007 memset(&hash_keys, 0, sizeof(hash_keys));
2008 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2010 ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
2012 hash_keys.addrs.v6addrs.src = fl6->saddr;
2013 hash_keys.addrs.v6addrs.dst = fl6->daddr;
2014 hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
2015 hash_keys.basic.ip_proto = fl6->flowi6_proto;
2020 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
2021 struct flow_keys keys;
2023 /* short-circuit if we already have L4 hash present */
2025 return skb_get_hash_raw(skb) >> 1;
2027 memset(&hash_keys, 0, sizeof(hash_keys));
2030 skb_flow_dissect_flow_keys(skb, &keys, flag);
2033 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2034 hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
2035 hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
2036 hash_keys.ports.src = flkeys->ports.src;
2037 hash_keys.ports.dst = flkeys->ports.dst;
2038 hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2040 memset(&hash_keys, 0, sizeof(hash_keys));
2041 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2042 hash_keys.addrs.v6addrs.src = fl6->saddr;
2043 hash_keys.addrs.v6addrs.dst = fl6->daddr;
2044 hash_keys.ports.src = fl6->fl6_sport;
2045 hash_keys.ports.dst = fl6->fl6_dport;
2046 hash_keys.basic.ip_proto = fl6->flowi6_proto;
2050 mhash = flow_hash_from_keys(&hash_keys);
2055 void ip6_route_input(struct sk_buff *skb)
2057 const struct ipv6hdr *iph = ipv6_hdr(skb);
2058 struct net *net = dev_net(skb->dev);
2059 int flags = RT6_LOOKUP_F_HAS_SADDR;
2060 struct ip_tunnel_info *tun_info;
2061 struct flowi6 fl6 = {
2062 .flowi6_iif = skb->dev->ifindex,
2063 .daddr = iph->daddr,
2064 .saddr = iph->saddr,
2065 .flowlabel = ip6_flowinfo(iph),
2066 .flowi6_mark = skb->mark,
2067 .flowi6_proto = iph->nexthdr,
2069 struct flow_keys *flkeys = NULL, _flkeys;
2071 tun_info = skb_tunnel_info(skb);
2072 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2073 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
2075 if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
2078 if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
2079 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
2082 ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
2085 static struct rt6_info *ip6_pol_route_output(struct net *net,
2086 struct fib6_table *table,
2088 const struct sk_buff *skb,
2091 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
2094 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
2095 struct flowi6 *fl6, int flags)
2099 if (rt6_need_strict(&fl6->daddr)) {
2100 struct dst_entry *dst;
2102 dst = l3mdev_link_scope_lookup(net, fl6);
2107 fl6->flowi6_iif = LOOPBACK_IFINDEX;
2109 any_src = ipv6_addr_any(&fl6->saddr);
2110 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
2111 (fl6->flowi6_oif && any_src))
2112 flags |= RT6_LOOKUP_F_IFACE;
2115 flags |= RT6_LOOKUP_F_HAS_SADDR;
2117 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
2119 return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
2121 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
2123 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2125 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
2126 struct net_device *loopback_dev = net->loopback_dev;
2127 struct dst_entry *new = NULL;
2129 rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
2130 DST_OBSOLETE_DEAD, 0);
2133 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
2137 new->input = dst_discard;
2138 new->output = dst_discard_out;
2140 dst_copy_metrics(new, &ort->dst);
2142 rt->rt6i_idev = in6_dev_get(loopback_dev);
2143 rt->rt6i_gateway = ort->rt6i_gateway;
2144 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
2146 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
2147 #ifdef CONFIG_IPV6_SUBTREES
2148 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
2152 dst_release(dst_orig);
2153 return new ? new : ERR_PTR(-ENOMEM);
2157 * Destination cache support functions
2160 static bool fib6_check(struct fib6_info *f6i, u32 cookie)
2164 if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie)
2167 if (fib6_check_expired(f6i))
2173 static struct dst_entry *rt6_check(struct rt6_info *rt,
2174 struct fib6_info *from,
2179 if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) ||
2180 rt_cookie != cookie)
2183 if (rt6_check_expired(rt))
2189 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt,
2190 struct fib6_info *from,
2193 if (!__rt6_check_expired(rt) &&
2194 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
2195 fib6_check(from, cookie))
2201 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
2203 struct dst_entry *dst_ret;
2204 struct fib6_info *from;
2205 struct rt6_info *rt;
2207 rt = container_of(dst, struct rt6_info, dst);
2211 /* All IPV6 dsts are created with ->obsolete set to the value
2212 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
2213 * into this function always.
2216 from = rcu_dereference(rt->from);
2218 if (from && (rt->rt6i_flags & RTF_PCPU ||
2219 unlikely(!list_empty(&rt->rt6i_uncached))))
2220 dst_ret = rt6_dst_from_check(rt, from, cookie);
2222 dst_ret = rt6_check(rt, from, cookie);
2229 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
2231 struct rt6_info *rt = (struct rt6_info *) dst;
2234 if (rt->rt6i_flags & RTF_CACHE) {
2236 if (rt6_check_expired(rt)) {
2237 rt6_remove_exception_rt(rt);
2249 static void ip6_link_failure(struct sk_buff *skb)
2251 struct rt6_info *rt;
2253 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2255 rt = (struct rt6_info *) skb_dst(skb);
2258 if (rt->rt6i_flags & RTF_CACHE) {
2259 if (dst_hold_safe(&rt->dst))
2260 rt6_remove_exception_rt(rt);
2262 struct fib6_info *from;
2263 struct fib6_node *fn;
2265 from = rcu_dereference(rt->from);
2267 fn = rcu_dereference(from->fib6_node);
2268 if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2276 static void rt6_update_expires(struct rt6_info *rt0, int timeout)
2278 if (!(rt0->rt6i_flags & RTF_EXPIRES)) {
2279 struct fib6_info *from;
2282 from = rcu_dereference(rt0->from);
2284 rt0->dst.expires = from->expires;
2288 dst_set_expires(&rt0->dst, timeout);
2289 rt0->rt6i_flags |= RTF_EXPIRES;
2292 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2294 struct net *net = dev_net(rt->dst.dev);
2296 dst_metric_set(&rt->dst, RTAX_MTU, mtu);
2297 rt->rt6i_flags |= RTF_MODIFIED;
2298 rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2301 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2306 from_set = !!rcu_dereference(rt->from);
2309 return !(rt->rt6i_flags & RTF_CACHE) &&
2310 (rt->rt6i_flags & RTF_PCPU || from_set);
2313 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2314 const struct ipv6hdr *iph, u32 mtu)
2316 const struct in6_addr *daddr, *saddr;
2317 struct rt6_info *rt6 = (struct rt6_info *)dst;
2319 if (dst_metric_locked(dst, RTAX_MTU))
2323 daddr = &iph->daddr;
2324 saddr = &iph->saddr;
2326 daddr = &sk->sk_v6_daddr;
2327 saddr = &inet6_sk(sk)->saddr;
2332 dst_confirm_neigh(dst, daddr);
2333 mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2334 if (mtu >= dst_mtu(dst))
2337 if (!rt6_cache_allowed_for_pmtu(rt6)) {
2338 rt6_do_update_pmtu(rt6, mtu);
2339 /* update rt6_ex->stamp for cache */
2340 if (rt6->rt6i_flags & RTF_CACHE)
2341 rt6_update_exception_stamp_rt(rt6);
2343 struct fib6_info *from;
2344 struct rt6_info *nrt6;
2347 from = rcu_dereference(rt6->from);
2348 nrt6 = ip6_rt_cache_alloc(from, daddr, saddr);
2350 rt6_do_update_pmtu(nrt6, mtu);
2351 if (rt6_insert_exception(nrt6, from))
2352 dst_release_immediate(&nrt6->dst);
2358 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2359 struct sk_buff *skb, u32 mtu)
2361 __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2364 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2365 int oif, u32 mark, kuid_t uid)
2367 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2368 struct dst_entry *dst;
2371 memset(&fl6, 0, sizeof(fl6));
2372 fl6.flowi6_oif = oif;
2373 fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
2374 fl6.daddr = iph->daddr;
2375 fl6.saddr = iph->saddr;
2376 fl6.flowlabel = ip6_flowinfo(iph);
2377 fl6.flowi6_uid = uid;
2379 dst = ip6_route_output(net, NULL, &fl6);
2381 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2384 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2386 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2388 struct dst_entry *dst;
2390 ip6_update_pmtu(skb, sock_net(sk), mtu,
2391 sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid);
2393 dst = __sk_dst_get(sk);
2394 if (!dst || !dst->obsolete ||
2395 dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2399 if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2400 ip6_datagram_dst_update(sk, false);
2403 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2405 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
2406 const struct flowi6 *fl6)
2408 #ifdef CONFIG_IPV6_SUBTREES
2409 struct ipv6_pinfo *np = inet6_sk(sk);
2412 ip6_dst_store(sk, dst,
2413 ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ?
2414 &sk->sk_v6_daddr : NULL,
2415 #ifdef CONFIG_IPV6_SUBTREES
2416 ipv6_addr_equal(&fl6->saddr, &np->saddr) ?
2422 /* Handle redirects */
2423 struct ip6rd_flowi {
2425 struct in6_addr gateway;
2428 static struct rt6_info *__ip6_route_redirect(struct net *net,
2429 struct fib6_table *table,
2431 const struct sk_buff *skb,
2434 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2435 struct rt6_info *ret = NULL, *rt_cache;
2436 struct fib6_info *rt;
2437 struct fib6_node *fn;
2439 /* Get the "current" route for this destination and
2440 * check if the redirect has come from appropriate router.
2442 * RFC 4861 specifies that redirects should only be
2443 * accepted if they come from the nexthop to the target.
2444 * Due to the way the routes are chosen, this notion
2445 * is a bit fuzzy and one might need to check all possible
2450 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2452 for_each_fib6_node_rt_rcu(fn) {
2453 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
2455 if (fib6_check_expired(rt))
2457 if (rt->fib6_flags & RTF_REJECT)
2459 if (!(rt->fib6_flags & RTF_GATEWAY))
2461 if (fl6->flowi6_oif != rt->fib6_nh.nh_dev->ifindex)
2463 /* rt_cache's gateway might be different from its 'parent'
2464 * in the case of an ip redirect.
2465 * So we keep searching in the exception table if the gateway
2468 if (!ipv6_addr_equal(&rdfl->gateway, &rt->fib6_nh.nh_gw)) {
2469 rt_cache = rt6_find_cached_rt(rt,
2473 ipv6_addr_equal(&rdfl->gateway,
2474 &rt_cache->rt6i_gateway)) {
2484 rt = net->ipv6.fib6_null_entry;
2485 else if (rt->fib6_flags & RTF_REJECT) {
2486 ret = net->ipv6.ip6_null_entry;
2490 if (rt == net->ipv6.fib6_null_entry) {
2491 fn = fib6_backtrack(fn, &fl6->saddr);
2498 ip6_hold_safe(net, &ret, true);
2500 ret = ip6_create_rt_rcu(rt);
2504 trace_fib6_table_lookup(net, rt, table, fl6);
2508 static struct dst_entry *ip6_route_redirect(struct net *net,
2509 const struct flowi6 *fl6,
2510 const struct sk_buff *skb,
2511 const struct in6_addr *gateway)
2513 int flags = RT6_LOOKUP_F_HAS_SADDR;
2514 struct ip6rd_flowi rdfl;
2517 rdfl.gateway = *gateway;
2519 return fib6_rule_lookup(net, &rdfl.fl6, skb,
2520 flags, __ip6_route_redirect);
2523 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2526 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2527 struct dst_entry *dst;
2530 memset(&fl6, 0, sizeof(fl6));
2531 fl6.flowi6_iif = LOOPBACK_IFINDEX;
2532 fl6.flowi6_oif = oif;
2533 fl6.flowi6_mark = mark;
2534 fl6.daddr = iph->daddr;
2535 fl6.saddr = iph->saddr;
2536 fl6.flowlabel = ip6_flowinfo(iph);
2537 fl6.flowi6_uid = uid;
2539 dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
2540 rt6_do_redirect(dst, NULL, skb);
2543 EXPORT_SYMBOL_GPL(ip6_redirect);
2545 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
2548 const struct ipv6hdr *iph = ipv6_hdr(skb);
2549 const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2550 struct dst_entry *dst;
2553 memset(&fl6, 0, sizeof(fl6));
2554 fl6.flowi6_iif = LOOPBACK_IFINDEX;
2555 fl6.flowi6_oif = oif;
2556 fl6.flowi6_mark = mark;
2557 fl6.daddr = msg->dest;
2558 fl6.saddr = iph->daddr;
2559 fl6.flowi6_uid = sock_net_uid(net, NULL);
2561 dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
2562 rt6_do_redirect(dst, NULL, skb);
2566 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2568 ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2571 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2573 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2575 struct net_device *dev = dst->dev;
2576 unsigned int mtu = dst_mtu(dst);
2577 struct net *net = dev_net(dev);
2579 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2581 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2582 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2585 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2586 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2587 * IPV6_MAXPLEN is also valid and means: "any MSS,
2588 * rely only on pmtu discovery"
2590 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2595 static unsigned int ip6_mtu(const struct dst_entry *dst)
2597 struct inet6_dev *idev;
2600 mtu = dst_metric_raw(dst, RTAX_MTU);
2607 idev = __in6_dev_get(dst->dev);
2609 mtu = idev->cnf.mtu6;
2613 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2615 return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2619 * 1. mtu on route is locked - use it
2620 * 2. mtu from nexthop exception
2621 * 3. mtu from egress device
2623 * based on ip6_dst_mtu_forward and exception logic of
2624 * rt6_find_cached_rt; called with rcu_read_lock
2626 u32 ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr,
2627 struct in6_addr *saddr)
2629 struct rt6_exception_bucket *bucket;
2630 struct rt6_exception *rt6_ex;
2631 struct in6_addr *src_key;
2632 struct inet6_dev *idev;
2635 if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) {
2636 mtu = f6i->fib6_pmtu;
2642 #ifdef CONFIG_IPV6_SUBTREES
2643 if (f6i->fib6_src.plen)
2647 bucket = rcu_dereference(f6i->rt6i_exception_bucket);
2648 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
2649 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
2650 mtu = dst_metric_raw(&rt6_ex->rt6i->dst, RTAX_MTU);
2653 struct net_device *dev = fib6_info_nh_dev(f6i);
2656 idev = __in6_dev_get(dev);
2657 if (idev && idev->cnf.mtu6 > mtu)
2658 mtu = idev->cnf.mtu6;
2661 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2663 return mtu - lwtunnel_headroom(fib6_info_nh_lwt(f6i), mtu);
2666 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2669 struct dst_entry *dst;
2670 struct rt6_info *rt;
2671 struct inet6_dev *idev = in6_dev_get(dev);
2672 struct net *net = dev_net(dev);
2674 if (unlikely(!idev))
2675 return ERR_PTR(-ENODEV);
2677 rt = ip6_dst_alloc(net, dev, 0);
2678 if (unlikely(!rt)) {
2680 dst = ERR_PTR(-ENOMEM);
2684 rt->dst.flags |= DST_HOST;
2685 rt->dst.input = ip6_input;
2686 rt->dst.output = ip6_output;
2687 rt->rt6i_gateway = fl6->daddr;
2688 rt->rt6i_dst.addr = fl6->daddr;
2689 rt->rt6i_dst.plen = 128;
2690 rt->rt6i_idev = idev;
2691 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2693 /* Add this dst into uncached_list so that rt6_disable_ip() can
2694 * do proper release of the net_device
2696 rt6_uncached_list_add(rt);
2697 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2699 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2705 static int ip6_dst_gc(struct dst_ops *ops)
2707 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2708 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2709 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2710 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2711 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2712 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2715 entries = dst_entries_get_fast(ops);
2716 if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2717 entries <= rt_max_size)
2720 net->ipv6.ip6_rt_gc_expire++;
2721 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2722 entries = dst_entries_get_slow(ops);
2723 if (entries < ops->gc_thresh)
2724 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2726 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2727 return entries > rt_max_size;
2730 static int ip6_convert_metrics(struct net *net, struct fib6_info *rt,
2731 struct fib6_config *cfg)
2733 struct dst_metrics *p;
2738 p = kzalloc(sizeof(*rt->fib6_metrics), GFP_KERNEL);
2742 refcount_set(&p->refcnt, 1);
2743 rt->fib6_metrics = p;
2745 return ip_metrics_convert(net, cfg->fc_mx, cfg->fc_mx_len, p->metrics);
2748 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2749 struct fib6_config *cfg,
2750 const struct in6_addr *gw_addr,
2751 u32 tbid, int flags)
2753 struct flowi6 fl6 = {
2754 .flowi6_oif = cfg->fc_ifindex,
2756 .saddr = cfg->fc_prefsrc,
2758 struct fib6_table *table;
2759 struct rt6_info *rt;
2761 table = fib6_get_table(net, tbid);
2765 if (!ipv6_addr_any(&cfg->fc_prefsrc))
2766 flags |= RT6_LOOKUP_F_HAS_SADDR;
2768 flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
2769 rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags);
2771 /* if table lookup failed, fall back to full lookup */
2772 if (rt == net->ipv6.ip6_null_entry) {
2780 static int ip6_route_check_nh_onlink(struct net *net,
2781 struct fib6_config *cfg,
2782 const struct net_device *dev,
2783 struct netlink_ext_ack *extack)
2785 u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
2786 const struct in6_addr *gw_addr = &cfg->fc_gateway;
2787 u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
2788 struct rt6_info *grt;
2792 grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
2794 if (!grt->dst.error &&
2795 (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
2796 NL_SET_ERR_MSG(extack,
2797 "Nexthop has invalid gateway or device mismatch");
2807 static int ip6_route_check_nh(struct net *net,
2808 struct fib6_config *cfg,
2809 struct net_device **_dev,
2810 struct inet6_dev **idev)
2812 const struct in6_addr *gw_addr = &cfg->fc_gateway;
2813 struct net_device *dev = _dev ? *_dev : NULL;
2814 struct rt6_info *grt = NULL;
2815 int err = -EHOSTUNREACH;
2817 if (cfg->fc_table) {
2818 int flags = RT6_LOOKUP_F_IFACE;
2820 grt = ip6_nh_lookup_table(net, cfg, gw_addr,
2821 cfg->fc_table, flags);
2823 if (grt->rt6i_flags & RTF_GATEWAY ||
2824 (dev && dev != grt->dst.dev)) {
2832 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1);
2838 if (dev != grt->dst.dev) {
2843 *_dev = dev = grt->dst.dev;
2844 *idev = grt->rt6i_idev;
2846 in6_dev_hold(grt->rt6i_idev);
2849 if (!(grt->rt6i_flags & RTF_GATEWAY))
2858 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
2859 struct net_device **_dev, struct inet6_dev **idev,
2860 struct netlink_ext_ack *extack)
2862 const struct in6_addr *gw_addr = &cfg->fc_gateway;
2863 int gwa_type = ipv6_addr_type(gw_addr);
2864 bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true;
2865 const struct net_device *dev = *_dev;
2866 bool need_addr_check = !dev;
2869 /* if gw_addr is local we will fail to detect this in case
2870 * address is still TENTATIVE (DAD in progress). rt6_lookup()
2871 * will return already-added prefix route via interface that
2872 * prefix route was assigned to, which might be non-loopback.
2875 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2876 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2880 if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
2881 /* IPv6 strictly inhibits using not link-local
2882 * addresses as nexthop address.
2883 * Otherwise, router will not able to send redirects.
2884 * It is very good, but in some (rare!) circumstances
2885 * (SIT, PtP, NBMA NOARP links) it is handy to allow
2886 * some exceptions. --ANK
2887 * We allow IPv4-mapped nexthops to support RFC4798-type
2890 if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) {
2891 NL_SET_ERR_MSG(extack, "Invalid gateway address");
2895 if (cfg->fc_flags & RTNH_F_ONLINK)
2896 err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
2898 err = ip6_route_check_nh(net, cfg, _dev, idev);
2904 /* reload in case device was changed */
2909 NL_SET_ERR_MSG(extack, "Egress device not specified");
2911 } else if (dev->flags & IFF_LOOPBACK) {
2912 NL_SET_ERR_MSG(extack,
2913 "Egress device can not be loopback device for this route");
2917 /* if we did not check gw_addr above, do so now that the
2918 * egress device has been resolved.
2920 if (need_addr_check &&
2921 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2922 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2931 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
2933 struct netlink_ext_ack *extack)
2935 struct net *net = cfg->fc_nlinfo.nl_net;
2936 struct fib6_info *rt = NULL;
2937 struct net_device *dev = NULL;
2938 struct inet6_dev *idev = NULL;
2939 struct fib6_table *table;
2943 /* RTF_PCPU is an internal flag; can not be set by userspace */
2944 if (cfg->fc_flags & RTF_PCPU) {
2945 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
2949 /* RTF_CACHE is an internal flag; can not be set by userspace */
2950 if (cfg->fc_flags & RTF_CACHE) {
2951 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
2955 if (cfg->fc_type > RTN_MAX) {
2956 NL_SET_ERR_MSG(extack, "Invalid route type");
2960 if (cfg->fc_dst_len > 128) {
2961 NL_SET_ERR_MSG(extack, "Invalid prefix length");
2964 if (cfg->fc_src_len > 128) {
2965 NL_SET_ERR_MSG(extack, "Invalid source address length");
2968 #ifndef CONFIG_IPV6_SUBTREES
2969 if (cfg->fc_src_len) {
2970 NL_SET_ERR_MSG(extack,
2971 "Specifying source address requires IPV6_SUBTREES to be enabled");
2975 if (cfg->fc_ifindex) {
2977 dev = dev_get_by_index(net, cfg->fc_ifindex);
2980 idev = in6_dev_get(dev);
2985 if (cfg->fc_metric == 0)
2986 cfg->fc_metric = IP6_RT_PRIO_USER;
2988 if (cfg->fc_flags & RTNH_F_ONLINK) {
2990 NL_SET_ERR_MSG(extack,
2991 "Nexthop device required for onlink");
2996 if (!(dev->flags & IFF_UP)) {
2997 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3004 if (cfg->fc_nlinfo.nlh &&
3005 !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
3006 table = fib6_get_table(net, cfg->fc_table);
3008 pr_warn("NLM_F_CREATE should be specified when creating new route\n");
3009 table = fib6_new_table(net, cfg->fc_table);
3012 table = fib6_new_table(net, cfg->fc_table);
3019 rt = fib6_info_alloc(gfp_flags);
3023 if (cfg->fc_flags & RTF_ADDRCONF)
3024 rt->dst_nocount = true;
3026 err = ip6_convert_metrics(net, rt, cfg);
3030 if (cfg->fc_flags & RTF_EXPIRES)
3031 fib6_set_expires(rt, jiffies +
3032 clock_t_to_jiffies(cfg->fc_expires));
3034 fib6_clean_expires(rt);
3036 if (cfg->fc_protocol == RTPROT_UNSPEC)
3037 cfg->fc_protocol = RTPROT_BOOT;
3038 rt->fib6_protocol = cfg->fc_protocol;
3040 addr_type = ipv6_addr_type(&cfg->fc_dst);
3042 if (cfg->fc_encap) {
3043 struct lwtunnel_state *lwtstate;
3045 err = lwtunnel_build_state(cfg->fc_encap_type,
3046 cfg->fc_encap, AF_INET6, cfg,
3050 rt->fib6_nh.nh_lwtstate = lwtstate_get(lwtstate);
3053 ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
3054 rt->fib6_dst.plen = cfg->fc_dst_len;
3055 if (rt->fib6_dst.plen == 128)
3056 rt->dst_host = true;
3058 #ifdef CONFIG_IPV6_SUBTREES
3059 ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
3060 rt->fib6_src.plen = cfg->fc_src_len;
3063 rt->fib6_metric = cfg->fc_metric;
3064 rt->fib6_nh.nh_weight = 1;
3066 rt->fib6_type = cfg->fc_type;
3068 /* We cannot add true routes via loopback here,
3069 they would result in kernel looping; promote them to reject routes
3071 if ((cfg->fc_flags & RTF_REJECT) ||
3072 (dev && (dev->flags & IFF_LOOPBACK) &&
3073 !(addr_type & IPV6_ADDR_LOOPBACK) &&
3074 !(cfg->fc_flags & RTF_LOCAL))) {
3075 /* hold loopback dev/idev if we haven't done so. */
3076 if (dev != net->loopback_dev) {
3081 dev = net->loopback_dev;
3083 idev = in6_dev_get(dev);
3089 rt->fib6_flags = RTF_REJECT|RTF_NONEXTHOP;
3093 if (cfg->fc_flags & RTF_GATEWAY) {
3094 err = ip6_validate_gw(net, cfg, &dev, &idev, extack);
3098 rt->fib6_nh.nh_gw = cfg->fc_gateway;
3105 if (idev->cnf.disable_ipv6) {
3106 NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
3111 if (!(dev->flags & IFF_UP)) {
3112 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3117 if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
3118 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
3119 NL_SET_ERR_MSG(extack, "Invalid source address");
3123 rt->fib6_prefsrc.addr = cfg->fc_prefsrc;
3124 rt->fib6_prefsrc.plen = 128;
3126 rt->fib6_prefsrc.plen = 0;
3128 rt->fib6_flags = cfg->fc_flags;
3131 if (!(rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
3132 !netif_carrier_ok(dev))
3133 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
3134 rt->fib6_nh.nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK);
3135 rt->fib6_nh.nh_dev = dev;
3136 rt->fib6_table = table;
3138 cfg->fc_nlinfo.nl_net = dev_net(dev);
3150 fib6_info_release(rt);
3151 return ERR_PTR(err);
3154 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
3155 struct netlink_ext_ack *extack)
3157 struct fib6_info *rt;
3160 rt = ip6_route_info_create(cfg, gfp_flags, extack);
3164 err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack);
3165 fib6_info_release(rt);
3170 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info)
3172 struct net *net = info->nl_net;
3173 struct fib6_table *table;
3176 if (rt == net->ipv6.fib6_null_entry) {
3181 table = rt->fib6_table;
3182 spin_lock_bh(&table->tb6_lock);
3183 err = fib6_del(rt, info);
3184 spin_unlock_bh(&table->tb6_lock);
3187 fib6_info_release(rt);
3191 int ip6_del_rt(struct net *net, struct fib6_info *rt)
3193 struct nl_info info = { .nl_net = net };
3195 return __ip6_del_rt(rt, &info);
3198 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
3200 struct nl_info *info = &cfg->fc_nlinfo;
3201 struct net *net = info->nl_net;
3202 struct sk_buff *skb = NULL;
3203 struct fib6_table *table;
3206 if (rt == net->ipv6.fib6_null_entry)
3208 table = rt->fib6_table;
3209 spin_lock_bh(&table->tb6_lock);
3211 if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) {
3212 struct fib6_info *sibling, *next_sibling;
3214 /* prefer to send a single notification with all hops */
3215 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3217 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3219 if (rt6_fill_node(net, skb, rt, NULL,
3220 NULL, NULL, 0, RTM_DELROUTE,
3221 info->portid, seq, 0) < 0) {
3225 info->skip_notify = 1;
3228 list_for_each_entry_safe(sibling, next_sibling,
3231 err = fib6_del(sibling, info);
3237 err = fib6_del(rt, info);
3239 spin_unlock_bh(&table->tb6_lock);
3241 fib6_info_release(rt);
3244 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3245 info->nlh, gfp_any());
3250 static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
3254 if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex)
3257 if (cfg->fc_flags & RTF_GATEWAY &&
3258 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
3260 if (dst_hold_safe(&rt->dst))
3261 rc = rt6_remove_exception_rt(rt);
3266 static int ip6_route_del(struct fib6_config *cfg,
3267 struct netlink_ext_ack *extack)
3269 struct rt6_info *rt_cache;
3270 struct fib6_table *table;
3271 struct fib6_info *rt;
3272 struct fib6_node *fn;
3275 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
3277 NL_SET_ERR_MSG(extack, "FIB table does not exist");
3283 fn = fib6_locate(&table->tb6_root,
3284 &cfg->fc_dst, cfg->fc_dst_len,
3285 &cfg->fc_src, cfg->fc_src_len,
3286 !(cfg->fc_flags & RTF_CACHE));
3289 for_each_fib6_node_rt_rcu(fn) {
3290 if (cfg->fc_flags & RTF_CACHE) {
3293 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
3296 rc = ip6_del_cached_rt(rt_cache, cfg);
3304 if (cfg->fc_ifindex &&
3305 (!rt->fib6_nh.nh_dev ||
3306 rt->fib6_nh.nh_dev->ifindex != cfg->fc_ifindex))
3308 if (cfg->fc_flags & RTF_GATEWAY &&
3309 !ipv6_addr_equal(&cfg->fc_gateway, &rt->fib6_nh.nh_gw))
3311 if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
3313 if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol)
3315 if (!fib6_info_hold_safe(rt))
3319 /* if gateway was specified only delete the one hop */
3320 if (cfg->fc_flags & RTF_GATEWAY)
3321 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
3323 return __ip6_del_rt_siblings(rt, cfg);
3331 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
3333 struct netevent_redirect netevent;
3334 struct rt6_info *rt, *nrt = NULL;
3335 struct ndisc_options ndopts;
3336 struct inet6_dev *in6_dev;
3337 struct neighbour *neigh;
3338 struct fib6_info *from;
3340 int optlen, on_link;
3343 optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
3344 optlen -= sizeof(*msg);
3347 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
3351 msg = (struct rd_msg *)icmp6_hdr(skb);
3353 if (ipv6_addr_is_multicast(&msg->dest)) {
3354 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
3359 if (ipv6_addr_equal(&msg->dest, &msg->target)) {
3361 } else if (ipv6_addr_type(&msg->target) !=
3362 (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
3363 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
3367 in6_dev = __in6_dev_get(skb->dev);
3370 if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
3374 * The IP source address of the Redirect MUST be the same as the current
3375 * first-hop router for the specified ICMP Destination Address.
3378 if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
3379 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
3384 if (ndopts.nd_opts_tgt_lladdr) {
3385 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
3388 net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
3393 rt = (struct rt6_info *) dst;
3394 if (rt->rt6i_flags & RTF_REJECT) {
3395 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3399 /* Redirect received -> path was valid.
3400 * Look, redirects are sent only in response to data packets,
3401 * so that this nexthop apparently is reachable. --ANK
3403 dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3405 neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3410 * We have finally decided to accept it.
3413 ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3414 NEIGH_UPDATE_F_WEAK_OVERRIDE|
3415 NEIGH_UPDATE_F_OVERRIDE|
3416 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3417 NEIGH_UPDATE_F_ISROUTER)),
3418 NDISC_REDIRECT, &ndopts);
3421 from = rcu_dereference(rt->from);
3422 /* This fib6_info_hold() is safe here because we hold reference to rt
3423 * and rt already holds reference to fib6_info.
3425 fib6_info_hold(from);
3428 nrt = ip6_rt_cache_alloc(from, &msg->dest, NULL);
3432 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3434 nrt->rt6i_flags &= ~RTF_GATEWAY;
3436 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3438 /* No need to remove rt from the exception table if rt is
3439 * a cached route because rt6_insert_exception() will
3442 if (rt6_insert_exception(nrt, from)) {
3443 dst_release_immediate(&nrt->dst);
3447 netevent.old = &rt->dst;
3448 netevent.new = &nrt->dst;
3449 netevent.daddr = &msg->dest;
3450 netevent.neigh = neigh;
3451 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3454 fib6_info_release(from);
3455 neigh_release(neigh);
3458 #ifdef CONFIG_IPV6_ROUTE_INFO
3459 static struct fib6_info *rt6_get_route_info(struct net *net,
3460 const struct in6_addr *prefix, int prefixlen,
3461 const struct in6_addr *gwaddr,
3462 struct net_device *dev)
3464 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3465 int ifindex = dev->ifindex;
3466 struct fib6_node *fn;
3467 struct fib6_info *rt = NULL;
3468 struct fib6_table *table;
3470 table = fib6_get_table(net, tb_id);
3475 fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3479 for_each_fib6_node_rt_rcu(fn) {
3480 if (rt->fib6_nh.nh_dev->ifindex != ifindex)
3482 if ((rt->fib6_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
3484 if (!ipv6_addr_equal(&rt->fib6_nh.nh_gw, gwaddr))
3486 if (!fib6_info_hold_safe(rt))
3495 static struct fib6_info *rt6_add_route_info(struct net *net,
3496 const struct in6_addr *prefix, int prefixlen,
3497 const struct in6_addr *gwaddr,
3498 struct net_device *dev,
3501 struct fib6_config cfg = {
3502 .fc_metric = IP6_RT_PRIO_USER,
3503 .fc_ifindex = dev->ifindex,
3504 .fc_dst_len = prefixlen,
3505 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3506 RTF_UP | RTF_PREF(pref),
3507 .fc_protocol = RTPROT_RA,
3508 .fc_type = RTN_UNICAST,
3509 .fc_nlinfo.portid = 0,
3510 .fc_nlinfo.nlh = NULL,
3511 .fc_nlinfo.nl_net = net,
3514 cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3515 cfg.fc_dst = *prefix;
3516 cfg.fc_gateway = *gwaddr;
3518 /* We should treat it as a default route if prefix length is 0. */
3520 cfg.fc_flags |= RTF_DEFAULT;
3522 ip6_route_add(&cfg, GFP_ATOMIC, NULL);
3524 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3528 struct fib6_info *rt6_get_dflt_router(struct net *net,
3529 const struct in6_addr *addr,
3530 struct net_device *dev)
3532 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3533 struct fib6_info *rt;
3534 struct fib6_table *table;
3536 table = fib6_get_table(net, tb_id);
3541 for_each_fib6_node_rt_rcu(&table->tb6_root) {
3542 if (dev == rt->fib6_nh.nh_dev &&
3543 ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3544 ipv6_addr_equal(&rt->fib6_nh.nh_gw, addr))
3547 if (rt && !fib6_info_hold_safe(rt))
3553 struct fib6_info *rt6_add_dflt_router(struct net *net,
3554 const struct in6_addr *gwaddr,
3555 struct net_device *dev,
3558 struct fib6_config cfg = {
3559 .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3560 .fc_metric = IP6_RT_PRIO_USER,
3561 .fc_ifindex = dev->ifindex,
3562 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3563 RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3564 .fc_protocol = RTPROT_RA,
3565 .fc_type = RTN_UNICAST,
3566 .fc_nlinfo.portid = 0,
3567 .fc_nlinfo.nlh = NULL,
3568 .fc_nlinfo.nl_net = net,
3571 cfg.fc_gateway = *gwaddr;
3573 if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) {
3574 struct fib6_table *table;
3576 table = fib6_get_table(dev_net(dev), cfg.fc_table);
3578 table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3581 return rt6_get_dflt_router(net, gwaddr, dev);
3584 static void __rt6_purge_dflt_routers(struct net *net,
3585 struct fib6_table *table)
3587 struct fib6_info *rt;
3591 for_each_fib6_node_rt_rcu(&table->tb6_root) {
3592 struct net_device *dev = fib6_info_nh_dev(rt);
3593 struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL;
3595 if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3596 (!idev || idev->cnf.accept_ra != 2) &&
3597 fib6_info_hold_safe(rt)) {
3599 ip6_del_rt(net, rt);
3605 table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3608 void rt6_purge_dflt_routers(struct net *net)
3610 struct fib6_table *table;
3611 struct hlist_head *head;
3616 for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3617 head = &net->ipv6.fib_table_hash[h];
3618 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3619 if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3620 __rt6_purge_dflt_routers(net, table);
3627 static void rtmsg_to_fib6_config(struct net *net,
3628 struct in6_rtmsg *rtmsg,
3629 struct fib6_config *cfg)
3631 memset(cfg, 0, sizeof(*cfg));
3633 cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3635 cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
3636 cfg->fc_metric = rtmsg->rtmsg_metric;
3637 cfg->fc_expires = rtmsg->rtmsg_info;
3638 cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
3639 cfg->fc_src_len = rtmsg->rtmsg_src_len;
3640 cfg->fc_flags = rtmsg->rtmsg_flags;
3641 cfg->fc_type = rtmsg->rtmsg_type;
3643 cfg->fc_nlinfo.nl_net = net;
3645 cfg->fc_dst = rtmsg->rtmsg_dst;
3646 cfg->fc_src = rtmsg->rtmsg_src;
3647 cfg->fc_gateway = rtmsg->rtmsg_gateway;
3650 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3652 struct fib6_config cfg;
3653 struct in6_rtmsg rtmsg;
3657 case SIOCADDRT: /* Add a route */
3658 case SIOCDELRT: /* Delete a route */
3659 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3661 err = copy_from_user(&rtmsg, arg,
3662 sizeof(struct in6_rtmsg));
3666 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3671 err = ip6_route_add(&cfg, GFP_KERNEL, NULL);
3674 err = ip6_route_del(&cfg, NULL);
3688 * Drop the packet on the floor
3691 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3694 struct dst_entry *dst = skb_dst(skb);
3695 switch (ipstats_mib_noroutes) {
3696 case IPSTATS_MIB_INNOROUTES:
3697 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3698 if (type == IPV6_ADDR_ANY) {
3699 IP6_INC_STATS(dev_net(dst->dev),
3700 __in6_dev_get_safely(skb->dev),
3701 IPSTATS_MIB_INADDRERRORS);
3705 case IPSTATS_MIB_OUTNOROUTES:
3706 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3707 ipstats_mib_noroutes);
3710 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3715 static int ip6_pkt_discard(struct sk_buff *skb)
3717 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3720 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3722 skb->dev = skb_dst(skb)->dev;
3723 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3726 static int ip6_pkt_prohibit(struct sk_buff *skb)
3728 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3731 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3733 skb->dev = skb_dst(skb)->dev;
3734 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3738 * Allocate a dst for local (unicast / anycast) address.
3741 struct fib6_info *addrconf_f6i_alloc(struct net *net,
3742 struct inet6_dev *idev,
3743 const struct in6_addr *addr,
3744 bool anycast, gfp_t gfp_flags)
3747 struct net_device *dev = idev->dev;
3748 struct fib6_info *f6i;
3750 f6i = fib6_info_alloc(gfp_flags);
3752 return ERR_PTR(-ENOMEM);
3754 f6i->dst_nocount = true;
3755 f6i->dst_host = true;
3756 f6i->fib6_protocol = RTPROT_KERNEL;
3757 f6i->fib6_flags = RTF_UP | RTF_NONEXTHOP;
3759 f6i->fib6_type = RTN_ANYCAST;
3760 f6i->fib6_flags |= RTF_ANYCAST;
3762 f6i->fib6_type = RTN_LOCAL;
3763 f6i->fib6_flags |= RTF_LOCAL;
3766 f6i->fib6_nh.nh_gw = *addr;
3768 f6i->fib6_nh.nh_dev = dev;
3769 f6i->fib6_dst.addr = *addr;
3770 f6i->fib6_dst.plen = 128;
3771 tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
3772 f6i->fib6_table = fib6_get_table(net, tb_id);
3777 /* remove deleted ip from prefsrc entries */
3778 struct arg_dev_net_ip {
3779 struct net_device *dev;
3781 struct in6_addr *addr;
3784 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
3786 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3787 struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3788 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3790 if (((void *)rt->fib6_nh.nh_dev == dev || !dev) &&
3791 rt != net->ipv6.fib6_null_entry &&
3792 ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) {
3793 spin_lock_bh(&rt6_exception_lock);
3794 /* remove prefsrc entry */
3795 rt->fib6_prefsrc.plen = 0;
3796 /* need to update cache as well */
3797 rt6_exceptions_remove_prefsrc(rt);
3798 spin_unlock_bh(&rt6_exception_lock);
3803 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3805 struct net *net = dev_net(ifp->idev->dev);
3806 struct arg_dev_net_ip adni = {
3807 .dev = ifp->idev->dev,
3811 fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3814 #define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
3816 /* Remove routers and update dst entries when gateway turn into host. */
3817 static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
3819 struct in6_addr *gateway = (struct in6_addr *)arg;
3821 if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3822 ipv6_addr_equal(gateway, &rt->fib6_nh.nh_gw)) {
3826 /* Further clean up cached routes in exception table.
3827 * This is needed because cached route may have a different
3828 * gateway than its 'parent' in the case of an ip redirect.
3830 rt6_exceptions_clean_tohost(rt, gateway);
3835 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3837 fib6_clean_all(net, fib6_clean_tohost, gateway);
3840 struct arg_netdev_event {
3841 const struct net_device *dev;
3843 unsigned int nh_flags;
3844 unsigned long event;
3848 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
3850 struct fib6_info *iter;
3851 struct fib6_node *fn;
3853 fn = rcu_dereference_protected(rt->fib6_node,
3854 lockdep_is_held(&rt->fib6_table->tb6_lock));
3855 iter = rcu_dereference_protected(fn->leaf,
3856 lockdep_is_held(&rt->fib6_table->tb6_lock));
3858 if (iter->fib6_metric == rt->fib6_metric &&
3859 rt6_qualify_for_ecmp(iter))
3861 iter = rcu_dereference_protected(iter->fib6_next,
3862 lockdep_is_held(&rt->fib6_table->tb6_lock));
3868 static bool rt6_is_dead(const struct fib6_info *rt)
3870 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD ||
3871 (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
3872 fib6_ignore_linkdown(rt)))
3878 static int rt6_multipath_total_weight(const struct fib6_info *rt)
3880 struct fib6_info *iter;
3883 if (!rt6_is_dead(rt))
3884 total += rt->fib6_nh.nh_weight;
3886 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) {
3887 if (!rt6_is_dead(iter))
3888 total += iter->fib6_nh.nh_weight;
3894 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total)
3896 int upper_bound = -1;
3898 if (!rt6_is_dead(rt)) {
3899 *weight += rt->fib6_nh.nh_weight;
3900 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
3903 atomic_set(&rt->fib6_nh.nh_upper_bound, upper_bound);
3906 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total)
3908 struct fib6_info *iter;
3911 rt6_upper_bound_set(rt, &weight, total);
3913 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3914 rt6_upper_bound_set(iter, &weight, total);
3917 void rt6_multipath_rebalance(struct fib6_info *rt)
3919 struct fib6_info *first;
3922 /* In case the entire multipath route was marked for flushing,
3923 * then there is no need to rebalance upon the removal of every
3926 if (!rt->fib6_nsiblings || rt->should_flush)
3929 /* During lookup routes are evaluated in order, so we need to
3930 * make sure upper bounds are assigned from the first sibling
3933 first = rt6_multipath_first_sibling(rt);
3934 if (WARN_ON_ONCE(!first))
3937 total = rt6_multipath_total_weight(first);
3938 rt6_multipath_upper_bound_set(first, total);
3941 static int fib6_ifup(struct fib6_info *rt, void *p_arg)
3943 const struct arg_netdev_event *arg = p_arg;
3944 struct net *net = dev_net(arg->dev);
3946 if (rt != net->ipv6.fib6_null_entry && rt->fib6_nh.nh_dev == arg->dev) {
3947 rt->fib6_nh.nh_flags &= ~arg->nh_flags;
3948 fib6_update_sernum_upto_root(net, rt);
3949 rt6_multipath_rebalance(rt);
3955 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags)
3957 struct arg_netdev_event arg = {
3960 .nh_flags = nh_flags,
3964 if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
3965 arg.nh_flags |= RTNH_F_LINKDOWN;
3967 fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
3970 static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
3971 const struct net_device *dev)
3973 struct fib6_info *iter;
3975 if (rt->fib6_nh.nh_dev == dev)
3977 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3978 if (iter->fib6_nh.nh_dev == dev)
3984 static void rt6_multipath_flush(struct fib6_info *rt)
3986 struct fib6_info *iter;
3988 rt->should_flush = 1;
3989 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3990 iter->should_flush = 1;
3993 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt,
3994 const struct net_device *down_dev)
3996 struct fib6_info *iter;
3997 unsigned int dead = 0;
3999 if (rt->fib6_nh.nh_dev == down_dev ||
4000 rt->fib6_nh.nh_flags & RTNH_F_DEAD)
4002 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4003 if (iter->fib6_nh.nh_dev == down_dev ||
4004 iter->fib6_nh.nh_flags & RTNH_F_DEAD)
4010 static void rt6_multipath_nh_flags_set(struct fib6_info *rt,
4011 const struct net_device *dev,
4012 unsigned int nh_flags)
4014 struct fib6_info *iter;
4016 if (rt->fib6_nh.nh_dev == dev)
4017 rt->fib6_nh.nh_flags |= nh_flags;
4018 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4019 if (iter->fib6_nh.nh_dev == dev)
4020 iter->fib6_nh.nh_flags |= nh_flags;
4023 /* called with write lock held for table with rt */
4024 static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
4026 const struct arg_netdev_event *arg = p_arg;
4027 const struct net_device *dev = arg->dev;
4028 struct net *net = dev_net(dev);
4030 if (rt == net->ipv6.fib6_null_entry)
4033 switch (arg->event) {
4034 case NETDEV_UNREGISTER:
4035 return rt->fib6_nh.nh_dev == dev ? -1 : 0;
4037 if (rt->should_flush)
4039 if (!rt->fib6_nsiblings)
4040 return rt->fib6_nh.nh_dev == dev ? -1 : 0;
4041 if (rt6_multipath_uses_dev(rt, dev)) {
4044 count = rt6_multipath_dead_count(rt, dev);
4045 if (rt->fib6_nsiblings + 1 == count) {
4046 rt6_multipath_flush(rt);
4049 rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
4051 fib6_update_sernum(net, rt);
4052 rt6_multipath_rebalance(rt);
4056 if (rt->fib6_nh.nh_dev != dev ||
4057 rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
4059 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
4060 rt6_multipath_rebalance(rt);
4067 void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
4069 struct arg_netdev_event arg = {
4076 fib6_clean_all(dev_net(dev), fib6_ifdown, &arg);
4079 void rt6_disable_ip(struct net_device *dev, unsigned long event)
4081 rt6_sync_down_dev(dev, event);
4082 rt6_uncached_list_flush_dev(dev_net(dev), dev);
4083 neigh_ifdown(&nd_tbl, dev);
4086 struct rt6_mtu_change_arg {
4087 struct net_device *dev;
4091 static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg)
4093 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
4094 struct inet6_dev *idev;
4096 /* In IPv6 pmtu discovery is not optional,
4097 so that RTAX_MTU lock cannot disable it.
4098 We still use this lock to block changes
4099 caused by addrconf/ndisc.
4102 idev = __in6_dev_get(arg->dev);
4106 /* For administrative MTU increase, there is no way to discover
4107 IPv6 PMTU increase, so PMTU increase should be updated here.
4108 Since RFC 1981 doesn't include administrative MTU increase
4109 update PMTU increase is a MUST. (i.e. jumbo frame)
4111 if (rt->fib6_nh.nh_dev == arg->dev &&
4112 !fib6_metric_locked(rt, RTAX_MTU)) {
4113 u32 mtu = rt->fib6_pmtu;
4115 if (mtu >= arg->mtu ||
4116 (mtu < arg->mtu && mtu == idev->cnf.mtu6))
4117 fib6_metric_set(rt, RTAX_MTU, arg->mtu);
4119 spin_lock_bh(&rt6_exception_lock);
4120 rt6_exceptions_update_pmtu(idev, rt, arg->mtu);
4121 spin_unlock_bh(&rt6_exception_lock);
4126 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
4128 struct rt6_mtu_change_arg arg = {
4133 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
4136 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
4137 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) },
4138 [RTA_PREFSRC] = { .len = sizeof(struct in6_addr) },
4139 [RTA_OIF] = { .type = NLA_U32 },
4140 [RTA_IIF] = { .type = NLA_U32 },
4141 [RTA_PRIORITY] = { .type = NLA_U32 },
4142 [RTA_METRICS] = { .type = NLA_NESTED },
4143 [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) },
4144 [RTA_PREF] = { .type = NLA_U8 },
4145 [RTA_ENCAP_TYPE] = { .type = NLA_U16 },
4146 [RTA_ENCAP] = { .type = NLA_NESTED },
4147 [RTA_EXPIRES] = { .type = NLA_U32 },
4148 [RTA_UID] = { .type = NLA_U32 },
4149 [RTA_MARK] = { .type = NLA_U32 },
4150 [RTA_TABLE] = { .type = NLA_U32 },
4151 [RTA_IP_PROTO] = { .type = NLA_U8 },
4152 [RTA_SPORT] = { .type = NLA_U16 },
4153 [RTA_DPORT] = { .type = NLA_U16 },
4156 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
4157 struct fib6_config *cfg,
4158 struct netlink_ext_ack *extack)
4161 struct nlattr *tb[RTA_MAX+1];
4165 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4171 rtm = nlmsg_data(nlh);
4172 memset(cfg, 0, sizeof(*cfg));
4174 cfg->fc_table = rtm->rtm_table;
4175 cfg->fc_dst_len = rtm->rtm_dst_len;
4176 cfg->fc_src_len = rtm->rtm_src_len;
4177 cfg->fc_flags = RTF_UP;
4178 cfg->fc_protocol = rtm->rtm_protocol;
4179 cfg->fc_type = rtm->rtm_type;
4181 if (rtm->rtm_type == RTN_UNREACHABLE ||
4182 rtm->rtm_type == RTN_BLACKHOLE ||
4183 rtm->rtm_type == RTN_PROHIBIT ||
4184 rtm->rtm_type == RTN_THROW)
4185 cfg->fc_flags |= RTF_REJECT;
4187 if (rtm->rtm_type == RTN_LOCAL)
4188 cfg->fc_flags |= RTF_LOCAL;
4190 if (rtm->rtm_flags & RTM_F_CLONED)
4191 cfg->fc_flags |= RTF_CACHE;
4193 cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
4195 cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
4196 cfg->fc_nlinfo.nlh = nlh;
4197 cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
4199 if (tb[RTA_GATEWAY]) {
4200 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
4201 cfg->fc_flags |= RTF_GATEWAY;
4205 int plen = (rtm->rtm_dst_len + 7) >> 3;
4207 if (nla_len(tb[RTA_DST]) < plen)
4210 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
4214 int plen = (rtm->rtm_src_len + 7) >> 3;
4216 if (nla_len(tb[RTA_SRC]) < plen)
4219 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
4222 if (tb[RTA_PREFSRC])
4223 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
4226 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
4228 if (tb[RTA_PRIORITY])
4229 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
4231 if (tb[RTA_METRICS]) {
4232 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
4233 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
4237 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
4239 if (tb[RTA_MULTIPATH]) {
4240 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
4241 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
4243 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
4244 cfg->fc_mp_len, extack);
4250 pref = nla_get_u8(tb[RTA_PREF]);
4251 if (pref != ICMPV6_ROUTER_PREF_LOW &&
4252 pref != ICMPV6_ROUTER_PREF_HIGH)
4253 pref = ICMPV6_ROUTER_PREF_MEDIUM;
4254 cfg->fc_flags |= RTF_PREF(pref);
4258 cfg->fc_encap = tb[RTA_ENCAP];
4260 if (tb[RTA_ENCAP_TYPE]) {
4261 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
4263 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
4268 if (tb[RTA_EXPIRES]) {
4269 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
4271 if (addrconf_finite_timeout(timeout)) {
4272 cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
4273 cfg->fc_flags |= RTF_EXPIRES;
4283 struct fib6_info *fib6_info;
4284 struct fib6_config r_cfg;
4285 struct list_head next;
4288 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
4292 list_for_each_entry(nh, rt6_nh_list, next) {
4293 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
4294 &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
4295 nh->r_cfg.fc_ifindex);
4299 static int ip6_route_info_append(struct net *net,
4300 struct list_head *rt6_nh_list,
4301 struct fib6_info *rt,
4302 struct fib6_config *r_cfg)
4307 list_for_each_entry(nh, rt6_nh_list, next) {
4308 /* check if fib6_info already exists */
4309 if (rt6_duplicate_nexthop(nh->fib6_info, rt))
4313 nh = kzalloc(sizeof(*nh), GFP_KERNEL);
4317 err = ip6_convert_metrics(net, rt, r_cfg);
4322 memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
4323 list_add_tail(&nh->next, rt6_nh_list);
4328 static void ip6_route_mpath_notify(struct fib6_info *rt,
4329 struct fib6_info *rt_last,
4330 struct nl_info *info,
4333 /* if this is an APPEND route, then rt points to the first route
4334 * inserted and rt_last points to last route inserted. Userspace
4335 * wants a consistent dump of the route which starts at the first
4336 * nexthop. Since sibling routes are always added at the end of
4337 * the list, find the first sibling of the last route appended
4339 if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) {
4340 rt = list_first_entry(&rt_last->fib6_siblings,
4346 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
4349 static int ip6_route_multipath_add(struct fib6_config *cfg,
4350 struct netlink_ext_ack *extack)
4352 struct fib6_info *rt_notif = NULL, *rt_last = NULL;
4353 struct nl_info *info = &cfg->fc_nlinfo;
4354 struct fib6_config r_cfg;
4355 struct rtnexthop *rtnh;
4356 struct fib6_info *rt;
4357 struct rt6_nh *err_nh;
4358 struct rt6_nh *nh, *nh_safe;
4364 int replace = (cfg->fc_nlinfo.nlh &&
4365 (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
4366 LIST_HEAD(rt6_nh_list);
4368 nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
4369 if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
4370 nlflags |= NLM_F_APPEND;
4372 remaining = cfg->fc_mp_len;
4373 rtnh = (struct rtnexthop *)cfg->fc_mp;
4375 /* Parse a Multipath Entry and build a list (rt6_nh_list) of
4376 * fib6_info structs per nexthop
4378 while (rtnh_ok(rtnh, remaining)) {
4379 memcpy(&r_cfg, cfg, sizeof(*cfg));
4380 if (rtnh->rtnh_ifindex)
4381 r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4383 attrlen = rtnh_attrlen(rtnh);
4385 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4387 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4389 r_cfg.fc_gateway = nla_get_in6_addr(nla);
4390 r_cfg.fc_flags |= RTF_GATEWAY;
4392 r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
4393 nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
4395 r_cfg.fc_encap_type = nla_get_u16(nla);
4398 r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
4399 rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack);
4405 if (!rt6_qualify_for_ecmp(rt)) {
4407 NL_SET_ERR_MSG(extack,
4408 "Device only routes can not be added for IPv6 using the multipath API.");
4409 fib6_info_release(rt);
4413 rt->fib6_nh.nh_weight = rtnh->rtnh_hops + 1;
4415 err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
4418 fib6_info_release(rt);
4422 rtnh = rtnh_next(rtnh, &remaining);
4425 /* for add and replace send one notification with all nexthops.
4426 * Skip the notification in fib6_add_rt2node and send one with
4427 * the full route when done
4429 info->skip_notify = 1;
4432 list_for_each_entry(nh, &rt6_nh_list, next) {
4433 err = __ip6_ins_rt(nh->fib6_info, info, extack);
4434 fib6_info_release(nh->fib6_info);
4437 /* save reference to last route successfully inserted */
4438 rt_last = nh->fib6_info;
4440 /* save reference to first route for notification */
4442 rt_notif = nh->fib6_info;
4445 /* nh->fib6_info is used or freed at this point, reset to NULL*/
4446 nh->fib6_info = NULL;
4449 ip6_print_replace_route_err(&rt6_nh_list);
4454 /* Because each route is added like a single route we remove
4455 * these flags after the first nexthop: if there is a collision,
4456 * we have already failed to add the first nexthop:
4457 * fib6_add_rt2node() has rejected it; when replacing, old
4458 * nexthops have been replaced by first new, the rest should
4461 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
4466 /* success ... tell user about new route */
4467 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4471 /* send notification for routes that were added so that
4472 * the delete notifications sent by ip6_route_del are
4476 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4478 /* Delete routes that were already added */
4479 list_for_each_entry(nh, &rt6_nh_list, next) {
4482 ip6_route_del(&nh->r_cfg, extack);
4486 list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
4488 fib6_info_release(nh->fib6_info);
4489 list_del(&nh->next);
4496 static int ip6_route_multipath_del(struct fib6_config *cfg,
4497 struct netlink_ext_ack *extack)
4499 struct fib6_config r_cfg;
4500 struct rtnexthop *rtnh;
4503 int err = 1, last_err = 0;
4505 remaining = cfg->fc_mp_len;
4506 rtnh = (struct rtnexthop *)cfg->fc_mp;
4508 /* Parse a Multipath Entry */
4509 while (rtnh_ok(rtnh, remaining)) {
4510 memcpy(&r_cfg, cfg, sizeof(*cfg));
4511 if (rtnh->rtnh_ifindex)
4512 r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4514 attrlen = rtnh_attrlen(rtnh);
4516 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4518 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4520 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
4521 r_cfg.fc_flags |= RTF_GATEWAY;
4524 err = ip6_route_del(&r_cfg, extack);
4528 rtnh = rtnh_next(rtnh, &remaining);
4534 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4535 struct netlink_ext_ack *extack)
4537 struct fib6_config cfg;
4540 err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4545 return ip6_route_multipath_del(&cfg, extack);
4547 cfg.fc_delete_all_nh = 1;
4548 return ip6_route_del(&cfg, extack);
4552 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4553 struct netlink_ext_ack *extack)
4555 struct fib6_config cfg;
4558 err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4563 return ip6_route_multipath_add(&cfg, extack);
4565 return ip6_route_add(&cfg, GFP_KERNEL, extack);
4568 static size_t rt6_nlmsg_size(struct fib6_info *rt)
4570 int nexthop_len = 0;
4572 if (rt->fib6_nsiblings) {
4573 nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */
4574 + NLA_ALIGN(sizeof(struct rtnexthop))
4575 + nla_total_size(16) /* RTA_GATEWAY */
4576 + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate);
4578 nexthop_len *= rt->fib6_nsiblings;
4581 return NLMSG_ALIGN(sizeof(struct rtmsg))
4582 + nla_total_size(16) /* RTA_SRC */
4583 + nla_total_size(16) /* RTA_DST */
4584 + nla_total_size(16) /* RTA_GATEWAY */
4585 + nla_total_size(16) /* RTA_PREFSRC */
4586 + nla_total_size(4) /* RTA_TABLE */
4587 + nla_total_size(4) /* RTA_IIF */
4588 + nla_total_size(4) /* RTA_OIF */
4589 + nla_total_size(4) /* RTA_PRIORITY */
4590 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
4591 + nla_total_size(sizeof(struct rta_cacheinfo))
4592 + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
4593 + nla_total_size(1) /* RTA_PREF */
4594 + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate)
4598 static int rt6_nexthop_info(struct sk_buff *skb, struct fib6_info *rt,
4599 unsigned int *flags, bool skip_oif)
4601 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
4602 *flags |= RTNH_F_DEAD;
4604 if (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN) {
4605 *flags |= RTNH_F_LINKDOWN;
4608 if (fib6_ignore_linkdown(rt))
4609 *flags |= RTNH_F_DEAD;
4613 if (rt->fib6_flags & RTF_GATEWAY) {
4614 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->fib6_nh.nh_gw) < 0)
4615 goto nla_put_failure;
4618 *flags |= (rt->fib6_nh.nh_flags & RTNH_F_ONLINK);
4619 if (rt->fib6_nh.nh_flags & RTNH_F_OFFLOAD)
4620 *flags |= RTNH_F_OFFLOAD;
4622 /* not needed for multipath encoding b/c it has a rtnexthop struct */
4623 if (!skip_oif && rt->fib6_nh.nh_dev &&
4624 nla_put_u32(skb, RTA_OIF, rt->fib6_nh.nh_dev->ifindex))
4625 goto nla_put_failure;
4627 if (rt->fib6_nh.nh_lwtstate &&
4628 lwtunnel_fill_encap(skb, rt->fib6_nh.nh_lwtstate) < 0)
4629 goto nla_put_failure;
4637 /* add multipath next hop */
4638 static int rt6_add_nexthop(struct sk_buff *skb, struct fib6_info *rt)
4640 const struct net_device *dev = rt->fib6_nh.nh_dev;
4641 struct rtnexthop *rtnh;
4642 unsigned int flags = 0;
4644 rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
4646 goto nla_put_failure;
4648 rtnh->rtnh_hops = rt->fib6_nh.nh_weight - 1;
4649 rtnh->rtnh_ifindex = dev ? dev->ifindex : 0;
4651 if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
4652 goto nla_put_failure;
4654 rtnh->rtnh_flags = flags;
4656 /* length of rtnetlink header + attributes */
4657 rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
4665 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
4666 struct fib6_info *rt, struct dst_entry *dst,
4667 struct in6_addr *dest, struct in6_addr *src,
4668 int iif, int type, u32 portid, u32 seq,
4671 struct rt6_info *rt6 = (struct rt6_info *)dst;
4672 struct rt6key *rt6_dst, *rt6_src;
4673 u32 *pmetrics, table, rt6_flags;
4674 struct nlmsghdr *nlh;
4678 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4683 rt6_dst = &rt6->rt6i_dst;
4684 rt6_src = &rt6->rt6i_src;
4685 rt6_flags = rt6->rt6i_flags;
4687 rt6_dst = &rt->fib6_dst;
4688 rt6_src = &rt->fib6_src;
4689 rt6_flags = rt->fib6_flags;
4692 rtm = nlmsg_data(nlh);
4693 rtm->rtm_family = AF_INET6;
4694 rtm->rtm_dst_len = rt6_dst->plen;
4695 rtm->rtm_src_len = rt6_src->plen;
4698 table = rt->fib6_table->tb6_id;
4700 table = RT6_TABLE_UNSPEC;
4701 rtm->rtm_table = table;
4702 if (nla_put_u32(skb, RTA_TABLE, table))
4703 goto nla_put_failure;
4705 rtm->rtm_type = rt->fib6_type;
4707 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4708 rtm->rtm_protocol = rt->fib6_protocol;
4710 if (rt6_flags & RTF_CACHE)
4711 rtm->rtm_flags |= RTM_F_CLONED;
4714 if (nla_put_in6_addr(skb, RTA_DST, dest))
4715 goto nla_put_failure;
4716 rtm->rtm_dst_len = 128;
4717 } else if (rtm->rtm_dst_len)
4718 if (nla_put_in6_addr(skb, RTA_DST, &rt6_dst->addr))
4719 goto nla_put_failure;
4720 #ifdef CONFIG_IPV6_SUBTREES
4722 if (nla_put_in6_addr(skb, RTA_SRC, src))
4723 goto nla_put_failure;
4724 rtm->rtm_src_len = 128;
4725 } else if (rtm->rtm_src_len &&
4726 nla_put_in6_addr(skb, RTA_SRC, &rt6_src->addr))
4727 goto nla_put_failure;
4730 #ifdef CONFIG_IPV6_MROUTE
4731 if (ipv6_addr_is_multicast(&rt6_dst->addr)) {
4732 int err = ip6mr_get_route(net, skb, rtm, portid);
4737 goto nla_put_failure;
4740 if (nla_put_u32(skb, RTA_IIF, iif))
4741 goto nla_put_failure;
4743 struct in6_addr saddr_buf;
4744 if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 &&
4745 nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4746 goto nla_put_failure;
4749 if (rt->fib6_prefsrc.plen) {
4750 struct in6_addr saddr_buf;
4751 saddr_buf = rt->fib6_prefsrc.addr;
4752 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4753 goto nla_put_failure;
4756 pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics;
4757 if (rtnetlink_put_metrics(skb, pmetrics) < 0)
4758 goto nla_put_failure;
4760 if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric))
4761 goto nla_put_failure;
4763 /* For multipath routes, walk the siblings list and add
4764 * each as a nexthop within RTA_MULTIPATH.
4767 if (rt6_flags & RTF_GATEWAY &&
4768 nla_put_in6_addr(skb, RTA_GATEWAY, &rt6->rt6i_gateway))
4769 goto nla_put_failure;
4771 if (dst->dev && nla_put_u32(skb, RTA_OIF, dst->dev->ifindex))
4772 goto nla_put_failure;
4773 } else if (rt->fib6_nsiblings) {
4774 struct fib6_info *sibling, *next_sibling;
4777 mp = nla_nest_start(skb, RTA_MULTIPATH);
4779 goto nla_put_failure;
4781 if (rt6_add_nexthop(skb, rt) < 0)
4782 goto nla_put_failure;
4784 list_for_each_entry_safe(sibling, next_sibling,
4785 &rt->fib6_siblings, fib6_siblings) {
4786 if (rt6_add_nexthop(skb, sibling) < 0)
4787 goto nla_put_failure;
4790 nla_nest_end(skb, mp);
4792 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
4793 goto nla_put_failure;
4796 if (rt6_flags & RTF_EXPIRES) {
4797 expires = dst ? dst->expires : rt->expires;
4801 if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
4802 goto nla_put_failure;
4804 if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt6_flags)))
4805 goto nla_put_failure;
4808 nlmsg_end(skb, nlh);
4812 nlmsg_cancel(skb, nlh);
4816 int rt6_dump_route(struct fib6_info *rt, void *p_arg)
4818 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4819 struct net *net = arg->net;
4821 if (rt == net->ipv6.fib6_null_entry)
4824 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
4825 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
4827 /* user wants prefix routes only */
4828 if (rtm->rtm_flags & RTM_F_PREFIX &&
4829 !(rt->fib6_flags & RTF_PREFIX_RT)) {
4830 /* success since this is not a prefix route */
4835 return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0,
4836 RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid,
4837 arg->cb->nlh->nlmsg_seq, NLM_F_MULTI);
4840 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4841 struct netlink_ext_ack *extack)
4843 struct net *net = sock_net(in_skb->sk);
4844 struct nlattr *tb[RTA_MAX+1];
4845 int err, iif = 0, oif = 0;
4846 struct fib6_info *from;
4847 struct dst_entry *dst;
4848 struct rt6_info *rt;
4849 struct sk_buff *skb;
4854 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4860 memset(&fl6, 0, sizeof(fl6));
4861 rtm = nlmsg_data(nlh);
4862 fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4863 fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4866 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4869 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4873 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4876 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4880 iif = nla_get_u32(tb[RTA_IIF]);
4883 oif = nla_get_u32(tb[RTA_OIF]);
4886 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4889 fl6.flowi6_uid = make_kuid(current_user_ns(),
4890 nla_get_u32(tb[RTA_UID]));
4892 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4895 fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]);
4898 fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]);
4900 if (tb[RTA_IP_PROTO]) {
4901 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
4902 &fl6.flowi6_proto, extack);
4908 struct net_device *dev;
4913 dev = dev_get_by_index_rcu(net, iif);
4920 fl6.flowi6_iif = iif;
4922 if (!ipv6_addr_any(&fl6.saddr))
4923 flags |= RT6_LOOKUP_F_HAS_SADDR;
4925 dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags);
4929 fl6.flowi6_oif = oif;
4931 dst = ip6_route_output(net, NULL, &fl6);
4935 rt = container_of(dst, struct rt6_info, dst);
4936 if (rt->dst.error) {
4937 err = rt->dst.error;
4942 if (rt == net->ipv6.ip6_null_entry) {
4943 err = rt->dst.error;
4948 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
4955 skb_dst_set(skb, &rt->dst);
4958 from = rcu_dereference(rt->from);
4961 err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, iif,
4962 RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4965 err = rt6_fill_node(net, skb, from, dst, &fl6.daddr,
4966 &fl6.saddr, iif, RTM_NEWROUTE,
4967 NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
4976 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
4981 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
4982 unsigned int nlm_flags)
4984 struct sk_buff *skb;
4985 struct net *net = info->nl_net;
4990 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
4992 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
4996 err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
4997 event, info->portid, seq, nlm_flags);
4999 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
5000 WARN_ON(err == -EMSGSIZE);
5004 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
5005 info->nlh, gfp_any());
5009 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
5012 static int ip6_route_dev_notify(struct notifier_block *this,
5013 unsigned long event, void *ptr)
5015 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
5016 struct net *net = dev_net(dev);
5018 if (!(dev->flags & IFF_LOOPBACK))
5021 if (event == NETDEV_REGISTER) {
5022 net->ipv6.fib6_null_entry->fib6_nh.nh_dev = dev;
5023 net->ipv6.ip6_null_entry->dst.dev = dev;
5024 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
5025 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5026 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
5027 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
5028 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
5029 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
5031 } else if (event == NETDEV_UNREGISTER &&
5032 dev->reg_state != NETREG_UNREGISTERED) {
5033 /* NETDEV_UNREGISTER could be fired for multiple times by
5034 * netdev_wait_allrefs(). Make sure we only call this once.
5036 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
5037 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5038 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
5039 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
5050 #ifdef CONFIG_PROC_FS
5051 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
5053 struct net *net = (struct net *)seq->private;
5054 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
5055 net->ipv6.rt6_stats->fib_nodes,
5056 net->ipv6.rt6_stats->fib_route_nodes,
5057 atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
5058 net->ipv6.rt6_stats->fib_rt_entries,
5059 net->ipv6.rt6_stats->fib_rt_cache,
5060 dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
5061 net->ipv6.rt6_stats->fib_discarded_routes);
5065 #endif /* CONFIG_PROC_FS */
5067 #ifdef CONFIG_SYSCTL
5070 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
5071 void __user *buffer, size_t *lenp, loff_t *ppos)
5078 net = (struct net *)ctl->extra1;
5079 delay = net->ipv6.sysctl.flush_delay;
5080 proc_dointvec(ctl, write, buffer, lenp, ppos);
5081 fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
5085 struct ctl_table ipv6_route_table_template[] = {
5087 .procname = "flush",
5088 .data = &init_net.ipv6.sysctl.flush_delay,
5089 .maxlen = sizeof(int),
5091 .proc_handler = ipv6_sysctl_rtcache_flush
5094 .procname = "gc_thresh",
5095 .data = &ip6_dst_ops_template.gc_thresh,
5096 .maxlen = sizeof(int),
5098 .proc_handler = proc_dointvec,
5101 .procname = "max_size",
5102 .data = &init_net.ipv6.sysctl.ip6_rt_max_size,
5103 .maxlen = sizeof(int),
5105 .proc_handler = proc_dointvec,
5108 .procname = "gc_min_interval",
5109 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5110 .maxlen = sizeof(int),
5112 .proc_handler = proc_dointvec_jiffies,
5115 .procname = "gc_timeout",
5116 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
5117 .maxlen = sizeof(int),
5119 .proc_handler = proc_dointvec_jiffies,
5122 .procname = "gc_interval",
5123 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval,
5124 .maxlen = sizeof(int),
5126 .proc_handler = proc_dointvec_jiffies,
5129 .procname = "gc_elasticity",
5130 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
5131 .maxlen = sizeof(int),
5133 .proc_handler = proc_dointvec,
5136 .procname = "mtu_expires",
5137 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
5138 .maxlen = sizeof(int),
5140 .proc_handler = proc_dointvec_jiffies,
5143 .procname = "min_adv_mss",
5144 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss,
5145 .maxlen = sizeof(int),
5147 .proc_handler = proc_dointvec,
5150 .procname = "gc_min_interval_ms",
5151 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5152 .maxlen = sizeof(int),
5154 .proc_handler = proc_dointvec_ms_jiffies,
5159 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
5161 struct ctl_table *table;
5163 table = kmemdup(ipv6_route_table_template,
5164 sizeof(ipv6_route_table_template),
5168 table[0].data = &net->ipv6.sysctl.flush_delay;
5169 table[0].extra1 = net;
5170 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
5171 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
5172 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5173 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
5174 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
5175 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
5176 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
5177 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
5178 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5180 /* Don't export sysctls to unprivileged users */
5181 if (net->user_ns != &init_user_ns)
5182 table[0].procname = NULL;
5189 static int __net_init ip6_route_net_init(struct net *net)
5193 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
5194 sizeof(net->ipv6.ip6_dst_ops));
5196 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
5197 goto out_ip6_dst_ops;
5199 net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template,
5200 sizeof(*net->ipv6.fib6_null_entry),
5202 if (!net->ipv6.fib6_null_entry)
5203 goto out_ip6_dst_entries;
5205 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
5206 sizeof(*net->ipv6.ip6_null_entry),
5208 if (!net->ipv6.ip6_null_entry)
5209 goto out_fib6_null_entry;
5210 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5211 dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
5212 ip6_template_metrics, true);
5214 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5215 net->ipv6.fib6_has_custom_rules = false;
5216 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
5217 sizeof(*net->ipv6.ip6_prohibit_entry),
5219 if (!net->ipv6.ip6_prohibit_entry)
5220 goto out_ip6_null_entry;
5221 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5222 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
5223 ip6_template_metrics, true);
5225 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
5226 sizeof(*net->ipv6.ip6_blk_hole_entry),
5228 if (!net->ipv6.ip6_blk_hole_entry)
5229 goto out_ip6_prohibit_entry;
5230 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5231 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
5232 ip6_template_metrics, true);
5235 net->ipv6.sysctl.flush_delay = 0;
5236 net->ipv6.sysctl.ip6_rt_max_size = 4096;
5237 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
5238 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
5239 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
5240 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
5241 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
5242 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
5244 net->ipv6.ip6_rt_gc_expire = 30*HZ;
5250 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5251 out_ip6_prohibit_entry:
5252 kfree(net->ipv6.ip6_prohibit_entry);
5254 kfree(net->ipv6.ip6_null_entry);
5256 out_fib6_null_entry:
5257 kfree(net->ipv6.fib6_null_entry);
5258 out_ip6_dst_entries:
5259 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5264 static void __net_exit ip6_route_net_exit(struct net *net)
5266 kfree(net->ipv6.fib6_null_entry);
5267 kfree(net->ipv6.ip6_null_entry);
5268 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5269 kfree(net->ipv6.ip6_prohibit_entry);
5270 kfree(net->ipv6.ip6_blk_hole_entry);
5272 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5275 static int __net_init ip6_route_net_init_late(struct net *net)
5277 #ifdef CONFIG_PROC_FS
5278 proc_create_net("ipv6_route", 0, net->proc_net, &ipv6_route_seq_ops,
5279 sizeof(struct ipv6_route_iter));
5280 proc_create_net_single("rt6_stats", 0444, net->proc_net,
5281 rt6_stats_seq_show, NULL);
5286 static void __net_exit ip6_route_net_exit_late(struct net *net)
5288 #ifdef CONFIG_PROC_FS
5289 remove_proc_entry("ipv6_route", net->proc_net);
5290 remove_proc_entry("rt6_stats", net->proc_net);
5294 static struct pernet_operations ip6_route_net_ops = {
5295 .init = ip6_route_net_init,
5296 .exit = ip6_route_net_exit,
5299 static int __net_init ipv6_inetpeer_init(struct net *net)
5301 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
5305 inet_peer_base_init(bp);
5306 net->ipv6.peers = bp;
5310 static void __net_exit ipv6_inetpeer_exit(struct net *net)
5312 struct inet_peer_base *bp = net->ipv6.peers;
5314 net->ipv6.peers = NULL;
5315 inetpeer_invalidate_tree(bp);
5319 static struct pernet_operations ipv6_inetpeer_ops = {
5320 .init = ipv6_inetpeer_init,
5321 .exit = ipv6_inetpeer_exit,
5324 static struct pernet_operations ip6_route_net_late_ops = {
5325 .init = ip6_route_net_init_late,
5326 .exit = ip6_route_net_exit_late,
5329 static struct notifier_block ip6_route_dev_notifier = {
5330 .notifier_call = ip6_route_dev_notify,
5331 .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
5334 void __init ip6_route_init_special_entries(void)
5336 /* Registering of the loopback is done before this portion of code,
5337 * the loopback reference in rt6_info will not be taken, do it
5338 * manually for init_net */
5339 init_net.ipv6.fib6_null_entry->fib6_nh.nh_dev = init_net.loopback_dev;
5340 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
5341 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5342 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5343 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
5344 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5345 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
5346 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5350 int __init ip6_route_init(void)
5356 ip6_dst_ops_template.kmem_cachep =
5357 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
5358 SLAB_HWCACHE_ALIGN, NULL);
5359 if (!ip6_dst_ops_template.kmem_cachep)
5362 ret = dst_entries_init(&ip6_dst_blackhole_ops);
5364 goto out_kmem_cache;
5366 ret = register_pernet_subsys(&ipv6_inetpeer_ops);
5368 goto out_dst_entries;
5370 ret = register_pernet_subsys(&ip6_route_net_ops);
5372 goto out_register_inetpeer;
5374 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
5378 goto out_register_subsys;
5384 ret = fib6_rules_init();
5388 ret = register_pernet_subsys(&ip6_route_net_late_ops);
5390 goto fib6_rules_init;
5392 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
5393 inet6_rtm_newroute, NULL, 0);
5395 goto out_register_late_subsys;
5397 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
5398 inet6_rtm_delroute, NULL, 0);
5400 goto out_register_late_subsys;
5402 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
5403 inet6_rtm_getroute, NULL,
5404 RTNL_FLAG_DOIT_UNLOCKED);
5406 goto out_register_late_subsys;
5408 ret = register_netdevice_notifier(&ip6_route_dev_notifier);
5410 goto out_register_late_subsys;
5412 for_each_possible_cpu(cpu) {
5413 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
5415 INIT_LIST_HEAD(&ul->head);
5416 spin_lock_init(&ul->lock);
5422 out_register_late_subsys:
5423 rtnl_unregister_all(PF_INET6);
5424 unregister_pernet_subsys(&ip6_route_net_late_ops);
5426 fib6_rules_cleanup();
5431 out_register_subsys:
5432 unregister_pernet_subsys(&ip6_route_net_ops);
5433 out_register_inetpeer:
5434 unregister_pernet_subsys(&ipv6_inetpeer_ops);
5436 dst_entries_destroy(&ip6_dst_blackhole_ops);
5438 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5442 void ip6_route_cleanup(void)
5444 unregister_netdevice_notifier(&ip6_route_dev_notifier);
5445 unregister_pernet_subsys(&ip6_route_net_late_ops);
5446 fib6_rules_cleanup();
5449 unregister_pernet_subsys(&ipv6_inetpeer_ops);
5450 unregister_pernet_subsys(&ip6_route_net_ops);
5451 dst_entries_destroy(&ip6_dst_blackhole_ops);
5452 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);