2 * Linux INET6 implementation
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
16 * YOSHIFUJI Hideaki @USAGI
17 * reworked default router selection.
18 * - respect outgoing interface
19 * - select from (probably) reachable routers (i.e.
20 * routers in REACHABLE, STALE, DELAY or PROBE states).
21 * - always select the same router if it is (probably)
22 * reachable. otherwise, round-robin the list.
24 * Fixed routing subtrees.
27 #define pr_fmt(fmt) "IPv6: " fmt
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
56 #include <linux/rtnetlink.h>
58 #include <net/dst_metadata.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/nexthop.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
67 #include <linux/uaccess.h>
70 #include <linux/sysctl.h>
73 static int ip6_rt_type_to_error(u8 fib6_type);
75 #define CREATE_TRACE_POINTS
76 #include <trace/events/fib6.h>
77 EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup);
78 #undef CREATE_TRACE_POINTS
81 RT6_NUD_FAIL_HARD = -3,
82 RT6_NUD_FAIL_PROBE = -2,
83 RT6_NUD_FAIL_DO_RR = -1,
87 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
88 static unsigned int ip6_default_advmss(const struct dst_entry *dst);
89 static unsigned int ip6_mtu(const struct dst_entry *dst);
90 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
91 static void ip6_dst_destroy(struct dst_entry *);
92 static void ip6_dst_ifdown(struct dst_entry *,
93 struct net_device *dev, int how);
94 static int ip6_dst_gc(struct dst_ops *ops);
96 static int ip6_pkt_discard(struct sk_buff *skb);
97 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
98 static int ip6_pkt_prohibit(struct sk_buff *skb);
99 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
100 static void ip6_link_failure(struct sk_buff *skb);
101 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
102 struct sk_buff *skb, u32 mtu);
103 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
104 struct sk_buff *skb);
105 static int rt6_score_route(struct fib6_info *rt, int oif, int strict);
106 static size_t rt6_nlmsg_size(struct fib6_info *rt);
107 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
108 struct fib6_info *rt, struct dst_entry *dst,
109 struct in6_addr *dest, struct in6_addr *src,
110 int iif, int type, u32 portid, u32 seq,
112 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
113 struct in6_addr *daddr,
114 struct in6_addr *saddr);
116 #ifdef CONFIG_IPV6_ROUTE_INFO
117 static struct fib6_info *rt6_add_route_info(struct net *net,
118 const struct in6_addr *prefix, int prefixlen,
119 const struct in6_addr *gwaddr,
120 struct net_device *dev,
122 static struct fib6_info *rt6_get_route_info(struct net *net,
123 const struct in6_addr *prefix, int prefixlen,
124 const struct in6_addr *gwaddr,
125 struct net_device *dev);
128 struct uncached_list {
130 struct list_head head;
133 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
135 void rt6_uncached_list_add(struct rt6_info *rt)
137 struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
139 rt->rt6i_uncached_list = ul;
141 spin_lock_bh(&ul->lock);
142 list_add_tail(&rt->rt6i_uncached, &ul->head);
143 spin_unlock_bh(&ul->lock);
146 void rt6_uncached_list_del(struct rt6_info *rt)
148 if (!list_empty(&rt->rt6i_uncached)) {
149 struct uncached_list *ul = rt->rt6i_uncached_list;
150 struct net *net = dev_net(rt->dst.dev);
152 spin_lock_bh(&ul->lock);
153 list_del(&rt->rt6i_uncached);
154 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
155 spin_unlock_bh(&ul->lock);
159 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
161 struct net_device *loopback_dev = net->loopback_dev;
164 if (dev == loopback_dev)
167 for_each_possible_cpu(cpu) {
168 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
171 spin_lock_bh(&ul->lock);
172 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
173 struct inet6_dev *rt_idev = rt->rt6i_idev;
174 struct net_device *rt_dev = rt->dst.dev;
176 if (rt_idev->dev == dev) {
177 rt->rt6i_idev = in6_dev_get(loopback_dev);
178 in6_dev_put(rt_idev);
182 rt->dst.dev = loopback_dev;
183 dev_hold(rt->dst.dev);
187 spin_unlock_bh(&ul->lock);
191 static inline const void *choose_neigh_daddr(const struct in6_addr *p,
195 if (!ipv6_addr_any(p))
196 return (const void *) p;
198 return &ipv6_hdr(skb)->daddr;
202 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
203 struct net_device *dev,
209 daddr = choose_neigh_daddr(gw, skb, daddr);
210 n = __ipv6_neigh_lookup(dev, daddr);
213 return neigh_create(&nd_tbl, daddr, dev);
216 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst,
220 const struct rt6_info *rt = container_of(dst, struct rt6_info, dst);
222 return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr);
225 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
227 struct net_device *dev = dst->dev;
228 struct rt6_info *rt = (struct rt6_info *)dst;
230 daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr);
233 if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
235 if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
237 __ipv6_confirm_neigh(dev, daddr);
240 static struct dst_ops ip6_dst_ops_template = {
244 .check = ip6_dst_check,
245 .default_advmss = ip6_default_advmss,
247 .cow_metrics = dst_cow_metrics_generic,
248 .destroy = ip6_dst_destroy,
249 .ifdown = ip6_dst_ifdown,
250 .negative_advice = ip6_negative_advice,
251 .link_failure = ip6_link_failure,
252 .update_pmtu = ip6_rt_update_pmtu,
253 .redirect = rt6_do_redirect,
254 .local_out = __ip6_local_out,
255 .neigh_lookup = ip6_dst_neigh_lookup,
256 .confirm_neigh = ip6_confirm_neigh,
259 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
261 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
263 return mtu ? : dst->dev->mtu;
266 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
267 struct sk_buff *skb, u32 mtu)
271 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
276 static struct dst_ops ip6_dst_blackhole_ops = {
278 .destroy = ip6_dst_destroy,
279 .check = ip6_dst_check,
280 .mtu = ip6_blackhole_mtu,
281 .default_advmss = ip6_default_advmss,
282 .update_pmtu = ip6_rt_blackhole_update_pmtu,
283 .redirect = ip6_rt_blackhole_redirect,
284 .cow_metrics = dst_cow_metrics_generic,
285 .neigh_lookup = ip6_dst_neigh_lookup,
288 static const u32 ip6_template_metrics[RTAX_MAX] = {
289 [RTAX_HOPLIMIT - 1] = 0,
292 static const struct fib6_info fib6_null_entry_template = {
293 .fib6_flags = (RTF_REJECT | RTF_NONEXTHOP),
294 .fib6_protocol = RTPROT_KERNEL,
295 .fib6_metric = ~(u32)0,
296 .fib6_ref = ATOMIC_INIT(1),
297 .fib6_type = RTN_UNREACHABLE,
298 .fib6_metrics = (struct dst_metrics *)&dst_default_metrics,
301 static const struct rt6_info ip6_null_entry_template = {
303 .__refcnt = ATOMIC_INIT(1),
305 .obsolete = DST_OBSOLETE_FORCE_CHK,
306 .error = -ENETUNREACH,
307 .input = ip6_pkt_discard,
308 .output = ip6_pkt_discard_out,
310 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
313 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
315 static const struct rt6_info ip6_prohibit_entry_template = {
317 .__refcnt = ATOMIC_INIT(1),
319 .obsolete = DST_OBSOLETE_FORCE_CHK,
321 .input = ip6_pkt_prohibit,
322 .output = ip6_pkt_prohibit_out,
324 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
327 static const struct rt6_info ip6_blk_hole_entry_template = {
329 .__refcnt = ATOMIC_INIT(1),
331 .obsolete = DST_OBSOLETE_FORCE_CHK,
333 .input = dst_discard,
334 .output = dst_discard_out,
336 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
341 static void rt6_info_init(struct rt6_info *rt)
343 struct dst_entry *dst = &rt->dst;
345 memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
346 INIT_LIST_HEAD(&rt->rt6i_uncached);
349 /* allocate dst with ip6_dst_ops */
350 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
353 struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
354 1, DST_OBSOLETE_FORCE_CHK, flags);
358 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
363 EXPORT_SYMBOL(ip6_dst_alloc);
365 static void ip6_dst_destroy(struct dst_entry *dst)
367 struct rt6_info *rt = (struct rt6_info *)dst;
368 struct fib6_info *from;
369 struct inet6_dev *idev;
371 dst_destroy_metrics_generic(dst);
372 rt6_uncached_list_del(rt);
374 idev = rt->rt6i_idev;
376 rt->rt6i_idev = NULL;
381 from = rcu_dereference(rt->from);
382 rcu_assign_pointer(rt->from, NULL);
383 fib6_info_release(from);
387 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
390 struct rt6_info *rt = (struct rt6_info *)dst;
391 struct inet6_dev *idev = rt->rt6i_idev;
392 struct net_device *loopback_dev =
393 dev_net(dev)->loopback_dev;
395 if (idev && idev->dev != loopback_dev) {
396 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
398 rt->rt6i_idev = loopback_idev;
404 static bool __rt6_check_expired(const struct rt6_info *rt)
406 if (rt->rt6i_flags & RTF_EXPIRES)
407 return time_after(jiffies, rt->dst.expires);
412 static bool rt6_check_expired(const struct rt6_info *rt)
414 struct fib6_info *from;
416 from = rcu_dereference(rt->from);
418 if (rt->rt6i_flags & RTF_EXPIRES) {
419 if (time_after(jiffies, rt->dst.expires))
422 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
423 fib6_check_expired(from);
428 struct fib6_info *fib6_multipath_select(const struct net *net,
429 struct fib6_info *match,
430 struct flowi6 *fl6, int oif,
431 const struct sk_buff *skb,
434 struct fib6_info *sibling, *next_sibling;
436 /* We might have already computed the hash for ICMPv6 errors. In such
437 * case it will always be non-zero. Otherwise now is the time to do it.
440 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
442 if (fl6->mp_hash <= atomic_read(&match->fib6_nh.nh_upper_bound))
445 list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
449 nh_upper_bound = atomic_read(&sibling->fib6_nh.nh_upper_bound);
450 if (fl6->mp_hash > nh_upper_bound)
452 if (rt6_score_route(sibling, oif, strict) < 0)
462 * Route lookup. rcu_read_lock() should be held.
465 static inline struct fib6_info *rt6_device_match(struct net *net,
466 struct fib6_info *rt,
467 const struct in6_addr *saddr,
471 struct fib6_info *sprt;
473 if (!oif && ipv6_addr_any(saddr) &&
474 !(rt->fib6_nh.nh_flags & RTNH_F_DEAD))
477 for (sprt = rt; sprt; sprt = rcu_dereference(sprt->fib6_next)) {
478 const struct net_device *dev = sprt->fib6_nh.nh_dev;
480 if (sprt->fib6_nh.nh_flags & RTNH_F_DEAD)
484 if (dev->ifindex == oif)
487 if (ipv6_chk_addr(net, saddr, dev,
488 flags & RT6_LOOKUP_F_IFACE))
493 if (oif && flags & RT6_LOOKUP_F_IFACE)
494 return net->ipv6.fib6_null_entry;
496 return rt->fib6_nh.nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt;
499 #ifdef CONFIG_IPV6_ROUTER_PREF
500 struct __rt6_probe_work {
501 struct work_struct work;
502 struct in6_addr target;
503 struct net_device *dev;
506 static void rt6_probe_deferred(struct work_struct *w)
508 struct in6_addr mcaddr;
509 struct __rt6_probe_work *work =
510 container_of(w, struct __rt6_probe_work, work);
512 addrconf_addr_solict_mult(&work->target, &mcaddr);
513 ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
518 static void rt6_probe(struct fib6_info *rt)
520 struct __rt6_probe_work *work;
521 const struct in6_addr *nh_gw;
522 struct neighbour *neigh;
523 struct net_device *dev;
526 * Okay, this does not seem to be appropriate
527 * for now, however, we need to check if it
528 * is really so; aka Router Reachability Probing.
530 * Router Reachability Probe MUST be rate-limited
531 * to no more than one per minute.
533 if (!rt || !(rt->fib6_flags & RTF_GATEWAY))
536 nh_gw = &rt->fib6_nh.nh_gw;
537 dev = rt->fib6_nh.nh_dev;
539 neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
541 struct inet6_dev *idev;
543 if (neigh->nud_state & NUD_VALID)
546 idev = __in6_dev_get(dev);
548 write_lock(&neigh->lock);
549 if (!(neigh->nud_state & NUD_VALID) &&
551 neigh->updated + idev->cnf.rtr_probe_interval)) {
552 work = kmalloc(sizeof(*work), GFP_ATOMIC);
554 __neigh_set_probe_once(neigh);
556 write_unlock(&neigh->lock);
558 work = kmalloc(sizeof(*work), GFP_ATOMIC);
562 INIT_WORK(&work->work, rt6_probe_deferred);
563 work->target = *nh_gw;
566 schedule_work(&work->work);
570 rcu_read_unlock_bh();
573 static inline void rt6_probe(struct fib6_info *rt)
579 * Default Router Selection (RFC 2461 6.3.6)
581 static inline int rt6_check_dev(struct fib6_info *rt, int oif)
583 const struct net_device *dev = rt->fib6_nh.nh_dev;
585 if (!oif || dev->ifindex == oif)
590 static inline enum rt6_nud_state rt6_check_neigh(struct fib6_info *rt)
592 enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
593 struct neighbour *neigh;
595 if (rt->fib6_flags & RTF_NONEXTHOP ||
596 !(rt->fib6_flags & RTF_GATEWAY))
597 return RT6_NUD_SUCCEED;
600 neigh = __ipv6_neigh_lookup_noref(rt->fib6_nh.nh_dev,
603 read_lock(&neigh->lock);
604 if (neigh->nud_state & NUD_VALID)
605 ret = RT6_NUD_SUCCEED;
606 #ifdef CONFIG_IPV6_ROUTER_PREF
607 else if (!(neigh->nud_state & NUD_FAILED))
608 ret = RT6_NUD_SUCCEED;
610 ret = RT6_NUD_FAIL_PROBE;
612 read_unlock(&neigh->lock);
614 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
615 RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
617 rcu_read_unlock_bh();
622 static int rt6_score_route(struct fib6_info *rt, int oif, int strict)
626 m = rt6_check_dev(rt, oif);
627 if (!m && (strict & RT6_LOOKUP_F_IFACE))
628 return RT6_NUD_FAIL_HARD;
629 #ifdef CONFIG_IPV6_ROUTER_PREF
630 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->fib6_flags)) << 2;
632 if (strict & RT6_LOOKUP_F_REACHABLE) {
633 int n = rt6_check_neigh(rt);
640 /* called with rc_read_lock held */
641 static inline bool fib6_ignore_linkdown(const struct fib6_info *f6i)
643 const struct net_device *dev = fib6_info_nh_dev(f6i);
647 const struct inet6_dev *idev = __in6_dev_get(dev);
649 rc = !!idev->cnf.ignore_routes_with_linkdown;
655 static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict,
656 int *mpri, struct fib6_info *match,
660 bool match_do_rr = false;
662 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
665 if (fib6_ignore_linkdown(rt) &&
666 rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
667 !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
670 if (fib6_check_expired(rt))
673 m = rt6_score_route(rt, oif, strict);
674 if (m == RT6_NUD_FAIL_DO_RR) {
676 m = 0; /* lowest valid score */
677 } else if (m == RT6_NUD_FAIL_HARD) {
681 if (strict & RT6_LOOKUP_F_REACHABLE)
684 /* note that m can be RT6_NUD_FAIL_PROBE at this point */
686 *do_rr = match_do_rr;
694 static struct fib6_info *find_rr_leaf(struct fib6_node *fn,
695 struct fib6_info *leaf,
696 struct fib6_info *rr_head,
697 u32 metric, int oif, int strict,
700 struct fib6_info *rt, *match, *cont;
705 for (rt = rr_head; rt; rt = rcu_dereference(rt->fib6_next)) {
706 if (rt->fib6_metric != metric) {
711 match = find_match(rt, oif, strict, &mpri, match, do_rr);
714 for (rt = leaf; rt && rt != rr_head;
715 rt = rcu_dereference(rt->fib6_next)) {
716 if (rt->fib6_metric != metric) {
721 match = find_match(rt, oif, strict, &mpri, match, do_rr);
727 for (rt = cont; rt; rt = rcu_dereference(rt->fib6_next))
728 match = find_match(rt, oif, strict, &mpri, match, do_rr);
733 static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn,
736 struct fib6_info *leaf = rcu_dereference(fn->leaf);
737 struct fib6_info *match, *rt0;
741 if (!leaf || leaf == net->ipv6.fib6_null_entry)
742 return net->ipv6.fib6_null_entry;
744 rt0 = rcu_dereference(fn->rr_ptr);
748 /* Double check to make sure fn is not an intermediate node
749 * and fn->leaf does not points to its child's leaf
750 * (This might happen if all routes under fn are deleted from
751 * the tree and fib6_repair_tree() is called on the node.)
753 key_plen = rt0->fib6_dst.plen;
754 #ifdef CONFIG_IPV6_SUBTREES
755 if (rt0->fib6_src.plen)
756 key_plen = rt0->fib6_src.plen;
758 if (fn->fn_bit != key_plen)
759 return net->ipv6.fib6_null_entry;
761 match = find_rr_leaf(fn, leaf, rt0, rt0->fib6_metric, oif, strict,
765 struct fib6_info *next = rcu_dereference(rt0->fib6_next);
767 /* no entries matched; do round-robin */
768 if (!next || next->fib6_metric != rt0->fib6_metric)
772 spin_lock_bh(&leaf->fib6_table->tb6_lock);
773 /* make sure next is not being deleted from the tree */
775 rcu_assign_pointer(fn->rr_ptr, next);
776 spin_unlock_bh(&leaf->fib6_table->tb6_lock);
780 return match ? match : net->ipv6.fib6_null_entry;
783 static bool rt6_is_gw_or_nonexthop(const struct fib6_info *rt)
785 return (rt->fib6_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
788 #ifdef CONFIG_IPV6_ROUTE_INFO
789 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
790 const struct in6_addr *gwaddr)
792 struct net *net = dev_net(dev);
793 struct route_info *rinfo = (struct route_info *) opt;
794 struct in6_addr prefix_buf, *prefix;
796 unsigned long lifetime;
797 struct fib6_info *rt;
799 if (len < sizeof(struct route_info)) {
803 /* Sanity check for prefix_len and length */
804 if (rinfo->length > 3) {
806 } else if (rinfo->prefix_len > 128) {
808 } else if (rinfo->prefix_len > 64) {
809 if (rinfo->length < 2) {
812 } else if (rinfo->prefix_len > 0) {
813 if (rinfo->length < 1) {
818 pref = rinfo->route_pref;
819 if (pref == ICMPV6_ROUTER_PREF_INVALID)
822 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
824 if (rinfo->length == 3)
825 prefix = (struct in6_addr *)rinfo->prefix;
827 /* this function is safe */
828 ipv6_addr_prefix(&prefix_buf,
829 (struct in6_addr *)rinfo->prefix,
831 prefix = &prefix_buf;
834 if (rinfo->prefix_len == 0)
835 rt = rt6_get_dflt_router(net, gwaddr, dev);
837 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
840 if (rt && !lifetime) {
846 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
849 rt->fib6_flags = RTF_ROUTEINFO |
850 (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
853 if (!addrconf_finite_timeout(lifetime))
854 fib6_clean_expires(rt);
856 fib6_set_expires(rt, jiffies + HZ * lifetime);
858 fib6_info_release(rt);
865 * Misc support functions
868 /* called with rcu_lock held */
869 static struct net_device *ip6_rt_get_dev_rcu(struct fib6_info *rt)
871 struct net_device *dev = rt->fib6_nh.nh_dev;
873 if (rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) {
874 /* for copies of local routes, dst->dev needs to be the
875 * device if it is a master device, the master device if
876 * device is enslaved, and the loopback as the default
878 if (netif_is_l3_slave(dev) &&
879 !rt6_need_strict(&rt->fib6_dst.addr))
880 dev = l3mdev_master_dev_rcu(dev);
881 else if (!netif_is_l3_master(dev))
882 dev = dev_net(dev)->loopback_dev;
883 /* last case is netif_is_l3_master(dev) is true in which
884 * case we want dev returned to be dev
891 static const int fib6_prop[RTN_MAX + 1] = {
898 [RTN_BLACKHOLE] = -EINVAL,
899 [RTN_UNREACHABLE] = -EHOSTUNREACH,
900 [RTN_PROHIBIT] = -EACCES,
901 [RTN_THROW] = -EAGAIN,
903 [RTN_XRESOLVE] = -EINVAL,
906 static int ip6_rt_type_to_error(u8 fib6_type)
908 return fib6_prop[fib6_type];
911 static unsigned short fib6_info_dst_flags(struct fib6_info *rt)
913 unsigned short flags = 0;
916 flags |= DST_NOCOUNT;
917 if (rt->dst_nopolicy)
918 flags |= DST_NOPOLICY;
925 static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort)
927 rt->dst.error = ip6_rt_type_to_error(ort->fib6_type);
929 switch (ort->fib6_type) {
931 rt->dst.output = dst_discard_out;
932 rt->dst.input = dst_discard;
935 rt->dst.output = ip6_pkt_prohibit_out;
936 rt->dst.input = ip6_pkt_prohibit;
939 case RTN_UNREACHABLE:
941 rt->dst.output = ip6_pkt_discard_out;
942 rt->dst.input = ip6_pkt_discard;
947 static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort)
949 rt->dst.flags |= fib6_info_dst_flags(ort);
951 if (ort->fib6_flags & RTF_REJECT) {
952 ip6_rt_init_dst_reject(rt, ort);
957 rt->dst.output = ip6_output;
959 if (ort->fib6_type == RTN_LOCAL) {
960 rt->dst.input = ip6_input;
961 } else if (ipv6_addr_type(&ort->fib6_dst.addr) & IPV6_ADDR_MULTICAST) {
962 rt->dst.input = ip6_mc_input;
964 rt->dst.input = ip6_forward;
967 if (ort->fib6_nh.nh_lwtstate) {
968 rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate);
969 lwtunnel_set_redirect(&rt->dst);
972 rt->dst.lastuse = jiffies;
975 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
977 rt->rt6i_flags &= ~RTF_EXPIRES;
978 fib6_info_hold(from);
979 rcu_assign_pointer(rt->from, from);
980 dst_init_metrics(&rt->dst, from->fib6_metrics->metrics, true);
981 if (from->fib6_metrics != &dst_default_metrics) {
982 rt->dst._metrics |= DST_METRICS_REFCOUNTED;
983 refcount_inc(&from->fib6_metrics->refcnt);
987 static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort)
989 struct net_device *dev = fib6_info_nh_dev(ort);
991 ip6_rt_init_dst(rt, ort);
993 rt->rt6i_dst = ort->fib6_dst;
994 rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL;
995 rt->rt6i_gateway = ort->fib6_nh.nh_gw;
996 rt->rt6i_flags = ort->fib6_flags;
997 rt6_set_from(rt, ort);
998 #ifdef CONFIG_IPV6_SUBTREES
999 rt->rt6i_src = ort->fib6_src;
1001 rt->rt6i_prefsrc = ort->fib6_prefsrc;
1002 rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate);
1005 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
1006 struct in6_addr *saddr)
1008 struct fib6_node *pn, *sn;
1010 if (fn->fn_flags & RTN_TL_ROOT)
1012 pn = rcu_dereference(fn->parent);
1013 sn = FIB6_SUBTREE(pn);
1015 fn = fib6_node_lookup(sn, NULL, saddr);
1018 if (fn->fn_flags & RTN_RTINFO)
1023 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
1026 struct rt6_info *rt = *prt;
1028 if (dst_hold_safe(&rt->dst))
1030 if (null_fallback) {
1031 rt = net->ipv6.ip6_null_entry;
1040 /* called with rcu_lock held */
1041 static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt)
1043 unsigned short flags = fib6_info_dst_flags(rt);
1044 struct net_device *dev = rt->fib6_nh.nh_dev;
1045 struct rt6_info *nrt;
1047 nrt = ip6_dst_alloc(dev_net(dev), dev, flags);
1049 ip6_rt_copy_init(nrt, rt);
1054 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
1055 struct fib6_table *table,
1057 const struct sk_buff *skb,
1060 struct fib6_info *f6i;
1061 struct fib6_node *fn;
1062 struct rt6_info *rt;
1064 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1065 flags &= ~RT6_LOOKUP_F_IFACE;
1068 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1070 f6i = rcu_dereference(fn->leaf);
1072 f6i = net->ipv6.fib6_null_entry;
1074 f6i = rt6_device_match(net, f6i, &fl6->saddr,
1075 fl6->flowi6_oif, flags);
1076 if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0)
1077 f6i = fib6_multipath_select(net, f6i, fl6,
1078 fl6->flowi6_oif, skb,
1081 if (f6i == net->ipv6.fib6_null_entry) {
1082 fn = fib6_backtrack(fn, &fl6->saddr);
1087 trace_fib6_table_lookup(net, f6i, table, fl6);
1089 /* Search through exception table */
1090 rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1092 if (ip6_hold_safe(net, &rt, true))
1093 dst_use_noref(&rt->dst, jiffies);
1094 } else if (f6i == net->ipv6.fib6_null_entry) {
1095 rt = net->ipv6.ip6_null_entry;
1098 rt = ip6_create_rt_rcu(f6i);
1100 rt = net->ipv6.ip6_null_entry;
1110 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
1111 const struct sk_buff *skb, int flags)
1113 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
1115 EXPORT_SYMBOL_GPL(ip6_route_lookup);
1117 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
1118 const struct in6_addr *saddr, int oif,
1119 const struct sk_buff *skb, int strict)
1121 struct flowi6 fl6 = {
1125 struct dst_entry *dst;
1126 int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
1129 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
1130 flags |= RT6_LOOKUP_F_HAS_SADDR;
1133 dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
1134 if (dst->error == 0)
1135 return (struct rt6_info *) dst;
1141 EXPORT_SYMBOL(rt6_lookup);
1143 /* ip6_ins_rt is called with FREE table->tb6_lock.
1144 * It takes new route entry, the addition fails by any reason the
1145 * route is released.
1146 * Caller must hold dst before calling it.
1149 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info,
1150 struct netlink_ext_ack *extack)
1153 struct fib6_table *table;
1155 table = rt->fib6_table;
1156 spin_lock_bh(&table->tb6_lock);
1157 err = fib6_add(&table->tb6_root, rt, info, extack);
1158 spin_unlock_bh(&table->tb6_lock);
1163 int ip6_ins_rt(struct net *net, struct fib6_info *rt)
1165 struct nl_info info = { .nl_net = net, };
1167 return __ip6_ins_rt(rt, &info, NULL);
1170 static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort,
1171 const struct in6_addr *daddr,
1172 const struct in6_addr *saddr)
1174 struct net_device *dev;
1175 struct rt6_info *rt;
1181 dev = ip6_rt_get_dev_rcu(ort);
1182 rt = ip6_dst_alloc(dev_net(dev), dev, 0);
1186 ip6_rt_copy_init(rt, ort);
1187 rt->rt6i_flags |= RTF_CACHE;
1188 rt->dst.flags |= DST_HOST;
1189 rt->rt6i_dst.addr = *daddr;
1190 rt->rt6i_dst.plen = 128;
1192 if (!rt6_is_gw_or_nonexthop(ort)) {
1193 if (ort->fib6_dst.plen != 128 &&
1194 ipv6_addr_equal(&ort->fib6_dst.addr, daddr))
1195 rt->rt6i_flags |= RTF_ANYCAST;
1196 #ifdef CONFIG_IPV6_SUBTREES
1197 if (rt->rt6i_src.plen && saddr) {
1198 rt->rt6i_src.addr = *saddr;
1199 rt->rt6i_src.plen = 128;
1207 static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt)
1209 unsigned short flags = fib6_info_dst_flags(rt);
1210 struct net_device *dev;
1211 struct rt6_info *pcpu_rt;
1214 dev = ip6_rt_get_dev_rcu(rt);
1215 pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags);
1219 ip6_rt_copy_init(pcpu_rt, rt);
1220 pcpu_rt->rt6i_flags |= RTF_PCPU;
1224 /* It should be called with rcu_read_lock() acquired */
1225 static struct rt6_info *rt6_get_pcpu_route(struct fib6_info *rt)
1227 struct rt6_info *pcpu_rt, **p;
1229 p = this_cpu_ptr(rt->rt6i_pcpu);
1233 ip6_hold_safe(NULL, &pcpu_rt, false);
1238 static struct rt6_info *rt6_make_pcpu_route(struct net *net,
1239 struct fib6_info *rt)
1241 struct rt6_info *pcpu_rt, *prev, **p;
1243 pcpu_rt = ip6_rt_pcpu_alloc(rt);
1245 dst_hold(&net->ipv6.ip6_null_entry->dst);
1246 return net->ipv6.ip6_null_entry;
1249 dst_hold(&pcpu_rt->dst);
1250 p = this_cpu_ptr(rt->rt6i_pcpu);
1251 prev = cmpxchg(p, NULL, pcpu_rt);
1257 /* exception hash table implementation
1259 static DEFINE_SPINLOCK(rt6_exception_lock);
1261 /* Remove rt6_ex from hash table and free the memory
1262 * Caller must hold rt6_exception_lock
1264 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1265 struct rt6_exception *rt6_ex)
1269 if (!bucket || !rt6_ex)
1272 net = dev_net(rt6_ex->rt6i->dst.dev);
1273 hlist_del_rcu(&rt6_ex->hlist);
1274 dst_release(&rt6_ex->rt6i->dst);
1275 kfree_rcu(rt6_ex, rcu);
1276 WARN_ON_ONCE(!bucket->depth);
1278 net->ipv6.rt6_stats->fib_rt_cache--;
1281 /* Remove oldest rt6_ex in bucket and free the memory
1282 * Caller must hold rt6_exception_lock
1284 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1286 struct rt6_exception *rt6_ex, *oldest = NULL;
1291 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1292 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1295 rt6_remove_exception(bucket, oldest);
1298 static u32 rt6_exception_hash(const struct in6_addr *dst,
1299 const struct in6_addr *src)
1301 static u32 seed __read_mostly;
1304 net_get_random_once(&seed, sizeof(seed));
1305 val = jhash(dst, sizeof(*dst), seed);
1307 #ifdef CONFIG_IPV6_SUBTREES
1309 val = jhash(src, sizeof(*src), val);
1311 return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1314 /* Helper function to find the cached rt in the hash table
1315 * and update bucket pointer to point to the bucket for this
1316 * (daddr, saddr) pair
1317 * Caller must hold rt6_exception_lock
1319 static struct rt6_exception *
1320 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1321 const struct in6_addr *daddr,
1322 const struct in6_addr *saddr)
1324 struct rt6_exception *rt6_ex;
1327 if (!(*bucket) || !daddr)
1330 hval = rt6_exception_hash(daddr, saddr);
1333 hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1334 struct rt6_info *rt6 = rt6_ex->rt6i;
1335 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1337 #ifdef CONFIG_IPV6_SUBTREES
1338 if (matched && saddr)
1339 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1347 /* Helper function to find the cached rt in the hash table
1348 * and update bucket pointer to point to the bucket for this
1349 * (daddr, saddr) pair
1350 * Caller must hold rcu_read_lock()
1352 static struct rt6_exception *
1353 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1354 const struct in6_addr *daddr,
1355 const struct in6_addr *saddr)
1357 struct rt6_exception *rt6_ex;
1360 WARN_ON_ONCE(!rcu_read_lock_held());
1362 if (!(*bucket) || !daddr)
1365 hval = rt6_exception_hash(daddr, saddr);
1368 hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1369 struct rt6_info *rt6 = rt6_ex->rt6i;
1370 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1372 #ifdef CONFIG_IPV6_SUBTREES
1373 if (matched && saddr)
1374 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1382 static unsigned int fib6_mtu(const struct fib6_info *rt)
1386 if (rt->fib6_pmtu) {
1387 mtu = rt->fib6_pmtu;
1389 struct net_device *dev = fib6_info_nh_dev(rt);
1390 struct inet6_dev *idev;
1393 idev = __in6_dev_get(dev);
1394 mtu = idev->cnf.mtu6;
1398 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1400 return mtu - lwtunnel_headroom(rt->fib6_nh.nh_lwtstate, mtu);
1403 static int rt6_insert_exception(struct rt6_info *nrt,
1404 struct fib6_info *ort)
1406 struct net *net = dev_net(nrt->dst.dev);
1407 struct rt6_exception_bucket *bucket;
1408 struct in6_addr *src_key = NULL;
1409 struct rt6_exception *rt6_ex;
1412 spin_lock_bh(&rt6_exception_lock);
1414 if (ort->exception_bucket_flushed) {
1419 bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1420 lockdep_is_held(&rt6_exception_lock));
1422 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1428 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1431 #ifdef CONFIG_IPV6_SUBTREES
1432 /* rt6i_src.plen != 0 indicates ort is in subtree
1433 * and exception table is indexed by a hash of
1434 * both rt6i_dst and rt6i_src.
1435 * Otherwise, the exception table is indexed by
1436 * a hash of only rt6i_dst.
1438 if (ort->fib6_src.plen)
1439 src_key = &nrt->rt6i_src.addr;
1442 /* Update rt6i_prefsrc as it could be changed
1443 * in rt6_remove_prefsrc()
1445 nrt->rt6i_prefsrc = ort->fib6_prefsrc;
1446 /* rt6_mtu_change() might lower mtu on ort.
1447 * Only insert this exception route if its mtu
1448 * is less than ort's mtu value.
1450 if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(ort)) {
1455 rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1458 rt6_remove_exception(bucket, rt6_ex);
1460 rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1466 rt6_ex->stamp = jiffies;
1467 hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1469 net->ipv6.rt6_stats->fib_rt_cache++;
1471 if (bucket->depth > FIB6_MAX_DEPTH)
1472 rt6_exception_remove_oldest(bucket);
1475 spin_unlock_bh(&rt6_exception_lock);
1477 /* Update fn->fn_sernum to invalidate all cached dst */
1479 spin_lock_bh(&ort->fib6_table->tb6_lock);
1480 fib6_update_sernum(net, ort);
1481 spin_unlock_bh(&ort->fib6_table->tb6_lock);
1482 fib6_force_start_gc(net);
1488 void rt6_flush_exceptions(struct fib6_info *rt)
1490 struct rt6_exception_bucket *bucket;
1491 struct rt6_exception *rt6_ex;
1492 struct hlist_node *tmp;
1495 spin_lock_bh(&rt6_exception_lock);
1496 /* Prevent rt6_insert_exception() to recreate the bucket list */
1497 rt->exception_bucket_flushed = 1;
1499 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1500 lockdep_is_held(&rt6_exception_lock));
1504 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1505 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1506 rt6_remove_exception(bucket, rt6_ex);
1507 WARN_ON_ONCE(bucket->depth);
1512 spin_unlock_bh(&rt6_exception_lock);
1515 /* Find cached rt in the hash table inside passed in rt
1516 * Caller has to hold rcu_read_lock()
1518 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
1519 struct in6_addr *daddr,
1520 struct in6_addr *saddr)
1522 struct rt6_exception_bucket *bucket;
1523 struct in6_addr *src_key = NULL;
1524 struct rt6_exception *rt6_ex;
1525 struct rt6_info *res = NULL;
1527 bucket = rcu_dereference(rt->rt6i_exception_bucket);
1529 #ifdef CONFIG_IPV6_SUBTREES
1530 /* rt6i_src.plen != 0 indicates rt is in subtree
1531 * and exception table is indexed by a hash of
1532 * both rt6i_dst and rt6i_src.
1533 * Otherwise, the exception table is indexed by
1534 * a hash of only rt6i_dst.
1536 if (rt->fib6_src.plen)
1539 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1541 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1547 /* Remove the passed in cached rt from the hash table that contains it */
1548 static int rt6_remove_exception_rt(struct rt6_info *rt)
1550 struct rt6_exception_bucket *bucket;
1551 struct in6_addr *src_key = NULL;
1552 struct rt6_exception *rt6_ex;
1553 struct fib6_info *from;
1556 from = rcu_dereference(rt->from);
1558 !(rt->rt6i_flags & RTF_CACHE))
1561 if (!rcu_access_pointer(from->rt6i_exception_bucket))
1564 spin_lock_bh(&rt6_exception_lock);
1565 bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1566 lockdep_is_held(&rt6_exception_lock));
1567 #ifdef CONFIG_IPV6_SUBTREES
1568 /* rt6i_src.plen != 0 indicates 'from' is in subtree
1569 * and exception table is indexed by a hash of
1570 * both rt6i_dst and rt6i_src.
1571 * Otherwise, the exception table is indexed by
1572 * a hash of only rt6i_dst.
1574 if (from->fib6_src.plen)
1575 src_key = &rt->rt6i_src.addr;
1577 rt6_ex = __rt6_find_exception_spinlock(&bucket,
1581 rt6_remove_exception(bucket, rt6_ex);
1587 spin_unlock_bh(&rt6_exception_lock);
1591 /* Find rt6_ex which contains the passed in rt cache and
1594 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1596 struct rt6_exception_bucket *bucket;
1597 struct fib6_info *from = rt->from;
1598 struct in6_addr *src_key = NULL;
1599 struct rt6_exception *rt6_ex;
1602 !(rt->rt6i_flags & RTF_CACHE))
1606 bucket = rcu_dereference(from->rt6i_exception_bucket);
1608 #ifdef CONFIG_IPV6_SUBTREES
1609 /* rt6i_src.plen != 0 indicates 'from' is in subtree
1610 * and exception table is indexed by a hash of
1611 * both rt6i_dst and rt6i_src.
1612 * Otherwise, the exception table is indexed by
1613 * a hash of only rt6i_dst.
1615 if (from->fib6_src.plen)
1616 src_key = &rt->rt6i_src.addr;
1618 rt6_ex = __rt6_find_exception_rcu(&bucket,
1622 rt6_ex->stamp = jiffies;
1627 static void rt6_exceptions_remove_prefsrc(struct fib6_info *rt)
1629 struct rt6_exception_bucket *bucket;
1630 struct rt6_exception *rt6_ex;
1633 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1634 lockdep_is_held(&rt6_exception_lock));
1637 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1638 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1639 rt6_ex->rt6i->rt6i_prefsrc.plen = 0;
1646 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
1647 struct rt6_info *rt, int mtu)
1649 /* If the new MTU is lower than the route PMTU, this new MTU will be the
1650 * lowest MTU in the path: always allow updating the route PMTU to
1651 * reflect PMTU decreases.
1653 * If the new MTU is higher, and the route PMTU is equal to the local
1654 * MTU, this means the old MTU is the lowest in the path, so allow
1655 * updating it: if other nodes now have lower MTUs, PMTU discovery will
1659 if (dst_mtu(&rt->dst) >= mtu)
1662 if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
1668 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
1669 struct fib6_info *rt, int mtu)
1671 struct rt6_exception_bucket *bucket;
1672 struct rt6_exception *rt6_ex;
1675 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1676 lockdep_is_held(&rt6_exception_lock));
1681 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1682 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1683 struct rt6_info *entry = rt6_ex->rt6i;
1685 /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
1686 * route), the metrics of its rt->from have already
1689 if (dst_metric_raw(&entry->dst, RTAX_MTU) &&
1690 rt6_mtu_change_route_allowed(idev, entry, mtu))
1691 dst_metric_set(&entry->dst, RTAX_MTU, mtu);
1697 #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE)
1699 static void rt6_exceptions_clean_tohost(struct fib6_info *rt,
1700 struct in6_addr *gateway)
1702 struct rt6_exception_bucket *bucket;
1703 struct rt6_exception *rt6_ex;
1704 struct hlist_node *tmp;
1707 if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1710 spin_lock_bh(&rt6_exception_lock);
1711 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1712 lockdep_is_held(&rt6_exception_lock));
1715 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1716 hlist_for_each_entry_safe(rt6_ex, tmp,
1717 &bucket->chain, hlist) {
1718 struct rt6_info *entry = rt6_ex->rt6i;
1720 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1721 RTF_CACHE_GATEWAY &&
1722 ipv6_addr_equal(gateway,
1723 &entry->rt6i_gateway)) {
1724 rt6_remove_exception(bucket, rt6_ex);
1731 spin_unlock_bh(&rt6_exception_lock);
1734 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1735 struct rt6_exception *rt6_ex,
1736 struct fib6_gc_args *gc_args,
1739 struct rt6_info *rt = rt6_ex->rt6i;
1741 /* we are pruning and obsoleting aged-out and non gateway exceptions
1742 * even if others have still references to them, so that on next
1743 * dst_check() such references can be dropped.
1744 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1745 * expired, independently from their aging, as per RFC 8201 section 4
1747 if (!(rt->rt6i_flags & RTF_EXPIRES)) {
1748 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1749 RT6_TRACE("aging clone %p\n", rt);
1750 rt6_remove_exception(bucket, rt6_ex);
1753 } else if (time_after(jiffies, rt->dst.expires)) {
1754 RT6_TRACE("purging expired route %p\n", rt);
1755 rt6_remove_exception(bucket, rt6_ex);
1759 if (rt->rt6i_flags & RTF_GATEWAY) {
1760 struct neighbour *neigh;
1761 __u8 neigh_flags = 0;
1763 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
1765 neigh_flags = neigh->flags;
1767 if (!(neigh_flags & NTF_ROUTER)) {
1768 RT6_TRACE("purging route %p via non-router but gateway\n",
1770 rt6_remove_exception(bucket, rt6_ex);
1778 void rt6_age_exceptions(struct fib6_info *rt,
1779 struct fib6_gc_args *gc_args,
1782 struct rt6_exception_bucket *bucket;
1783 struct rt6_exception *rt6_ex;
1784 struct hlist_node *tmp;
1787 if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1791 spin_lock(&rt6_exception_lock);
1792 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1793 lockdep_is_held(&rt6_exception_lock));
1796 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1797 hlist_for_each_entry_safe(rt6_ex, tmp,
1798 &bucket->chain, hlist) {
1799 rt6_age_examine_exception(bucket, rt6_ex,
1805 spin_unlock(&rt6_exception_lock);
1806 rcu_read_unlock_bh();
1809 /* must be called with rcu lock held */
1810 struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table,
1811 int oif, struct flowi6 *fl6, int strict)
1813 struct fib6_node *fn, *saved_fn;
1814 struct fib6_info *f6i;
1816 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1819 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1823 f6i = rt6_select(net, fn, oif, strict);
1824 if (f6i == net->ipv6.fib6_null_entry) {
1825 fn = fib6_backtrack(fn, &fl6->saddr);
1827 goto redo_rt6_select;
1828 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1829 /* also consider unreachable route */
1830 strict &= ~RT6_LOOKUP_F_REACHABLE;
1832 goto redo_rt6_select;
1836 trace_fib6_table_lookup(net, f6i, table, fl6);
1841 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1842 int oif, struct flowi6 *fl6,
1843 const struct sk_buff *skb, int flags)
1845 struct fib6_info *f6i;
1846 struct rt6_info *rt;
1849 strict |= flags & RT6_LOOKUP_F_IFACE;
1850 strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1851 if (net->ipv6.devconf_all->forwarding == 0)
1852 strict |= RT6_LOOKUP_F_REACHABLE;
1856 f6i = fib6_table_lookup(net, table, oif, fl6, strict);
1857 if (f6i->fib6_nsiblings)
1858 f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict);
1860 if (f6i == net->ipv6.fib6_null_entry) {
1861 rt = net->ipv6.ip6_null_entry;
1867 /*Search through exception table */
1868 rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1870 if (ip6_hold_safe(net, &rt, true))
1871 dst_use_noref(&rt->dst, jiffies);
1875 } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1876 !(f6i->fib6_flags & RTF_GATEWAY))) {
1877 /* Create a RTF_CACHE clone which will not be
1878 * owned by the fib6 tree. It is for the special case where
1879 * the daddr in the skb during the neighbor look-up is different
1880 * from the fl6->daddr used to look-up route here.
1882 struct rt6_info *uncached_rt;
1884 uncached_rt = ip6_rt_cache_alloc(f6i, &fl6->daddr, NULL);
1889 /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1890 * No need for another dst_hold()
1892 rt6_uncached_list_add(uncached_rt);
1893 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1895 uncached_rt = net->ipv6.ip6_null_entry;
1896 dst_hold(&uncached_rt->dst);
1901 /* Get a percpu copy */
1903 struct rt6_info *pcpu_rt;
1906 pcpu_rt = rt6_get_pcpu_route(f6i);
1909 pcpu_rt = rt6_make_pcpu_route(net, f6i);
1917 EXPORT_SYMBOL_GPL(ip6_pol_route);
1919 static struct rt6_info *ip6_pol_route_input(struct net *net,
1920 struct fib6_table *table,
1922 const struct sk_buff *skb,
1925 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
1928 struct dst_entry *ip6_route_input_lookup(struct net *net,
1929 struct net_device *dev,
1931 const struct sk_buff *skb,
1934 if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1935 flags |= RT6_LOOKUP_F_IFACE;
1937 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
1939 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1941 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1942 struct flow_keys *keys,
1943 struct flow_keys *flkeys)
1945 const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1946 const struct ipv6hdr *key_iph = outer_iph;
1947 struct flow_keys *_flkeys = flkeys;
1948 const struct ipv6hdr *inner_iph;
1949 const struct icmp6hdr *icmph;
1950 struct ipv6hdr _inner_iph;
1951 struct icmp6hdr _icmph;
1953 if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1956 icmph = skb_header_pointer(skb, skb_transport_offset(skb),
1957 sizeof(_icmph), &_icmph);
1961 if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1962 icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1963 icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1964 icmph->icmp6_type != ICMPV6_PARAMPROB)
1967 inner_iph = skb_header_pointer(skb,
1968 skb_transport_offset(skb) + sizeof(*icmph),
1969 sizeof(_inner_iph), &_inner_iph);
1973 key_iph = inner_iph;
1977 keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
1978 keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
1979 keys->tags.flow_label = _flkeys->tags.flow_label;
1980 keys->basic.ip_proto = _flkeys->basic.ip_proto;
1982 keys->addrs.v6addrs.src = key_iph->saddr;
1983 keys->addrs.v6addrs.dst = key_iph->daddr;
1984 keys->tags.flow_label = ip6_flowlabel(key_iph);
1985 keys->basic.ip_proto = key_iph->nexthdr;
1989 /* if skb is set it will be used and fl6 can be NULL */
1990 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
1991 const struct sk_buff *skb, struct flow_keys *flkeys)
1993 struct flow_keys hash_keys;
1996 switch (ip6_multipath_hash_policy(net)) {
1998 memset(&hash_keys, 0, sizeof(hash_keys));
1999 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2001 ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
2003 hash_keys.addrs.v6addrs.src = fl6->saddr;
2004 hash_keys.addrs.v6addrs.dst = fl6->daddr;
2005 hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
2006 hash_keys.basic.ip_proto = fl6->flowi6_proto;
2011 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
2012 struct flow_keys keys;
2014 /* short-circuit if we already have L4 hash present */
2016 return skb_get_hash_raw(skb) >> 1;
2018 memset(&hash_keys, 0, sizeof(hash_keys));
2021 skb_flow_dissect_flow_keys(skb, &keys, flag);
2024 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2025 hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
2026 hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
2027 hash_keys.ports.src = flkeys->ports.src;
2028 hash_keys.ports.dst = flkeys->ports.dst;
2029 hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2031 memset(&hash_keys, 0, sizeof(hash_keys));
2032 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2033 hash_keys.addrs.v6addrs.src = fl6->saddr;
2034 hash_keys.addrs.v6addrs.dst = fl6->daddr;
2035 hash_keys.ports.src = fl6->fl6_sport;
2036 hash_keys.ports.dst = fl6->fl6_dport;
2037 hash_keys.basic.ip_proto = fl6->flowi6_proto;
2041 mhash = flow_hash_from_keys(&hash_keys);
2046 void ip6_route_input(struct sk_buff *skb)
2048 const struct ipv6hdr *iph = ipv6_hdr(skb);
2049 struct net *net = dev_net(skb->dev);
2050 int flags = RT6_LOOKUP_F_HAS_SADDR;
2051 struct ip_tunnel_info *tun_info;
2052 struct flowi6 fl6 = {
2053 .flowi6_iif = skb->dev->ifindex,
2054 .daddr = iph->daddr,
2055 .saddr = iph->saddr,
2056 .flowlabel = ip6_flowinfo(iph),
2057 .flowi6_mark = skb->mark,
2058 .flowi6_proto = iph->nexthdr,
2060 struct flow_keys *flkeys = NULL, _flkeys;
2062 tun_info = skb_tunnel_info(skb);
2063 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2064 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
2066 if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
2069 if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
2070 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
2073 ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
2076 static struct rt6_info *ip6_pol_route_output(struct net *net,
2077 struct fib6_table *table,
2079 const struct sk_buff *skb,
2082 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
2085 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
2086 struct flowi6 *fl6, int flags)
2090 if (rt6_need_strict(&fl6->daddr)) {
2091 struct dst_entry *dst;
2093 dst = l3mdev_link_scope_lookup(net, fl6);
2098 fl6->flowi6_iif = LOOPBACK_IFINDEX;
2100 any_src = ipv6_addr_any(&fl6->saddr);
2101 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
2102 (fl6->flowi6_oif && any_src))
2103 flags |= RT6_LOOKUP_F_IFACE;
2106 flags |= RT6_LOOKUP_F_HAS_SADDR;
2108 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
2110 return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
2112 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
2114 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2116 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
2117 struct net_device *loopback_dev = net->loopback_dev;
2118 struct dst_entry *new = NULL;
2120 rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
2121 DST_OBSOLETE_DEAD, 0);
2124 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
2128 new->input = dst_discard;
2129 new->output = dst_discard_out;
2131 dst_copy_metrics(new, &ort->dst);
2133 rt->rt6i_idev = in6_dev_get(loopback_dev);
2134 rt->rt6i_gateway = ort->rt6i_gateway;
2135 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
2137 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
2138 #ifdef CONFIG_IPV6_SUBTREES
2139 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
2143 dst_release(dst_orig);
2144 return new ? new : ERR_PTR(-ENOMEM);
2148 * Destination cache support functions
2151 static bool fib6_check(struct fib6_info *f6i, u32 cookie)
2155 if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie)
2158 if (fib6_check_expired(f6i))
2164 static struct dst_entry *rt6_check(struct rt6_info *rt,
2165 struct fib6_info *from,
2170 if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) ||
2171 rt_cookie != cookie)
2174 if (rt6_check_expired(rt))
2180 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt,
2181 struct fib6_info *from,
2184 if (!__rt6_check_expired(rt) &&
2185 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
2186 fib6_check(from, cookie))
2192 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
2194 struct dst_entry *dst_ret;
2195 struct fib6_info *from;
2196 struct rt6_info *rt;
2198 rt = container_of(dst, struct rt6_info, dst);
2202 /* All IPV6 dsts are created with ->obsolete set to the value
2203 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
2204 * into this function always.
2207 from = rcu_dereference(rt->from);
2209 if (from && (rt->rt6i_flags & RTF_PCPU ||
2210 unlikely(!list_empty(&rt->rt6i_uncached))))
2211 dst_ret = rt6_dst_from_check(rt, from, cookie);
2213 dst_ret = rt6_check(rt, from, cookie);
2220 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
2222 struct rt6_info *rt = (struct rt6_info *) dst;
2225 if (rt->rt6i_flags & RTF_CACHE) {
2227 if (rt6_check_expired(rt)) {
2228 rt6_remove_exception_rt(rt);
2240 static void ip6_link_failure(struct sk_buff *skb)
2242 struct rt6_info *rt;
2244 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2246 rt = (struct rt6_info *) skb_dst(skb);
2249 if (rt->rt6i_flags & RTF_CACHE) {
2250 if (dst_hold_safe(&rt->dst))
2251 rt6_remove_exception_rt(rt);
2253 struct fib6_info *from;
2254 struct fib6_node *fn;
2256 from = rcu_dereference(rt->from);
2258 fn = rcu_dereference(from->fib6_node);
2259 if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2267 static void rt6_update_expires(struct rt6_info *rt0, int timeout)
2269 if (!(rt0->rt6i_flags & RTF_EXPIRES)) {
2270 struct fib6_info *from;
2273 from = rcu_dereference(rt0->from);
2275 rt0->dst.expires = from->expires;
2279 dst_set_expires(&rt0->dst, timeout);
2280 rt0->rt6i_flags |= RTF_EXPIRES;
2283 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2285 struct net *net = dev_net(rt->dst.dev);
2287 dst_metric_set(&rt->dst, RTAX_MTU, mtu);
2288 rt->rt6i_flags |= RTF_MODIFIED;
2289 rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2292 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2297 from_set = !!rcu_dereference(rt->from);
2300 return !(rt->rt6i_flags & RTF_CACHE) &&
2301 (rt->rt6i_flags & RTF_PCPU || from_set);
2304 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2305 const struct ipv6hdr *iph, u32 mtu)
2307 const struct in6_addr *daddr, *saddr;
2308 struct rt6_info *rt6 = (struct rt6_info *)dst;
2310 if (dst_metric_locked(dst, RTAX_MTU))
2314 daddr = &iph->daddr;
2315 saddr = &iph->saddr;
2317 daddr = &sk->sk_v6_daddr;
2318 saddr = &inet6_sk(sk)->saddr;
2323 dst_confirm_neigh(dst, daddr);
2324 mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2325 if (mtu >= dst_mtu(dst))
2328 if (!rt6_cache_allowed_for_pmtu(rt6)) {
2329 rt6_do_update_pmtu(rt6, mtu);
2330 /* update rt6_ex->stamp for cache */
2331 if (rt6->rt6i_flags & RTF_CACHE)
2332 rt6_update_exception_stamp_rt(rt6);
2334 struct fib6_info *from;
2335 struct rt6_info *nrt6;
2338 from = rcu_dereference(rt6->from);
2339 nrt6 = ip6_rt_cache_alloc(from, daddr, saddr);
2341 rt6_do_update_pmtu(nrt6, mtu);
2342 if (rt6_insert_exception(nrt6, from))
2343 dst_release_immediate(&nrt6->dst);
2349 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2350 struct sk_buff *skb, u32 mtu)
2352 __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2355 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2356 int oif, u32 mark, kuid_t uid)
2358 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2359 struct dst_entry *dst;
2362 memset(&fl6, 0, sizeof(fl6));
2363 fl6.flowi6_oif = oif;
2364 fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
2365 fl6.daddr = iph->daddr;
2366 fl6.saddr = iph->saddr;
2367 fl6.flowlabel = ip6_flowinfo(iph);
2368 fl6.flowi6_uid = uid;
2370 dst = ip6_route_output(net, NULL, &fl6);
2372 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2375 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2377 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2379 struct dst_entry *dst;
2381 ip6_update_pmtu(skb, sock_net(sk), mtu,
2382 sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid);
2384 dst = __sk_dst_get(sk);
2385 if (!dst || !dst->obsolete ||
2386 dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2390 if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2391 ip6_datagram_dst_update(sk, false);
2394 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2396 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
2397 const struct flowi6 *fl6)
2399 #ifdef CONFIG_IPV6_SUBTREES
2400 struct ipv6_pinfo *np = inet6_sk(sk);
2403 ip6_dst_store(sk, dst,
2404 ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ?
2405 &sk->sk_v6_daddr : NULL,
2406 #ifdef CONFIG_IPV6_SUBTREES
2407 ipv6_addr_equal(&fl6->saddr, &np->saddr) ?
2413 /* Handle redirects */
2414 struct ip6rd_flowi {
2416 struct in6_addr gateway;
2419 static struct rt6_info *__ip6_route_redirect(struct net *net,
2420 struct fib6_table *table,
2422 const struct sk_buff *skb,
2425 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2426 struct rt6_info *ret = NULL, *rt_cache;
2427 struct fib6_info *rt;
2428 struct fib6_node *fn;
2430 /* Get the "current" route for this destination and
2431 * check if the redirect has come from appropriate router.
2433 * RFC 4861 specifies that redirects should only be
2434 * accepted if they come from the nexthop to the target.
2435 * Due to the way the routes are chosen, this notion
2436 * is a bit fuzzy and one might need to check all possible
2441 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2443 for_each_fib6_node_rt_rcu(fn) {
2444 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
2446 if (fib6_check_expired(rt))
2448 if (rt->fib6_flags & RTF_REJECT)
2450 if (!(rt->fib6_flags & RTF_GATEWAY))
2452 if (fl6->flowi6_oif != rt->fib6_nh.nh_dev->ifindex)
2454 /* rt_cache's gateway might be different from its 'parent'
2455 * in the case of an ip redirect.
2456 * So we keep searching in the exception table if the gateway
2459 if (!ipv6_addr_equal(&rdfl->gateway, &rt->fib6_nh.nh_gw)) {
2460 rt_cache = rt6_find_cached_rt(rt,
2464 ipv6_addr_equal(&rdfl->gateway,
2465 &rt_cache->rt6i_gateway)) {
2475 rt = net->ipv6.fib6_null_entry;
2476 else if (rt->fib6_flags & RTF_REJECT) {
2477 ret = net->ipv6.ip6_null_entry;
2481 if (rt == net->ipv6.fib6_null_entry) {
2482 fn = fib6_backtrack(fn, &fl6->saddr);
2489 dst_hold(&ret->dst);
2491 ret = ip6_create_rt_rcu(rt);
2495 trace_fib6_table_lookup(net, rt, table, fl6);
2499 static struct dst_entry *ip6_route_redirect(struct net *net,
2500 const struct flowi6 *fl6,
2501 const struct sk_buff *skb,
2502 const struct in6_addr *gateway)
2504 int flags = RT6_LOOKUP_F_HAS_SADDR;
2505 struct ip6rd_flowi rdfl;
2508 rdfl.gateway = *gateway;
2510 return fib6_rule_lookup(net, &rdfl.fl6, skb,
2511 flags, __ip6_route_redirect);
2514 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2517 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2518 struct dst_entry *dst;
2521 memset(&fl6, 0, sizeof(fl6));
2522 fl6.flowi6_iif = LOOPBACK_IFINDEX;
2523 fl6.flowi6_oif = oif;
2524 fl6.flowi6_mark = mark;
2525 fl6.daddr = iph->daddr;
2526 fl6.saddr = iph->saddr;
2527 fl6.flowlabel = ip6_flowinfo(iph);
2528 fl6.flowi6_uid = uid;
2530 dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
2531 rt6_do_redirect(dst, NULL, skb);
2534 EXPORT_SYMBOL_GPL(ip6_redirect);
2536 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
2539 const struct ipv6hdr *iph = ipv6_hdr(skb);
2540 const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2541 struct dst_entry *dst;
2544 memset(&fl6, 0, sizeof(fl6));
2545 fl6.flowi6_iif = LOOPBACK_IFINDEX;
2546 fl6.flowi6_oif = oif;
2547 fl6.flowi6_mark = mark;
2548 fl6.daddr = msg->dest;
2549 fl6.saddr = iph->daddr;
2550 fl6.flowi6_uid = sock_net_uid(net, NULL);
2552 dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
2553 rt6_do_redirect(dst, NULL, skb);
2557 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2559 ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2562 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2564 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2566 struct net_device *dev = dst->dev;
2567 unsigned int mtu = dst_mtu(dst);
2568 struct net *net = dev_net(dev);
2570 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2572 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2573 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2576 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2577 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2578 * IPV6_MAXPLEN is also valid and means: "any MSS,
2579 * rely only on pmtu discovery"
2581 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2586 static unsigned int ip6_mtu(const struct dst_entry *dst)
2588 struct inet6_dev *idev;
2591 mtu = dst_metric_raw(dst, RTAX_MTU);
2598 idev = __in6_dev_get(dst->dev);
2600 mtu = idev->cnf.mtu6;
2604 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2606 return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2610 * 1. mtu on route is locked - use it
2611 * 2. mtu from nexthop exception
2612 * 3. mtu from egress device
2614 * based on ip6_dst_mtu_forward and exception logic of
2615 * rt6_find_cached_rt; called with rcu_read_lock
2617 u32 ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr,
2618 struct in6_addr *saddr)
2620 struct rt6_exception_bucket *bucket;
2621 struct rt6_exception *rt6_ex;
2622 struct in6_addr *src_key;
2623 struct inet6_dev *idev;
2626 if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) {
2627 mtu = f6i->fib6_pmtu;
2633 #ifdef CONFIG_IPV6_SUBTREES
2634 if (f6i->fib6_src.plen)
2638 bucket = rcu_dereference(f6i->rt6i_exception_bucket);
2639 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
2640 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
2641 mtu = dst_metric_raw(&rt6_ex->rt6i->dst, RTAX_MTU);
2644 struct net_device *dev = fib6_info_nh_dev(f6i);
2647 idev = __in6_dev_get(dev);
2648 if (idev && idev->cnf.mtu6 > mtu)
2649 mtu = idev->cnf.mtu6;
2652 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2654 return mtu - lwtunnel_headroom(fib6_info_nh_lwt(f6i), mtu);
2657 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2660 struct dst_entry *dst;
2661 struct rt6_info *rt;
2662 struct inet6_dev *idev = in6_dev_get(dev);
2663 struct net *net = dev_net(dev);
2665 if (unlikely(!idev))
2666 return ERR_PTR(-ENODEV);
2668 rt = ip6_dst_alloc(net, dev, 0);
2669 if (unlikely(!rt)) {
2671 dst = ERR_PTR(-ENOMEM);
2675 rt->dst.flags |= DST_HOST;
2676 rt->dst.input = ip6_input;
2677 rt->dst.output = ip6_output;
2678 rt->rt6i_gateway = fl6->daddr;
2679 rt->rt6i_dst.addr = fl6->daddr;
2680 rt->rt6i_dst.plen = 128;
2681 rt->rt6i_idev = idev;
2682 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2684 /* Add this dst into uncached_list so that rt6_disable_ip() can
2685 * do proper release of the net_device
2687 rt6_uncached_list_add(rt);
2688 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2690 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2696 static int ip6_dst_gc(struct dst_ops *ops)
2698 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2699 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2700 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2701 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2702 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2703 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2706 entries = dst_entries_get_fast(ops);
2707 if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2708 entries <= rt_max_size)
2711 net->ipv6.ip6_rt_gc_expire++;
2712 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2713 entries = dst_entries_get_slow(ops);
2714 if (entries < ops->gc_thresh)
2715 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2717 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2718 return entries > rt_max_size;
2721 static int ip6_convert_metrics(struct net *net, struct fib6_info *rt,
2722 struct fib6_config *cfg)
2724 struct dst_metrics *p;
2729 p = kzalloc(sizeof(*rt->fib6_metrics), GFP_KERNEL);
2733 refcount_set(&p->refcnt, 1);
2734 rt->fib6_metrics = p;
2736 return ip_metrics_convert(net, cfg->fc_mx, cfg->fc_mx_len, p->metrics);
2739 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2740 struct fib6_config *cfg,
2741 const struct in6_addr *gw_addr,
2742 u32 tbid, int flags)
2744 struct flowi6 fl6 = {
2745 .flowi6_oif = cfg->fc_ifindex,
2747 .saddr = cfg->fc_prefsrc,
2749 struct fib6_table *table;
2750 struct rt6_info *rt;
2752 table = fib6_get_table(net, tbid);
2756 if (!ipv6_addr_any(&cfg->fc_prefsrc))
2757 flags |= RT6_LOOKUP_F_HAS_SADDR;
2759 flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
2760 rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags);
2762 /* if table lookup failed, fall back to full lookup */
2763 if (rt == net->ipv6.ip6_null_entry) {
2771 static int ip6_route_check_nh_onlink(struct net *net,
2772 struct fib6_config *cfg,
2773 const struct net_device *dev,
2774 struct netlink_ext_ack *extack)
2776 u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
2777 const struct in6_addr *gw_addr = &cfg->fc_gateway;
2778 u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
2779 struct rt6_info *grt;
2783 grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
2785 if (!grt->dst.error &&
2786 (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
2787 NL_SET_ERR_MSG(extack,
2788 "Nexthop has invalid gateway or device mismatch");
2798 static int ip6_route_check_nh(struct net *net,
2799 struct fib6_config *cfg,
2800 struct net_device **_dev,
2801 struct inet6_dev **idev)
2803 const struct in6_addr *gw_addr = &cfg->fc_gateway;
2804 struct net_device *dev = _dev ? *_dev : NULL;
2805 struct rt6_info *grt = NULL;
2806 int err = -EHOSTUNREACH;
2808 if (cfg->fc_table) {
2809 int flags = RT6_LOOKUP_F_IFACE;
2811 grt = ip6_nh_lookup_table(net, cfg, gw_addr,
2812 cfg->fc_table, flags);
2814 if (grt->rt6i_flags & RTF_GATEWAY ||
2815 (dev && dev != grt->dst.dev)) {
2823 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1);
2829 if (dev != grt->dst.dev) {
2834 *_dev = dev = grt->dst.dev;
2835 *idev = grt->rt6i_idev;
2837 in6_dev_hold(grt->rt6i_idev);
2840 if (!(grt->rt6i_flags & RTF_GATEWAY))
2849 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
2850 struct net_device **_dev, struct inet6_dev **idev,
2851 struct netlink_ext_ack *extack)
2853 const struct in6_addr *gw_addr = &cfg->fc_gateway;
2854 int gwa_type = ipv6_addr_type(gw_addr);
2855 bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true;
2856 const struct net_device *dev = *_dev;
2857 bool need_addr_check = !dev;
2860 /* if gw_addr is local we will fail to detect this in case
2861 * address is still TENTATIVE (DAD in progress). rt6_lookup()
2862 * will return already-added prefix route via interface that
2863 * prefix route was assigned to, which might be non-loopback.
2866 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2867 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2871 if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
2872 /* IPv6 strictly inhibits using not link-local
2873 * addresses as nexthop address.
2874 * Otherwise, router will not able to send redirects.
2875 * It is very good, but in some (rare!) circumstances
2876 * (SIT, PtP, NBMA NOARP links) it is handy to allow
2877 * some exceptions. --ANK
2878 * We allow IPv4-mapped nexthops to support RFC4798-type
2881 if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) {
2882 NL_SET_ERR_MSG(extack, "Invalid gateway address");
2886 if (cfg->fc_flags & RTNH_F_ONLINK)
2887 err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
2889 err = ip6_route_check_nh(net, cfg, _dev, idev);
2895 /* reload in case device was changed */
2900 NL_SET_ERR_MSG(extack, "Egress device not specified");
2902 } else if (dev->flags & IFF_LOOPBACK) {
2903 NL_SET_ERR_MSG(extack,
2904 "Egress device can not be loopback device for this route");
2908 /* if we did not check gw_addr above, do so now that the
2909 * egress device has been resolved.
2911 if (need_addr_check &&
2912 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2913 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2922 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
2924 struct netlink_ext_ack *extack)
2926 struct net *net = cfg->fc_nlinfo.nl_net;
2927 struct fib6_info *rt = NULL;
2928 struct net_device *dev = NULL;
2929 struct inet6_dev *idev = NULL;
2930 struct fib6_table *table;
2934 /* RTF_PCPU is an internal flag; can not be set by userspace */
2935 if (cfg->fc_flags & RTF_PCPU) {
2936 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
2940 /* RTF_CACHE is an internal flag; can not be set by userspace */
2941 if (cfg->fc_flags & RTF_CACHE) {
2942 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
2946 if (cfg->fc_type > RTN_MAX) {
2947 NL_SET_ERR_MSG(extack, "Invalid route type");
2951 if (cfg->fc_dst_len > 128) {
2952 NL_SET_ERR_MSG(extack, "Invalid prefix length");
2955 if (cfg->fc_src_len > 128) {
2956 NL_SET_ERR_MSG(extack, "Invalid source address length");
2959 #ifndef CONFIG_IPV6_SUBTREES
2960 if (cfg->fc_src_len) {
2961 NL_SET_ERR_MSG(extack,
2962 "Specifying source address requires IPV6_SUBTREES to be enabled");
2966 if (cfg->fc_ifindex) {
2968 dev = dev_get_by_index(net, cfg->fc_ifindex);
2971 idev = in6_dev_get(dev);
2976 if (cfg->fc_metric == 0)
2977 cfg->fc_metric = IP6_RT_PRIO_USER;
2979 if (cfg->fc_flags & RTNH_F_ONLINK) {
2981 NL_SET_ERR_MSG(extack,
2982 "Nexthop device required for onlink");
2987 if (!(dev->flags & IFF_UP)) {
2988 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
2995 if (cfg->fc_nlinfo.nlh &&
2996 !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
2997 table = fib6_get_table(net, cfg->fc_table);
2999 pr_warn("NLM_F_CREATE should be specified when creating new route\n");
3000 table = fib6_new_table(net, cfg->fc_table);
3003 table = fib6_new_table(net, cfg->fc_table);
3010 rt = fib6_info_alloc(gfp_flags);
3014 if (cfg->fc_flags & RTF_ADDRCONF)
3015 rt->dst_nocount = true;
3017 err = ip6_convert_metrics(net, rt, cfg);
3021 if (cfg->fc_flags & RTF_EXPIRES)
3022 fib6_set_expires(rt, jiffies +
3023 clock_t_to_jiffies(cfg->fc_expires));
3025 fib6_clean_expires(rt);
3027 if (cfg->fc_protocol == RTPROT_UNSPEC)
3028 cfg->fc_protocol = RTPROT_BOOT;
3029 rt->fib6_protocol = cfg->fc_protocol;
3031 addr_type = ipv6_addr_type(&cfg->fc_dst);
3033 if (cfg->fc_encap) {
3034 struct lwtunnel_state *lwtstate;
3036 err = lwtunnel_build_state(cfg->fc_encap_type,
3037 cfg->fc_encap, AF_INET6, cfg,
3041 rt->fib6_nh.nh_lwtstate = lwtstate_get(lwtstate);
3044 ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
3045 rt->fib6_dst.plen = cfg->fc_dst_len;
3046 if (rt->fib6_dst.plen == 128)
3047 rt->dst_host = true;
3049 #ifdef CONFIG_IPV6_SUBTREES
3050 ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
3051 rt->fib6_src.plen = cfg->fc_src_len;
3054 rt->fib6_metric = cfg->fc_metric;
3055 rt->fib6_nh.nh_weight = 1;
3057 rt->fib6_type = cfg->fc_type;
3059 /* We cannot add true routes via loopback here,
3060 they would result in kernel looping; promote them to reject routes
3062 if ((cfg->fc_flags & RTF_REJECT) ||
3063 (dev && (dev->flags & IFF_LOOPBACK) &&
3064 !(addr_type & IPV6_ADDR_LOOPBACK) &&
3065 !(cfg->fc_flags & RTF_LOCAL))) {
3066 /* hold loopback dev/idev if we haven't done so. */
3067 if (dev != net->loopback_dev) {
3072 dev = net->loopback_dev;
3074 idev = in6_dev_get(dev);
3080 rt->fib6_flags = RTF_REJECT|RTF_NONEXTHOP;
3084 if (cfg->fc_flags & RTF_GATEWAY) {
3085 err = ip6_validate_gw(net, cfg, &dev, &idev, extack);
3089 rt->fib6_nh.nh_gw = cfg->fc_gateway;
3096 if (idev->cnf.disable_ipv6) {
3097 NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
3102 if (!(dev->flags & IFF_UP)) {
3103 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3108 if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
3109 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
3110 NL_SET_ERR_MSG(extack, "Invalid source address");
3114 rt->fib6_prefsrc.addr = cfg->fc_prefsrc;
3115 rt->fib6_prefsrc.plen = 128;
3117 rt->fib6_prefsrc.plen = 0;
3119 rt->fib6_flags = cfg->fc_flags;
3122 if (!(rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
3123 !netif_carrier_ok(dev))
3124 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
3125 rt->fib6_nh.nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK);
3126 rt->fib6_nh.nh_dev = dev;
3127 rt->fib6_table = table;
3129 cfg->fc_nlinfo.nl_net = dev_net(dev);
3141 fib6_info_release(rt);
3142 return ERR_PTR(err);
3145 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
3146 struct netlink_ext_ack *extack)
3148 struct fib6_info *rt;
3151 rt = ip6_route_info_create(cfg, gfp_flags, extack);
3155 err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack);
3156 fib6_info_release(rt);
3161 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info)
3163 struct net *net = info->nl_net;
3164 struct fib6_table *table;
3167 if (rt == net->ipv6.fib6_null_entry) {
3172 table = rt->fib6_table;
3173 spin_lock_bh(&table->tb6_lock);
3174 err = fib6_del(rt, info);
3175 spin_unlock_bh(&table->tb6_lock);
3178 fib6_info_release(rt);
3182 int ip6_del_rt(struct net *net, struct fib6_info *rt)
3184 struct nl_info info = { .nl_net = net };
3186 return __ip6_del_rt(rt, &info);
3189 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
3191 struct nl_info *info = &cfg->fc_nlinfo;
3192 struct net *net = info->nl_net;
3193 struct sk_buff *skb = NULL;
3194 struct fib6_table *table;
3197 if (rt == net->ipv6.fib6_null_entry)
3199 table = rt->fib6_table;
3200 spin_lock_bh(&table->tb6_lock);
3202 if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) {
3203 struct fib6_info *sibling, *next_sibling;
3205 /* prefer to send a single notification with all hops */
3206 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3208 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3210 if (rt6_fill_node(net, skb, rt, NULL,
3211 NULL, NULL, 0, RTM_DELROUTE,
3212 info->portid, seq, 0) < 0) {
3216 info->skip_notify = 1;
3219 list_for_each_entry_safe(sibling, next_sibling,
3222 err = fib6_del(sibling, info);
3228 err = fib6_del(rt, info);
3230 spin_unlock_bh(&table->tb6_lock);
3232 fib6_info_release(rt);
3235 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3236 info->nlh, gfp_any());
3241 static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
3245 if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex)
3248 if (cfg->fc_flags & RTF_GATEWAY &&
3249 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
3251 if (dst_hold_safe(&rt->dst))
3252 rc = rt6_remove_exception_rt(rt);
3257 static int ip6_route_del(struct fib6_config *cfg,
3258 struct netlink_ext_ack *extack)
3260 struct rt6_info *rt_cache;
3261 struct fib6_table *table;
3262 struct fib6_info *rt;
3263 struct fib6_node *fn;
3266 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
3268 NL_SET_ERR_MSG(extack, "FIB table does not exist");
3274 fn = fib6_locate(&table->tb6_root,
3275 &cfg->fc_dst, cfg->fc_dst_len,
3276 &cfg->fc_src, cfg->fc_src_len,
3277 !(cfg->fc_flags & RTF_CACHE));
3280 for_each_fib6_node_rt_rcu(fn) {
3281 if (cfg->fc_flags & RTF_CACHE) {
3284 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
3287 rc = ip6_del_cached_rt(rt_cache, cfg);
3295 if (cfg->fc_ifindex &&
3296 (!rt->fib6_nh.nh_dev ||
3297 rt->fib6_nh.nh_dev->ifindex != cfg->fc_ifindex))
3299 if (cfg->fc_flags & RTF_GATEWAY &&
3300 !ipv6_addr_equal(&cfg->fc_gateway, &rt->fib6_nh.nh_gw))
3302 if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
3304 if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol)
3309 /* if gateway was specified only delete the one hop */
3310 if (cfg->fc_flags & RTF_GATEWAY)
3311 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
3313 return __ip6_del_rt_siblings(rt, cfg);
3321 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
3323 struct netevent_redirect netevent;
3324 struct rt6_info *rt, *nrt = NULL;
3325 struct ndisc_options ndopts;
3326 struct inet6_dev *in6_dev;
3327 struct neighbour *neigh;
3328 struct fib6_info *from;
3330 int optlen, on_link;
3333 optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
3334 optlen -= sizeof(*msg);
3337 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
3341 msg = (struct rd_msg *)icmp6_hdr(skb);
3343 if (ipv6_addr_is_multicast(&msg->dest)) {
3344 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
3349 if (ipv6_addr_equal(&msg->dest, &msg->target)) {
3351 } else if (ipv6_addr_type(&msg->target) !=
3352 (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
3353 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
3357 in6_dev = __in6_dev_get(skb->dev);
3360 if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
3364 * The IP source address of the Redirect MUST be the same as the current
3365 * first-hop router for the specified ICMP Destination Address.
3368 if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
3369 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
3374 if (ndopts.nd_opts_tgt_lladdr) {
3375 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
3378 net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
3383 rt = (struct rt6_info *) dst;
3384 if (rt->rt6i_flags & RTF_REJECT) {
3385 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3389 /* Redirect received -> path was valid.
3390 * Look, redirects are sent only in response to data packets,
3391 * so that this nexthop apparently is reachable. --ANK
3393 dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3395 neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3400 * We have finally decided to accept it.
3403 ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3404 NEIGH_UPDATE_F_WEAK_OVERRIDE|
3405 NEIGH_UPDATE_F_OVERRIDE|
3406 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3407 NEIGH_UPDATE_F_ISROUTER)),
3408 NDISC_REDIRECT, &ndopts);
3411 from = rcu_dereference(rt->from);
3412 fib6_info_hold(from);
3415 nrt = ip6_rt_cache_alloc(from, &msg->dest, NULL);
3419 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3421 nrt->rt6i_flags &= ~RTF_GATEWAY;
3423 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3425 /* No need to remove rt from the exception table if rt is
3426 * a cached route because rt6_insert_exception() will
3429 if (rt6_insert_exception(nrt, from)) {
3430 dst_release_immediate(&nrt->dst);
3434 netevent.old = &rt->dst;
3435 netevent.new = &nrt->dst;
3436 netevent.daddr = &msg->dest;
3437 netevent.neigh = neigh;
3438 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3441 fib6_info_release(from);
3442 neigh_release(neigh);
3445 #ifdef CONFIG_IPV6_ROUTE_INFO
3446 static struct fib6_info *rt6_get_route_info(struct net *net,
3447 const struct in6_addr *prefix, int prefixlen,
3448 const struct in6_addr *gwaddr,
3449 struct net_device *dev)
3451 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3452 int ifindex = dev->ifindex;
3453 struct fib6_node *fn;
3454 struct fib6_info *rt = NULL;
3455 struct fib6_table *table;
3457 table = fib6_get_table(net, tb_id);
3462 fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3466 for_each_fib6_node_rt_rcu(fn) {
3467 if (rt->fib6_nh.nh_dev->ifindex != ifindex)
3469 if ((rt->fib6_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
3471 if (!ipv6_addr_equal(&rt->fib6_nh.nh_gw, gwaddr))
3481 static struct fib6_info *rt6_add_route_info(struct net *net,
3482 const struct in6_addr *prefix, int prefixlen,
3483 const struct in6_addr *gwaddr,
3484 struct net_device *dev,
3487 struct fib6_config cfg = {
3488 .fc_metric = IP6_RT_PRIO_USER,
3489 .fc_ifindex = dev->ifindex,
3490 .fc_dst_len = prefixlen,
3491 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3492 RTF_UP | RTF_PREF(pref),
3493 .fc_protocol = RTPROT_RA,
3494 .fc_type = RTN_UNICAST,
3495 .fc_nlinfo.portid = 0,
3496 .fc_nlinfo.nlh = NULL,
3497 .fc_nlinfo.nl_net = net,
3500 cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3501 cfg.fc_dst = *prefix;
3502 cfg.fc_gateway = *gwaddr;
3504 /* We should treat it as a default route if prefix length is 0. */
3506 cfg.fc_flags |= RTF_DEFAULT;
3508 ip6_route_add(&cfg, GFP_ATOMIC, NULL);
3510 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3514 struct fib6_info *rt6_get_dflt_router(struct net *net,
3515 const struct in6_addr *addr,
3516 struct net_device *dev)
3518 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3519 struct fib6_info *rt;
3520 struct fib6_table *table;
3522 table = fib6_get_table(net, tb_id);
3527 for_each_fib6_node_rt_rcu(&table->tb6_root) {
3528 if (dev == rt->fib6_nh.nh_dev &&
3529 ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3530 ipv6_addr_equal(&rt->fib6_nh.nh_gw, addr))
3539 struct fib6_info *rt6_add_dflt_router(struct net *net,
3540 const struct in6_addr *gwaddr,
3541 struct net_device *dev,
3544 struct fib6_config cfg = {
3545 .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3546 .fc_metric = IP6_RT_PRIO_USER,
3547 .fc_ifindex = dev->ifindex,
3548 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3549 RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3550 .fc_protocol = RTPROT_RA,
3551 .fc_type = RTN_UNICAST,
3552 .fc_nlinfo.portid = 0,
3553 .fc_nlinfo.nlh = NULL,
3554 .fc_nlinfo.nl_net = net,
3557 cfg.fc_gateway = *gwaddr;
3559 if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) {
3560 struct fib6_table *table;
3562 table = fib6_get_table(dev_net(dev), cfg.fc_table);
3564 table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3567 return rt6_get_dflt_router(net, gwaddr, dev);
3570 static void __rt6_purge_dflt_routers(struct net *net,
3571 struct fib6_table *table)
3573 struct fib6_info *rt;
3577 for_each_fib6_node_rt_rcu(&table->tb6_root) {
3578 struct net_device *dev = fib6_info_nh_dev(rt);
3579 struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL;
3581 if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3582 (!idev || idev->cnf.accept_ra != 2)) {
3585 ip6_del_rt(net, rt);
3591 table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3594 void rt6_purge_dflt_routers(struct net *net)
3596 struct fib6_table *table;
3597 struct hlist_head *head;
3602 for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3603 head = &net->ipv6.fib_table_hash[h];
3604 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3605 if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3606 __rt6_purge_dflt_routers(net, table);
3613 static void rtmsg_to_fib6_config(struct net *net,
3614 struct in6_rtmsg *rtmsg,
3615 struct fib6_config *cfg)
3617 memset(cfg, 0, sizeof(*cfg));
3619 cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3621 cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
3622 cfg->fc_metric = rtmsg->rtmsg_metric;
3623 cfg->fc_expires = rtmsg->rtmsg_info;
3624 cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
3625 cfg->fc_src_len = rtmsg->rtmsg_src_len;
3626 cfg->fc_flags = rtmsg->rtmsg_flags;
3627 cfg->fc_type = rtmsg->rtmsg_type;
3629 cfg->fc_nlinfo.nl_net = net;
3631 cfg->fc_dst = rtmsg->rtmsg_dst;
3632 cfg->fc_src = rtmsg->rtmsg_src;
3633 cfg->fc_gateway = rtmsg->rtmsg_gateway;
3636 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3638 struct fib6_config cfg;
3639 struct in6_rtmsg rtmsg;
3643 case SIOCADDRT: /* Add a route */
3644 case SIOCDELRT: /* Delete a route */
3645 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3647 err = copy_from_user(&rtmsg, arg,
3648 sizeof(struct in6_rtmsg));
3652 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3657 err = ip6_route_add(&cfg, GFP_KERNEL, NULL);
3660 err = ip6_route_del(&cfg, NULL);
3674 * Drop the packet on the floor
3677 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3680 struct dst_entry *dst = skb_dst(skb);
3681 switch (ipstats_mib_noroutes) {
3682 case IPSTATS_MIB_INNOROUTES:
3683 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3684 if (type == IPV6_ADDR_ANY) {
3685 IP6_INC_STATS(dev_net(dst->dev),
3686 __in6_dev_get_safely(skb->dev),
3687 IPSTATS_MIB_INADDRERRORS);
3691 case IPSTATS_MIB_OUTNOROUTES:
3692 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3693 ipstats_mib_noroutes);
3696 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3701 static int ip6_pkt_discard(struct sk_buff *skb)
3703 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3706 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3708 skb->dev = skb_dst(skb)->dev;
3709 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3712 static int ip6_pkt_prohibit(struct sk_buff *skb)
3714 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3717 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3719 skb->dev = skb_dst(skb)->dev;
3720 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3724 * Allocate a dst for local (unicast / anycast) address.
3727 struct fib6_info *addrconf_f6i_alloc(struct net *net,
3728 struct inet6_dev *idev,
3729 const struct in6_addr *addr,
3730 bool anycast, gfp_t gfp_flags)
3733 struct net_device *dev = idev->dev;
3734 struct fib6_info *f6i;
3736 f6i = fib6_info_alloc(gfp_flags);
3738 return ERR_PTR(-ENOMEM);
3740 f6i->dst_nocount = true;
3741 f6i->dst_host = true;
3742 f6i->fib6_protocol = RTPROT_KERNEL;
3743 f6i->fib6_flags = RTF_UP | RTF_NONEXTHOP;
3745 f6i->fib6_type = RTN_ANYCAST;
3746 f6i->fib6_flags |= RTF_ANYCAST;
3748 f6i->fib6_type = RTN_LOCAL;
3749 f6i->fib6_flags |= RTF_LOCAL;
3752 f6i->fib6_nh.nh_gw = *addr;
3754 f6i->fib6_nh.nh_dev = dev;
3755 f6i->fib6_dst.addr = *addr;
3756 f6i->fib6_dst.plen = 128;
3757 tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
3758 f6i->fib6_table = fib6_get_table(net, tb_id);
3763 /* remove deleted ip from prefsrc entries */
3764 struct arg_dev_net_ip {
3765 struct net_device *dev;
3767 struct in6_addr *addr;
3770 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
3772 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3773 struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3774 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3776 if (((void *)rt->fib6_nh.nh_dev == dev || !dev) &&
3777 rt != net->ipv6.fib6_null_entry &&
3778 ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) {
3779 spin_lock_bh(&rt6_exception_lock);
3780 /* remove prefsrc entry */
3781 rt->fib6_prefsrc.plen = 0;
3782 /* need to update cache as well */
3783 rt6_exceptions_remove_prefsrc(rt);
3784 spin_unlock_bh(&rt6_exception_lock);
3789 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3791 struct net *net = dev_net(ifp->idev->dev);
3792 struct arg_dev_net_ip adni = {
3793 .dev = ifp->idev->dev,
3797 fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3800 #define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
3802 /* Remove routers and update dst entries when gateway turn into host. */
3803 static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
3805 struct in6_addr *gateway = (struct in6_addr *)arg;
3807 if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3808 ipv6_addr_equal(gateway, &rt->fib6_nh.nh_gw)) {
3812 /* Further clean up cached routes in exception table.
3813 * This is needed because cached route may have a different
3814 * gateway than its 'parent' in the case of an ip redirect.
3816 rt6_exceptions_clean_tohost(rt, gateway);
3821 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3823 fib6_clean_all(net, fib6_clean_tohost, gateway);
3826 struct arg_netdev_event {
3827 const struct net_device *dev;
3829 unsigned int nh_flags;
3830 unsigned long event;
3834 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
3836 struct fib6_info *iter;
3837 struct fib6_node *fn;
3839 fn = rcu_dereference_protected(rt->fib6_node,
3840 lockdep_is_held(&rt->fib6_table->tb6_lock));
3841 iter = rcu_dereference_protected(fn->leaf,
3842 lockdep_is_held(&rt->fib6_table->tb6_lock));
3844 if (iter->fib6_metric == rt->fib6_metric &&
3845 rt6_qualify_for_ecmp(iter))
3847 iter = rcu_dereference_protected(iter->fib6_next,
3848 lockdep_is_held(&rt->fib6_table->tb6_lock));
3854 static bool rt6_is_dead(const struct fib6_info *rt)
3856 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD ||
3857 (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
3858 fib6_ignore_linkdown(rt)))
3864 static int rt6_multipath_total_weight(const struct fib6_info *rt)
3866 struct fib6_info *iter;
3869 if (!rt6_is_dead(rt))
3870 total += rt->fib6_nh.nh_weight;
3872 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) {
3873 if (!rt6_is_dead(iter))
3874 total += iter->fib6_nh.nh_weight;
3880 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total)
3882 int upper_bound = -1;
3884 if (!rt6_is_dead(rt)) {
3885 *weight += rt->fib6_nh.nh_weight;
3886 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
3889 atomic_set(&rt->fib6_nh.nh_upper_bound, upper_bound);
3892 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total)
3894 struct fib6_info *iter;
3897 rt6_upper_bound_set(rt, &weight, total);
3899 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3900 rt6_upper_bound_set(iter, &weight, total);
3903 void rt6_multipath_rebalance(struct fib6_info *rt)
3905 struct fib6_info *first;
3908 /* In case the entire multipath route was marked for flushing,
3909 * then there is no need to rebalance upon the removal of every
3912 if (!rt->fib6_nsiblings || rt->should_flush)
3915 /* During lookup routes are evaluated in order, so we need to
3916 * make sure upper bounds are assigned from the first sibling
3919 first = rt6_multipath_first_sibling(rt);
3920 if (WARN_ON_ONCE(!first))
3923 total = rt6_multipath_total_weight(first);
3924 rt6_multipath_upper_bound_set(first, total);
3927 static int fib6_ifup(struct fib6_info *rt, void *p_arg)
3929 const struct arg_netdev_event *arg = p_arg;
3930 struct net *net = dev_net(arg->dev);
3932 if (rt != net->ipv6.fib6_null_entry && rt->fib6_nh.nh_dev == arg->dev) {
3933 rt->fib6_nh.nh_flags &= ~arg->nh_flags;
3934 fib6_update_sernum_upto_root(net, rt);
3935 rt6_multipath_rebalance(rt);
3941 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags)
3943 struct arg_netdev_event arg = {
3946 .nh_flags = nh_flags,
3950 if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
3951 arg.nh_flags |= RTNH_F_LINKDOWN;
3953 fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
3956 static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
3957 const struct net_device *dev)
3959 struct fib6_info *iter;
3961 if (rt->fib6_nh.nh_dev == dev)
3963 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3964 if (iter->fib6_nh.nh_dev == dev)
3970 static void rt6_multipath_flush(struct fib6_info *rt)
3972 struct fib6_info *iter;
3974 rt->should_flush = 1;
3975 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3976 iter->should_flush = 1;
3979 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt,
3980 const struct net_device *down_dev)
3982 struct fib6_info *iter;
3983 unsigned int dead = 0;
3985 if (rt->fib6_nh.nh_dev == down_dev ||
3986 rt->fib6_nh.nh_flags & RTNH_F_DEAD)
3988 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3989 if (iter->fib6_nh.nh_dev == down_dev ||
3990 iter->fib6_nh.nh_flags & RTNH_F_DEAD)
3996 static void rt6_multipath_nh_flags_set(struct fib6_info *rt,
3997 const struct net_device *dev,
3998 unsigned int nh_flags)
4000 struct fib6_info *iter;
4002 if (rt->fib6_nh.nh_dev == dev)
4003 rt->fib6_nh.nh_flags |= nh_flags;
4004 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4005 if (iter->fib6_nh.nh_dev == dev)
4006 iter->fib6_nh.nh_flags |= nh_flags;
4009 /* called with write lock held for table with rt */
4010 static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
4012 const struct arg_netdev_event *arg = p_arg;
4013 const struct net_device *dev = arg->dev;
4014 struct net *net = dev_net(dev);
4016 if (rt == net->ipv6.fib6_null_entry)
4019 switch (arg->event) {
4020 case NETDEV_UNREGISTER:
4021 return rt->fib6_nh.nh_dev == dev ? -1 : 0;
4023 if (rt->should_flush)
4025 if (!rt->fib6_nsiblings)
4026 return rt->fib6_nh.nh_dev == dev ? -1 : 0;
4027 if (rt6_multipath_uses_dev(rt, dev)) {
4030 count = rt6_multipath_dead_count(rt, dev);
4031 if (rt->fib6_nsiblings + 1 == count) {
4032 rt6_multipath_flush(rt);
4035 rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
4037 fib6_update_sernum(net, rt);
4038 rt6_multipath_rebalance(rt);
4042 if (rt->fib6_nh.nh_dev != dev ||
4043 rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
4045 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
4046 rt6_multipath_rebalance(rt);
4053 void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
4055 struct arg_netdev_event arg = {
4062 fib6_clean_all(dev_net(dev), fib6_ifdown, &arg);
4065 void rt6_disable_ip(struct net_device *dev, unsigned long event)
4067 rt6_sync_down_dev(dev, event);
4068 rt6_uncached_list_flush_dev(dev_net(dev), dev);
4069 neigh_ifdown(&nd_tbl, dev);
4072 struct rt6_mtu_change_arg {
4073 struct net_device *dev;
4077 static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg)
4079 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
4080 struct inet6_dev *idev;
4082 /* In IPv6 pmtu discovery is not optional,
4083 so that RTAX_MTU lock cannot disable it.
4084 We still use this lock to block changes
4085 caused by addrconf/ndisc.
4088 idev = __in6_dev_get(arg->dev);
4092 /* For administrative MTU increase, there is no way to discover
4093 IPv6 PMTU increase, so PMTU increase should be updated here.
4094 Since RFC 1981 doesn't include administrative MTU increase
4095 update PMTU increase is a MUST. (i.e. jumbo frame)
4097 if (rt->fib6_nh.nh_dev == arg->dev &&
4098 !fib6_metric_locked(rt, RTAX_MTU)) {
4099 u32 mtu = rt->fib6_pmtu;
4101 if (mtu >= arg->mtu ||
4102 (mtu < arg->mtu && mtu == idev->cnf.mtu6))
4103 fib6_metric_set(rt, RTAX_MTU, arg->mtu);
4105 spin_lock_bh(&rt6_exception_lock);
4106 rt6_exceptions_update_pmtu(idev, rt, arg->mtu);
4107 spin_unlock_bh(&rt6_exception_lock);
4112 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
4114 struct rt6_mtu_change_arg arg = {
4119 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
4122 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
4123 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) },
4124 [RTA_PREFSRC] = { .len = sizeof(struct in6_addr) },
4125 [RTA_OIF] = { .type = NLA_U32 },
4126 [RTA_IIF] = { .type = NLA_U32 },
4127 [RTA_PRIORITY] = { .type = NLA_U32 },
4128 [RTA_METRICS] = { .type = NLA_NESTED },
4129 [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) },
4130 [RTA_PREF] = { .type = NLA_U8 },
4131 [RTA_ENCAP_TYPE] = { .type = NLA_U16 },
4132 [RTA_ENCAP] = { .type = NLA_NESTED },
4133 [RTA_EXPIRES] = { .type = NLA_U32 },
4134 [RTA_UID] = { .type = NLA_U32 },
4135 [RTA_MARK] = { .type = NLA_U32 },
4136 [RTA_TABLE] = { .type = NLA_U32 },
4137 [RTA_IP_PROTO] = { .type = NLA_U8 },
4138 [RTA_SPORT] = { .type = NLA_U16 },
4139 [RTA_DPORT] = { .type = NLA_U16 },
4142 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
4143 struct fib6_config *cfg,
4144 struct netlink_ext_ack *extack)
4147 struct nlattr *tb[RTA_MAX+1];
4151 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4157 rtm = nlmsg_data(nlh);
4158 memset(cfg, 0, sizeof(*cfg));
4160 cfg->fc_table = rtm->rtm_table;
4161 cfg->fc_dst_len = rtm->rtm_dst_len;
4162 cfg->fc_src_len = rtm->rtm_src_len;
4163 cfg->fc_flags = RTF_UP;
4164 cfg->fc_protocol = rtm->rtm_protocol;
4165 cfg->fc_type = rtm->rtm_type;
4167 if (rtm->rtm_type == RTN_UNREACHABLE ||
4168 rtm->rtm_type == RTN_BLACKHOLE ||
4169 rtm->rtm_type == RTN_PROHIBIT ||
4170 rtm->rtm_type == RTN_THROW)
4171 cfg->fc_flags |= RTF_REJECT;
4173 if (rtm->rtm_type == RTN_LOCAL)
4174 cfg->fc_flags |= RTF_LOCAL;
4176 if (rtm->rtm_flags & RTM_F_CLONED)
4177 cfg->fc_flags |= RTF_CACHE;
4179 cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
4181 cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
4182 cfg->fc_nlinfo.nlh = nlh;
4183 cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
4185 if (tb[RTA_GATEWAY]) {
4186 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
4187 cfg->fc_flags |= RTF_GATEWAY;
4191 int plen = (rtm->rtm_dst_len + 7) >> 3;
4193 if (nla_len(tb[RTA_DST]) < plen)
4196 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
4200 int plen = (rtm->rtm_src_len + 7) >> 3;
4202 if (nla_len(tb[RTA_SRC]) < plen)
4205 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
4208 if (tb[RTA_PREFSRC])
4209 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
4212 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
4214 if (tb[RTA_PRIORITY])
4215 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
4217 if (tb[RTA_METRICS]) {
4218 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
4219 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
4223 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
4225 if (tb[RTA_MULTIPATH]) {
4226 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
4227 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
4229 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
4230 cfg->fc_mp_len, extack);
4236 pref = nla_get_u8(tb[RTA_PREF]);
4237 if (pref != ICMPV6_ROUTER_PREF_LOW &&
4238 pref != ICMPV6_ROUTER_PREF_HIGH)
4239 pref = ICMPV6_ROUTER_PREF_MEDIUM;
4240 cfg->fc_flags |= RTF_PREF(pref);
4244 cfg->fc_encap = tb[RTA_ENCAP];
4246 if (tb[RTA_ENCAP_TYPE]) {
4247 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
4249 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
4254 if (tb[RTA_EXPIRES]) {
4255 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
4257 if (addrconf_finite_timeout(timeout)) {
4258 cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
4259 cfg->fc_flags |= RTF_EXPIRES;
4269 struct fib6_info *fib6_info;
4270 struct fib6_config r_cfg;
4271 struct list_head next;
4274 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
4278 list_for_each_entry(nh, rt6_nh_list, next) {
4279 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
4280 &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
4281 nh->r_cfg.fc_ifindex);
4285 static int ip6_route_info_append(struct net *net,
4286 struct list_head *rt6_nh_list,
4287 struct fib6_info *rt,
4288 struct fib6_config *r_cfg)
4293 list_for_each_entry(nh, rt6_nh_list, next) {
4294 /* check if fib6_info already exists */
4295 if (rt6_duplicate_nexthop(nh->fib6_info, rt))
4299 nh = kzalloc(sizeof(*nh), GFP_KERNEL);
4303 err = ip6_convert_metrics(net, rt, r_cfg);
4308 memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
4309 list_add_tail(&nh->next, rt6_nh_list);
4314 static void ip6_route_mpath_notify(struct fib6_info *rt,
4315 struct fib6_info *rt_last,
4316 struct nl_info *info,
4319 /* if this is an APPEND route, then rt points to the first route
4320 * inserted and rt_last points to last route inserted. Userspace
4321 * wants a consistent dump of the route which starts at the first
4322 * nexthop. Since sibling routes are always added at the end of
4323 * the list, find the first sibling of the last route appended
4325 if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) {
4326 rt = list_first_entry(&rt_last->fib6_siblings,
4332 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
4335 static int ip6_route_multipath_add(struct fib6_config *cfg,
4336 struct netlink_ext_ack *extack)
4338 struct fib6_info *rt_notif = NULL, *rt_last = NULL;
4339 struct nl_info *info = &cfg->fc_nlinfo;
4340 struct fib6_config r_cfg;
4341 struct rtnexthop *rtnh;
4342 struct fib6_info *rt;
4343 struct rt6_nh *err_nh;
4344 struct rt6_nh *nh, *nh_safe;
4350 int replace = (cfg->fc_nlinfo.nlh &&
4351 (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
4352 LIST_HEAD(rt6_nh_list);
4354 nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
4355 if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
4356 nlflags |= NLM_F_APPEND;
4358 remaining = cfg->fc_mp_len;
4359 rtnh = (struct rtnexthop *)cfg->fc_mp;
4361 /* Parse a Multipath Entry and build a list (rt6_nh_list) of
4362 * fib6_info structs per nexthop
4364 while (rtnh_ok(rtnh, remaining)) {
4365 memcpy(&r_cfg, cfg, sizeof(*cfg));
4366 if (rtnh->rtnh_ifindex)
4367 r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4369 attrlen = rtnh_attrlen(rtnh);
4371 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4373 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4375 r_cfg.fc_gateway = nla_get_in6_addr(nla);
4376 r_cfg.fc_flags |= RTF_GATEWAY;
4378 r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
4379 nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
4381 r_cfg.fc_encap_type = nla_get_u16(nla);
4384 r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
4385 rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack);
4391 if (!rt6_qualify_for_ecmp(rt)) {
4393 NL_SET_ERR_MSG(extack,
4394 "Device only routes can not be added for IPv6 using the multipath API.");
4395 fib6_info_release(rt);
4399 rt->fib6_nh.nh_weight = rtnh->rtnh_hops + 1;
4401 err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
4404 fib6_info_release(rt);
4408 rtnh = rtnh_next(rtnh, &remaining);
4411 /* for add and replace send one notification with all nexthops.
4412 * Skip the notification in fib6_add_rt2node and send one with
4413 * the full route when done
4415 info->skip_notify = 1;
4418 list_for_each_entry(nh, &rt6_nh_list, next) {
4419 err = __ip6_ins_rt(nh->fib6_info, info, extack);
4420 fib6_info_release(nh->fib6_info);
4423 /* save reference to last route successfully inserted */
4424 rt_last = nh->fib6_info;
4426 /* save reference to first route for notification */
4428 rt_notif = nh->fib6_info;
4431 /* nh->fib6_info is used or freed at this point, reset to NULL*/
4432 nh->fib6_info = NULL;
4435 ip6_print_replace_route_err(&rt6_nh_list);
4440 /* Because each route is added like a single route we remove
4441 * these flags after the first nexthop: if there is a collision,
4442 * we have already failed to add the first nexthop:
4443 * fib6_add_rt2node() has rejected it; when replacing, old
4444 * nexthops have been replaced by first new, the rest should
4447 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
4452 /* success ... tell user about new route */
4453 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4457 /* send notification for routes that were added so that
4458 * the delete notifications sent by ip6_route_del are
4462 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4464 /* Delete routes that were already added */
4465 list_for_each_entry(nh, &rt6_nh_list, next) {
4468 ip6_route_del(&nh->r_cfg, extack);
4472 list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
4474 fib6_info_release(nh->fib6_info);
4475 list_del(&nh->next);
4482 static int ip6_route_multipath_del(struct fib6_config *cfg,
4483 struct netlink_ext_ack *extack)
4485 struct fib6_config r_cfg;
4486 struct rtnexthop *rtnh;
4489 int err = 1, last_err = 0;
4491 remaining = cfg->fc_mp_len;
4492 rtnh = (struct rtnexthop *)cfg->fc_mp;
4494 /* Parse a Multipath Entry */
4495 while (rtnh_ok(rtnh, remaining)) {
4496 memcpy(&r_cfg, cfg, sizeof(*cfg));
4497 if (rtnh->rtnh_ifindex)
4498 r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4500 attrlen = rtnh_attrlen(rtnh);
4502 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4504 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4506 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
4507 r_cfg.fc_flags |= RTF_GATEWAY;
4510 err = ip6_route_del(&r_cfg, extack);
4514 rtnh = rtnh_next(rtnh, &remaining);
4520 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4521 struct netlink_ext_ack *extack)
4523 struct fib6_config cfg;
4526 err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4531 return ip6_route_multipath_del(&cfg, extack);
4533 cfg.fc_delete_all_nh = 1;
4534 return ip6_route_del(&cfg, extack);
4538 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4539 struct netlink_ext_ack *extack)
4541 struct fib6_config cfg;
4544 err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4549 return ip6_route_multipath_add(&cfg, extack);
4551 return ip6_route_add(&cfg, GFP_KERNEL, extack);
4554 static size_t rt6_nlmsg_size(struct fib6_info *rt)
4556 int nexthop_len = 0;
4558 if (rt->fib6_nsiblings) {
4559 nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */
4560 + NLA_ALIGN(sizeof(struct rtnexthop))
4561 + nla_total_size(16) /* RTA_GATEWAY */
4562 + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate);
4564 nexthop_len *= rt->fib6_nsiblings;
4567 return NLMSG_ALIGN(sizeof(struct rtmsg))
4568 + nla_total_size(16) /* RTA_SRC */
4569 + nla_total_size(16) /* RTA_DST */
4570 + nla_total_size(16) /* RTA_GATEWAY */
4571 + nla_total_size(16) /* RTA_PREFSRC */
4572 + nla_total_size(4) /* RTA_TABLE */
4573 + nla_total_size(4) /* RTA_IIF */
4574 + nla_total_size(4) /* RTA_OIF */
4575 + nla_total_size(4) /* RTA_PRIORITY */
4576 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
4577 + nla_total_size(sizeof(struct rta_cacheinfo))
4578 + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
4579 + nla_total_size(1) /* RTA_PREF */
4580 + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate)
4584 static int rt6_nexthop_info(struct sk_buff *skb, struct fib6_info *rt,
4585 unsigned int *flags, bool skip_oif)
4587 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
4588 *flags |= RTNH_F_DEAD;
4590 if (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN) {
4591 *flags |= RTNH_F_LINKDOWN;
4594 if (fib6_ignore_linkdown(rt))
4595 *flags |= RTNH_F_DEAD;
4599 if (rt->fib6_flags & RTF_GATEWAY) {
4600 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->fib6_nh.nh_gw) < 0)
4601 goto nla_put_failure;
4604 *flags |= (rt->fib6_nh.nh_flags & RTNH_F_ONLINK);
4605 if (rt->fib6_nh.nh_flags & RTNH_F_OFFLOAD)
4606 *flags |= RTNH_F_OFFLOAD;
4608 /* not needed for multipath encoding b/c it has a rtnexthop struct */
4609 if (!skip_oif && rt->fib6_nh.nh_dev &&
4610 nla_put_u32(skb, RTA_OIF, rt->fib6_nh.nh_dev->ifindex))
4611 goto nla_put_failure;
4613 if (rt->fib6_nh.nh_lwtstate &&
4614 lwtunnel_fill_encap(skb, rt->fib6_nh.nh_lwtstate) < 0)
4615 goto nla_put_failure;
4623 /* add multipath next hop */
4624 static int rt6_add_nexthop(struct sk_buff *skb, struct fib6_info *rt)
4626 const struct net_device *dev = rt->fib6_nh.nh_dev;
4627 struct rtnexthop *rtnh;
4628 unsigned int flags = 0;
4630 rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
4632 goto nla_put_failure;
4634 rtnh->rtnh_hops = rt->fib6_nh.nh_weight - 1;
4635 rtnh->rtnh_ifindex = dev ? dev->ifindex : 0;
4637 if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
4638 goto nla_put_failure;
4640 rtnh->rtnh_flags = flags;
4642 /* length of rtnetlink header + attributes */
4643 rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
4651 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
4652 struct fib6_info *rt, struct dst_entry *dst,
4653 struct in6_addr *dest, struct in6_addr *src,
4654 int iif, int type, u32 portid, u32 seq,
4658 struct nlmsghdr *nlh;
4663 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4667 rtm = nlmsg_data(nlh);
4668 rtm->rtm_family = AF_INET6;
4669 rtm->rtm_dst_len = rt->fib6_dst.plen;
4670 rtm->rtm_src_len = rt->fib6_src.plen;
4673 table = rt->fib6_table->tb6_id;
4675 table = RT6_TABLE_UNSPEC;
4676 rtm->rtm_table = table;
4677 if (nla_put_u32(skb, RTA_TABLE, table))
4678 goto nla_put_failure;
4680 rtm->rtm_type = rt->fib6_type;
4682 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4683 rtm->rtm_protocol = rt->fib6_protocol;
4685 if (rt->fib6_flags & RTF_CACHE)
4686 rtm->rtm_flags |= RTM_F_CLONED;
4689 if (nla_put_in6_addr(skb, RTA_DST, dest))
4690 goto nla_put_failure;
4691 rtm->rtm_dst_len = 128;
4692 } else if (rtm->rtm_dst_len)
4693 if (nla_put_in6_addr(skb, RTA_DST, &rt->fib6_dst.addr))
4694 goto nla_put_failure;
4695 #ifdef CONFIG_IPV6_SUBTREES
4697 if (nla_put_in6_addr(skb, RTA_SRC, src))
4698 goto nla_put_failure;
4699 rtm->rtm_src_len = 128;
4700 } else if (rtm->rtm_src_len &&
4701 nla_put_in6_addr(skb, RTA_SRC, &rt->fib6_src.addr))
4702 goto nla_put_failure;
4705 #ifdef CONFIG_IPV6_MROUTE
4706 if (ipv6_addr_is_multicast(&rt->fib6_dst.addr)) {
4707 int err = ip6mr_get_route(net, skb, rtm, portid);
4712 goto nla_put_failure;
4715 if (nla_put_u32(skb, RTA_IIF, iif))
4716 goto nla_put_failure;
4718 struct in6_addr saddr_buf;
4719 if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 &&
4720 nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4721 goto nla_put_failure;
4724 if (rt->fib6_prefsrc.plen) {
4725 struct in6_addr saddr_buf;
4726 saddr_buf = rt->fib6_prefsrc.addr;
4727 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4728 goto nla_put_failure;
4731 pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics;
4732 if (rtnetlink_put_metrics(skb, pmetrics) < 0)
4733 goto nla_put_failure;
4735 if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric))
4736 goto nla_put_failure;
4738 /* For multipath routes, walk the siblings list and add
4739 * each as a nexthop within RTA_MULTIPATH.
4741 if (rt->fib6_nsiblings) {
4742 struct fib6_info *sibling, *next_sibling;
4745 mp = nla_nest_start(skb, RTA_MULTIPATH);
4747 goto nla_put_failure;
4749 if (rt6_add_nexthop(skb, rt) < 0)
4750 goto nla_put_failure;
4752 list_for_each_entry_safe(sibling, next_sibling,
4753 &rt->fib6_siblings, fib6_siblings) {
4754 if (rt6_add_nexthop(skb, sibling) < 0)
4755 goto nla_put_failure;
4758 nla_nest_end(skb, mp);
4760 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
4761 goto nla_put_failure;
4764 if (rt->fib6_flags & RTF_EXPIRES) {
4765 expires = dst ? dst->expires : rt->expires;
4769 if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
4770 goto nla_put_failure;
4772 if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->fib6_flags)))
4773 goto nla_put_failure;
4776 nlmsg_end(skb, nlh);
4780 nlmsg_cancel(skb, nlh);
4784 int rt6_dump_route(struct fib6_info *rt, void *p_arg)
4786 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4787 struct net *net = arg->net;
4789 if (rt == net->ipv6.fib6_null_entry)
4792 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
4793 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
4795 /* user wants prefix routes only */
4796 if (rtm->rtm_flags & RTM_F_PREFIX &&
4797 !(rt->fib6_flags & RTF_PREFIX_RT)) {
4798 /* success since this is not a prefix route */
4803 return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0,
4804 RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid,
4805 arg->cb->nlh->nlmsg_seq, NLM_F_MULTI);
4808 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4809 struct netlink_ext_ack *extack)
4811 struct net *net = sock_net(in_skb->sk);
4812 struct nlattr *tb[RTA_MAX+1];
4813 int err, iif = 0, oif = 0;
4814 struct fib6_info *from;
4815 struct dst_entry *dst;
4816 struct rt6_info *rt;
4817 struct sk_buff *skb;
4822 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4828 memset(&fl6, 0, sizeof(fl6));
4829 rtm = nlmsg_data(nlh);
4830 fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4831 fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4834 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4837 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4841 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4844 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4848 iif = nla_get_u32(tb[RTA_IIF]);
4851 oif = nla_get_u32(tb[RTA_OIF]);
4854 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4857 fl6.flowi6_uid = make_kuid(current_user_ns(),
4858 nla_get_u32(tb[RTA_UID]));
4860 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4863 fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]);
4866 fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]);
4868 if (tb[RTA_IP_PROTO]) {
4869 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
4870 &fl6.flowi6_proto, extack);
4876 struct net_device *dev;
4881 dev = dev_get_by_index_rcu(net, iif);
4888 fl6.flowi6_iif = iif;
4890 if (!ipv6_addr_any(&fl6.saddr))
4891 flags |= RT6_LOOKUP_F_HAS_SADDR;
4893 dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags);
4897 fl6.flowi6_oif = oif;
4899 dst = ip6_route_output(net, NULL, &fl6);
4903 rt = container_of(dst, struct rt6_info, dst);
4904 if (rt->dst.error) {
4905 err = rt->dst.error;
4910 if (rt == net->ipv6.ip6_null_entry) {
4911 err = rt->dst.error;
4916 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
4923 skb_dst_set(skb, &rt->dst);
4926 from = rcu_dereference(rt->from);
4929 err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, iif,
4930 RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4933 err = rt6_fill_node(net, skb, from, dst, &fl6.daddr,
4934 &fl6.saddr, iif, RTM_NEWROUTE,
4935 NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
4944 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
4949 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
4950 unsigned int nlm_flags)
4952 struct sk_buff *skb;
4953 struct net *net = info->nl_net;
4958 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
4960 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
4964 err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
4965 event, info->portid, seq, nlm_flags);
4967 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
4968 WARN_ON(err == -EMSGSIZE);
4972 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
4973 info->nlh, gfp_any());
4977 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
4980 static int ip6_route_dev_notify(struct notifier_block *this,
4981 unsigned long event, void *ptr)
4983 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
4984 struct net *net = dev_net(dev);
4986 if (!(dev->flags & IFF_LOOPBACK))
4989 if (event == NETDEV_REGISTER) {
4990 net->ipv6.fib6_null_entry->fib6_nh.nh_dev = dev;
4991 net->ipv6.ip6_null_entry->dst.dev = dev;
4992 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
4993 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4994 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
4995 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
4996 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
4997 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
4999 } else if (event == NETDEV_UNREGISTER &&
5000 dev->reg_state != NETREG_UNREGISTERED) {
5001 /* NETDEV_UNREGISTER could be fired for multiple times by
5002 * netdev_wait_allrefs(). Make sure we only call this once.
5004 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
5005 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5006 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
5007 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
5018 #ifdef CONFIG_PROC_FS
5019 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
5021 struct net *net = (struct net *)seq->private;
5022 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
5023 net->ipv6.rt6_stats->fib_nodes,
5024 net->ipv6.rt6_stats->fib_route_nodes,
5025 atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
5026 net->ipv6.rt6_stats->fib_rt_entries,
5027 net->ipv6.rt6_stats->fib_rt_cache,
5028 dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
5029 net->ipv6.rt6_stats->fib_discarded_routes);
5033 #endif /* CONFIG_PROC_FS */
5035 #ifdef CONFIG_SYSCTL
5038 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
5039 void __user *buffer, size_t *lenp, loff_t *ppos)
5046 net = (struct net *)ctl->extra1;
5047 delay = net->ipv6.sysctl.flush_delay;
5048 proc_dointvec(ctl, write, buffer, lenp, ppos);
5049 fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
5053 struct ctl_table ipv6_route_table_template[] = {
5055 .procname = "flush",
5056 .data = &init_net.ipv6.sysctl.flush_delay,
5057 .maxlen = sizeof(int),
5059 .proc_handler = ipv6_sysctl_rtcache_flush
5062 .procname = "gc_thresh",
5063 .data = &ip6_dst_ops_template.gc_thresh,
5064 .maxlen = sizeof(int),
5066 .proc_handler = proc_dointvec,
5069 .procname = "max_size",
5070 .data = &init_net.ipv6.sysctl.ip6_rt_max_size,
5071 .maxlen = sizeof(int),
5073 .proc_handler = proc_dointvec,
5076 .procname = "gc_min_interval",
5077 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5078 .maxlen = sizeof(int),
5080 .proc_handler = proc_dointvec_jiffies,
5083 .procname = "gc_timeout",
5084 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
5085 .maxlen = sizeof(int),
5087 .proc_handler = proc_dointvec_jiffies,
5090 .procname = "gc_interval",
5091 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval,
5092 .maxlen = sizeof(int),
5094 .proc_handler = proc_dointvec_jiffies,
5097 .procname = "gc_elasticity",
5098 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
5099 .maxlen = sizeof(int),
5101 .proc_handler = proc_dointvec,
5104 .procname = "mtu_expires",
5105 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
5106 .maxlen = sizeof(int),
5108 .proc_handler = proc_dointvec_jiffies,
5111 .procname = "min_adv_mss",
5112 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss,
5113 .maxlen = sizeof(int),
5115 .proc_handler = proc_dointvec,
5118 .procname = "gc_min_interval_ms",
5119 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5120 .maxlen = sizeof(int),
5122 .proc_handler = proc_dointvec_ms_jiffies,
5127 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
5129 struct ctl_table *table;
5131 table = kmemdup(ipv6_route_table_template,
5132 sizeof(ipv6_route_table_template),
5136 table[0].data = &net->ipv6.sysctl.flush_delay;
5137 table[0].extra1 = net;
5138 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
5139 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
5140 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5141 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
5142 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
5143 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
5144 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
5145 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
5146 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5148 /* Don't export sysctls to unprivileged users */
5149 if (net->user_ns != &init_user_ns)
5150 table[0].procname = NULL;
5157 static int __net_init ip6_route_net_init(struct net *net)
5161 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
5162 sizeof(net->ipv6.ip6_dst_ops));
5164 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
5165 goto out_ip6_dst_ops;
5167 net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template,
5168 sizeof(*net->ipv6.fib6_null_entry),
5170 if (!net->ipv6.fib6_null_entry)
5171 goto out_ip6_dst_entries;
5173 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
5174 sizeof(*net->ipv6.ip6_null_entry),
5176 if (!net->ipv6.ip6_null_entry)
5177 goto out_fib6_null_entry;
5178 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5179 dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
5180 ip6_template_metrics, true);
5182 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5183 net->ipv6.fib6_has_custom_rules = false;
5184 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
5185 sizeof(*net->ipv6.ip6_prohibit_entry),
5187 if (!net->ipv6.ip6_prohibit_entry)
5188 goto out_ip6_null_entry;
5189 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5190 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
5191 ip6_template_metrics, true);
5193 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
5194 sizeof(*net->ipv6.ip6_blk_hole_entry),
5196 if (!net->ipv6.ip6_blk_hole_entry)
5197 goto out_ip6_prohibit_entry;
5198 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5199 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
5200 ip6_template_metrics, true);
5203 net->ipv6.sysctl.flush_delay = 0;
5204 net->ipv6.sysctl.ip6_rt_max_size = 4096;
5205 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
5206 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
5207 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
5208 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
5209 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
5210 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
5212 net->ipv6.ip6_rt_gc_expire = 30*HZ;
5218 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5219 out_ip6_prohibit_entry:
5220 kfree(net->ipv6.ip6_prohibit_entry);
5222 kfree(net->ipv6.ip6_null_entry);
5224 out_fib6_null_entry:
5225 kfree(net->ipv6.fib6_null_entry);
5226 out_ip6_dst_entries:
5227 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5232 static void __net_exit ip6_route_net_exit(struct net *net)
5234 kfree(net->ipv6.fib6_null_entry);
5235 kfree(net->ipv6.ip6_null_entry);
5236 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5237 kfree(net->ipv6.ip6_prohibit_entry);
5238 kfree(net->ipv6.ip6_blk_hole_entry);
5240 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5243 static int __net_init ip6_route_net_init_late(struct net *net)
5245 #ifdef CONFIG_PROC_FS
5246 proc_create_net("ipv6_route", 0, net->proc_net, &ipv6_route_seq_ops,
5247 sizeof(struct ipv6_route_iter));
5248 proc_create_net_single("rt6_stats", 0444, net->proc_net,
5249 rt6_stats_seq_show, NULL);
5254 static void __net_exit ip6_route_net_exit_late(struct net *net)
5256 #ifdef CONFIG_PROC_FS
5257 remove_proc_entry("ipv6_route", net->proc_net);
5258 remove_proc_entry("rt6_stats", net->proc_net);
5262 static struct pernet_operations ip6_route_net_ops = {
5263 .init = ip6_route_net_init,
5264 .exit = ip6_route_net_exit,
5267 static int __net_init ipv6_inetpeer_init(struct net *net)
5269 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
5273 inet_peer_base_init(bp);
5274 net->ipv6.peers = bp;
5278 static void __net_exit ipv6_inetpeer_exit(struct net *net)
5280 struct inet_peer_base *bp = net->ipv6.peers;
5282 net->ipv6.peers = NULL;
5283 inetpeer_invalidate_tree(bp);
5287 static struct pernet_operations ipv6_inetpeer_ops = {
5288 .init = ipv6_inetpeer_init,
5289 .exit = ipv6_inetpeer_exit,
5292 static struct pernet_operations ip6_route_net_late_ops = {
5293 .init = ip6_route_net_init_late,
5294 .exit = ip6_route_net_exit_late,
5297 static struct notifier_block ip6_route_dev_notifier = {
5298 .notifier_call = ip6_route_dev_notify,
5299 .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
5302 void __init ip6_route_init_special_entries(void)
5304 /* Registering of the loopback is done before this portion of code,
5305 * the loopback reference in rt6_info will not be taken, do it
5306 * manually for init_net */
5307 init_net.ipv6.fib6_null_entry->fib6_nh.nh_dev = init_net.loopback_dev;
5308 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
5309 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5310 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5311 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
5312 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5313 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
5314 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5318 int __init ip6_route_init(void)
5324 ip6_dst_ops_template.kmem_cachep =
5325 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
5326 SLAB_HWCACHE_ALIGN, NULL);
5327 if (!ip6_dst_ops_template.kmem_cachep)
5330 ret = dst_entries_init(&ip6_dst_blackhole_ops);
5332 goto out_kmem_cache;
5334 ret = register_pernet_subsys(&ipv6_inetpeer_ops);
5336 goto out_dst_entries;
5338 ret = register_pernet_subsys(&ip6_route_net_ops);
5340 goto out_register_inetpeer;
5342 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
5346 goto out_register_subsys;
5352 ret = fib6_rules_init();
5356 ret = register_pernet_subsys(&ip6_route_net_late_ops);
5358 goto fib6_rules_init;
5360 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
5361 inet6_rtm_newroute, NULL, 0);
5363 goto out_register_late_subsys;
5365 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
5366 inet6_rtm_delroute, NULL, 0);
5368 goto out_register_late_subsys;
5370 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
5371 inet6_rtm_getroute, NULL,
5372 RTNL_FLAG_DOIT_UNLOCKED);
5374 goto out_register_late_subsys;
5376 ret = register_netdevice_notifier(&ip6_route_dev_notifier);
5378 goto out_register_late_subsys;
5380 for_each_possible_cpu(cpu) {
5381 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
5383 INIT_LIST_HEAD(&ul->head);
5384 spin_lock_init(&ul->lock);
5390 out_register_late_subsys:
5391 rtnl_unregister_all(PF_INET6);
5392 unregister_pernet_subsys(&ip6_route_net_late_ops);
5394 fib6_rules_cleanup();
5399 out_register_subsys:
5400 unregister_pernet_subsys(&ip6_route_net_ops);
5401 out_register_inetpeer:
5402 unregister_pernet_subsys(&ipv6_inetpeer_ops);
5404 dst_entries_destroy(&ip6_dst_blackhole_ops);
5406 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5410 void ip6_route_cleanup(void)
5412 unregister_netdevice_notifier(&ip6_route_dev_notifier);
5413 unregister_pernet_subsys(&ip6_route_net_late_ops);
5414 fib6_rules_cleanup();
5417 unregister_pernet_subsys(&ipv6_inetpeer_ops);
5418 unregister_pernet_subsys(&ip6_route_net_ops);
5419 dst_entries_destroy(&ip6_dst_blackhole_ops);
5420 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);