Merge tag 'arm64-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux
[linux-2.6-microblaze.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #define pr_fmt(fmt) "IPv6: " fmt
28
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
49 #include <net/snmp.h>
50 #include <net/ipv6.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
55 #include <net/tcp.h>
56 #include <linux/rtnetlink.h>
57 #include <net/dst.h>
58 #include <net/dst_metadata.h>
59 #include <net/xfrm.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/rtnh.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
66 #include <net/ip.h>
67 #include <linux/uaccess.h>
68
69 #ifdef CONFIG_SYSCTL
70 #include <linux/sysctl.h>
71 #endif
72
73 static int ip6_rt_type_to_error(u8 fib6_type);
74
75 #define CREATE_TRACE_POINTS
76 #include <trace/events/fib6.h>
77 EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup);
78 #undef CREATE_TRACE_POINTS
79
80 enum rt6_nud_state {
81         RT6_NUD_FAIL_HARD = -3,
82         RT6_NUD_FAIL_PROBE = -2,
83         RT6_NUD_FAIL_DO_RR = -1,
84         RT6_NUD_SUCCEED = 1
85 };
86
87 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
88 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
89 static unsigned int      ip6_mtu(const struct dst_entry *dst);
90 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
91 static void             ip6_dst_destroy(struct dst_entry *);
92 static void             ip6_dst_ifdown(struct dst_entry *,
93                                        struct net_device *dev, int how);
94 static int               ip6_dst_gc(struct dst_ops *ops);
95
96 static int              ip6_pkt_discard(struct sk_buff *skb);
97 static int              ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
98 static int              ip6_pkt_prohibit(struct sk_buff *skb);
99 static int              ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
100 static void             ip6_link_failure(struct sk_buff *skb);
101 static void             ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
102                                            struct sk_buff *skb, u32 mtu);
103 static void             rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
104                                         struct sk_buff *skb);
105 static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif,
106                            int strict);
107 static size_t rt6_nlmsg_size(struct fib6_info *rt);
108 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
109                          struct fib6_info *rt, struct dst_entry *dst,
110                          struct in6_addr *dest, struct in6_addr *src,
111                          int iif, int type, u32 portid, u32 seq,
112                          unsigned int flags);
113 static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res,
114                                            const struct in6_addr *daddr,
115                                            const struct in6_addr *saddr);
116
117 #ifdef CONFIG_IPV6_ROUTE_INFO
118 static struct fib6_info *rt6_add_route_info(struct net *net,
119                                            const struct in6_addr *prefix, int prefixlen,
120                                            const struct in6_addr *gwaddr,
121                                            struct net_device *dev,
122                                            unsigned int pref);
123 static struct fib6_info *rt6_get_route_info(struct net *net,
124                                            const struct in6_addr *prefix, int prefixlen,
125                                            const struct in6_addr *gwaddr,
126                                            struct net_device *dev);
127 #endif
128
129 struct uncached_list {
130         spinlock_t              lock;
131         struct list_head        head;
132 };
133
134 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
135
136 void rt6_uncached_list_add(struct rt6_info *rt)
137 {
138         struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
139
140         rt->rt6i_uncached_list = ul;
141
142         spin_lock_bh(&ul->lock);
143         list_add_tail(&rt->rt6i_uncached, &ul->head);
144         spin_unlock_bh(&ul->lock);
145 }
146
147 void rt6_uncached_list_del(struct rt6_info *rt)
148 {
149         if (!list_empty(&rt->rt6i_uncached)) {
150                 struct uncached_list *ul = rt->rt6i_uncached_list;
151                 struct net *net = dev_net(rt->dst.dev);
152
153                 spin_lock_bh(&ul->lock);
154                 list_del(&rt->rt6i_uncached);
155                 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
156                 spin_unlock_bh(&ul->lock);
157         }
158 }
159
160 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
161 {
162         struct net_device *loopback_dev = net->loopback_dev;
163         int cpu;
164
165         if (dev == loopback_dev)
166                 return;
167
168         for_each_possible_cpu(cpu) {
169                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
170                 struct rt6_info *rt;
171
172                 spin_lock_bh(&ul->lock);
173                 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
174                         struct inet6_dev *rt_idev = rt->rt6i_idev;
175                         struct net_device *rt_dev = rt->dst.dev;
176
177                         if (rt_idev->dev == dev) {
178                                 rt->rt6i_idev = in6_dev_get(loopback_dev);
179                                 in6_dev_put(rt_idev);
180                         }
181
182                         if (rt_dev == dev) {
183                                 rt->dst.dev = loopback_dev;
184                                 dev_hold(rt->dst.dev);
185                                 dev_put(rt_dev);
186                         }
187                 }
188                 spin_unlock_bh(&ul->lock);
189         }
190 }
191
192 static inline const void *choose_neigh_daddr(const struct in6_addr *p,
193                                              struct sk_buff *skb,
194                                              const void *daddr)
195 {
196         if (!ipv6_addr_any(p))
197                 return (const void *) p;
198         else if (skb)
199                 return &ipv6_hdr(skb)->daddr;
200         return daddr;
201 }
202
203 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
204                                    struct net_device *dev,
205                                    struct sk_buff *skb,
206                                    const void *daddr)
207 {
208         struct neighbour *n;
209
210         daddr = choose_neigh_daddr(gw, skb, daddr);
211         n = __ipv6_neigh_lookup(dev, daddr);
212         if (n)
213                 return n;
214
215         n = neigh_create(&nd_tbl, daddr, dev);
216         return IS_ERR(n) ? NULL : n;
217 }
218
219 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst,
220                                               struct sk_buff *skb,
221                                               const void *daddr)
222 {
223         const struct rt6_info *rt = container_of(dst, struct rt6_info, dst);
224
225         return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr);
226 }
227
228 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
229 {
230         struct net_device *dev = dst->dev;
231         struct rt6_info *rt = (struct rt6_info *)dst;
232
233         daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr);
234         if (!daddr)
235                 return;
236         if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
237                 return;
238         if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
239                 return;
240         __ipv6_confirm_neigh(dev, daddr);
241 }
242
243 static struct dst_ops ip6_dst_ops_template = {
244         .family                 =       AF_INET6,
245         .gc                     =       ip6_dst_gc,
246         .gc_thresh              =       1024,
247         .check                  =       ip6_dst_check,
248         .default_advmss         =       ip6_default_advmss,
249         .mtu                    =       ip6_mtu,
250         .cow_metrics            =       dst_cow_metrics_generic,
251         .destroy                =       ip6_dst_destroy,
252         .ifdown                 =       ip6_dst_ifdown,
253         .negative_advice        =       ip6_negative_advice,
254         .link_failure           =       ip6_link_failure,
255         .update_pmtu            =       ip6_rt_update_pmtu,
256         .redirect               =       rt6_do_redirect,
257         .local_out              =       __ip6_local_out,
258         .neigh_lookup           =       ip6_dst_neigh_lookup,
259         .confirm_neigh          =       ip6_confirm_neigh,
260 };
261
262 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
263 {
264         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
265
266         return mtu ? : dst->dev->mtu;
267 }
268
269 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
270                                          struct sk_buff *skb, u32 mtu)
271 {
272 }
273
274 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
275                                       struct sk_buff *skb)
276 {
277 }
278
279 static struct dst_ops ip6_dst_blackhole_ops = {
280         .family                 =       AF_INET6,
281         .destroy                =       ip6_dst_destroy,
282         .check                  =       ip6_dst_check,
283         .mtu                    =       ip6_blackhole_mtu,
284         .default_advmss         =       ip6_default_advmss,
285         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
286         .redirect               =       ip6_rt_blackhole_redirect,
287         .cow_metrics            =       dst_cow_metrics_generic,
288         .neigh_lookup           =       ip6_dst_neigh_lookup,
289 };
290
291 static const u32 ip6_template_metrics[RTAX_MAX] = {
292         [RTAX_HOPLIMIT - 1] = 0,
293 };
294
295 static const struct fib6_info fib6_null_entry_template = {
296         .fib6_flags     = (RTF_REJECT | RTF_NONEXTHOP),
297         .fib6_protocol  = RTPROT_KERNEL,
298         .fib6_metric    = ~(u32)0,
299         .fib6_ref       = REFCOUNT_INIT(1),
300         .fib6_type      = RTN_UNREACHABLE,
301         .fib6_metrics   = (struct dst_metrics *)&dst_default_metrics,
302 };
303
304 static const struct rt6_info ip6_null_entry_template = {
305         .dst = {
306                 .__refcnt       = ATOMIC_INIT(1),
307                 .__use          = 1,
308                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
309                 .error          = -ENETUNREACH,
310                 .input          = ip6_pkt_discard,
311                 .output         = ip6_pkt_discard_out,
312         },
313         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
314 };
315
316 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
317
318 static const struct rt6_info ip6_prohibit_entry_template = {
319         .dst = {
320                 .__refcnt       = ATOMIC_INIT(1),
321                 .__use          = 1,
322                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
323                 .error          = -EACCES,
324                 .input          = ip6_pkt_prohibit,
325                 .output         = ip6_pkt_prohibit_out,
326         },
327         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
328 };
329
330 static const struct rt6_info ip6_blk_hole_entry_template = {
331         .dst = {
332                 .__refcnt       = ATOMIC_INIT(1),
333                 .__use          = 1,
334                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
335                 .error          = -EINVAL,
336                 .input          = dst_discard,
337                 .output         = dst_discard_out,
338         },
339         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
340 };
341
342 #endif
343
344 static void rt6_info_init(struct rt6_info *rt)
345 {
346         struct dst_entry *dst = &rt->dst;
347
348         memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
349         INIT_LIST_HEAD(&rt->rt6i_uncached);
350 }
351
352 /* allocate dst with ip6_dst_ops */
353 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
354                                int flags)
355 {
356         struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
357                                         1, DST_OBSOLETE_FORCE_CHK, flags);
358
359         if (rt) {
360                 rt6_info_init(rt);
361                 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
362         }
363
364         return rt;
365 }
366 EXPORT_SYMBOL(ip6_dst_alloc);
367
368 static void ip6_dst_destroy(struct dst_entry *dst)
369 {
370         struct rt6_info *rt = (struct rt6_info *)dst;
371         struct fib6_info *from;
372         struct inet6_dev *idev;
373
374         ip_dst_metrics_put(dst);
375         rt6_uncached_list_del(rt);
376
377         idev = rt->rt6i_idev;
378         if (idev) {
379                 rt->rt6i_idev = NULL;
380                 in6_dev_put(idev);
381         }
382
383         from = xchg((__force struct fib6_info **)&rt->from, NULL);
384         fib6_info_release(from);
385 }
386
387 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
388                            int how)
389 {
390         struct rt6_info *rt = (struct rt6_info *)dst;
391         struct inet6_dev *idev = rt->rt6i_idev;
392         struct net_device *loopback_dev =
393                 dev_net(dev)->loopback_dev;
394
395         if (idev && idev->dev != loopback_dev) {
396                 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
397                 if (loopback_idev) {
398                         rt->rt6i_idev = loopback_idev;
399                         in6_dev_put(idev);
400                 }
401         }
402 }
403
404 static bool __rt6_check_expired(const struct rt6_info *rt)
405 {
406         if (rt->rt6i_flags & RTF_EXPIRES)
407                 return time_after(jiffies, rt->dst.expires);
408         else
409                 return false;
410 }
411
412 static bool rt6_check_expired(const struct rt6_info *rt)
413 {
414         struct fib6_info *from;
415
416         from = rcu_dereference(rt->from);
417
418         if (rt->rt6i_flags & RTF_EXPIRES) {
419                 if (time_after(jiffies, rt->dst.expires))
420                         return true;
421         } else if (from) {
422                 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
423                         fib6_check_expired(from);
424         }
425         return false;
426 }
427
428 void fib6_select_path(const struct net *net, struct fib6_result *res,
429                       struct flowi6 *fl6, int oif, bool have_oif_match,
430                       const struct sk_buff *skb, int strict)
431 {
432         struct fib6_info *sibling, *next_sibling;
433         struct fib6_info *match = res->f6i;
434
435         if (!match->fib6_nsiblings || have_oif_match)
436                 goto out;
437
438         /* We might have already computed the hash for ICMPv6 errors. In such
439          * case it will always be non-zero. Otherwise now is the time to do it.
440          */
441         if (!fl6->mp_hash)
442                 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
443
444         if (fl6->mp_hash <= atomic_read(&match->fib6_nh.fib_nh_upper_bound))
445                 goto out;
446
447         list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
448                                  fib6_siblings) {
449                 const struct fib6_nh *nh = &sibling->fib6_nh;
450                 int nh_upper_bound;
451
452                 nh_upper_bound = atomic_read(&nh->fib_nh_upper_bound);
453                 if (fl6->mp_hash > nh_upper_bound)
454                         continue;
455                 if (rt6_score_route(nh, sibling->fib6_flags, oif, strict) < 0)
456                         break;
457                 match = sibling;
458                 break;
459         }
460
461 out:
462         res->f6i = match;
463         res->nh = &match->fib6_nh;
464 }
465
466 /*
467  *      Route lookup. rcu_read_lock() should be held.
468  */
469
470 static bool __rt6_device_match(struct net *net, const struct fib6_nh *nh,
471                                const struct in6_addr *saddr, int oif, int flags)
472 {
473         const struct net_device *dev;
474
475         if (nh->fib_nh_flags & RTNH_F_DEAD)
476                 return false;
477
478         dev = nh->fib_nh_dev;
479         if (oif) {
480                 if (dev->ifindex == oif)
481                         return true;
482         } else {
483                 if (ipv6_chk_addr(net, saddr, dev,
484                                   flags & RT6_LOOKUP_F_IFACE))
485                         return true;
486         }
487
488         return false;
489 }
490
491 static void rt6_device_match(struct net *net, struct fib6_result *res,
492                              const struct in6_addr *saddr, int oif, int flags)
493 {
494         struct fib6_info *f6i = res->f6i;
495         struct fib6_info *spf6i;
496         struct fib6_nh *nh;
497
498         if (!oif && ipv6_addr_any(saddr)) {
499                 nh = &f6i->fib6_nh;
500                 if (!(nh->fib_nh_flags & RTNH_F_DEAD))
501                         goto out;
502         }
503
504         for (spf6i = f6i; spf6i; spf6i = rcu_dereference(spf6i->fib6_next)) {
505                 nh = &spf6i->fib6_nh;
506                 if (__rt6_device_match(net, nh, saddr, oif, flags)) {
507                         res->f6i = spf6i;
508                         goto out;
509                 }
510         }
511
512         if (oif && flags & RT6_LOOKUP_F_IFACE) {
513                 res->f6i = net->ipv6.fib6_null_entry;
514                 nh = &res->f6i->fib6_nh;
515                 goto out;
516         }
517
518         nh = &f6i->fib6_nh;
519         if (nh->fib_nh_flags & RTNH_F_DEAD) {
520                 res->f6i = net->ipv6.fib6_null_entry;
521                 nh = &res->f6i->fib6_nh;
522         }
523 out:
524         res->nh = nh;
525         res->fib6_type = res->f6i->fib6_type;
526         res->fib6_flags = res->f6i->fib6_flags;
527 }
528
529 #ifdef CONFIG_IPV6_ROUTER_PREF
530 struct __rt6_probe_work {
531         struct work_struct work;
532         struct in6_addr target;
533         struct net_device *dev;
534 };
535
536 static void rt6_probe_deferred(struct work_struct *w)
537 {
538         struct in6_addr mcaddr;
539         struct __rt6_probe_work *work =
540                 container_of(w, struct __rt6_probe_work, work);
541
542         addrconf_addr_solict_mult(&work->target, &mcaddr);
543         ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
544         dev_put(work->dev);
545         kfree(work);
546 }
547
548 static void rt6_probe(struct fib6_nh *fib6_nh)
549 {
550         struct __rt6_probe_work *work = NULL;
551         const struct in6_addr *nh_gw;
552         struct neighbour *neigh;
553         struct net_device *dev;
554         struct inet6_dev *idev;
555
556         /*
557          * Okay, this does not seem to be appropriate
558          * for now, however, we need to check if it
559          * is really so; aka Router Reachability Probing.
560          *
561          * Router Reachability Probe MUST be rate-limited
562          * to no more than one per minute.
563          */
564         if (fib6_nh->fib_nh_gw_family)
565                 return;
566
567         nh_gw = &fib6_nh->fib_nh_gw6;
568         dev = fib6_nh->fib_nh_dev;
569         rcu_read_lock_bh();
570         idev = __in6_dev_get(dev);
571         neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
572         if (neigh) {
573                 if (neigh->nud_state & NUD_VALID)
574                         goto out;
575
576                 write_lock(&neigh->lock);
577                 if (!(neigh->nud_state & NUD_VALID) &&
578                     time_after(jiffies,
579                                neigh->updated + idev->cnf.rtr_probe_interval)) {
580                         work = kmalloc(sizeof(*work), GFP_ATOMIC);
581                         if (work)
582                                 __neigh_set_probe_once(neigh);
583                 }
584                 write_unlock(&neigh->lock);
585         } else if (time_after(jiffies, fib6_nh->last_probe +
586                                        idev->cnf.rtr_probe_interval)) {
587                 work = kmalloc(sizeof(*work), GFP_ATOMIC);
588         }
589
590         if (work) {
591                 fib6_nh->last_probe = jiffies;
592                 INIT_WORK(&work->work, rt6_probe_deferred);
593                 work->target = *nh_gw;
594                 dev_hold(dev);
595                 work->dev = dev;
596                 schedule_work(&work->work);
597         }
598
599 out:
600         rcu_read_unlock_bh();
601 }
602 #else
603 static inline void rt6_probe(struct fib6_nh *fib6_nh)
604 {
605 }
606 #endif
607
608 /*
609  * Default Router Selection (RFC 2461 6.3.6)
610  */
611 static enum rt6_nud_state rt6_check_neigh(const struct fib6_nh *fib6_nh)
612 {
613         enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
614         struct neighbour *neigh;
615
616         rcu_read_lock_bh();
617         neigh = __ipv6_neigh_lookup_noref(fib6_nh->fib_nh_dev,
618                                           &fib6_nh->fib_nh_gw6);
619         if (neigh) {
620                 read_lock(&neigh->lock);
621                 if (neigh->nud_state & NUD_VALID)
622                         ret = RT6_NUD_SUCCEED;
623 #ifdef CONFIG_IPV6_ROUTER_PREF
624                 else if (!(neigh->nud_state & NUD_FAILED))
625                         ret = RT6_NUD_SUCCEED;
626                 else
627                         ret = RT6_NUD_FAIL_PROBE;
628 #endif
629                 read_unlock(&neigh->lock);
630         } else {
631                 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
632                       RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
633         }
634         rcu_read_unlock_bh();
635
636         return ret;
637 }
638
639 static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif,
640                            int strict)
641 {
642         int m = 0;
643
644         if (!oif || nh->fib_nh_dev->ifindex == oif)
645                 m = 2;
646
647         if (!m && (strict & RT6_LOOKUP_F_IFACE))
648                 return RT6_NUD_FAIL_HARD;
649 #ifdef CONFIG_IPV6_ROUTER_PREF
650         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(fib6_flags)) << 2;
651 #endif
652         if ((strict & RT6_LOOKUP_F_REACHABLE) &&
653             !(fib6_flags & RTF_NONEXTHOP) && nh->fib_nh_gw_family) {
654                 int n = rt6_check_neigh(nh);
655                 if (n < 0)
656                         return n;
657         }
658         return m;
659 }
660
661 static bool find_match(struct fib6_nh *nh, u32 fib6_flags,
662                        int oif, int strict, int *mpri, bool *do_rr)
663 {
664         bool match_do_rr = false;
665         bool rc = false;
666         int m;
667
668         if (nh->fib_nh_flags & RTNH_F_DEAD)
669                 goto out;
670
671         if (ip6_ignore_linkdown(nh->fib_nh_dev) &&
672             nh->fib_nh_flags & RTNH_F_LINKDOWN &&
673             !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
674                 goto out;
675
676         m = rt6_score_route(nh, fib6_flags, oif, strict);
677         if (m == RT6_NUD_FAIL_DO_RR) {
678                 match_do_rr = true;
679                 m = 0; /* lowest valid score */
680         } else if (m == RT6_NUD_FAIL_HARD) {
681                 goto out;
682         }
683
684         if (strict & RT6_LOOKUP_F_REACHABLE)
685                 rt6_probe(nh);
686
687         /* note that m can be RT6_NUD_FAIL_PROBE at this point */
688         if (m > *mpri) {
689                 *do_rr = match_do_rr;
690                 *mpri = m;
691                 rc = true;
692         }
693 out:
694         return rc;
695 }
696
697 static void __find_rr_leaf(struct fib6_info *f6i_start,
698                            struct fib6_info *nomatch, u32 metric,
699                            struct fib6_result *res, struct fib6_info **cont,
700                            int oif, int strict, bool *do_rr, int *mpri)
701 {
702         struct fib6_info *f6i;
703
704         for (f6i = f6i_start;
705              f6i && f6i != nomatch;
706              f6i = rcu_dereference(f6i->fib6_next)) {
707                 struct fib6_nh *nh;
708
709                 if (cont && f6i->fib6_metric != metric) {
710                         *cont = f6i;
711                         return;
712                 }
713
714                 if (fib6_check_expired(f6i))
715                         continue;
716
717                 nh = &f6i->fib6_nh;
718                 if (find_match(nh, f6i->fib6_flags, oif, strict, mpri, do_rr)) {
719                         res->f6i = f6i;
720                         res->nh = nh;
721                         res->fib6_flags = f6i->fib6_flags;
722                         res->fib6_type = f6i->fib6_type;
723                 }
724         }
725 }
726
727 static void find_rr_leaf(struct fib6_node *fn, struct fib6_info *leaf,
728                          struct fib6_info *rr_head, int oif, int strict,
729                          bool *do_rr, struct fib6_result *res)
730 {
731         u32 metric = rr_head->fib6_metric;
732         struct fib6_info *cont = NULL;
733         int mpri = -1;
734
735         __find_rr_leaf(rr_head, NULL, metric, res, &cont,
736                        oif, strict, do_rr, &mpri);
737
738         __find_rr_leaf(leaf, rr_head, metric, res, &cont,
739                        oif, strict, do_rr, &mpri);
740
741         if (res->f6i || !cont)
742                 return;
743
744         __find_rr_leaf(cont, NULL, metric, res, NULL,
745                        oif, strict, do_rr, &mpri);
746 }
747
748 static void rt6_select(struct net *net, struct fib6_node *fn, int oif,
749                        struct fib6_result *res, int strict)
750 {
751         struct fib6_info *leaf = rcu_dereference(fn->leaf);
752         struct fib6_info *rt0;
753         bool do_rr = false;
754         int key_plen;
755
756         /* make sure this function or its helpers sets f6i */
757         res->f6i = NULL;
758
759         if (!leaf || leaf == net->ipv6.fib6_null_entry)
760                 goto out;
761
762         rt0 = rcu_dereference(fn->rr_ptr);
763         if (!rt0)
764                 rt0 = leaf;
765
766         /* Double check to make sure fn is not an intermediate node
767          * and fn->leaf does not points to its child's leaf
768          * (This might happen if all routes under fn are deleted from
769          * the tree and fib6_repair_tree() is called on the node.)
770          */
771         key_plen = rt0->fib6_dst.plen;
772 #ifdef CONFIG_IPV6_SUBTREES
773         if (rt0->fib6_src.plen)
774                 key_plen = rt0->fib6_src.plen;
775 #endif
776         if (fn->fn_bit != key_plen)
777                 goto out;
778
779         find_rr_leaf(fn, leaf, rt0, oif, strict, &do_rr, res);
780         if (do_rr) {
781                 struct fib6_info *next = rcu_dereference(rt0->fib6_next);
782
783                 /* no entries matched; do round-robin */
784                 if (!next || next->fib6_metric != rt0->fib6_metric)
785                         next = leaf;
786
787                 if (next != rt0) {
788                         spin_lock_bh(&leaf->fib6_table->tb6_lock);
789                         /* make sure next is not being deleted from the tree */
790                         if (next->fib6_node)
791                                 rcu_assign_pointer(fn->rr_ptr, next);
792                         spin_unlock_bh(&leaf->fib6_table->tb6_lock);
793                 }
794         }
795
796 out:
797         if (!res->f6i) {
798                 res->f6i = net->ipv6.fib6_null_entry;
799                 res->nh = &res->f6i->fib6_nh;
800                 res->fib6_flags = res->f6i->fib6_flags;
801                 res->fib6_type = res->f6i->fib6_type;
802         }
803 }
804
805 static bool rt6_is_gw_or_nonexthop(const struct fib6_result *res)
806 {
807         return (res->f6i->fib6_flags & RTF_NONEXTHOP) ||
808                res->nh->fib_nh_gw_family;
809 }
810
811 #ifdef CONFIG_IPV6_ROUTE_INFO
812 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
813                   const struct in6_addr *gwaddr)
814 {
815         struct net *net = dev_net(dev);
816         struct route_info *rinfo = (struct route_info *) opt;
817         struct in6_addr prefix_buf, *prefix;
818         unsigned int pref;
819         unsigned long lifetime;
820         struct fib6_info *rt;
821
822         if (len < sizeof(struct route_info)) {
823                 return -EINVAL;
824         }
825
826         /* Sanity check for prefix_len and length */
827         if (rinfo->length > 3) {
828                 return -EINVAL;
829         } else if (rinfo->prefix_len > 128) {
830                 return -EINVAL;
831         } else if (rinfo->prefix_len > 64) {
832                 if (rinfo->length < 2) {
833                         return -EINVAL;
834                 }
835         } else if (rinfo->prefix_len > 0) {
836                 if (rinfo->length < 1) {
837                         return -EINVAL;
838                 }
839         }
840
841         pref = rinfo->route_pref;
842         if (pref == ICMPV6_ROUTER_PREF_INVALID)
843                 return -EINVAL;
844
845         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
846
847         if (rinfo->length == 3)
848                 prefix = (struct in6_addr *)rinfo->prefix;
849         else {
850                 /* this function is safe */
851                 ipv6_addr_prefix(&prefix_buf,
852                                  (struct in6_addr *)rinfo->prefix,
853                                  rinfo->prefix_len);
854                 prefix = &prefix_buf;
855         }
856
857         if (rinfo->prefix_len == 0)
858                 rt = rt6_get_dflt_router(net, gwaddr, dev);
859         else
860                 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
861                                         gwaddr, dev);
862
863         if (rt && !lifetime) {
864                 ip6_del_rt(net, rt);
865                 rt = NULL;
866         }
867
868         if (!rt && lifetime)
869                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
870                                         dev, pref);
871         else if (rt)
872                 rt->fib6_flags = RTF_ROUTEINFO |
873                                  (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
874
875         if (rt) {
876                 if (!addrconf_finite_timeout(lifetime))
877                         fib6_clean_expires(rt);
878                 else
879                         fib6_set_expires(rt, jiffies + HZ * lifetime);
880
881                 fib6_info_release(rt);
882         }
883         return 0;
884 }
885 #endif
886
887 /*
888  *      Misc support functions
889  */
890
891 /* called with rcu_lock held */
892 static struct net_device *ip6_rt_get_dev_rcu(const struct fib6_result *res)
893 {
894         struct net_device *dev = res->nh->fib_nh_dev;
895
896         if (res->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) {
897                 /* for copies of local routes, dst->dev needs to be the
898                  * device if it is a master device, the master device if
899                  * device is enslaved, and the loopback as the default
900                  */
901                 if (netif_is_l3_slave(dev) &&
902                     !rt6_need_strict(&res->f6i->fib6_dst.addr))
903                         dev = l3mdev_master_dev_rcu(dev);
904                 else if (!netif_is_l3_master(dev))
905                         dev = dev_net(dev)->loopback_dev;
906                 /* last case is netif_is_l3_master(dev) is true in which
907                  * case we want dev returned to be dev
908                  */
909         }
910
911         return dev;
912 }
913
914 static const int fib6_prop[RTN_MAX + 1] = {
915         [RTN_UNSPEC]    = 0,
916         [RTN_UNICAST]   = 0,
917         [RTN_LOCAL]     = 0,
918         [RTN_BROADCAST] = 0,
919         [RTN_ANYCAST]   = 0,
920         [RTN_MULTICAST] = 0,
921         [RTN_BLACKHOLE] = -EINVAL,
922         [RTN_UNREACHABLE] = -EHOSTUNREACH,
923         [RTN_PROHIBIT]  = -EACCES,
924         [RTN_THROW]     = -EAGAIN,
925         [RTN_NAT]       = -EINVAL,
926         [RTN_XRESOLVE]  = -EINVAL,
927 };
928
929 static int ip6_rt_type_to_error(u8 fib6_type)
930 {
931         return fib6_prop[fib6_type];
932 }
933
934 static unsigned short fib6_info_dst_flags(struct fib6_info *rt)
935 {
936         unsigned short flags = 0;
937
938         if (rt->dst_nocount)
939                 flags |= DST_NOCOUNT;
940         if (rt->dst_nopolicy)
941                 flags |= DST_NOPOLICY;
942         if (rt->dst_host)
943                 flags |= DST_HOST;
944
945         return flags;
946 }
947
948 static void ip6_rt_init_dst_reject(struct rt6_info *rt, u8 fib6_type)
949 {
950         rt->dst.error = ip6_rt_type_to_error(fib6_type);
951
952         switch (fib6_type) {
953         case RTN_BLACKHOLE:
954                 rt->dst.output = dst_discard_out;
955                 rt->dst.input = dst_discard;
956                 break;
957         case RTN_PROHIBIT:
958                 rt->dst.output = ip6_pkt_prohibit_out;
959                 rt->dst.input = ip6_pkt_prohibit;
960                 break;
961         case RTN_THROW:
962         case RTN_UNREACHABLE:
963         default:
964                 rt->dst.output = ip6_pkt_discard_out;
965                 rt->dst.input = ip6_pkt_discard;
966                 break;
967         }
968 }
969
970 static void ip6_rt_init_dst(struct rt6_info *rt, const struct fib6_result *res)
971 {
972         struct fib6_info *f6i = res->f6i;
973
974         if (res->fib6_flags & RTF_REJECT) {
975                 ip6_rt_init_dst_reject(rt, res->fib6_type);
976                 return;
977         }
978
979         rt->dst.error = 0;
980         rt->dst.output = ip6_output;
981
982         if (res->fib6_type == RTN_LOCAL || res->fib6_type == RTN_ANYCAST) {
983                 rt->dst.input = ip6_input;
984         } else if (ipv6_addr_type(&f6i->fib6_dst.addr) & IPV6_ADDR_MULTICAST) {
985                 rt->dst.input = ip6_mc_input;
986         } else {
987                 rt->dst.input = ip6_forward;
988         }
989
990         if (res->nh->fib_nh_lws) {
991                 rt->dst.lwtstate = lwtstate_get(res->nh->fib_nh_lws);
992                 lwtunnel_set_redirect(&rt->dst);
993         }
994
995         rt->dst.lastuse = jiffies;
996 }
997
998 /* Caller must already hold reference to @from */
999 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
1000 {
1001         rt->rt6i_flags &= ~RTF_EXPIRES;
1002         rcu_assign_pointer(rt->from, from);
1003         ip_dst_init_metrics(&rt->dst, from->fib6_metrics);
1004 }
1005
1006 /* Caller must already hold reference to f6i in result */
1007 static void ip6_rt_copy_init(struct rt6_info *rt, const struct fib6_result *res)
1008 {
1009         const struct fib6_nh *nh = res->nh;
1010         const struct net_device *dev = nh->fib_nh_dev;
1011         struct fib6_info *f6i = res->f6i;
1012
1013         ip6_rt_init_dst(rt, res);
1014
1015         rt->rt6i_dst = f6i->fib6_dst;
1016         rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL;
1017         rt->rt6i_flags = res->fib6_flags;
1018         if (nh->fib_nh_gw_family) {
1019                 rt->rt6i_gateway = nh->fib_nh_gw6;
1020                 rt->rt6i_flags |= RTF_GATEWAY;
1021         }
1022         rt6_set_from(rt, f6i);
1023 #ifdef CONFIG_IPV6_SUBTREES
1024         rt->rt6i_src = f6i->fib6_src;
1025 #endif
1026 }
1027
1028 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
1029                                         struct in6_addr *saddr)
1030 {
1031         struct fib6_node *pn, *sn;
1032         while (1) {
1033                 if (fn->fn_flags & RTN_TL_ROOT)
1034                         return NULL;
1035                 pn = rcu_dereference(fn->parent);
1036                 sn = FIB6_SUBTREE(pn);
1037                 if (sn && sn != fn)
1038                         fn = fib6_node_lookup(sn, NULL, saddr);
1039                 else
1040                         fn = pn;
1041                 if (fn->fn_flags & RTN_RTINFO)
1042                         return fn;
1043         }
1044 }
1045
1046 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt)
1047 {
1048         struct rt6_info *rt = *prt;
1049
1050         if (dst_hold_safe(&rt->dst))
1051                 return true;
1052         if (net) {
1053                 rt = net->ipv6.ip6_null_entry;
1054                 dst_hold(&rt->dst);
1055         } else {
1056                 rt = NULL;
1057         }
1058         *prt = rt;
1059         return false;
1060 }
1061
1062 /* called with rcu_lock held */
1063 static struct rt6_info *ip6_create_rt_rcu(const struct fib6_result *res)
1064 {
1065         struct net_device *dev = res->nh->fib_nh_dev;
1066         struct fib6_info *f6i = res->f6i;
1067         unsigned short flags;
1068         struct rt6_info *nrt;
1069
1070         if (!fib6_info_hold_safe(f6i))
1071                 goto fallback;
1072
1073         flags = fib6_info_dst_flags(f6i);
1074         nrt = ip6_dst_alloc(dev_net(dev), dev, flags);
1075         if (!nrt) {
1076                 fib6_info_release(f6i);
1077                 goto fallback;
1078         }
1079
1080         ip6_rt_copy_init(nrt, res);
1081         return nrt;
1082
1083 fallback:
1084         nrt = dev_net(dev)->ipv6.ip6_null_entry;
1085         dst_hold(&nrt->dst);
1086         return nrt;
1087 }
1088
1089 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
1090                                              struct fib6_table *table,
1091                                              struct flowi6 *fl6,
1092                                              const struct sk_buff *skb,
1093                                              int flags)
1094 {
1095         struct fib6_result res = {};
1096         struct fib6_node *fn;
1097         struct rt6_info *rt;
1098
1099         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1100                 flags &= ~RT6_LOOKUP_F_IFACE;
1101
1102         rcu_read_lock();
1103         fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1104 restart:
1105         res.f6i = rcu_dereference(fn->leaf);
1106         if (!res.f6i)
1107                 res.f6i = net->ipv6.fib6_null_entry;
1108         else
1109                 rt6_device_match(net, &res, &fl6->saddr, fl6->flowi6_oif,
1110                                  flags);
1111
1112         if (res.f6i == net->ipv6.fib6_null_entry) {
1113                 fn = fib6_backtrack(fn, &fl6->saddr);
1114                 if (fn)
1115                         goto restart;
1116
1117                 rt = net->ipv6.ip6_null_entry;
1118                 dst_hold(&rt->dst);
1119                 goto out;
1120         }
1121
1122         fib6_select_path(net, &res, fl6, fl6->flowi6_oif,
1123                          fl6->flowi6_oif != 0, skb, flags);
1124
1125         /* Search through exception table */
1126         rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr);
1127         if (rt) {
1128                 if (ip6_hold_safe(net, &rt))
1129                         dst_use_noref(&rt->dst, jiffies);
1130         } else {
1131                 rt = ip6_create_rt_rcu(&res);
1132         }
1133
1134 out:
1135         trace_fib6_table_lookup(net, &res, table, fl6);
1136
1137         rcu_read_unlock();
1138
1139         return rt;
1140 }
1141
1142 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
1143                                    const struct sk_buff *skb, int flags)
1144 {
1145         return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
1146 }
1147 EXPORT_SYMBOL_GPL(ip6_route_lookup);
1148
1149 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
1150                             const struct in6_addr *saddr, int oif,
1151                             const struct sk_buff *skb, int strict)
1152 {
1153         struct flowi6 fl6 = {
1154                 .flowi6_oif = oif,
1155                 .daddr = *daddr,
1156         };
1157         struct dst_entry *dst;
1158         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
1159
1160         if (saddr) {
1161                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
1162                 flags |= RT6_LOOKUP_F_HAS_SADDR;
1163         }
1164
1165         dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
1166         if (dst->error == 0)
1167                 return (struct rt6_info *) dst;
1168
1169         dst_release(dst);
1170
1171         return NULL;
1172 }
1173 EXPORT_SYMBOL(rt6_lookup);
1174
1175 /* ip6_ins_rt is called with FREE table->tb6_lock.
1176  * It takes new route entry, the addition fails by any reason the
1177  * route is released.
1178  * Caller must hold dst before calling it.
1179  */
1180
1181 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info,
1182                         struct netlink_ext_ack *extack)
1183 {
1184         int err;
1185         struct fib6_table *table;
1186
1187         table = rt->fib6_table;
1188         spin_lock_bh(&table->tb6_lock);
1189         err = fib6_add(&table->tb6_root, rt, info, extack);
1190         spin_unlock_bh(&table->tb6_lock);
1191
1192         return err;
1193 }
1194
1195 int ip6_ins_rt(struct net *net, struct fib6_info *rt)
1196 {
1197         struct nl_info info = { .nl_net = net, };
1198
1199         return __ip6_ins_rt(rt, &info, NULL);
1200 }
1201
1202 static struct rt6_info *ip6_rt_cache_alloc(const struct fib6_result *res,
1203                                            const struct in6_addr *daddr,
1204                                            const struct in6_addr *saddr)
1205 {
1206         struct fib6_info *f6i = res->f6i;
1207         struct net_device *dev;
1208         struct rt6_info *rt;
1209
1210         /*
1211          *      Clone the route.
1212          */
1213
1214         if (!fib6_info_hold_safe(f6i))
1215                 return NULL;
1216
1217         dev = ip6_rt_get_dev_rcu(res);
1218         rt = ip6_dst_alloc(dev_net(dev), dev, 0);
1219         if (!rt) {
1220                 fib6_info_release(f6i);
1221                 return NULL;
1222         }
1223
1224         ip6_rt_copy_init(rt, res);
1225         rt->rt6i_flags |= RTF_CACHE;
1226         rt->dst.flags |= DST_HOST;
1227         rt->rt6i_dst.addr = *daddr;
1228         rt->rt6i_dst.plen = 128;
1229
1230         if (!rt6_is_gw_or_nonexthop(res)) {
1231                 if (f6i->fib6_dst.plen != 128 &&
1232                     ipv6_addr_equal(&f6i->fib6_dst.addr, daddr))
1233                         rt->rt6i_flags |= RTF_ANYCAST;
1234 #ifdef CONFIG_IPV6_SUBTREES
1235                 if (rt->rt6i_src.plen && saddr) {
1236                         rt->rt6i_src.addr = *saddr;
1237                         rt->rt6i_src.plen = 128;
1238                 }
1239 #endif
1240         }
1241
1242         return rt;
1243 }
1244
1245 static struct rt6_info *ip6_rt_pcpu_alloc(const struct fib6_result *res)
1246 {
1247         struct fib6_info *f6i = res->f6i;
1248         unsigned short flags = fib6_info_dst_flags(f6i);
1249         struct net_device *dev;
1250         struct rt6_info *pcpu_rt;
1251
1252         if (!fib6_info_hold_safe(f6i))
1253                 return NULL;
1254
1255         rcu_read_lock();
1256         dev = ip6_rt_get_dev_rcu(res);
1257         pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags);
1258         rcu_read_unlock();
1259         if (!pcpu_rt) {
1260                 fib6_info_release(f6i);
1261                 return NULL;
1262         }
1263         ip6_rt_copy_init(pcpu_rt, res);
1264         pcpu_rt->rt6i_flags |= RTF_PCPU;
1265         return pcpu_rt;
1266 }
1267
1268 /* It should be called with rcu_read_lock() acquired */
1269 static struct rt6_info *rt6_get_pcpu_route(const struct fib6_result *res)
1270 {
1271         struct rt6_info *pcpu_rt, **p;
1272
1273         p = this_cpu_ptr(res->f6i->rt6i_pcpu);
1274         pcpu_rt = *p;
1275
1276         if (pcpu_rt)
1277                 ip6_hold_safe(NULL, &pcpu_rt);
1278
1279         return pcpu_rt;
1280 }
1281
1282 static struct rt6_info *rt6_make_pcpu_route(struct net *net,
1283                                             const struct fib6_result *res)
1284 {
1285         struct rt6_info *pcpu_rt, *prev, **p;
1286
1287         pcpu_rt = ip6_rt_pcpu_alloc(res);
1288         if (!pcpu_rt) {
1289                 dst_hold(&net->ipv6.ip6_null_entry->dst);
1290                 return net->ipv6.ip6_null_entry;
1291         }
1292
1293         dst_hold(&pcpu_rt->dst);
1294         p = this_cpu_ptr(res->f6i->rt6i_pcpu);
1295         prev = cmpxchg(p, NULL, pcpu_rt);
1296         BUG_ON(prev);
1297
1298         if (res->f6i->fib6_destroying) {
1299                 struct fib6_info *from;
1300
1301                 from = xchg((__force struct fib6_info **)&pcpu_rt->from, NULL);
1302                 fib6_info_release(from);
1303         }
1304
1305         return pcpu_rt;
1306 }
1307
1308 /* exception hash table implementation
1309  */
1310 static DEFINE_SPINLOCK(rt6_exception_lock);
1311
1312 /* Remove rt6_ex from hash table and free the memory
1313  * Caller must hold rt6_exception_lock
1314  */
1315 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1316                                  struct rt6_exception *rt6_ex)
1317 {
1318         struct fib6_info *from;
1319         struct net *net;
1320
1321         if (!bucket || !rt6_ex)
1322                 return;
1323
1324         net = dev_net(rt6_ex->rt6i->dst.dev);
1325         net->ipv6.rt6_stats->fib_rt_cache--;
1326
1327         /* purge completely the exception to allow releasing the held resources:
1328          * some [sk] cache may keep the dst around for unlimited time
1329          */
1330         from = xchg((__force struct fib6_info **)&rt6_ex->rt6i->from, NULL);
1331         fib6_info_release(from);
1332         dst_dev_put(&rt6_ex->rt6i->dst);
1333
1334         hlist_del_rcu(&rt6_ex->hlist);
1335         dst_release(&rt6_ex->rt6i->dst);
1336         kfree_rcu(rt6_ex, rcu);
1337         WARN_ON_ONCE(!bucket->depth);
1338         bucket->depth--;
1339 }
1340
1341 /* Remove oldest rt6_ex in bucket and free the memory
1342  * Caller must hold rt6_exception_lock
1343  */
1344 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1345 {
1346         struct rt6_exception *rt6_ex, *oldest = NULL;
1347
1348         if (!bucket)
1349                 return;
1350
1351         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1352                 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1353                         oldest = rt6_ex;
1354         }
1355         rt6_remove_exception(bucket, oldest);
1356 }
1357
1358 static u32 rt6_exception_hash(const struct in6_addr *dst,
1359                               const struct in6_addr *src)
1360 {
1361         static u32 seed __read_mostly;
1362         u32 val;
1363
1364         net_get_random_once(&seed, sizeof(seed));
1365         val = jhash(dst, sizeof(*dst), seed);
1366
1367 #ifdef CONFIG_IPV6_SUBTREES
1368         if (src)
1369                 val = jhash(src, sizeof(*src), val);
1370 #endif
1371         return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1372 }
1373
1374 /* Helper function to find the cached rt in the hash table
1375  * and update bucket pointer to point to the bucket for this
1376  * (daddr, saddr) pair
1377  * Caller must hold rt6_exception_lock
1378  */
1379 static struct rt6_exception *
1380 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1381                               const struct in6_addr *daddr,
1382                               const struct in6_addr *saddr)
1383 {
1384         struct rt6_exception *rt6_ex;
1385         u32 hval;
1386
1387         if (!(*bucket) || !daddr)
1388                 return NULL;
1389
1390         hval = rt6_exception_hash(daddr, saddr);
1391         *bucket += hval;
1392
1393         hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1394                 struct rt6_info *rt6 = rt6_ex->rt6i;
1395                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1396
1397 #ifdef CONFIG_IPV6_SUBTREES
1398                 if (matched && saddr)
1399                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1400 #endif
1401                 if (matched)
1402                         return rt6_ex;
1403         }
1404         return NULL;
1405 }
1406
1407 /* Helper function to find the cached rt in the hash table
1408  * and update bucket pointer to point to the bucket for this
1409  * (daddr, saddr) pair
1410  * Caller must hold rcu_read_lock()
1411  */
1412 static struct rt6_exception *
1413 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1414                          const struct in6_addr *daddr,
1415                          const struct in6_addr *saddr)
1416 {
1417         struct rt6_exception *rt6_ex;
1418         u32 hval;
1419
1420         WARN_ON_ONCE(!rcu_read_lock_held());
1421
1422         if (!(*bucket) || !daddr)
1423                 return NULL;
1424
1425         hval = rt6_exception_hash(daddr, saddr);
1426         *bucket += hval;
1427
1428         hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1429                 struct rt6_info *rt6 = rt6_ex->rt6i;
1430                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1431
1432 #ifdef CONFIG_IPV6_SUBTREES
1433                 if (matched && saddr)
1434                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1435 #endif
1436                 if (matched)
1437                         return rt6_ex;
1438         }
1439         return NULL;
1440 }
1441
1442 static unsigned int fib6_mtu(const struct fib6_result *res)
1443 {
1444         const struct fib6_nh *nh = res->nh;
1445         unsigned int mtu;
1446
1447         if (res->f6i->fib6_pmtu) {
1448                 mtu = res->f6i->fib6_pmtu;
1449         } else {
1450                 struct net_device *dev = nh->fib_nh_dev;
1451                 struct inet6_dev *idev;
1452
1453                 rcu_read_lock();
1454                 idev = __in6_dev_get(dev);
1455                 mtu = idev->cnf.mtu6;
1456                 rcu_read_unlock();
1457         }
1458
1459         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1460
1461         return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu);
1462 }
1463
1464 static int rt6_insert_exception(struct rt6_info *nrt,
1465                                 const struct fib6_result *res)
1466 {
1467         struct net *net = dev_net(nrt->dst.dev);
1468         struct rt6_exception_bucket *bucket;
1469         struct in6_addr *src_key = NULL;
1470         struct rt6_exception *rt6_ex;
1471         struct fib6_info *f6i = res->f6i;
1472         int err = 0;
1473
1474         spin_lock_bh(&rt6_exception_lock);
1475
1476         if (f6i->exception_bucket_flushed) {
1477                 err = -EINVAL;
1478                 goto out;
1479         }
1480
1481         bucket = rcu_dereference_protected(f6i->rt6i_exception_bucket,
1482                                         lockdep_is_held(&rt6_exception_lock));
1483         if (!bucket) {
1484                 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1485                                  GFP_ATOMIC);
1486                 if (!bucket) {
1487                         err = -ENOMEM;
1488                         goto out;
1489                 }
1490                 rcu_assign_pointer(f6i->rt6i_exception_bucket, bucket);
1491         }
1492
1493 #ifdef CONFIG_IPV6_SUBTREES
1494         /* fib6_src.plen != 0 indicates f6i is in subtree
1495          * and exception table is indexed by a hash of
1496          * both fib6_dst and fib6_src.
1497          * Otherwise, the exception table is indexed by
1498          * a hash of only fib6_dst.
1499          */
1500         if (f6i->fib6_src.plen)
1501                 src_key = &nrt->rt6i_src.addr;
1502 #endif
1503         /* rt6_mtu_change() might lower mtu on f6i.
1504          * Only insert this exception route if its mtu
1505          * is less than f6i's mtu value.
1506          */
1507         if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(res)) {
1508                 err = -EINVAL;
1509                 goto out;
1510         }
1511
1512         rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1513                                                src_key);
1514         if (rt6_ex)
1515                 rt6_remove_exception(bucket, rt6_ex);
1516
1517         rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1518         if (!rt6_ex) {
1519                 err = -ENOMEM;
1520                 goto out;
1521         }
1522         rt6_ex->rt6i = nrt;
1523         rt6_ex->stamp = jiffies;
1524         hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1525         bucket->depth++;
1526         net->ipv6.rt6_stats->fib_rt_cache++;
1527
1528         if (bucket->depth > FIB6_MAX_DEPTH)
1529                 rt6_exception_remove_oldest(bucket);
1530
1531 out:
1532         spin_unlock_bh(&rt6_exception_lock);
1533
1534         /* Update fn->fn_sernum to invalidate all cached dst */
1535         if (!err) {
1536                 spin_lock_bh(&f6i->fib6_table->tb6_lock);
1537                 fib6_update_sernum(net, f6i);
1538                 spin_unlock_bh(&f6i->fib6_table->tb6_lock);
1539                 fib6_force_start_gc(net);
1540         }
1541
1542         return err;
1543 }
1544
1545 void rt6_flush_exceptions(struct fib6_info *rt)
1546 {
1547         struct rt6_exception_bucket *bucket;
1548         struct rt6_exception *rt6_ex;
1549         struct hlist_node *tmp;
1550         int i;
1551
1552         spin_lock_bh(&rt6_exception_lock);
1553         /* Prevent rt6_insert_exception() to recreate the bucket list */
1554         rt->exception_bucket_flushed = 1;
1555
1556         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1557                                     lockdep_is_held(&rt6_exception_lock));
1558         if (!bucket)
1559                 goto out;
1560
1561         for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1562                 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1563                         rt6_remove_exception(bucket, rt6_ex);
1564                 WARN_ON_ONCE(bucket->depth);
1565                 bucket++;
1566         }
1567
1568 out:
1569         spin_unlock_bh(&rt6_exception_lock);
1570 }
1571
1572 /* Find cached rt in the hash table inside passed in rt
1573  * Caller has to hold rcu_read_lock()
1574  */
1575 static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res,
1576                                            const struct in6_addr *daddr,
1577                                            const struct in6_addr *saddr)
1578 {
1579         const struct in6_addr *src_key = NULL;
1580         struct rt6_exception_bucket *bucket;
1581         struct rt6_exception *rt6_ex;
1582         struct rt6_info *ret = NULL;
1583
1584 #ifdef CONFIG_IPV6_SUBTREES
1585         /* fib6i_src.plen != 0 indicates f6i is in subtree
1586          * and exception table is indexed by a hash of
1587          * both fib6_dst and fib6_src.
1588          * However, the src addr used to create the hash
1589          * might not be exactly the passed in saddr which
1590          * is a /128 addr from the flow.
1591          * So we need to use f6i->fib6_src to redo lookup
1592          * if the passed in saddr does not find anything.
1593          * (See the logic in ip6_rt_cache_alloc() on how
1594          * rt->rt6i_src is updated.)
1595          */
1596         if (res->f6i->fib6_src.plen)
1597                 src_key = saddr;
1598 find_ex:
1599 #endif
1600         bucket = rcu_dereference(res->f6i->rt6i_exception_bucket);
1601         rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1602
1603         if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1604                 ret = rt6_ex->rt6i;
1605
1606 #ifdef CONFIG_IPV6_SUBTREES
1607         /* Use fib6_src as src_key and redo lookup */
1608         if (!ret && src_key && src_key != &res->f6i->fib6_src.addr) {
1609                 src_key = &res->f6i->fib6_src.addr;
1610                 goto find_ex;
1611         }
1612 #endif
1613
1614         return ret;
1615 }
1616
1617 /* Remove the passed in cached rt from the hash table that contains it */
1618 static int rt6_remove_exception_rt(struct rt6_info *rt)
1619 {
1620         struct rt6_exception_bucket *bucket;
1621         struct in6_addr *src_key = NULL;
1622         struct rt6_exception *rt6_ex;
1623         struct fib6_info *from;
1624         int err;
1625
1626         from = rcu_dereference(rt->from);
1627         if (!from ||
1628             !(rt->rt6i_flags & RTF_CACHE))
1629                 return -EINVAL;
1630
1631         if (!rcu_access_pointer(from->rt6i_exception_bucket))
1632                 return -ENOENT;
1633
1634         spin_lock_bh(&rt6_exception_lock);
1635         bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1636                                     lockdep_is_held(&rt6_exception_lock));
1637 #ifdef CONFIG_IPV6_SUBTREES
1638         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1639          * and exception table is indexed by a hash of
1640          * both rt6i_dst and rt6i_src.
1641          * Otherwise, the exception table is indexed by
1642          * a hash of only rt6i_dst.
1643          */
1644         if (from->fib6_src.plen)
1645                 src_key = &rt->rt6i_src.addr;
1646 #endif
1647         rt6_ex = __rt6_find_exception_spinlock(&bucket,
1648                                                &rt->rt6i_dst.addr,
1649                                                src_key);
1650         if (rt6_ex) {
1651                 rt6_remove_exception(bucket, rt6_ex);
1652                 err = 0;
1653         } else {
1654                 err = -ENOENT;
1655         }
1656
1657         spin_unlock_bh(&rt6_exception_lock);
1658         return err;
1659 }
1660
1661 /* Find rt6_ex which contains the passed in rt cache and
1662  * refresh its stamp
1663  */
1664 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1665 {
1666         struct rt6_exception_bucket *bucket;
1667         struct in6_addr *src_key = NULL;
1668         struct rt6_exception *rt6_ex;
1669         struct fib6_info *from;
1670
1671         rcu_read_lock();
1672         from = rcu_dereference(rt->from);
1673         if (!from || !(rt->rt6i_flags & RTF_CACHE))
1674                 goto unlock;
1675
1676         bucket = rcu_dereference(from->rt6i_exception_bucket);
1677
1678 #ifdef CONFIG_IPV6_SUBTREES
1679         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1680          * and exception table is indexed by a hash of
1681          * both rt6i_dst and rt6i_src.
1682          * Otherwise, the exception table is indexed by
1683          * a hash of only rt6i_dst.
1684          */
1685         if (from->fib6_src.plen)
1686                 src_key = &rt->rt6i_src.addr;
1687 #endif
1688         rt6_ex = __rt6_find_exception_rcu(&bucket,
1689                                           &rt->rt6i_dst.addr,
1690                                           src_key);
1691         if (rt6_ex)
1692                 rt6_ex->stamp = jiffies;
1693
1694 unlock:
1695         rcu_read_unlock();
1696 }
1697
1698 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
1699                                          struct rt6_info *rt, int mtu)
1700 {
1701         /* If the new MTU is lower than the route PMTU, this new MTU will be the
1702          * lowest MTU in the path: always allow updating the route PMTU to
1703          * reflect PMTU decreases.
1704          *
1705          * If the new MTU is higher, and the route PMTU is equal to the local
1706          * MTU, this means the old MTU is the lowest in the path, so allow
1707          * updating it: if other nodes now have lower MTUs, PMTU discovery will
1708          * handle this.
1709          */
1710
1711         if (dst_mtu(&rt->dst) >= mtu)
1712                 return true;
1713
1714         if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
1715                 return true;
1716
1717         return false;
1718 }
1719
1720 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
1721                                        struct fib6_info *rt, int mtu)
1722 {
1723         struct rt6_exception_bucket *bucket;
1724         struct rt6_exception *rt6_ex;
1725         int i;
1726
1727         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1728                                         lockdep_is_held(&rt6_exception_lock));
1729
1730         if (!bucket)
1731                 return;
1732
1733         for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1734                 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1735                         struct rt6_info *entry = rt6_ex->rt6i;
1736
1737                         /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
1738                          * route), the metrics of its rt->from have already
1739                          * been updated.
1740                          */
1741                         if (dst_metric_raw(&entry->dst, RTAX_MTU) &&
1742                             rt6_mtu_change_route_allowed(idev, entry, mtu))
1743                                 dst_metric_set(&entry->dst, RTAX_MTU, mtu);
1744                 }
1745                 bucket++;
1746         }
1747 }
1748
1749 #define RTF_CACHE_GATEWAY       (RTF_GATEWAY | RTF_CACHE)
1750
1751 static void rt6_exceptions_clean_tohost(struct fib6_info *rt,
1752                                         struct in6_addr *gateway)
1753 {
1754         struct rt6_exception_bucket *bucket;
1755         struct rt6_exception *rt6_ex;
1756         struct hlist_node *tmp;
1757         int i;
1758
1759         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1760                 return;
1761
1762         spin_lock_bh(&rt6_exception_lock);
1763         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1764                                      lockdep_is_held(&rt6_exception_lock));
1765
1766         if (bucket) {
1767                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1768                         hlist_for_each_entry_safe(rt6_ex, tmp,
1769                                                   &bucket->chain, hlist) {
1770                                 struct rt6_info *entry = rt6_ex->rt6i;
1771
1772                                 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1773                                     RTF_CACHE_GATEWAY &&
1774                                     ipv6_addr_equal(gateway,
1775                                                     &entry->rt6i_gateway)) {
1776                                         rt6_remove_exception(bucket, rt6_ex);
1777                                 }
1778                         }
1779                         bucket++;
1780                 }
1781         }
1782
1783         spin_unlock_bh(&rt6_exception_lock);
1784 }
1785
1786 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1787                                       struct rt6_exception *rt6_ex,
1788                                       struct fib6_gc_args *gc_args,
1789                                       unsigned long now)
1790 {
1791         struct rt6_info *rt = rt6_ex->rt6i;
1792
1793         /* we are pruning and obsoleting aged-out and non gateway exceptions
1794          * even if others have still references to them, so that on next
1795          * dst_check() such references can be dropped.
1796          * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1797          * expired, independently from their aging, as per RFC 8201 section 4
1798          */
1799         if (!(rt->rt6i_flags & RTF_EXPIRES)) {
1800                 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1801                         RT6_TRACE("aging clone %p\n", rt);
1802                         rt6_remove_exception(bucket, rt6_ex);
1803                         return;
1804                 }
1805         } else if (time_after(jiffies, rt->dst.expires)) {
1806                 RT6_TRACE("purging expired route %p\n", rt);
1807                 rt6_remove_exception(bucket, rt6_ex);
1808                 return;
1809         }
1810
1811         if (rt->rt6i_flags & RTF_GATEWAY) {
1812                 struct neighbour *neigh;
1813                 __u8 neigh_flags = 0;
1814
1815                 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
1816                 if (neigh)
1817                         neigh_flags = neigh->flags;
1818
1819                 if (!(neigh_flags & NTF_ROUTER)) {
1820                         RT6_TRACE("purging route %p via non-router but gateway\n",
1821                                   rt);
1822                         rt6_remove_exception(bucket, rt6_ex);
1823                         return;
1824                 }
1825         }
1826
1827         gc_args->more++;
1828 }
1829
1830 void rt6_age_exceptions(struct fib6_info *rt,
1831                         struct fib6_gc_args *gc_args,
1832                         unsigned long now)
1833 {
1834         struct rt6_exception_bucket *bucket;
1835         struct rt6_exception *rt6_ex;
1836         struct hlist_node *tmp;
1837         int i;
1838
1839         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1840                 return;
1841
1842         rcu_read_lock_bh();
1843         spin_lock(&rt6_exception_lock);
1844         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1845                                     lockdep_is_held(&rt6_exception_lock));
1846
1847         if (bucket) {
1848                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1849                         hlist_for_each_entry_safe(rt6_ex, tmp,
1850                                                   &bucket->chain, hlist) {
1851                                 rt6_age_examine_exception(bucket, rt6_ex,
1852                                                           gc_args, now);
1853                         }
1854                         bucket++;
1855                 }
1856         }
1857         spin_unlock(&rt6_exception_lock);
1858         rcu_read_unlock_bh();
1859 }
1860
1861 /* must be called with rcu lock held */
1862 int fib6_table_lookup(struct net *net, struct fib6_table *table, int oif,
1863                       struct flowi6 *fl6, struct fib6_result *res, int strict)
1864 {
1865         struct fib6_node *fn, *saved_fn;
1866
1867         fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1868         saved_fn = fn;
1869
1870         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1871                 oif = 0;
1872
1873 redo_rt6_select:
1874         rt6_select(net, fn, oif, res, strict);
1875         if (res->f6i == net->ipv6.fib6_null_entry) {
1876                 fn = fib6_backtrack(fn, &fl6->saddr);
1877                 if (fn)
1878                         goto redo_rt6_select;
1879                 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1880                         /* also consider unreachable route */
1881                         strict &= ~RT6_LOOKUP_F_REACHABLE;
1882                         fn = saved_fn;
1883                         goto redo_rt6_select;
1884                 }
1885         }
1886
1887         trace_fib6_table_lookup(net, res, table, fl6);
1888
1889         return 0;
1890 }
1891
1892 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1893                                int oif, struct flowi6 *fl6,
1894                                const struct sk_buff *skb, int flags)
1895 {
1896         struct fib6_result res = {};
1897         struct rt6_info *rt;
1898         int strict = 0;
1899
1900         strict |= flags & RT6_LOOKUP_F_IFACE;
1901         strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1902         if (net->ipv6.devconf_all->forwarding == 0)
1903                 strict |= RT6_LOOKUP_F_REACHABLE;
1904
1905         rcu_read_lock();
1906
1907         fib6_table_lookup(net, table, oif, fl6, &res, strict);
1908         if (res.f6i == net->ipv6.fib6_null_entry) {
1909                 rt = net->ipv6.ip6_null_entry;
1910                 rcu_read_unlock();
1911                 dst_hold(&rt->dst);
1912                 return rt;
1913         }
1914
1915         fib6_select_path(net, &res, fl6, oif, false, skb, strict);
1916
1917         /*Search through exception table */
1918         rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr);
1919         if (rt) {
1920                 if (ip6_hold_safe(net, &rt))
1921                         dst_use_noref(&rt->dst, jiffies);
1922
1923                 rcu_read_unlock();
1924                 return rt;
1925         } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1926                             !res.nh->fib_nh_gw_family)) {
1927                 /* Create a RTF_CACHE clone which will not be
1928                  * owned by the fib6 tree.  It is for the special case where
1929                  * the daddr in the skb during the neighbor look-up is different
1930                  * from the fl6->daddr used to look-up route here.
1931                  */
1932                 struct rt6_info *uncached_rt;
1933
1934                 uncached_rt = ip6_rt_cache_alloc(&res, &fl6->daddr, NULL);
1935
1936                 rcu_read_unlock();
1937
1938                 if (uncached_rt) {
1939                         /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1940                          * No need for another dst_hold()
1941                          */
1942                         rt6_uncached_list_add(uncached_rt);
1943                         atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1944                 } else {
1945                         uncached_rt = net->ipv6.ip6_null_entry;
1946                         dst_hold(&uncached_rt->dst);
1947                 }
1948
1949                 return uncached_rt;
1950         } else {
1951                 /* Get a percpu copy */
1952
1953                 struct rt6_info *pcpu_rt;
1954
1955                 local_bh_disable();
1956                 pcpu_rt = rt6_get_pcpu_route(&res);
1957
1958                 if (!pcpu_rt)
1959                         pcpu_rt = rt6_make_pcpu_route(net, &res);
1960
1961                 local_bh_enable();
1962                 rcu_read_unlock();
1963
1964                 return pcpu_rt;
1965         }
1966 }
1967 EXPORT_SYMBOL_GPL(ip6_pol_route);
1968
1969 static struct rt6_info *ip6_pol_route_input(struct net *net,
1970                                             struct fib6_table *table,
1971                                             struct flowi6 *fl6,
1972                                             const struct sk_buff *skb,
1973                                             int flags)
1974 {
1975         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
1976 }
1977
1978 struct dst_entry *ip6_route_input_lookup(struct net *net,
1979                                          struct net_device *dev,
1980                                          struct flowi6 *fl6,
1981                                          const struct sk_buff *skb,
1982                                          int flags)
1983 {
1984         if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1985                 flags |= RT6_LOOKUP_F_IFACE;
1986
1987         return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
1988 }
1989 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1990
1991 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1992                                   struct flow_keys *keys,
1993                                   struct flow_keys *flkeys)
1994 {
1995         const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1996         const struct ipv6hdr *key_iph = outer_iph;
1997         struct flow_keys *_flkeys = flkeys;
1998         const struct ipv6hdr *inner_iph;
1999         const struct icmp6hdr *icmph;
2000         struct ipv6hdr _inner_iph;
2001         struct icmp6hdr _icmph;
2002
2003         if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
2004                 goto out;
2005
2006         icmph = skb_header_pointer(skb, skb_transport_offset(skb),
2007                                    sizeof(_icmph), &_icmph);
2008         if (!icmph)
2009                 goto out;
2010
2011         if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
2012             icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
2013             icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
2014             icmph->icmp6_type != ICMPV6_PARAMPROB)
2015                 goto out;
2016
2017         inner_iph = skb_header_pointer(skb,
2018                                        skb_transport_offset(skb) + sizeof(*icmph),
2019                                        sizeof(_inner_iph), &_inner_iph);
2020         if (!inner_iph)
2021                 goto out;
2022
2023         key_iph = inner_iph;
2024         _flkeys = NULL;
2025 out:
2026         if (_flkeys) {
2027                 keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
2028                 keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
2029                 keys->tags.flow_label = _flkeys->tags.flow_label;
2030                 keys->basic.ip_proto = _flkeys->basic.ip_proto;
2031         } else {
2032                 keys->addrs.v6addrs.src = key_iph->saddr;
2033                 keys->addrs.v6addrs.dst = key_iph->daddr;
2034                 keys->tags.flow_label = ip6_flowlabel(key_iph);
2035                 keys->basic.ip_proto = key_iph->nexthdr;
2036         }
2037 }
2038
2039 /* if skb is set it will be used and fl6 can be NULL */
2040 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
2041                        const struct sk_buff *skb, struct flow_keys *flkeys)
2042 {
2043         struct flow_keys hash_keys;
2044         u32 mhash;
2045
2046         switch (ip6_multipath_hash_policy(net)) {
2047         case 0:
2048                 memset(&hash_keys, 0, sizeof(hash_keys));
2049                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2050                 if (skb) {
2051                         ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
2052                 } else {
2053                         hash_keys.addrs.v6addrs.src = fl6->saddr;
2054                         hash_keys.addrs.v6addrs.dst = fl6->daddr;
2055                         hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
2056                         hash_keys.basic.ip_proto = fl6->flowi6_proto;
2057                 }
2058                 break;
2059         case 1:
2060                 if (skb) {
2061                         unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
2062                         struct flow_keys keys;
2063
2064                         /* short-circuit if we already have L4 hash present */
2065                         if (skb->l4_hash)
2066                                 return skb_get_hash_raw(skb) >> 1;
2067
2068                         memset(&hash_keys, 0, sizeof(hash_keys));
2069
2070                         if (!flkeys) {
2071                                 skb_flow_dissect_flow_keys(skb, &keys, flag);
2072                                 flkeys = &keys;
2073                         }
2074                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2075                         hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
2076                         hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
2077                         hash_keys.ports.src = flkeys->ports.src;
2078                         hash_keys.ports.dst = flkeys->ports.dst;
2079                         hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2080                 } else {
2081                         memset(&hash_keys, 0, sizeof(hash_keys));
2082                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2083                         hash_keys.addrs.v6addrs.src = fl6->saddr;
2084                         hash_keys.addrs.v6addrs.dst = fl6->daddr;
2085                         hash_keys.ports.src = fl6->fl6_sport;
2086                         hash_keys.ports.dst = fl6->fl6_dport;
2087                         hash_keys.basic.ip_proto = fl6->flowi6_proto;
2088                 }
2089                 break;
2090         }
2091         mhash = flow_hash_from_keys(&hash_keys);
2092
2093         return mhash >> 1;
2094 }
2095
2096 void ip6_route_input(struct sk_buff *skb)
2097 {
2098         const struct ipv6hdr *iph = ipv6_hdr(skb);
2099         struct net *net = dev_net(skb->dev);
2100         int flags = RT6_LOOKUP_F_HAS_SADDR;
2101         struct ip_tunnel_info *tun_info;
2102         struct flowi6 fl6 = {
2103                 .flowi6_iif = skb->dev->ifindex,
2104                 .daddr = iph->daddr,
2105                 .saddr = iph->saddr,
2106                 .flowlabel = ip6_flowinfo(iph),
2107                 .flowi6_mark = skb->mark,
2108                 .flowi6_proto = iph->nexthdr,
2109         };
2110         struct flow_keys *flkeys = NULL, _flkeys;
2111
2112         tun_info = skb_tunnel_info(skb);
2113         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2114                 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
2115
2116         if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
2117                 flkeys = &_flkeys;
2118
2119         if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
2120                 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
2121         skb_dst_drop(skb);
2122         skb_dst_set(skb,
2123                     ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
2124 }
2125
2126 static struct rt6_info *ip6_pol_route_output(struct net *net,
2127                                              struct fib6_table *table,
2128                                              struct flowi6 *fl6,
2129                                              const struct sk_buff *skb,
2130                                              int flags)
2131 {
2132         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
2133 }
2134
2135 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
2136                                          struct flowi6 *fl6, int flags)
2137 {
2138         bool any_src;
2139
2140         if (ipv6_addr_type(&fl6->daddr) &
2141             (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)) {
2142                 struct dst_entry *dst;
2143
2144                 dst = l3mdev_link_scope_lookup(net, fl6);
2145                 if (dst)
2146                         return dst;
2147         }
2148
2149         fl6->flowi6_iif = LOOPBACK_IFINDEX;
2150
2151         any_src = ipv6_addr_any(&fl6->saddr);
2152         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
2153             (fl6->flowi6_oif && any_src))
2154                 flags |= RT6_LOOKUP_F_IFACE;
2155
2156         if (!any_src)
2157                 flags |= RT6_LOOKUP_F_HAS_SADDR;
2158         else if (sk)
2159                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
2160
2161         return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
2162 }
2163 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
2164
2165 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2166 {
2167         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
2168         struct net_device *loopback_dev = net->loopback_dev;
2169         struct dst_entry *new = NULL;
2170
2171         rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
2172                        DST_OBSOLETE_DEAD, 0);
2173         if (rt) {
2174                 rt6_info_init(rt);
2175                 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
2176
2177                 new = &rt->dst;
2178                 new->__use = 1;
2179                 new->input = dst_discard;
2180                 new->output = dst_discard_out;
2181
2182                 dst_copy_metrics(new, &ort->dst);
2183
2184                 rt->rt6i_idev = in6_dev_get(loopback_dev);
2185                 rt->rt6i_gateway = ort->rt6i_gateway;
2186                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
2187
2188                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
2189 #ifdef CONFIG_IPV6_SUBTREES
2190                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
2191 #endif
2192         }
2193
2194         dst_release(dst_orig);
2195         return new ? new : ERR_PTR(-ENOMEM);
2196 }
2197
2198 /*
2199  *      Destination cache support functions
2200  */
2201
2202 static bool fib6_check(struct fib6_info *f6i, u32 cookie)
2203 {
2204         u32 rt_cookie = 0;
2205
2206         if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie)
2207                 return false;
2208
2209         if (fib6_check_expired(f6i))
2210                 return false;
2211
2212         return true;
2213 }
2214
2215 static struct dst_entry *rt6_check(struct rt6_info *rt,
2216                                    struct fib6_info *from,
2217                                    u32 cookie)
2218 {
2219         u32 rt_cookie = 0;
2220
2221         if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) ||
2222             rt_cookie != cookie)
2223                 return NULL;
2224
2225         if (rt6_check_expired(rt))
2226                 return NULL;
2227
2228         return &rt->dst;
2229 }
2230
2231 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt,
2232                                             struct fib6_info *from,
2233                                             u32 cookie)
2234 {
2235         if (!__rt6_check_expired(rt) &&
2236             rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
2237             fib6_check(from, cookie))
2238                 return &rt->dst;
2239         else
2240                 return NULL;
2241 }
2242
2243 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
2244 {
2245         struct dst_entry *dst_ret;
2246         struct fib6_info *from;
2247         struct rt6_info *rt;
2248
2249         rt = container_of(dst, struct rt6_info, dst);
2250
2251         rcu_read_lock();
2252
2253         /* All IPV6 dsts are created with ->obsolete set to the value
2254          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
2255          * into this function always.
2256          */
2257
2258         from = rcu_dereference(rt->from);
2259
2260         if (from && (rt->rt6i_flags & RTF_PCPU ||
2261             unlikely(!list_empty(&rt->rt6i_uncached))))
2262                 dst_ret = rt6_dst_from_check(rt, from, cookie);
2263         else
2264                 dst_ret = rt6_check(rt, from, cookie);
2265
2266         rcu_read_unlock();
2267
2268         return dst_ret;
2269 }
2270
2271 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
2272 {
2273         struct rt6_info *rt = (struct rt6_info *) dst;
2274
2275         if (rt) {
2276                 if (rt->rt6i_flags & RTF_CACHE) {
2277                         rcu_read_lock();
2278                         if (rt6_check_expired(rt)) {
2279                                 rt6_remove_exception_rt(rt);
2280                                 dst = NULL;
2281                         }
2282                         rcu_read_unlock();
2283                 } else {
2284                         dst_release(dst);
2285                         dst = NULL;
2286                 }
2287         }
2288         return dst;
2289 }
2290
2291 static void ip6_link_failure(struct sk_buff *skb)
2292 {
2293         struct rt6_info *rt;
2294
2295         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2296
2297         rt = (struct rt6_info *) skb_dst(skb);
2298         if (rt) {
2299                 rcu_read_lock();
2300                 if (rt->rt6i_flags & RTF_CACHE) {
2301                         rt6_remove_exception_rt(rt);
2302                 } else {
2303                         struct fib6_info *from;
2304                         struct fib6_node *fn;
2305
2306                         from = rcu_dereference(rt->from);
2307                         if (from) {
2308                                 fn = rcu_dereference(from->fib6_node);
2309                                 if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2310                                         fn->fn_sernum = -1;
2311                         }
2312                 }
2313                 rcu_read_unlock();
2314         }
2315 }
2316
2317 static void rt6_update_expires(struct rt6_info *rt0, int timeout)
2318 {
2319         if (!(rt0->rt6i_flags & RTF_EXPIRES)) {
2320                 struct fib6_info *from;
2321
2322                 rcu_read_lock();
2323                 from = rcu_dereference(rt0->from);
2324                 if (from)
2325                         rt0->dst.expires = from->expires;
2326                 rcu_read_unlock();
2327         }
2328
2329         dst_set_expires(&rt0->dst, timeout);
2330         rt0->rt6i_flags |= RTF_EXPIRES;
2331 }
2332
2333 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2334 {
2335         struct net *net = dev_net(rt->dst.dev);
2336
2337         dst_metric_set(&rt->dst, RTAX_MTU, mtu);
2338         rt->rt6i_flags |= RTF_MODIFIED;
2339         rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2340 }
2341
2342 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2343 {
2344         return !(rt->rt6i_flags & RTF_CACHE) &&
2345                 (rt->rt6i_flags & RTF_PCPU || rcu_access_pointer(rt->from));
2346 }
2347
2348 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2349                                  const struct ipv6hdr *iph, u32 mtu)
2350 {
2351         const struct in6_addr *daddr, *saddr;
2352         struct rt6_info *rt6 = (struct rt6_info *)dst;
2353
2354         if (dst_metric_locked(dst, RTAX_MTU))
2355                 return;
2356
2357         if (iph) {
2358                 daddr = &iph->daddr;
2359                 saddr = &iph->saddr;
2360         } else if (sk) {
2361                 daddr = &sk->sk_v6_daddr;
2362                 saddr = &inet6_sk(sk)->saddr;
2363         } else {
2364                 daddr = NULL;
2365                 saddr = NULL;
2366         }
2367         dst_confirm_neigh(dst, daddr);
2368         mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2369         if (mtu >= dst_mtu(dst))
2370                 return;
2371
2372         if (!rt6_cache_allowed_for_pmtu(rt6)) {
2373                 rt6_do_update_pmtu(rt6, mtu);
2374                 /* update rt6_ex->stamp for cache */
2375                 if (rt6->rt6i_flags & RTF_CACHE)
2376                         rt6_update_exception_stamp_rt(rt6);
2377         } else if (daddr) {
2378                 struct fib6_result res = {};
2379                 struct rt6_info *nrt6;
2380
2381                 rcu_read_lock();
2382                 res.f6i = rcu_dereference(rt6->from);
2383                 if (!res.f6i) {
2384                         rcu_read_unlock();
2385                         return;
2386                 }
2387                 res.nh = &res.f6i->fib6_nh;
2388                 res.fib6_flags = res.f6i->fib6_flags;
2389                 res.fib6_type = res.f6i->fib6_type;
2390
2391                 nrt6 = ip6_rt_cache_alloc(&res, daddr, saddr);
2392                 if (nrt6) {
2393                         rt6_do_update_pmtu(nrt6, mtu);
2394                         if (rt6_insert_exception(nrt6, &res))
2395                                 dst_release_immediate(&nrt6->dst);
2396                 }
2397                 rcu_read_unlock();
2398         }
2399 }
2400
2401 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2402                                struct sk_buff *skb, u32 mtu)
2403 {
2404         __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2405 }
2406
2407 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2408                      int oif, u32 mark, kuid_t uid)
2409 {
2410         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2411         struct dst_entry *dst;
2412         struct flowi6 fl6 = {
2413                 .flowi6_oif = oif,
2414                 .flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark),
2415                 .daddr = iph->daddr,
2416                 .saddr = iph->saddr,
2417                 .flowlabel = ip6_flowinfo(iph),
2418                 .flowi6_uid = uid,
2419         };
2420
2421         dst = ip6_route_output(net, NULL, &fl6);
2422         if (!dst->error)
2423                 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2424         dst_release(dst);
2425 }
2426 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2427
2428 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2429 {
2430         int oif = sk->sk_bound_dev_if;
2431         struct dst_entry *dst;
2432
2433         if (!oif && skb->dev)
2434                 oif = l3mdev_master_ifindex(skb->dev);
2435
2436         ip6_update_pmtu(skb, sock_net(sk), mtu, oif, sk->sk_mark, sk->sk_uid);
2437
2438         dst = __sk_dst_get(sk);
2439         if (!dst || !dst->obsolete ||
2440             dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2441                 return;
2442
2443         bh_lock_sock(sk);
2444         if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2445                 ip6_datagram_dst_update(sk, false);
2446         bh_unlock_sock(sk);
2447 }
2448 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2449
2450 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
2451                            const struct flowi6 *fl6)
2452 {
2453 #ifdef CONFIG_IPV6_SUBTREES
2454         struct ipv6_pinfo *np = inet6_sk(sk);
2455 #endif
2456
2457         ip6_dst_store(sk, dst,
2458                       ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ?
2459                       &sk->sk_v6_daddr : NULL,
2460 #ifdef CONFIG_IPV6_SUBTREES
2461                       ipv6_addr_equal(&fl6->saddr, &np->saddr) ?
2462                       &np->saddr :
2463 #endif
2464                       NULL);
2465 }
2466
2467 static bool ip6_redirect_nh_match(const struct fib6_result *res,
2468                                   struct flowi6 *fl6,
2469                                   const struct in6_addr *gw,
2470                                   struct rt6_info **ret)
2471 {
2472         const struct fib6_nh *nh = res->nh;
2473
2474         if (nh->fib_nh_flags & RTNH_F_DEAD || !nh->fib_nh_gw_family ||
2475             fl6->flowi6_oif != nh->fib_nh_dev->ifindex)
2476                 return false;
2477
2478         /* rt_cache's gateway might be different from its 'parent'
2479          * in the case of an ip redirect.
2480          * So we keep searching in the exception table if the gateway
2481          * is different.
2482          */
2483         if (!ipv6_addr_equal(gw, &nh->fib_nh_gw6)) {
2484                 struct rt6_info *rt_cache;
2485
2486                 rt_cache = rt6_find_cached_rt(res, &fl6->daddr, &fl6->saddr);
2487                 if (rt_cache &&
2488                     ipv6_addr_equal(gw, &rt_cache->rt6i_gateway)) {
2489                         *ret = rt_cache;
2490                         return true;
2491                 }
2492                 return false;
2493         }
2494         return true;
2495 }
2496
2497 /* Handle redirects */
2498 struct ip6rd_flowi {
2499         struct flowi6 fl6;
2500         struct in6_addr gateway;
2501 };
2502
2503 static struct rt6_info *__ip6_route_redirect(struct net *net,
2504                                              struct fib6_table *table,
2505                                              struct flowi6 *fl6,
2506                                              const struct sk_buff *skb,
2507                                              int flags)
2508 {
2509         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2510         struct rt6_info *ret = NULL;
2511         struct fib6_result res = {};
2512         struct fib6_info *rt;
2513         struct fib6_node *fn;
2514
2515         /* Get the "current" route for this destination and
2516          * check if the redirect has come from appropriate router.
2517          *
2518          * RFC 4861 specifies that redirects should only be
2519          * accepted if they come from the nexthop to the target.
2520          * Due to the way the routes are chosen, this notion
2521          * is a bit fuzzy and one might need to check all possible
2522          * routes.
2523          */
2524
2525         rcu_read_lock();
2526         fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2527 restart:
2528         for_each_fib6_node_rt_rcu(fn) {
2529                 res.f6i = rt;
2530                 res.nh = &rt->fib6_nh;
2531
2532                 if (fib6_check_expired(rt))
2533                         continue;
2534                 if (rt->fib6_flags & RTF_REJECT)
2535                         break;
2536                 if (ip6_redirect_nh_match(&res, fl6, &rdfl->gateway, &ret))
2537                         goto out;
2538         }
2539
2540         if (!rt)
2541                 rt = net->ipv6.fib6_null_entry;
2542         else if (rt->fib6_flags & RTF_REJECT) {
2543                 ret = net->ipv6.ip6_null_entry;
2544                 goto out;
2545         }
2546
2547         if (rt == net->ipv6.fib6_null_entry) {
2548                 fn = fib6_backtrack(fn, &fl6->saddr);
2549                 if (fn)
2550                         goto restart;
2551         }
2552
2553         res.f6i = rt;
2554         res.nh = &rt->fib6_nh;
2555 out:
2556         if (ret) {
2557                 ip6_hold_safe(net, &ret);
2558         } else {
2559                 res.fib6_flags = res.f6i->fib6_flags;
2560                 res.fib6_type = res.f6i->fib6_type;
2561                 ret = ip6_create_rt_rcu(&res);
2562         }
2563
2564         rcu_read_unlock();
2565
2566         trace_fib6_table_lookup(net, &res, table, fl6);
2567         return ret;
2568 };
2569
2570 static struct dst_entry *ip6_route_redirect(struct net *net,
2571                                             const struct flowi6 *fl6,
2572                                             const struct sk_buff *skb,
2573                                             const struct in6_addr *gateway)
2574 {
2575         int flags = RT6_LOOKUP_F_HAS_SADDR;
2576         struct ip6rd_flowi rdfl;
2577
2578         rdfl.fl6 = *fl6;
2579         rdfl.gateway = *gateway;
2580
2581         return fib6_rule_lookup(net, &rdfl.fl6, skb,
2582                                 flags, __ip6_route_redirect);
2583 }
2584
2585 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2586                   kuid_t uid)
2587 {
2588         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2589         struct dst_entry *dst;
2590         struct flowi6 fl6 = {
2591                 .flowi6_iif = LOOPBACK_IFINDEX,
2592                 .flowi6_oif = oif,
2593                 .flowi6_mark = mark,
2594                 .daddr = iph->daddr,
2595                 .saddr = iph->saddr,
2596                 .flowlabel = ip6_flowinfo(iph),
2597                 .flowi6_uid = uid,
2598         };
2599
2600         dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
2601         rt6_do_redirect(dst, NULL, skb);
2602         dst_release(dst);
2603 }
2604 EXPORT_SYMBOL_GPL(ip6_redirect);
2605
2606 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif)
2607 {
2608         const struct ipv6hdr *iph = ipv6_hdr(skb);
2609         const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2610         struct dst_entry *dst;
2611         struct flowi6 fl6 = {
2612                 .flowi6_iif = LOOPBACK_IFINDEX,
2613                 .flowi6_oif = oif,
2614                 .daddr = msg->dest,
2615                 .saddr = iph->daddr,
2616                 .flowi6_uid = sock_net_uid(net, NULL),
2617         };
2618
2619         dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
2620         rt6_do_redirect(dst, NULL, skb);
2621         dst_release(dst);
2622 }
2623
2624 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2625 {
2626         ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2627                      sk->sk_uid);
2628 }
2629 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2630
2631 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2632 {
2633         struct net_device *dev = dst->dev;
2634         unsigned int mtu = dst_mtu(dst);
2635         struct net *net = dev_net(dev);
2636
2637         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2638
2639         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2640                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2641
2642         /*
2643          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2644          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2645          * IPV6_MAXPLEN is also valid and means: "any MSS,
2646          * rely only on pmtu discovery"
2647          */
2648         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2649                 mtu = IPV6_MAXPLEN;
2650         return mtu;
2651 }
2652
2653 static unsigned int ip6_mtu(const struct dst_entry *dst)
2654 {
2655         struct inet6_dev *idev;
2656         unsigned int mtu;
2657
2658         mtu = dst_metric_raw(dst, RTAX_MTU);
2659         if (mtu)
2660                 goto out;
2661
2662         mtu = IPV6_MIN_MTU;
2663
2664         rcu_read_lock();
2665         idev = __in6_dev_get(dst->dev);
2666         if (idev)
2667                 mtu = idev->cnf.mtu6;
2668         rcu_read_unlock();
2669
2670 out:
2671         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2672
2673         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2674 }
2675
2676 /* MTU selection:
2677  * 1. mtu on route is locked - use it
2678  * 2. mtu from nexthop exception
2679  * 3. mtu from egress device
2680  *
2681  * based on ip6_dst_mtu_forward and exception logic of
2682  * rt6_find_cached_rt; called with rcu_read_lock
2683  */
2684 u32 ip6_mtu_from_fib6(const struct fib6_result *res,
2685                       const struct in6_addr *daddr,
2686                       const struct in6_addr *saddr)
2687 {
2688         const struct fib6_nh *nh = res->nh;
2689         struct fib6_info *f6i = res->f6i;
2690         struct inet6_dev *idev;
2691         struct rt6_info *rt;
2692         u32 mtu = 0;
2693
2694         if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) {
2695                 mtu = f6i->fib6_pmtu;
2696                 if (mtu)
2697                         goto out;
2698         }
2699
2700         rt = rt6_find_cached_rt(res, daddr, saddr);
2701         if (unlikely(rt)) {
2702                 mtu = dst_metric_raw(&rt->dst, RTAX_MTU);
2703         } else {
2704                 struct net_device *dev = nh->fib_nh_dev;
2705
2706                 mtu = IPV6_MIN_MTU;
2707                 idev = __in6_dev_get(dev);
2708                 if (idev && idev->cnf.mtu6 > mtu)
2709                         mtu = idev->cnf.mtu6;
2710         }
2711
2712         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2713 out:
2714         return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu);
2715 }
2716
2717 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2718                                   struct flowi6 *fl6)
2719 {
2720         struct dst_entry *dst;
2721         struct rt6_info *rt;
2722         struct inet6_dev *idev = in6_dev_get(dev);
2723         struct net *net = dev_net(dev);
2724
2725         if (unlikely(!idev))
2726                 return ERR_PTR(-ENODEV);
2727
2728         rt = ip6_dst_alloc(net, dev, 0);
2729         if (unlikely(!rt)) {
2730                 in6_dev_put(idev);
2731                 dst = ERR_PTR(-ENOMEM);
2732                 goto out;
2733         }
2734
2735         rt->dst.flags |= DST_HOST;
2736         rt->dst.input = ip6_input;
2737         rt->dst.output  = ip6_output;
2738         rt->rt6i_gateway  = fl6->daddr;
2739         rt->rt6i_dst.addr = fl6->daddr;
2740         rt->rt6i_dst.plen = 128;
2741         rt->rt6i_idev     = idev;
2742         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2743
2744         /* Add this dst into uncached_list so that rt6_disable_ip() can
2745          * do proper release of the net_device
2746          */
2747         rt6_uncached_list_add(rt);
2748         atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2749
2750         dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2751
2752 out:
2753         return dst;
2754 }
2755
2756 static int ip6_dst_gc(struct dst_ops *ops)
2757 {
2758         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2759         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2760         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2761         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2762         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2763         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2764         int entries;
2765
2766         entries = dst_entries_get_fast(ops);
2767         if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2768             entries <= rt_max_size)
2769                 goto out;
2770
2771         net->ipv6.ip6_rt_gc_expire++;
2772         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2773         entries = dst_entries_get_slow(ops);
2774         if (entries < ops->gc_thresh)
2775                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2776 out:
2777         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2778         return entries > rt_max_size;
2779 }
2780
2781 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2782                                             struct fib6_config *cfg,
2783                                             const struct in6_addr *gw_addr,
2784                                             u32 tbid, int flags)
2785 {
2786         struct flowi6 fl6 = {
2787                 .flowi6_oif = cfg->fc_ifindex,
2788                 .daddr = *gw_addr,
2789                 .saddr = cfg->fc_prefsrc,
2790         };
2791         struct fib6_table *table;
2792         struct rt6_info *rt;
2793
2794         table = fib6_get_table(net, tbid);
2795         if (!table)
2796                 return NULL;
2797
2798         if (!ipv6_addr_any(&cfg->fc_prefsrc))
2799                 flags |= RT6_LOOKUP_F_HAS_SADDR;
2800
2801         flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
2802         rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags);
2803
2804         /* if table lookup failed, fall back to full lookup */
2805         if (rt == net->ipv6.ip6_null_entry) {
2806                 ip6_rt_put(rt);
2807                 rt = NULL;
2808         }
2809
2810         return rt;
2811 }
2812
2813 static int ip6_route_check_nh_onlink(struct net *net,
2814                                      struct fib6_config *cfg,
2815                                      const struct net_device *dev,
2816                                      struct netlink_ext_ack *extack)
2817 {
2818         u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
2819         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2820         u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
2821         struct fib6_info *from;
2822         struct rt6_info *grt;
2823         int err;
2824
2825         err = 0;
2826         grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
2827         if (grt) {
2828                 rcu_read_lock();
2829                 from = rcu_dereference(grt->from);
2830                 if (!grt->dst.error &&
2831                     /* ignore match if it is the default route */
2832                     from && !ipv6_addr_any(&from->fib6_dst.addr) &&
2833                     (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
2834                         NL_SET_ERR_MSG(extack,
2835                                        "Nexthop has invalid gateway or device mismatch");
2836                         err = -EINVAL;
2837                 }
2838                 rcu_read_unlock();
2839
2840                 ip6_rt_put(grt);
2841         }
2842
2843         return err;
2844 }
2845
2846 static int ip6_route_check_nh(struct net *net,
2847                               struct fib6_config *cfg,
2848                               struct net_device **_dev,
2849                               struct inet6_dev **idev)
2850 {
2851         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2852         struct net_device *dev = _dev ? *_dev : NULL;
2853         struct rt6_info *grt = NULL;
2854         int err = -EHOSTUNREACH;
2855
2856         if (cfg->fc_table) {
2857                 int flags = RT6_LOOKUP_F_IFACE;
2858
2859                 grt = ip6_nh_lookup_table(net, cfg, gw_addr,
2860                                           cfg->fc_table, flags);
2861                 if (grt) {
2862                         if (grt->rt6i_flags & RTF_GATEWAY ||
2863                             (dev && dev != grt->dst.dev)) {
2864                                 ip6_rt_put(grt);
2865                                 grt = NULL;
2866                         }
2867                 }
2868         }
2869
2870         if (!grt)
2871                 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1);
2872
2873         if (!grt)
2874                 goto out;
2875
2876         if (dev) {
2877                 if (dev != grt->dst.dev) {
2878                         ip6_rt_put(grt);
2879                         goto out;
2880                 }
2881         } else {
2882                 *_dev = dev = grt->dst.dev;
2883                 *idev = grt->rt6i_idev;
2884                 dev_hold(dev);
2885                 in6_dev_hold(grt->rt6i_idev);
2886         }
2887
2888         if (!(grt->rt6i_flags & RTF_GATEWAY))
2889                 err = 0;
2890
2891         ip6_rt_put(grt);
2892
2893 out:
2894         return err;
2895 }
2896
2897 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
2898                            struct net_device **_dev, struct inet6_dev **idev,
2899                            struct netlink_ext_ack *extack)
2900 {
2901         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2902         int gwa_type = ipv6_addr_type(gw_addr);
2903         bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true;
2904         const struct net_device *dev = *_dev;
2905         bool need_addr_check = !dev;
2906         int err = -EINVAL;
2907
2908         /* if gw_addr is local we will fail to detect this in case
2909          * address is still TENTATIVE (DAD in progress). rt6_lookup()
2910          * will return already-added prefix route via interface that
2911          * prefix route was assigned to, which might be non-loopback.
2912          */
2913         if (dev &&
2914             ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2915                 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2916                 goto out;
2917         }
2918
2919         if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
2920                 /* IPv6 strictly inhibits using not link-local
2921                  * addresses as nexthop address.
2922                  * Otherwise, router will not able to send redirects.
2923                  * It is very good, but in some (rare!) circumstances
2924                  * (SIT, PtP, NBMA NOARP links) it is handy to allow
2925                  * some exceptions. --ANK
2926                  * We allow IPv4-mapped nexthops to support RFC4798-type
2927                  * addressing
2928                  */
2929                 if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) {
2930                         NL_SET_ERR_MSG(extack, "Invalid gateway address");
2931                         goto out;
2932                 }
2933
2934                 if (cfg->fc_flags & RTNH_F_ONLINK)
2935                         err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
2936                 else
2937                         err = ip6_route_check_nh(net, cfg, _dev, idev);
2938
2939                 if (err)
2940                         goto out;
2941         }
2942
2943         /* reload in case device was changed */
2944         dev = *_dev;
2945
2946         err = -EINVAL;
2947         if (!dev) {
2948                 NL_SET_ERR_MSG(extack, "Egress device not specified");
2949                 goto out;
2950         } else if (dev->flags & IFF_LOOPBACK) {
2951                 NL_SET_ERR_MSG(extack,
2952                                "Egress device can not be loopback device for this route");
2953                 goto out;
2954         }
2955
2956         /* if we did not check gw_addr above, do so now that the
2957          * egress device has been resolved.
2958          */
2959         if (need_addr_check &&
2960             ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2961                 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2962                 goto out;
2963         }
2964
2965         err = 0;
2966 out:
2967         return err;
2968 }
2969
2970 static bool fib6_is_reject(u32 flags, struct net_device *dev, int addr_type)
2971 {
2972         if ((flags & RTF_REJECT) ||
2973             (dev && (dev->flags & IFF_LOOPBACK) &&
2974              !(addr_type & IPV6_ADDR_LOOPBACK) &&
2975              !(flags & RTF_LOCAL)))
2976                 return true;
2977
2978         return false;
2979 }
2980
2981 int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh,
2982                  struct fib6_config *cfg, gfp_t gfp_flags,
2983                  struct netlink_ext_ack *extack)
2984 {
2985         struct net_device *dev = NULL;
2986         struct inet6_dev *idev = NULL;
2987         int addr_type;
2988         int err;
2989
2990         fib6_nh->fib_nh_family = AF_INET6;
2991
2992         err = -ENODEV;
2993         if (cfg->fc_ifindex) {
2994                 dev = dev_get_by_index(net, cfg->fc_ifindex);
2995                 if (!dev)
2996                         goto out;
2997                 idev = in6_dev_get(dev);
2998                 if (!idev)
2999                         goto out;
3000         }
3001
3002         if (cfg->fc_flags & RTNH_F_ONLINK) {
3003                 if (!dev) {
3004                         NL_SET_ERR_MSG(extack,
3005                                        "Nexthop device required for onlink");
3006                         goto out;
3007                 }
3008
3009                 if (!(dev->flags & IFF_UP)) {
3010                         NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3011                         err = -ENETDOWN;
3012                         goto out;
3013                 }
3014
3015                 fib6_nh->fib_nh_flags |= RTNH_F_ONLINK;
3016         }
3017
3018         fib6_nh->fib_nh_weight = 1;
3019
3020         /* We cannot add true routes via loopback here,
3021          * they would result in kernel looping; promote them to reject routes
3022          */
3023         addr_type = ipv6_addr_type(&cfg->fc_dst);
3024         if (fib6_is_reject(cfg->fc_flags, dev, addr_type)) {
3025                 /* hold loopback dev/idev if we haven't done so. */
3026                 if (dev != net->loopback_dev) {
3027                         if (dev) {
3028                                 dev_put(dev);
3029                                 in6_dev_put(idev);
3030                         }
3031                         dev = net->loopback_dev;
3032                         dev_hold(dev);
3033                         idev = in6_dev_get(dev);
3034                         if (!idev) {
3035                                 err = -ENODEV;
3036                                 goto out;
3037                         }
3038                 }
3039                 goto set_dev;
3040         }
3041
3042         if (cfg->fc_flags & RTF_GATEWAY) {
3043                 err = ip6_validate_gw(net, cfg, &dev, &idev, extack);
3044                 if (err)
3045                         goto out;
3046
3047                 fib6_nh->fib_nh_gw6 = cfg->fc_gateway;
3048                 fib6_nh->fib_nh_gw_family = AF_INET6;
3049         }
3050
3051         err = -ENODEV;
3052         if (!dev)
3053                 goto out;
3054
3055         if (idev->cnf.disable_ipv6) {
3056                 NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
3057                 err = -EACCES;
3058                 goto out;
3059         }
3060
3061         if (!(dev->flags & IFF_UP) && !cfg->fc_ignore_dev_down) {
3062                 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3063                 err = -ENETDOWN;
3064                 goto out;
3065         }
3066
3067         if (!(cfg->fc_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
3068             !netif_carrier_ok(dev))
3069                 fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN;
3070
3071         err = fib_nh_common_init(&fib6_nh->nh_common, cfg->fc_encap,
3072                                  cfg->fc_encap_type, cfg, gfp_flags, extack);
3073         if (err)
3074                 goto out;
3075 set_dev:
3076         fib6_nh->fib_nh_dev = dev;
3077         fib6_nh->fib_nh_oif = dev->ifindex;
3078         err = 0;
3079 out:
3080         if (idev)
3081                 in6_dev_put(idev);
3082
3083         if (err) {
3084                 lwtstate_put(fib6_nh->fib_nh_lws);
3085                 fib6_nh->fib_nh_lws = NULL;
3086                 if (dev)
3087                         dev_put(dev);
3088         }
3089
3090         return err;
3091 }
3092
3093 void fib6_nh_release(struct fib6_nh *fib6_nh)
3094 {
3095         fib_nh_common_release(&fib6_nh->nh_common);
3096 }
3097
3098 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
3099                                               gfp_t gfp_flags,
3100                                               struct netlink_ext_ack *extack)
3101 {
3102         struct net *net = cfg->fc_nlinfo.nl_net;
3103         struct fib6_info *rt = NULL;
3104         struct fib6_table *table;
3105         int err = -EINVAL;
3106         int addr_type;
3107
3108         /* RTF_PCPU is an internal flag; can not be set by userspace */
3109         if (cfg->fc_flags & RTF_PCPU) {
3110                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
3111                 goto out;
3112         }
3113
3114         /* RTF_CACHE is an internal flag; can not be set by userspace */
3115         if (cfg->fc_flags & RTF_CACHE) {
3116                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
3117                 goto out;
3118         }
3119
3120         if (cfg->fc_type > RTN_MAX) {
3121                 NL_SET_ERR_MSG(extack, "Invalid route type");
3122                 goto out;
3123         }
3124
3125         if (cfg->fc_dst_len > 128) {
3126                 NL_SET_ERR_MSG(extack, "Invalid prefix length");
3127                 goto out;
3128         }
3129         if (cfg->fc_src_len > 128) {
3130                 NL_SET_ERR_MSG(extack, "Invalid source address length");
3131                 goto out;
3132         }
3133 #ifndef CONFIG_IPV6_SUBTREES
3134         if (cfg->fc_src_len) {
3135                 NL_SET_ERR_MSG(extack,
3136                                "Specifying source address requires IPV6_SUBTREES to be enabled");
3137                 goto out;
3138         }
3139 #endif
3140
3141         err = -ENOBUFS;
3142         if (cfg->fc_nlinfo.nlh &&
3143             !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
3144                 table = fib6_get_table(net, cfg->fc_table);
3145                 if (!table) {
3146                         pr_warn("NLM_F_CREATE should be specified when creating new route\n");
3147                         table = fib6_new_table(net, cfg->fc_table);
3148                 }
3149         } else {
3150                 table = fib6_new_table(net, cfg->fc_table);
3151         }
3152
3153         if (!table)
3154                 goto out;
3155
3156         err = -ENOMEM;
3157         rt = fib6_info_alloc(gfp_flags);
3158         if (!rt)
3159                 goto out;
3160
3161         rt->fib6_metrics = ip_fib_metrics_init(net, cfg->fc_mx, cfg->fc_mx_len,
3162                                                extack);
3163         if (IS_ERR(rt->fib6_metrics)) {
3164                 err = PTR_ERR(rt->fib6_metrics);
3165                 /* Do not leave garbage there. */
3166                 rt->fib6_metrics = (struct dst_metrics *)&dst_default_metrics;
3167                 goto out;
3168         }
3169
3170         if (cfg->fc_flags & RTF_ADDRCONF)
3171                 rt->dst_nocount = true;
3172
3173         if (cfg->fc_flags & RTF_EXPIRES)
3174                 fib6_set_expires(rt, jiffies +
3175                                 clock_t_to_jiffies(cfg->fc_expires));
3176         else
3177                 fib6_clean_expires(rt);
3178
3179         if (cfg->fc_protocol == RTPROT_UNSPEC)
3180                 cfg->fc_protocol = RTPROT_BOOT;
3181         rt->fib6_protocol = cfg->fc_protocol;
3182
3183         rt->fib6_table = table;
3184         rt->fib6_metric = cfg->fc_metric;
3185         rt->fib6_type = cfg->fc_type;
3186         rt->fib6_flags = cfg->fc_flags & ~RTF_GATEWAY;
3187
3188         ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
3189         rt->fib6_dst.plen = cfg->fc_dst_len;
3190         if (rt->fib6_dst.plen == 128)
3191                 rt->dst_host = true;
3192
3193 #ifdef CONFIG_IPV6_SUBTREES
3194         ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
3195         rt->fib6_src.plen = cfg->fc_src_len;
3196 #endif
3197         err = fib6_nh_init(net, &rt->fib6_nh, cfg, gfp_flags, extack);
3198         if (err)
3199                 goto out;
3200
3201         /* We cannot add true routes via loopback here,
3202          * they would result in kernel looping; promote them to reject routes
3203          */
3204         addr_type = ipv6_addr_type(&cfg->fc_dst);
3205         if (fib6_is_reject(cfg->fc_flags, rt->fib6_nh.fib_nh_dev, addr_type))
3206                 rt->fib6_flags = RTF_REJECT | RTF_NONEXTHOP;
3207
3208         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
3209                 struct net_device *dev = fib6_info_nh_dev(rt);
3210
3211                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
3212                         NL_SET_ERR_MSG(extack, "Invalid source address");
3213                         err = -EINVAL;
3214                         goto out;
3215                 }
3216                 rt->fib6_prefsrc.addr = cfg->fc_prefsrc;
3217                 rt->fib6_prefsrc.plen = 128;
3218         } else
3219                 rt->fib6_prefsrc.plen = 0;
3220
3221         return rt;
3222 out:
3223         fib6_info_release(rt);
3224         return ERR_PTR(err);
3225 }
3226
3227 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
3228                   struct netlink_ext_ack *extack)
3229 {
3230         struct fib6_info *rt;
3231         int err;
3232
3233         rt = ip6_route_info_create(cfg, gfp_flags, extack);
3234         if (IS_ERR(rt))
3235                 return PTR_ERR(rt);
3236
3237         err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack);
3238         fib6_info_release(rt);
3239
3240         return err;
3241 }
3242
3243 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info)
3244 {
3245         struct net *net = info->nl_net;
3246         struct fib6_table *table;
3247         int err;
3248
3249         if (rt == net->ipv6.fib6_null_entry) {
3250                 err = -ENOENT;
3251                 goto out;
3252         }
3253
3254         table = rt->fib6_table;
3255         spin_lock_bh(&table->tb6_lock);
3256         err = fib6_del(rt, info);
3257         spin_unlock_bh(&table->tb6_lock);
3258
3259 out:
3260         fib6_info_release(rt);
3261         return err;
3262 }
3263
3264 int ip6_del_rt(struct net *net, struct fib6_info *rt)
3265 {
3266         struct nl_info info = { .nl_net = net };
3267
3268         return __ip6_del_rt(rt, &info);
3269 }
3270
3271 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
3272 {
3273         struct nl_info *info = &cfg->fc_nlinfo;
3274         struct net *net = info->nl_net;
3275         struct sk_buff *skb = NULL;
3276         struct fib6_table *table;
3277         int err = -ENOENT;
3278
3279         if (rt == net->ipv6.fib6_null_entry)
3280                 goto out_put;
3281         table = rt->fib6_table;
3282         spin_lock_bh(&table->tb6_lock);
3283
3284         if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) {
3285                 struct fib6_info *sibling, *next_sibling;
3286
3287                 /* prefer to send a single notification with all hops */
3288                 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3289                 if (skb) {
3290                         u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3291
3292                         if (rt6_fill_node(net, skb, rt, NULL,
3293                                           NULL, NULL, 0, RTM_DELROUTE,
3294                                           info->portid, seq, 0) < 0) {
3295                                 kfree_skb(skb);
3296                                 skb = NULL;
3297                         } else
3298                                 info->skip_notify = 1;
3299                 }
3300
3301                 list_for_each_entry_safe(sibling, next_sibling,
3302                                          &rt->fib6_siblings,
3303                                          fib6_siblings) {
3304                         err = fib6_del(sibling, info);
3305                         if (err)
3306                                 goto out_unlock;
3307                 }
3308         }
3309
3310         err = fib6_del(rt, info);
3311 out_unlock:
3312         spin_unlock_bh(&table->tb6_lock);
3313 out_put:
3314         fib6_info_release(rt);
3315
3316         if (skb) {
3317                 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3318                             info->nlh, gfp_any());
3319         }
3320         return err;
3321 }
3322
3323 static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
3324 {
3325         int rc = -ESRCH;
3326
3327         if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex)
3328                 goto out;
3329
3330         if (cfg->fc_flags & RTF_GATEWAY &&
3331             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
3332                 goto out;
3333
3334         rc = rt6_remove_exception_rt(rt);
3335 out:
3336         return rc;
3337 }
3338
3339 static int ip6_route_del(struct fib6_config *cfg,
3340                          struct netlink_ext_ack *extack)
3341 {
3342         struct rt6_info *rt_cache;
3343         struct fib6_table *table;
3344         struct fib6_info *rt;
3345         struct fib6_node *fn;
3346         int err = -ESRCH;
3347
3348         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
3349         if (!table) {
3350                 NL_SET_ERR_MSG(extack, "FIB table does not exist");
3351                 return err;
3352         }
3353
3354         rcu_read_lock();
3355
3356         fn = fib6_locate(&table->tb6_root,
3357                          &cfg->fc_dst, cfg->fc_dst_len,
3358                          &cfg->fc_src, cfg->fc_src_len,
3359                          !(cfg->fc_flags & RTF_CACHE));
3360
3361         if (fn) {
3362                 for_each_fib6_node_rt_rcu(fn) {
3363                         struct fib6_nh *nh;
3364
3365                         if (cfg->fc_flags & RTF_CACHE) {
3366                                 struct fib6_result res = {
3367                                         .f6i = rt,
3368                                 };
3369                                 int rc;
3370
3371                                 rt_cache = rt6_find_cached_rt(&res,
3372                                                               &cfg->fc_dst,
3373                                                               &cfg->fc_src);
3374                                 if (rt_cache) {
3375                                         rc = ip6_del_cached_rt(rt_cache, cfg);
3376                                         if (rc != -ESRCH) {
3377                                                 rcu_read_unlock();
3378                                                 return rc;
3379                                         }
3380                                 }
3381                                 continue;
3382                         }
3383
3384                         nh = &rt->fib6_nh;
3385                         if (cfg->fc_ifindex &&
3386                             (!nh->fib_nh_dev ||
3387                              nh->fib_nh_dev->ifindex != cfg->fc_ifindex))
3388                                 continue;
3389                         if (cfg->fc_flags & RTF_GATEWAY &&
3390                             !ipv6_addr_equal(&cfg->fc_gateway, &nh->fib_nh_gw6))
3391                                 continue;
3392                         if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
3393                                 continue;
3394                         if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol)
3395                                 continue;
3396                         if (!fib6_info_hold_safe(rt))
3397                                 continue;
3398                         rcu_read_unlock();
3399
3400                         /* if gateway was specified only delete the one hop */
3401                         if (cfg->fc_flags & RTF_GATEWAY)
3402                                 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
3403
3404                         return __ip6_del_rt_siblings(rt, cfg);
3405                 }
3406         }
3407         rcu_read_unlock();
3408
3409         return err;
3410 }
3411
3412 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
3413 {
3414         struct netevent_redirect netevent;
3415         struct rt6_info *rt, *nrt = NULL;
3416         struct fib6_result res = {};
3417         struct ndisc_options ndopts;
3418         struct inet6_dev *in6_dev;
3419         struct neighbour *neigh;
3420         struct rd_msg *msg;
3421         int optlen, on_link;
3422         u8 *lladdr;
3423
3424         optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
3425         optlen -= sizeof(*msg);
3426
3427         if (optlen < 0) {
3428                 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
3429                 return;
3430         }
3431
3432         msg = (struct rd_msg *)icmp6_hdr(skb);
3433
3434         if (ipv6_addr_is_multicast(&msg->dest)) {
3435                 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
3436                 return;
3437         }
3438
3439         on_link = 0;
3440         if (ipv6_addr_equal(&msg->dest, &msg->target)) {
3441                 on_link = 1;
3442         } else if (ipv6_addr_type(&msg->target) !=
3443                    (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
3444                 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
3445                 return;
3446         }
3447
3448         in6_dev = __in6_dev_get(skb->dev);
3449         if (!in6_dev)
3450                 return;
3451         if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
3452                 return;
3453
3454         /* RFC2461 8.1:
3455          *      The IP source address of the Redirect MUST be the same as the current
3456          *      first-hop router for the specified ICMP Destination Address.
3457          */
3458
3459         if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
3460                 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
3461                 return;
3462         }
3463
3464         lladdr = NULL;
3465         if (ndopts.nd_opts_tgt_lladdr) {
3466                 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
3467                                              skb->dev);
3468                 if (!lladdr) {
3469                         net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
3470                         return;
3471                 }
3472         }
3473
3474         rt = (struct rt6_info *) dst;
3475         if (rt->rt6i_flags & RTF_REJECT) {
3476                 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3477                 return;
3478         }
3479
3480         /* Redirect received -> path was valid.
3481          * Look, redirects are sent only in response to data packets,
3482          * so that this nexthop apparently is reachable. --ANK
3483          */
3484         dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3485
3486         neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3487         if (!neigh)
3488                 return;
3489
3490         /*
3491          *      We have finally decided to accept it.
3492          */
3493
3494         ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3495                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
3496                      NEIGH_UPDATE_F_OVERRIDE|
3497                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3498                                      NEIGH_UPDATE_F_ISROUTER)),
3499                      NDISC_REDIRECT, &ndopts);
3500
3501         rcu_read_lock();
3502         res.f6i = rcu_dereference(rt->from);
3503         if (!res.f6i)
3504                 goto out;
3505
3506         res.nh = &res.f6i->fib6_nh;
3507         res.fib6_flags = res.f6i->fib6_flags;
3508         res.fib6_type = res.f6i->fib6_type;
3509         nrt = ip6_rt_cache_alloc(&res, &msg->dest, NULL);
3510         if (!nrt)
3511                 goto out;
3512
3513         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3514         if (on_link)
3515                 nrt->rt6i_flags &= ~RTF_GATEWAY;
3516
3517         nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3518
3519         /* rt6_insert_exception() will take care of duplicated exceptions */
3520         if (rt6_insert_exception(nrt, &res)) {
3521                 dst_release_immediate(&nrt->dst);
3522                 goto out;
3523         }
3524
3525         netevent.old = &rt->dst;
3526         netevent.new = &nrt->dst;
3527         netevent.daddr = &msg->dest;
3528         netevent.neigh = neigh;
3529         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3530
3531 out:
3532         rcu_read_unlock();
3533         neigh_release(neigh);
3534 }
3535
3536 #ifdef CONFIG_IPV6_ROUTE_INFO
3537 static struct fib6_info *rt6_get_route_info(struct net *net,
3538                                            const struct in6_addr *prefix, int prefixlen,
3539                                            const struct in6_addr *gwaddr,
3540                                            struct net_device *dev)
3541 {
3542         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3543         int ifindex = dev->ifindex;
3544         struct fib6_node *fn;
3545         struct fib6_info *rt = NULL;
3546         struct fib6_table *table;
3547
3548         table = fib6_get_table(net, tb_id);
3549         if (!table)
3550                 return NULL;
3551
3552         rcu_read_lock();
3553         fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3554         if (!fn)
3555                 goto out;
3556
3557         for_each_fib6_node_rt_rcu(fn) {
3558                 if (rt->fib6_nh.fib_nh_dev->ifindex != ifindex)
3559                         continue;
3560                 if (!(rt->fib6_flags & RTF_ROUTEINFO) ||
3561                     !rt->fib6_nh.fib_nh_gw_family)
3562                         continue;
3563                 if (!ipv6_addr_equal(&rt->fib6_nh.fib_nh_gw6, gwaddr))
3564                         continue;
3565                 if (!fib6_info_hold_safe(rt))
3566                         continue;
3567                 break;
3568         }
3569 out:
3570         rcu_read_unlock();
3571         return rt;
3572 }
3573
3574 static struct fib6_info *rt6_add_route_info(struct net *net,
3575                                            const struct in6_addr *prefix, int prefixlen,
3576                                            const struct in6_addr *gwaddr,
3577                                            struct net_device *dev,
3578                                            unsigned int pref)
3579 {
3580         struct fib6_config cfg = {
3581                 .fc_metric      = IP6_RT_PRIO_USER,
3582                 .fc_ifindex     = dev->ifindex,
3583                 .fc_dst_len     = prefixlen,
3584                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3585                                   RTF_UP | RTF_PREF(pref),
3586                 .fc_protocol = RTPROT_RA,
3587                 .fc_type = RTN_UNICAST,
3588                 .fc_nlinfo.portid = 0,
3589                 .fc_nlinfo.nlh = NULL,
3590                 .fc_nlinfo.nl_net = net,
3591         };
3592
3593         cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3594         cfg.fc_dst = *prefix;
3595         cfg.fc_gateway = *gwaddr;
3596
3597         /* We should treat it as a default route if prefix length is 0. */
3598         if (!prefixlen)
3599                 cfg.fc_flags |= RTF_DEFAULT;
3600
3601         ip6_route_add(&cfg, GFP_ATOMIC, NULL);
3602
3603         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3604 }
3605 #endif
3606
3607 struct fib6_info *rt6_get_dflt_router(struct net *net,
3608                                      const struct in6_addr *addr,
3609                                      struct net_device *dev)
3610 {
3611         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3612         struct fib6_info *rt;
3613         struct fib6_table *table;
3614
3615         table = fib6_get_table(net, tb_id);
3616         if (!table)
3617                 return NULL;
3618
3619         rcu_read_lock();
3620         for_each_fib6_node_rt_rcu(&table->tb6_root) {
3621                 struct fib6_nh *nh = &rt->fib6_nh;
3622
3623                 if (dev == nh->fib_nh_dev &&
3624                     ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3625                     ipv6_addr_equal(&nh->fib_nh_gw6, addr))
3626                         break;
3627         }
3628         if (rt && !fib6_info_hold_safe(rt))
3629                 rt = NULL;
3630         rcu_read_unlock();
3631         return rt;
3632 }
3633
3634 struct fib6_info *rt6_add_dflt_router(struct net *net,
3635                                      const struct in6_addr *gwaddr,
3636                                      struct net_device *dev,
3637                                      unsigned int pref)
3638 {
3639         struct fib6_config cfg = {
3640                 .fc_table       = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3641                 .fc_metric      = IP6_RT_PRIO_USER,
3642                 .fc_ifindex     = dev->ifindex,
3643                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3644                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3645                 .fc_protocol = RTPROT_RA,
3646                 .fc_type = RTN_UNICAST,
3647                 .fc_nlinfo.portid = 0,
3648                 .fc_nlinfo.nlh = NULL,
3649                 .fc_nlinfo.nl_net = net,
3650         };
3651
3652         cfg.fc_gateway = *gwaddr;
3653
3654         if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) {
3655                 struct fib6_table *table;
3656
3657                 table = fib6_get_table(dev_net(dev), cfg.fc_table);
3658                 if (table)
3659                         table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3660         }
3661
3662         return rt6_get_dflt_router(net, gwaddr, dev);
3663 }
3664
3665 static void __rt6_purge_dflt_routers(struct net *net,
3666                                      struct fib6_table *table)
3667 {
3668         struct fib6_info *rt;
3669
3670 restart:
3671         rcu_read_lock();
3672         for_each_fib6_node_rt_rcu(&table->tb6_root) {
3673                 struct net_device *dev = fib6_info_nh_dev(rt);
3674                 struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL;
3675
3676                 if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3677                     (!idev || idev->cnf.accept_ra != 2) &&
3678                     fib6_info_hold_safe(rt)) {
3679                         rcu_read_unlock();
3680                         ip6_del_rt(net, rt);
3681                         goto restart;
3682                 }
3683         }
3684         rcu_read_unlock();
3685
3686         table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3687 }
3688
3689 void rt6_purge_dflt_routers(struct net *net)
3690 {
3691         struct fib6_table *table;
3692         struct hlist_head *head;
3693         unsigned int h;
3694
3695         rcu_read_lock();
3696
3697         for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3698                 head = &net->ipv6.fib_table_hash[h];
3699                 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3700                         if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3701                                 __rt6_purge_dflt_routers(net, table);
3702                 }
3703         }
3704
3705         rcu_read_unlock();
3706 }
3707
3708 static void rtmsg_to_fib6_config(struct net *net,
3709                                  struct in6_rtmsg *rtmsg,
3710                                  struct fib6_config *cfg)
3711 {
3712         *cfg = (struct fib6_config){
3713                 .fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3714                          : RT6_TABLE_MAIN,
3715                 .fc_ifindex = rtmsg->rtmsg_ifindex,
3716                 .fc_metric = rtmsg->rtmsg_metric ? : IP6_RT_PRIO_USER,
3717                 .fc_expires = rtmsg->rtmsg_info,
3718                 .fc_dst_len = rtmsg->rtmsg_dst_len,
3719                 .fc_src_len = rtmsg->rtmsg_src_len,
3720                 .fc_flags = rtmsg->rtmsg_flags,
3721                 .fc_type = rtmsg->rtmsg_type,
3722
3723                 .fc_nlinfo.nl_net = net,
3724
3725                 .fc_dst = rtmsg->rtmsg_dst,
3726                 .fc_src = rtmsg->rtmsg_src,
3727                 .fc_gateway = rtmsg->rtmsg_gateway,
3728         };
3729 }
3730
3731 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3732 {
3733         struct fib6_config cfg;
3734         struct in6_rtmsg rtmsg;
3735         int err;
3736
3737         switch (cmd) {
3738         case SIOCADDRT:         /* Add a route */
3739         case SIOCDELRT:         /* Delete a route */
3740                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3741                         return -EPERM;
3742                 err = copy_from_user(&rtmsg, arg,
3743                                      sizeof(struct in6_rtmsg));
3744                 if (err)
3745                         return -EFAULT;
3746
3747                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3748
3749                 rtnl_lock();
3750                 switch (cmd) {
3751                 case SIOCADDRT:
3752                         err = ip6_route_add(&cfg, GFP_KERNEL, NULL);
3753                         break;
3754                 case SIOCDELRT:
3755                         err = ip6_route_del(&cfg, NULL);
3756                         break;
3757                 default:
3758                         err = -EINVAL;
3759                 }
3760                 rtnl_unlock();
3761
3762                 return err;
3763         }
3764
3765         return -EINVAL;
3766 }
3767
3768 /*
3769  *      Drop the packet on the floor
3770  */
3771
3772 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3773 {
3774         struct dst_entry *dst = skb_dst(skb);
3775         struct net *net = dev_net(dst->dev);
3776         struct inet6_dev *idev;
3777         int type;
3778
3779         if (netif_is_l3_master(skb->dev) &&
3780             dst->dev == net->loopback_dev)
3781                 idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif));
3782         else
3783                 idev = ip6_dst_idev(dst);
3784
3785         switch (ipstats_mib_noroutes) {
3786         case IPSTATS_MIB_INNOROUTES:
3787                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3788                 if (type == IPV6_ADDR_ANY) {
3789                         IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
3790                         break;
3791                 }
3792                 /* FALLTHROUGH */
3793         case IPSTATS_MIB_OUTNOROUTES:
3794                 IP6_INC_STATS(net, idev, ipstats_mib_noroutes);
3795                 break;
3796         }
3797
3798         /* Start over by dropping the dst for l3mdev case */
3799         if (netif_is_l3_master(skb->dev))
3800                 skb_dst_drop(skb);
3801
3802         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3803         kfree_skb(skb);
3804         return 0;
3805 }
3806
3807 static int ip6_pkt_discard(struct sk_buff *skb)
3808 {
3809         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3810 }
3811
3812 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3813 {
3814         skb->dev = skb_dst(skb)->dev;
3815         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3816 }
3817
3818 static int ip6_pkt_prohibit(struct sk_buff *skb)
3819 {
3820         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3821 }
3822
3823 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3824 {
3825         skb->dev = skb_dst(skb)->dev;
3826         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3827 }
3828
3829 /*
3830  *      Allocate a dst for local (unicast / anycast) address.
3831  */
3832
3833 struct fib6_info *addrconf_f6i_alloc(struct net *net,
3834                                      struct inet6_dev *idev,
3835                                      const struct in6_addr *addr,
3836                                      bool anycast, gfp_t gfp_flags)
3837 {
3838         struct fib6_config cfg = {
3839                 .fc_table = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL,
3840                 .fc_ifindex = idev->dev->ifindex,
3841                 .fc_flags = RTF_UP | RTF_ADDRCONF | RTF_NONEXTHOP,
3842                 .fc_dst = *addr,
3843                 .fc_dst_len = 128,
3844                 .fc_protocol = RTPROT_KERNEL,
3845                 .fc_nlinfo.nl_net = net,
3846                 .fc_ignore_dev_down = true,
3847         };
3848
3849         if (anycast) {
3850                 cfg.fc_type = RTN_ANYCAST;
3851                 cfg.fc_flags |= RTF_ANYCAST;
3852         } else {
3853                 cfg.fc_type = RTN_LOCAL;
3854                 cfg.fc_flags |= RTF_LOCAL;
3855         }
3856
3857         return ip6_route_info_create(&cfg, gfp_flags, NULL);
3858 }
3859
3860 /* remove deleted ip from prefsrc entries */
3861 struct arg_dev_net_ip {
3862         struct net_device *dev;
3863         struct net *net;
3864         struct in6_addr *addr;
3865 };
3866
3867 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
3868 {
3869         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3870         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3871         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3872
3873         if (((void *)rt->fib6_nh.fib_nh_dev == dev || !dev) &&
3874             rt != net->ipv6.fib6_null_entry &&
3875             ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) {
3876                 spin_lock_bh(&rt6_exception_lock);
3877                 /* remove prefsrc entry */
3878                 rt->fib6_prefsrc.plen = 0;
3879                 spin_unlock_bh(&rt6_exception_lock);
3880         }
3881         return 0;
3882 }
3883
3884 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3885 {
3886         struct net *net = dev_net(ifp->idev->dev);
3887         struct arg_dev_net_ip adni = {
3888                 .dev = ifp->idev->dev,
3889                 .net = net,
3890                 .addr = &ifp->addr,
3891         };
3892         fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3893 }
3894
3895 #define RTF_RA_ROUTER           (RTF_ADDRCONF | RTF_DEFAULT)
3896
3897 /* Remove routers and update dst entries when gateway turn into host. */
3898 static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
3899 {
3900         struct in6_addr *gateway = (struct in6_addr *)arg;
3901
3902         if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3903             rt->fib6_nh.fib_nh_gw_family &&
3904             ipv6_addr_equal(gateway, &rt->fib6_nh.fib_nh_gw6)) {
3905                 return -1;
3906         }
3907
3908         /* Further clean up cached routes in exception table.
3909          * This is needed because cached route may have a different
3910          * gateway than its 'parent' in the case of an ip redirect.
3911          */
3912         rt6_exceptions_clean_tohost(rt, gateway);
3913
3914         return 0;
3915 }
3916
3917 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3918 {
3919         fib6_clean_all(net, fib6_clean_tohost, gateway);
3920 }
3921
3922 struct arg_netdev_event {
3923         const struct net_device *dev;
3924         union {
3925                 unsigned char nh_flags;
3926                 unsigned long event;
3927         };
3928 };
3929
3930 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
3931 {
3932         struct fib6_info *iter;
3933         struct fib6_node *fn;
3934
3935         fn = rcu_dereference_protected(rt->fib6_node,
3936                         lockdep_is_held(&rt->fib6_table->tb6_lock));
3937         iter = rcu_dereference_protected(fn->leaf,
3938                         lockdep_is_held(&rt->fib6_table->tb6_lock));
3939         while (iter) {
3940                 if (iter->fib6_metric == rt->fib6_metric &&
3941                     rt6_qualify_for_ecmp(iter))
3942                         return iter;
3943                 iter = rcu_dereference_protected(iter->fib6_next,
3944                                 lockdep_is_held(&rt->fib6_table->tb6_lock));
3945         }
3946
3947         return NULL;
3948 }
3949
3950 static bool rt6_is_dead(const struct fib6_info *rt)
3951 {
3952         if (rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD ||
3953             (rt->fib6_nh.fib_nh_flags & RTNH_F_LINKDOWN &&
3954              ip6_ignore_linkdown(rt->fib6_nh.fib_nh_dev)))
3955                 return true;
3956
3957         return false;
3958 }
3959
3960 static int rt6_multipath_total_weight(const struct fib6_info *rt)
3961 {
3962         struct fib6_info *iter;
3963         int total = 0;
3964
3965         if (!rt6_is_dead(rt))
3966                 total += rt->fib6_nh.fib_nh_weight;
3967
3968         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) {
3969                 if (!rt6_is_dead(iter))
3970                         total += iter->fib6_nh.fib_nh_weight;
3971         }
3972
3973         return total;
3974 }
3975
3976 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total)
3977 {
3978         int upper_bound = -1;
3979
3980         if (!rt6_is_dead(rt)) {
3981                 *weight += rt->fib6_nh.fib_nh_weight;
3982                 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
3983                                                     total) - 1;
3984         }
3985         atomic_set(&rt->fib6_nh.fib_nh_upper_bound, upper_bound);
3986 }
3987
3988 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total)
3989 {
3990         struct fib6_info *iter;
3991         int weight = 0;
3992
3993         rt6_upper_bound_set(rt, &weight, total);
3994
3995         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3996                 rt6_upper_bound_set(iter, &weight, total);
3997 }
3998
3999 void rt6_multipath_rebalance(struct fib6_info *rt)
4000 {
4001         struct fib6_info *first;
4002         int total;
4003
4004         /* In case the entire multipath route was marked for flushing,
4005          * then there is no need to rebalance upon the removal of every
4006          * sibling route.
4007          */
4008         if (!rt->fib6_nsiblings || rt->should_flush)
4009                 return;
4010
4011         /* During lookup routes are evaluated in order, so we need to
4012          * make sure upper bounds are assigned from the first sibling
4013          * onwards.
4014          */
4015         first = rt6_multipath_first_sibling(rt);
4016         if (WARN_ON_ONCE(!first))
4017                 return;
4018
4019         total = rt6_multipath_total_weight(first);
4020         rt6_multipath_upper_bound_set(first, total);
4021 }
4022
4023 static int fib6_ifup(struct fib6_info *rt, void *p_arg)
4024 {
4025         const struct arg_netdev_event *arg = p_arg;
4026         struct net *net = dev_net(arg->dev);
4027
4028         if (rt != net->ipv6.fib6_null_entry &&
4029             rt->fib6_nh.fib_nh_dev == arg->dev) {
4030                 rt->fib6_nh.fib_nh_flags &= ~arg->nh_flags;
4031                 fib6_update_sernum_upto_root(net, rt);
4032                 rt6_multipath_rebalance(rt);
4033         }
4034
4035         return 0;
4036 }
4037
4038 void rt6_sync_up(struct net_device *dev, unsigned char nh_flags)
4039 {
4040         struct arg_netdev_event arg = {
4041                 .dev = dev,
4042                 {
4043                         .nh_flags = nh_flags,
4044                 },
4045         };
4046
4047         if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
4048                 arg.nh_flags |= RTNH_F_LINKDOWN;
4049
4050         fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
4051 }
4052
4053 static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
4054                                    const struct net_device *dev)
4055 {
4056         struct fib6_info *iter;
4057
4058         if (rt->fib6_nh.fib_nh_dev == dev)
4059                 return true;
4060         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4061                 if (iter->fib6_nh.fib_nh_dev == dev)
4062                         return true;
4063
4064         return false;
4065 }
4066
4067 static void rt6_multipath_flush(struct fib6_info *rt)
4068 {
4069         struct fib6_info *iter;
4070
4071         rt->should_flush = 1;
4072         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4073                 iter->should_flush = 1;
4074 }
4075
4076 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt,
4077                                              const struct net_device *down_dev)
4078 {
4079         struct fib6_info *iter;
4080         unsigned int dead = 0;
4081
4082         if (rt->fib6_nh.fib_nh_dev == down_dev ||
4083             rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD)
4084                 dead++;
4085         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4086                 if (iter->fib6_nh.fib_nh_dev == down_dev ||
4087                     iter->fib6_nh.fib_nh_flags & RTNH_F_DEAD)
4088                         dead++;
4089
4090         return dead;
4091 }
4092
4093 static void rt6_multipath_nh_flags_set(struct fib6_info *rt,
4094                                        const struct net_device *dev,
4095                                        unsigned char nh_flags)
4096 {
4097         struct fib6_info *iter;
4098
4099         if (rt->fib6_nh.fib_nh_dev == dev)
4100                 rt->fib6_nh.fib_nh_flags |= nh_flags;
4101         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4102                 if (iter->fib6_nh.fib_nh_dev == dev)
4103                         iter->fib6_nh.fib_nh_flags |= nh_flags;
4104 }
4105
4106 /* called with write lock held for table with rt */
4107 static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
4108 {
4109         const struct arg_netdev_event *arg = p_arg;
4110         const struct net_device *dev = arg->dev;
4111         struct net *net = dev_net(dev);
4112
4113         if (rt == net->ipv6.fib6_null_entry)
4114                 return 0;
4115
4116         switch (arg->event) {
4117         case NETDEV_UNREGISTER:
4118                 return rt->fib6_nh.fib_nh_dev == dev ? -1 : 0;
4119         case NETDEV_DOWN:
4120                 if (rt->should_flush)
4121                         return -1;
4122                 if (!rt->fib6_nsiblings)
4123                         return rt->fib6_nh.fib_nh_dev == dev ? -1 : 0;
4124                 if (rt6_multipath_uses_dev(rt, dev)) {
4125                         unsigned int count;
4126
4127                         count = rt6_multipath_dead_count(rt, dev);
4128                         if (rt->fib6_nsiblings + 1 == count) {
4129                                 rt6_multipath_flush(rt);
4130                                 return -1;
4131                         }
4132                         rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
4133                                                    RTNH_F_LINKDOWN);
4134                         fib6_update_sernum(net, rt);
4135                         rt6_multipath_rebalance(rt);
4136                 }
4137                 return -2;
4138         case NETDEV_CHANGE:
4139                 if (rt->fib6_nh.fib_nh_dev != dev ||
4140                     rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
4141                         break;
4142                 rt->fib6_nh.fib_nh_flags |= RTNH_F_LINKDOWN;
4143                 rt6_multipath_rebalance(rt);
4144                 break;
4145         }
4146
4147         return 0;
4148 }
4149
4150 void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
4151 {
4152         struct arg_netdev_event arg = {
4153                 .dev = dev,
4154                 {
4155                         .event = event,
4156                 },
4157         };
4158         struct net *net = dev_net(dev);
4159
4160         if (net->ipv6.sysctl.skip_notify_on_dev_down)
4161                 fib6_clean_all_skip_notify(net, fib6_ifdown, &arg);
4162         else
4163                 fib6_clean_all(net, fib6_ifdown, &arg);
4164 }
4165
4166 void rt6_disable_ip(struct net_device *dev, unsigned long event)
4167 {
4168         rt6_sync_down_dev(dev, event);
4169         rt6_uncached_list_flush_dev(dev_net(dev), dev);
4170         neigh_ifdown(&nd_tbl, dev);
4171 }
4172
4173 struct rt6_mtu_change_arg {
4174         struct net_device *dev;
4175         unsigned int mtu;
4176 };
4177
4178 static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg)
4179 {
4180         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
4181         struct inet6_dev *idev;
4182
4183         /* In IPv6 pmtu discovery is not optional,
4184            so that RTAX_MTU lock cannot disable it.
4185            We still use this lock to block changes
4186            caused by addrconf/ndisc.
4187         */
4188
4189         idev = __in6_dev_get(arg->dev);
4190         if (!idev)
4191                 return 0;
4192
4193         /* For administrative MTU increase, there is no way to discover
4194            IPv6 PMTU increase, so PMTU increase should be updated here.
4195            Since RFC 1981 doesn't include administrative MTU increase
4196            update PMTU increase is a MUST. (i.e. jumbo frame)
4197          */
4198         if (rt->fib6_nh.fib_nh_dev == arg->dev &&
4199             !fib6_metric_locked(rt, RTAX_MTU)) {
4200                 u32 mtu = rt->fib6_pmtu;
4201
4202                 if (mtu >= arg->mtu ||
4203                     (mtu < arg->mtu && mtu == idev->cnf.mtu6))
4204                         fib6_metric_set(rt, RTAX_MTU, arg->mtu);
4205
4206                 spin_lock_bh(&rt6_exception_lock);
4207                 rt6_exceptions_update_pmtu(idev, rt, arg->mtu);
4208                 spin_unlock_bh(&rt6_exception_lock);
4209         }
4210         return 0;
4211 }
4212
4213 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
4214 {
4215         struct rt6_mtu_change_arg arg = {
4216                 .dev = dev,
4217                 .mtu = mtu,
4218         };
4219
4220         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
4221 }
4222
4223 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
4224         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
4225         [RTA_PREFSRC]           = { .len = sizeof(struct in6_addr) },
4226         [RTA_OIF]               = { .type = NLA_U32 },
4227         [RTA_IIF]               = { .type = NLA_U32 },
4228         [RTA_PRIORITY]          = { .type = NLA_U32 },
4229         [RTA_METRICS]           = { .type = NLA_NESTED },
4230         [RTA_MULTIPATH]         = { .len = sizeof(struct rtnexthop) },
4231         [RTA_PREF]              = { .type = NLA_U8 },
4232         [RTA_ENCAP_TYPE]        = { .type = NLA_U16 },
4233         [RTA_ENCAP]             = { .type = NLA_NESTED },
4234         [RTA_EXPIRES]           = { .type = NLA_U32 },
4235         [RTA_UID]               = { .type = NLA_U32 },
4236         [RTA_MARK]              = { .type = NLA_U32 },
4237         [RTA_TABLE]             = { .type = NLA_U32 },
4238         [RTA_IP_PROTO]          = { .type = NLA_U8 },
4239         [RTA_SPORT]             = { .type = NLA_U16 },
4240         [RTA_DPORT]             = { .type = NLA_U16 },
4241 };
4242
4243 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
4244                               struct fib6_config *cfg,
4245                               struct netlink_ext_ack *extack)
4246 {
4247         struct rtmsg *rtm;
4248         struct nlattr *tb[RTA_MAX+1];
4249         unsigned int pref;
4250         int err;
4251
4252         err = nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
4253                                      rtm_ipv6_policy, extack);
4254         if (err < 0)
4255                 goto errout;
4256
4257         err = -EINVAL;
4258         rtm = nlmsg_data(nlh);
4259
4260         *cfg = (struct fib6_config){
4261                 .fc_table = rtm->rtm_table,
4262                 .fc_dst_len = rtm->rtm_dst_len,
4263                 .fc_src_len = rtm->rtm_src_len,
4264                 .fc_flags = RTF_UP,
4265                 .fc_protocol = rtm->rtm_protocol,
4266                 .fc_type = rtm->rtm_type,
4267
4268                 .fc_nlinfo.portid = NETLINK_CB(skb).portid,
4269                 .fc_nlinfo.nlh = nlh,
4270                 .fc_nlinfo.nl_net = sock_net(skb->sk),
4271         };
4272
4273         if (rtm->rtm_type == RTN_UNREACHABLE ||
4274             rtm->rtm_type == RTN_BLACKHOLE ||
4275             rtm->rtm_type == RTN_PROHIBIT ||
4276             rtm->rtm_type == RTN_THROW)
4277                 cfg->fc_flags |= RTF_REJECT;
4278
4279         if (rtm->rtm_type == RTN_LOCAL)
4280                 cfg->fc_flags |= RTF_LOCAL;
4281
4282         if (rtm->rtm_flags & RTM_F_CLONED)
4283                 cfg->fc_flags |= RTF_CACHE;
4284
4285         cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
4286
4287         if (tb[RTA_GATEWAY]) {
4288                 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
4289                 cfg->fc_flags |= RTF_GATEWAY;
4290         }
4291         if (tb[RTA_VIA]) {
4292                 NL_SET_ERR_MSG(extack, "IPv6 does not support RTA_VIA attribute");
4293                 goto errout;
4294         }
4295
4296         if (tb[RTA_DST]) {
4297                 int plen = (rtm->rtm_dst_len + 7) >> 3;
4298
4299                 if (nla_len(tb[RTA_DST]) < plen)
4300                         goto errout;
4301
4302                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
4303         }
4304
4305         if (tb[RTA_SRC]) {
4306                 int plen = (rtm->rtm_src_len + 7) >> 3;
4307
4308                 if (nla_len(tb[RTA_SRC]) < plen)
4309                         goto errout;
4310
4311                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
4312         }
4313
4314         if (tb[RTA_PREFSRC])
4315                 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
4316
4317         if (tb[RTA_OIF])
4318                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
4319
4320         if (tb[RTA_PRIORITY])
4321                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
4322
4323         if (tb[RTA_METRICS]) {
4324                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
4325                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
4326         }
4327
4328         if (tb[RTA_TABLE])
4329                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
4330
4331         if (tb[RTA_MULTIPATH]) {
4332                 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
4333                 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
4334
4335                 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
4336                                                      cfg->fc_mp_len, extack);
4337                 if (err < 0)
4338                         goto errout;
4339         }
4340
4341         if (tb[RTA_PREF]) {
4342                 pref = nla_get_u8(tb[RTA_PREF]);
4343                 if (pref != ICMPV6_ROUTER_PREF_LOW &&
4344                     pref != ICMPV6_ROUTER_PREF_HIGH)
4345                         pref = ICMPV6_ROUTER_PREF_MEDIUM;
4346                 cfg->fc_flags |= RTF_PREF(pref);
4347         }
4348
4349         if (tb[RTA_ENCAP])
4350                 cfg->fc_encap = tb[RTA_ENCAP];
4351
4352         if (tb[RTA_ENCAP_TYPE]) {
4353                 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
4354
4355                 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
4356                 if (err < 0)
4357                         goto errout;
4358         }
4359
4360         if (tb[RTA_EXPIRES]) {
4361                 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
4362
4363                 if (addrconf_finite_timeout(timeout)) {
4364                         cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
4365                         cfg->fc_flags |= RTF_EXPIRES;
4366                 }
4367         }
4368
4369         err = 0;
4370 errout:
4371         return err;
4372 }
4373
4374 struct rt6_nh {
4375         struct fib6_info *fib6_info;
4376         struct fib6_config r_cfg;
4377         struct list_head next;
4378 };
4379
4380 static int ip6_route_info_append(struct net *net,
4381                                  struct list_head *rt6_nh_list,
4382                                  struct fib6_info *rt,
4383                                  struct fib6_config *r_cfg)
4384 {
4385         struct rt6_nh *nh;
4386         int err = -EEXIST;
4387
4388         list_for_each_entry(nh, rt6_nh_list, next) {
4389                 /* check if fib6_info already exists */
4390                 if (rt6_duplicate_nexthop(nh->fib6_info, rt))
4391                         return err;
4392         }
4393
4394         nh = kzalloc(sizeof(*nh), GFP_KERNEL);
4395         if (!nh)
4396                 return -ENOMEM;
4397         nh->fib6_info = rt;
4398         memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
4399         list_add_tail(&nh->next, rt6_nh_list);
4400
4401         return 0;
4402 }
4403
4404 static void ip6_route_mpath_notify(struct fib6_info *rt,
4405                                    struct fib6_info *rt_last,
4406                                    struct nl_info *info,
4407                                    __u16 nlflags)
4408 {
4409         /* if this is an APPEND route, then rt points to the first route
4410          * inserted and rt_last points to last route inserted. Userspace
4411          * wants a consistent dump of the route which starts at the first
4412          * nexthop. Since sibling routes are always added at the end of
4413          * the list, find the first sibling of the last route appended
4414          */
4415         if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) {
4416                 rt = list_first_entry(&rt_last->fib6_siblings,
4417                                       struct fib6_info,
4418                                       fib6_siblings);
4419         }
4420
4421         if (rt)
4422                 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
4423 }
4424
4425 static int ip6_route_multipath_add(struct fib6_config *cfg,
4426                                    struct netlink_ext_ack *extack)
4427 {
4428         struct fib6_info *rt_notif = NULL, *rt_last = NULL;
4429         struct nl_info *info = &cfg->fc_nlinfo;
4430         struct fib6_config r_cfg;
4431         struct rtnexthop *rtnh;
4432         struct fib6_info *rt;
4433         struct rt6_nh *err_nh;
4434         struct rt6_nh *nh, *nh_safe;
4435         __u16 nlflags;
4436         int remaining;
4437         int attrlen;
4438         int err = 1;
4439         int nhn = 0;
4440         int replace = (cfg->fc_nlinfo.nlh &&
4441                        (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
4442         LIST_HEAD(rt6_nh_list);
4443
4444         nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
4445         if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
4446                 nlflags |= NLM_F_APPEND;
4447
4448         remaining = cfg->fc_mp_len;
4449         rtnh = (struct rtnexthop *)cfg->fc_mp;
4450
4451         /* Parse a Multipath Entry and build a list (rt6_nh_list) of
4452          * fib6_info structs per nexthop
4453          */
4454         while (rtnh_ok(rtnh, remaining)) {
4455                 memcpy(&r_cfg, cfg, sizeof(*cfg));
4456                 if (rtnh->rtnh_ifindex)
4457                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4458
4459                 attrlen = rtnh_attrlen(rtnh);
4460                 if (attrlen > 0) {
4461                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4462
4463                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4464                         if (nla) {
4465                                 r_cfg.fc_gateway = nla_get_in6_addr(nla);
4466                                 r_cfg.fc_flags |= RTF_GATEWAY;
4467                         }
4468                         r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
4469                         nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
4470                         if (nla)
4471                                 r_cfg.fc_encap_type = nla_get_u16(nla);
4472                 }
4473
4474                 r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
4475                 rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack);
4476                 if (IS_ERR(rt)) {
4477                         err = PTR_ERR(rt);
4478                         rt = NULL;
4479                         goto cleanup;
4480                 }
4481                 if (!rt6_qualify_for_ecmp(rt)) {
4482                         err = -EINVAL;
4483                         NL_SET_ERR_MSG(extack,
4484                                        "Device only routes can not be added for IPv6 using the multipath API.");
4485                         fib6_info_release(rt);
4486                         goto cleanup;
4487                 }
4488
4489                 rt->fib6_nh.fib_nh_weight = rtnh->rtnh_hops + 1;
4490
4491                 err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
4492                                             rt, &r_cfg);
4493                 if (err) {
4494                         fib6_info_release(rt);
4495                         goto cleanup;
4496                 }
4497
4498                 rtnh = rtnh_next(rtnh, &remaining);
4499         }
4500
4501         /* for add and replace send one notification with all nexthops.
4502          * Skip the notification in fib6_add_rt2node and send one with
4503          * the full route when done
4504          */
4505         info->skip_notify = 1;
4506
4507         err_nh = NULL;
4508         list_for_each_entry(nh, &rt6_nh_list, next) {
4509                 err = __ip6_ins_rt(nh->fib6_info, info, extack);
4510                 fib6_info_release(nh->fib6_info);
4511
4512                 if (!err) {
4513                         /* save reference to last route successfully inserted */
4514                         rt_last = nh->fib6_info;
4515
4516                         /* save reference to first route for notification */
4517                         if (!rt_notif)
4518                                 rt_notif = nh->fib6_info;
4519                 }
4520
4521                 /* nh->fib6_info is used or freed at this point, reset to NULL*/
4522                 nh->fib6_info = NULL;
4523                 if (err) {
4524                         if (replace && nhn)
4525                                 NL_SET_ERR_MSG_MOD(extack,
4526                                                    "multipath route replace failed (check consistency of installed routes)");
4527                         err_nh = nh;
4528                         goto add_errout;
4529                 }
4530
4531                 /* Because each route is added like a single route we remove
4532                  * these flags after the first nexthop: if there is a collision,
4533                  * we have already failed to add the first nexthop:
4534                  * fib6_add_rt2node() has rejected it; when replacing, old
4535                  * nexthops have been replaced by first new, the rest should
4536                  * be added to it.
4537                  */
4538                 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
4539                                                      NLM_F_REPLACE);
4540                 nhn++;
4541         }
4542
4543         /* success ... tell user about new route */
4544         ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4545         goto cleanup;
4546
4547 add_errout:
4548         /* send notification for routes that were added so that
4549          * the delete notifications sent by ip6_route_del are
4550          * coherent
4551          */
4552         if (rt_notif)
4553                 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4554
4555         /* Delete routes that were already added */
4556         list_for_each_entry(nh, &rt6_nh_list, next) {
4557                 if (err_nh == nh)
4558                         break;
4559                 ip6_route_del(&nh->r_cfg, extack);
4560         }
4561
4562 cleanup:
4563         list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
4564                 if (nh->fib6_info)
4565                         fib6_info_release(nh->fib6_info);
4566                 list_del(&nh->next);
4567                 kfree(nh);
4568         }
4569
4570         return err;
4571 }
4572
4573 static int ip6_route_multipath_del(struct fib6_config *cfg,
4574                                    struct netlink_ext_ack *extack)
4575 {
4576         struct fib6_config r_cfg;
4577         struct rtnexthop *rtnh;
4578         int remaining;
4579         int attrlen;
4580         int err = 1, last_err = 0;
4581
4582         remaining = cfg->fc_mp_len;
4583         rtnh = (struct rtnexthop *)cfg->fc_mp;
4584
4585         /* Parse a Multipath Entry */
4586         while (rtnh_ok(rtnh, remaining)) {
4587                 memcpy(&r_cfg, cfg, sizeof(*cfg));
4588                 if (rtnh->rtnh_ifindex)
4589                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4590
4591                 attrlen = rtnh_attrlen(rtnh);
4592                 if (attrlen > 0) {
4593                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4594
4595                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4596                         if (nla) {
4597                                 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
4598                                 r_cfg.fc_flags |= RTF_GATEWAY;
4599                         }
4600                 }
4601                 err = ip6_route_del(&r_cfg, extack);
4602                 if (err)
4603                         last_err = err;
4604
4605                 rtnh = rtnh_next(rtnh, &remaining);
4606         }
4607
4608         return last_err;
4609 }
4610
4611 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4612                               struct netlink_ext_ack *extack)
4613 {
4614         struct fib6_config cfg;
4615         int err;
4616
4617         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4618         if (err < 0)
4619                 return err;
4620
4621         if (cfg.fc_mp)
4622                 return ip6_route_multipath_del(&cfg, extack);
4623         else {
4624                 cfg.fc_delete_all_nh = 1;
4625                 return ip6_route_del(&cfg, extack);
4626         }
4627 }
4628
4629 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4630                               struct netlink_ext_ack *extack)
4631 {
4632         struct fib6_config cfg;
4633         int err;
4634
4635         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4636         if (err < 0)
4637                 return err;
4638
4639         if (cfg.fc_metric == 0)
4640                 cfg.fc_metric = IP6_RT_PRIO_USER;
4641
4642         if (cfg.fc_mp)
4643                 return ip6_route_multipath_add(&cfg, extack);
4644         else
4645                 return ip6_route_add(&cfg, GFP_KERNEL, extack);
4646 }
4647
4648 static size_t rt6_nlmsg_size(struct fib6_info *rt)
4649 {
4650         int nexthop_len = 0;
4651
4652         if (rt->fib6_nsiblings) {
4653                 nexthop_len = nla_total_size(0)  /* RTA_MULTIPATH */
4654                             + NLA_ALIGN(sizeof(struct rtnexthop))
4655                             + nla_total_size(16) /* RTA_GATEWAY */
4656                             + lwtunnel_get_encap_size(rt->fib6_nh.fib_nh_lws);
4657
4658                 nexthop_len *= rt->fib6_nsiblings;
4659         }
4660
4661         return NLMSG_ALIGN(sizeof(struct rtmsg))
4662                + nla_total_size(16) /* RTA_SRC */
4663                + nla_total_size(16) /* RTA_DST */
4664                + nla_total_size(16) /* RTA_GATEWAY */
4665                + nla_total_size(16) /* RTA_PREFSRC */
4666                + nla_total_size(4) /* RTA_TABLE */
4667                + nla_total_size(4) /* RTA_IIF */
4668                + nla_total_size(4) /* RTA_OIF */
4669                + nla_total_size(4) /* RTA_PRIORITY */
4670                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
4671                + nla_total_size(sizeof(struct rta_cacheinfo))
4672                + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
4673                + nla_total_size(1) /* RTA_PREF */
4674                + lwtunnel_get_encap_size(rt->fib6_nh.fib_nh_lws)
4675                + nexthop_len;
4676 }
4677
4678 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
4679                          struct fib6_info *rt, struct dst_entry *dst,
4680                          struct in6_addr *dest, struct in6_addr *src,
4681                          int iif, int type, u32 portid, u32 seq,
4682                          unsigned int flags)
4683 {
4684         struct rt6_info *rt6 = (struct rt6_info *)dst;
4685         struct rt6key *rt6_dst, *rt6_src;
4686         u32 *pmetrics, table, rt6_flags;
4687         struct nlmsghdr *nlh;
4688         struct rtmsg *rtm;
4689         long expires = 0;
4690
4691         nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4692         if (!nlh)
4693                 return -EMSGSIZE;
4694
4695         if (rt6) {
4696                 rt6_dst = &rt6->rt6i_dst;
4697                 rt6_src = &rt6->rt6i_src;
4698                 rt6_flags = rt6->rt6i_flags;
4699         } else {
4700                 rt6_dst = &rt->fib6_dst;
4701                 rt6_src = &rt->fib6_src;
4702                 rt6_flags = rt->fib6_flags;
4703         }
4704
4705         rtm = nlmsg_data(nlh);
4706         rtm->rtm_family = AF_INET6;
4707         rtm->rtm_dst_len = rt6_dst->plen;
4708         rtm->rtm_src_len = rt6_src->plen;
4709         rtm->rtm_tos = 0;
4710         if (rt->fib6_table)
4711                 table = rt->fib6_table->tb6_id;
4712         else
4713                 table = RT6_TABLE_UNSPEC;
4714         rtm->rtm_table = table < 256 ? table : RT_TABLE_COMPAT;
4715         if (nla_put_u32(skb, RTA_TABLE, table))
4716                 goto nla_put_failure;
4717
4718         rtm->rtm_type = rt->fib6_type;
4719         rtm->rtm_flags = 0;
4720         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4721         rtm->rtm_protocol = rt->fib6_protocol;
4722
4723         if (rt6_flags & RTF_CACHE)
4724                 rtm->rtm_flags |= RTM_F_CLONED;
4725
4726         if (dest) {
4727                 if (nla_put_in6_addr(skb, RTA_DST, dest))
4728                         goto nla_put_failure;
4729                 rtm->rtm_dst_len = 128;
4730         } else if (rtm->rtm_dst_len)
4731                 if (nla_put_in6_addr(skb, RTA_DST, &rt6_dst->addr))
4732                         goto nla_put_failure;
4733 #ifdef CONFIG_IPV6_SUBTREES
4734         if (src) {
4735                 if (nla_put_in6_addr(skb, RTA_SRC, src))
4736                         goto nla_put_failure;
4737                 rtm->rtm_src_len = 128;
4738         } else if (rtm->rtm_src_len &&
4739                    nla_put_in6_addr(skb, RTA_SRC, &rt6_src->addr))
4740                 goto nla_put_failure;
4741 #endif
4742         if (iif) {
4743 #ifdef CONFIG_IPV6_MROUTE
4744                 if (ipv6_addr_is_multicast(&rt6_dst->addr)) {
4745                         int err = ip6mr_get_route(net, skb, rtm, portid);
4746
4747                         if (err == 0)
4748                                 return 0;
4749                         if (err < 0)
4750                                 goto nla_put_failure;
4751                 } else
4752 #endif
4753                         if (nla_put_u32(skb, RTA_IIF, iif))
4754                                 goto nla_put_failure;
4755         } else if (dest) {
4756                 struct in6_addr saddr_buf;
4757                 if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 &&
4758                     nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4759                         goto nla_put_failure;
4760         }
4761
4762         if (rt->fib6_prefsrc.plen) {
4763                 struct in6_addr saddr_buf;
4764                 saddr_buf = rt->fib6_prefsrc.addr;
4765                 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4766                         goto nla_put_failure;
4767         }
4768
4769         pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics;
4770         if (rtnetlink_put_metrics(skb, pmetrics) < 0)
4771                 goto nla_put_failure;
4772
4773         if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric))
4774                 goto nla_put_failure;
4775
4776         /* For multipath routes, walk the siblings list and add
4777          * each as a nexthop within RTA_MULTIPATH.
4778          */
4779         if (rt6) {
4780                 if (rt6_flags & RTF_GATEWAY &&
4781                     nla_put_in6_addr(skb, RTA_GATEWAY, &rt6->rt6i_gateway))
4782                         goto nla_put_failure;
4783
4784                 if (dst->dev && nla_put_u32(skb, RTA_OIF, dst->dev->ifindex))
4785                         goto nla_put_failure;
4786         } else if (rt->fib6_nsiblings) {
4787                 struct fib6_info *sibling, *next_sibling;
4788                 struct nlattr *mp;
4789
4790                 mp = nla_nest_start_noflag(skb, RTA_MULTIPATH);
4791                 if (!mp)
4792                         goto nla_put_failure;
4793
4794                 if (fib_add_nexthop(skb, &rt->fib6_nh.nh_common,
4795                                     rt->fib6_nh.fib_nh_weight) < 0)
4796                         goto nla_put_failure;
4797
4798                 list_for_each_entry_safe(sibling, next_sibling,
4799                                          &rt->fib6_siblings, fib6_siblings) {
4800                         if (fib_add_nexthop(skb, &sibling->fib6_nh.nh_common,
4801                                             sibling->fib6_nh.fib_nh_weight) < 0)
4802                                 goto nla_put_failure;
4803                 }
4804
4805                 nla_nest_end(skb, mp);
4806         } else {
4807                 unsigned char nh_flags = 0;
4808
4809                 if (fib_nexthop_info(skb, &rt->fib6_nh.nh_common,
4810                                      &nh_flags, false) < 0)
4811                         goto nla_put_failure;
4812
4813                 rtm->rtm_flags |= nh_flags;
4814         }
4815
4816         if (rt6_flags & RTF_EXPIRES) {
4817                 expires = dst ? dst->expires : rt->expires;
4818                 expires -= jiffies;
4819         }
4820
4821         if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
4822                 goto nla_put_failure;
4823
4824         if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt6_flags)))
4825                 goto nla_put_failure;
4826
4827
4828         nlmsg_end(skb, nlh);
4829         return 0;
4830
4831 nla_put_failure:
4832         nlmsg_cancel(skb, nlh);
4833         return -EMSGSIZE;
4834 }
4835
4836 static bool fib6_info_uses_dev(const struct fib6_info *f6i,
4837                                const struct net_device *dev)
4838 {
4839         if (f6i->fib6_nh.fib_nh_dev == dev)
4840                 return true;
4841
4842         if (f6i->fib6_nsiblings) {
4843                 struct fib6_info *sibling, *next_sibling;
4844
4845                 list_for_each_entry_safe(sibling, next_sibling,
4846                                          &f6i->fib6_siblings, fib6_siblings) {
4847                         if (sibling->fib6_nh.fib_nh_dev == dev)
4848                                 return true;
4849                 }
4850         }
4851
4852         return false;
4853 }
4854
4855 int rt6_dump_route(struct fib6_info *rt, void *p_arg)
4856 {
4857         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4858         struct fib_dump_filter *filter = &arg->filter;
4859         unsigned int flags = NLM_F_MULTI;
4860         struct net *net = arg->net;
4861
4862         if (rt == net->ipv6.fib6_null_entry)
4863                 return 0;
4864
4865         if ((filter->flags & RTM_F_PREFIX) &&
4866             !(rt->fib6_flags & RTF_PREFIX_RT)) {
4867                 /* success since this is not a prefix route */
4868                 return 1;
4869         }
4870         if (filter->filter_set) {
4871                 if ((filter->rt_type && rt->fib6_type != filter->rt_type) ||
4872                     (filter->dev && !fib6_info_uses_dev(rt, filter->dev)) ||
4873                     (filter->protocol && rt->fib6_protocol != filter->protocol)) {
4874                         return 1;
4875                 }
4876                 flags |= NLM_F_DUMP_FILTERED;
4877         }
4878
4879         return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0,
4880                              RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid,
4881                              arg->cb->nlh->nlmsg_seq, flags);
4882 }
4883
4884 static int inet6_rtm_valid_getroute_req(struct sk_buff *skb,
4885                                         const struct nlmsghdr *nlh,
4886                                         struct nlattr **tb,
4887                                         struct netlink_ext_ack *extack)
4888 {
4889         struct rtmsg *rtm;
4890         int i, err;
4891
4892         if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
4893                 NL_SET_ERR_MSG_MOD(extack,
4894                                    "Invalid header for get route request");
4895                 return -EINVAL;
4896         }
4897
4898         if (!netlink_strict_get_check(skb))
4899                 return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
4900                                               rtm_ipv6_policy, extack);
4901
4902         rtm = nlmsg_data(nlh);
4903         if ((rtm->rtm_src_len && rtm->rtm_src_len != 128) ||
4904             (rtm->rtm_dst_len && rtm->rtm_dst_len != 128) ||
4905             rtm->rtm_table || rtm->rtm_protocol || rtm->rtm_scope ||
4906             rtm->rtm_type) {
4907                 NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for get route request");
4908                 return -EINVAL;
4909         }
4910         if (rtm->rtm_flags & ~RTM_F_FIB_MATCH) {
4911                 NL_SET_ERR_MSG_MOD(extack,
4912                                    "Invalid flags for get route request");
4913                 return -EINVAL;
4914         }
4915
4916         err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
4917                                             rtm_ipv6_policy, extack);
4918         if (err)
4919                 return err;
4920
4921         if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
4922             (tb[RTA_DST] && !rtm->rtm_dst_len)) {
4923                 NL_SET_ERR_MSG_MOD(extack, "rtm_src_len and rtm_dst_len must be 128 for IPv6");
4924                 return -EINVAL;
4925         }
4926
4927         for (i = 0; i <= RTA_MAX; i++) {
4928                 if (!tb[i])
4929                         continue;
4930
4931                 switch (i) {
4932                 case RTA_SRC:
4933                 case RTA_DST:
4934                 case RTA_IIF:
4935                 case RTA_OIF:
4936                 case RTA_MARK:
4937                 case RTA_UID:
4938                 case RTA_SPORT:
4939                 case RTA_DPORT:
4940                 case RTA_IP_PROTO:
4941                         break;
4942                 default:
4943                         NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in get route request");
4944                         return -EINVAL;
4945                 }
4946         }
4947
4948         return 0;
4949 }
4950
4951 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4952                               struct netlink_ext_ack *extack)
4953 {
4954         struct net *net = sock_net(in_skb->sk);
4955         struct nlattr *tb[RTA_MAX+1];
4956         int err, iif = 0, oif = 0;
4957         struct fib6_info *from;
4958         struct dst_entry *dst;
4959         struct rt6_info *rt;
4960         struct sk_buff *skb;
4961         struct rtmsg *rtm;
4962         struct flowi6 fl6 = {};
4963         bool fibmatch;
4964
4965         err = inet6_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
4966         if (err < 0)
4967                 goto errout;
4968
4969         err = -EINVAL;
4970         rtm = nlmsg_data(nlh);
4971         fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4972         fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4973
4974         if (tb[RTA_SRC]) {
4975                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4976                         goto errout;
4977
4978                 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4979         }
4980
4981         if (tb[RTA_DST]) {
4982                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4983                         goto errout;
4984
4985                 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4986         }
4987
4988         if (tb[RTA_IIF])
4989                 iif = nla_get_u32(tb[RTA_IIF]);
4990
4991         if (tb[RTA_OIF])
4992                 oif = nla_get_u32(tb[RTA_OIF]);
4993
4994         if (tb[RTA_MARK])
4995                 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4996
4997         if (tb[RTA_UID])
4998                 fl6.flowi6_uid = make_kuid(current_user_ns(),
4999                                            nla_get_u32(tb[RTA_UID]));
5000         else
5001                 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
5002
5003         if (tb[RTA_SPORT])
5004                 fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]);
5005
5006         if (tb[RTA_DPORT])
5007                 fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]);
5008
5009         if (tb[RTA_IP_PROTO]) {
5010                 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
5011                                                   &fl6.flowi6_proto, AF_INET6,
5012                                                   extack);
5013                 if (err)
5014                         goto errout;
5015         }
5016
5017         if (iif) {
5018                 struct net_device *dev;
5019                 int flags = 0;
5020
5021                 rcu_read_lock();
5022
5023                 dev = dev_get_by_index_rcu(net, iif);
5024                 if (!dev) {
5025                         rcu_read_unlock();
5026                         err = -ENODEV;
5027                         goto errout;
5028                 }
5029
5030                 fl6.flowi6_iif = iif;
5031
5032                 if (!ipv6_addr_any(&fl6.saddr))
5033                         flags |= RT6_LOOKUP_F_HAS_SADDR;
5034
5035                 dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags);
5036
5037                 rcu_read_unlock();
5038         } else {
5039                 fl6.flowi6_oif = oif;
5040
5041                 dst = ip6_route_output(net, NULL, &fl6);
5042         }
5043
5044
5045         rt = container_of(dst, struct rt6_info, dst);
5046         if (rt->dst.error) {
5047                 err = rt->dst.error;
5048                 ip6_rt_put(rt);
5049                 goto errout;
5050         }
5051
5052         if (rt == net->ipv6.ip6_null_entry) {
5053                 err = rt->dst.error;
5054                 ip6_rt_put(rt);
5055                 goto errout;
5056         }
5057
5058         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
5059         if (!skb) {
5060                 ip6_rt_put(rt);
5061                 err = -ENOBUFS;
5062                 goto errout;
5063         }
5064
5065         skb_dst_set(skb, &rt->dst);
5066
5067         rcu_read_lock();
5068         from = rcu_dereference(rt->from);
5069         if (from) {
5070                 if (fibmatch)
5071                         err = rt6_fill_node(net, skb, from, NULL, NULL, NULL,
5072                                             iif, RTM_NEWROUTE,
5073                                             NETLINK_CB(in_skb).portid,
5074                                             nlh->nlmsg_seq, 0);
5075                 else
5076                         err = rt6_fill_node(net, skb, from, dst, &fl6.daddr,
5077                                             &fl6.saddr, iif, RTM_NEWROUTE,
5078                                             NETLINK_CB(in_skb).portid,
5079                                             nlh->nlmsg_seq, 0);
5080         } else {
5081                 err = -ENETUNREACH;
5082         }
5083         rcu_read_unlock();
5084
5085         if (err < 0) {
5086                 kfree_skb(skb);
5087                 goto errout;
5088         }
5089
5090         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
5091 errout:
5092         return err;
5093 }
5094
5095 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
5096                      unsigned int nlm_flags)
5097 {
5098         struct sk_buff *skb;
5099         struct net *net = info->nl_net;
5100         u32 seq;
5101         int err;
5102
5103         err = -ENOBUFS;
5104         seq = info->nlh ? info->nlh->nlmsg_seq : 0;
5105
5106         skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
5107         if (!skb)
5108                 goto errout;
5109
5110         err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
5111                             event, info->portid, seq, nlm_flags);
5112         if (err < 0) {
5113                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
5114                 WARN_ON(err == -EMSGSIZE);
5115                 kfree_skb(skb);
5116                 goto errout;
5117         }
5118         rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
5119                     info->nlh, gfp_any());
5120         return;
5121 errout:
5122         if (err < 0)
5123                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
5124 }
5125
5126 static int ip6_route_dev_notify(struct notifier_block *this,
5127                                 unsigned long event, void *ptr)
5128 {
5129         struct net_device *dev = netdev_notifier_info_to_dev(ptr);
5130         struct net *net = dev_net(dev);
5131
5132         if (!(dev->flags & IFF_LOOPBACK))
5133                 return NOTIFY_OK;
5134
5135         if (event == NETDEV_REGISTER) {
5136                 net->ipv6.fib6_null_entry->fib6_nh.fib_nh_dev = dev;
5137                 net->ipv6.ip6_null_entry->dst.dev = dev;
5138                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
5139 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5140                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
5141                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
5142                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
5143                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
5144 #endif
5145          } else if (event == NETDEV_UNREGISTER &&
5146                     dev->reg_state != NETREG_UNREGISTERED) {
5147                 /* NETDEV_UNREGISTER could be fired for multiple times by
5148                  * netdev_wait_allrefs(). Make sure we only call this once.
5149                  */
5150                 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
5151 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5152                 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
5153                 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
5154 #endif
5155         }
5156
5157         return NOTIFY_OK;
5158 }
5159
5160 /*
5161  *      /proc
5162  */
5163
5164 #ifdef CONFIG_PROC_FS
5165 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
5166 {
5167         struct net *net = (struct net *)seq->private;
5168         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
5169                    net->ipv6.rt6_stats->fib_nodes,
5170                    net->ipv6.rt6_stats->fib_route_nodes,
5171                    atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
5172                    net->ipv6.rt6_stats->fib_rt_entries,
5173                    net->ipv6.rt6_stats->fib_rt_cache,
5174                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
5175                    net->ipv6.rt6_stats->fib_discarded_routes);
5176
5177         return 0;
5178 }
5179 #endif  /* CONFIG_PROC_FS */
5180
5181 #ifdef CONFIG_SYSCTL
5182
5183 static
5184 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
5185                               void __user *buffer, size_t *lenp, loff_t *ppos)
5186 {
5187         struct net *net;
5188         int delay;
5189         int ret;
5190         if (!write)
5191                 return -EINVAL;
5192
5193         net = (struct net *)ctl->extra1;
5194         delay = net->ipv6.sysctl.flush_delay;
5195         ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
5196         if (ret)
5197                 return ret;
5198
5199         fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
5200         return 0;
5201 }
5202
5203 static int zero;
5204 static int one = 1;
5205
5206 static struct ctl_table ipv6_route_table_template[] = {
5207         {
5208                 .procname       =       "flush",
5209                 .data           =       &init_net.ipv6.sysctl.flush_delay,
5210                 .maxlen         =       sizeof(int),
5211                 .mode           =       0200,
5212                 .proc_handler   =       ipv6_sysctl_rtcache_flush
5213         },
5214         {
5215                 .procname       =       "gc_thresh",
5216                 .data           =       &ip6_dst_ops_template.gc_thresh,
5217                 .maxlen         =       sizeof(int),
5218                 .mode           =       0644,
5219                 .proc_handler   =       proc_dointvec,
5220         },
5221         {
5222                 .procname       =       "max_size",
5223                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
5224                 .maxlen         =       sizeof(int),
5225                 .mode           =       0644,
5226                 .proc_handler   =       proc_dointvec,
5227         },
5228         {
5229                 .procname       =       "gc_min_interval",
5230                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5231                 .maxlen         =       sizeof(int),
5232                 .mode           =       0644,
5233                 .proc_handler   =       proc_dointvec_jiffies,
5234         },
5235         {
5236                 .procname       =       "gc_timeout",
5237                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
5238                 .maxlen         =       sizeof(int),
5239                 .mode           =       0644,
5240                 .proc_handler   =       proc_dointvec_jiffies,
5241         },
5242         {
5243                 .procname       =       "gc_interval",
5244                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
5245                 .maxlen         =       sizeof(int),
5246                 .mode           =       0644,
5247                 .proc_handler   =       proc_dointvec_jiffies,
5248         },
5249         {
5250                 .procname       =       "gc_elasticity",
5251                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
5252                 .maxlen         =       sizeof(int),
5253                 .mode           =       0644,
5254                 .proc_handler   =       proc_dointvec,
5255         },
5256         {
5257                 .procname       =       "mtu_expires",
5258                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
5259                 .maxlen         =       sizeof(int),
5260                 .mode           =       0644,
5261                 .proc_handler   =       proc_dointvec_jiffies,
5262         },
5263         {
5264                 .procname       =       "min_adv_mss",
5265                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
5266                 .maxlen         =       sizeof(int),
5267                 .mode           =       0644,
5268                 .proc_handler   =       proc_dointvec,
5269         },
5270         {
5271                 .procname       =       "gc_min_interval_ms",
5272                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5273                 .maxlen         =       sizeof(int),
5274                 .mode           =       0644,
5275                 .proc_handler   =       proc_dointvec_ms_jiffies,
5276         },
5277         {
5278                 .procname       =       "skip_notify_on_dev_down",
5279                 .data           =       &init_net.ipv6.sysctl.skip_notify_on_dev_down,
5280                 .maxlen         =       sizeof(int),
5281                 .mode           =       0644,
5282                 .proc_handler   =       proc_dointvec,
5283                 .extra1         =       &zero,
5284                 .extra2         =       &one,
5285         },
5286         { }
5287 };
5288
5289 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
5290 {
5291         struct ctl_table *table;
5292
5293         table = kmemdup(ipv6_route_table_template,
5294                         sizeof(ipv6_route_table_template),
5295                         GFP_KERNEL);
5296
5297         if (table) {
5298                 table[0].data = &net->ipv6.sysctl.flush_delay;
5299                 table[0].extra1 = net;
5300                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
5301                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
5302                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5303                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
5304                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
5305                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
5306                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
5307                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
5308                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5309                 table[10].data = &net->ipv6.sysctl.skip_notify_on_dev_down;
5310
5311                 /* Don't export sysctls to unprivileged users */
5312                 if (net->user_ns != &init_user_ns)
5313                         table[0].procname = NULL;
5314         }
5315
5316         return table;
5317 }
5318 #endif
5319
5320 static int __net_init ip6_route_net_init(struct net *net)
5321 {
5322         int ret = -ENOMEM;
5323
5324         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
5325                sizeof(net->ipv6.ip6_dst_ops));
5326
5327         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
5328                 goto out_ip6_dst_ops;
5329
5330         net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template,
5331                                             sizeof(*net->ipv6.fib6_null_entry),
5332                                             GFP_KERNEL);
5333         if (!net->ipv6.fib6_null_entry)
5334                 goto out_ip6_dst_entries;
5335
5336         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
5337                                            sizeof(*net->ipv6.ip6_null_entry),
5338                                            GFP_KERNEL);
5339         if (!net->ipv6.ip6_null_entry)
5340                 goto out_fib6_null_entry;
5341         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5342         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
5343                          ip6_template_metrics, true);
5344
5345 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5346         net->ipv6.fib6_has_custom_rules = false;
5347         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
5348                                                sizeof(*net->ipv6.ip6_prohibit_entry),
5349                                                GFP_KERNEL);
5350         if (!net->ipv6.ip6_prohibit_entry)
5351                 goto out_ip6_null_entry;
5352         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5353         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
5354                          ip6_template_metrics, true);
5355
5356         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
5357                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
5358                                                GFP_KERNEL);
5359         if (!net->ipv6.ip6_blk_hole_entry)
5360                 goto out_ip6_prohibit_entry;
5361         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5362         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
5363                          ip6_template_metrics, true);
5364 #endif
5365
5366         net->ipv6.sysctl.flush_delay = 0;
5367         net->ipv6.sysctl.ip6_rt_max_size = 4096;
5368         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
5369         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
5370         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
5371         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
5372         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
5373         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
5374         net->ipv6.sysctl.skip_notify_on_dev_down = 0;
5375
5376         net->ipv6.ip6_rt_gc_expire = 30*HZ;
5377
5378         ret = 0;
5379 out:
5380         return ret;
5381
5382 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5383 out_ip6_prohibit_entry:
5384         kfree(net->ipv6.ip6_prohibit_entry);
5385 out_ip6_null_entry:
5386         kfree(net->ipv6.ip6_null_entry);
5387 #endif
5388 out_fib6_null_entry:
5389         kfree(net->ipv6.fib6_null_entry);
5390 out_ip6_dst_entries:
5391         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5392 out_ip6_dst_ops:
5393         goto out;
5394 }
5395
5396 static void __net_exit ip6_route_net_exit(struct net *net)
5397 {
5398         kfree(net->ipv6.fib6_null_entry);
5399         kfree(net->ipv6.ip6_null_entry);
5400 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5401         kfree(net->ipv6.ip6_prohibit_entry);
5402         kfree(net->ipv6.ip6_blk_hole_entry);
5403 #endif
5404         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5405 }
5406
5407 static int __net_init ip6_route_net_init_late(struct net *net)
5408 {
5409 #ifdef CONFIG_PROC_FS
5410         proc_create_net("ipv6_route", 0, net->proc_net, &ipv6_route_seq_ops,
5411                         sizeof(struct ipv6_route_iter));
5412         proc_create_net_single("rt6_stats", 0444, net->proc_net,
5413                         rt6_stats_seq_show, NULL);
5414 #endif
5415         return 0;
5416 }
5417
5418 static void __net_exit ip6_route_net_exit_late(struct net *net)
5419 {
5420 #ifdef CONFIG_PROC_FS
5421         remove_proc_entry("ipv6_route", net->proc_net);
5422         remove_proc_entry("rt6_stats", net->proc_net);
5423 #endif
5424 }
5425
5426 static struct pernet_operations ip6_route_net_ops = {
5427         .init = ip6_route_net_init,
5428         .exit = ip6_route_net_exit,
5429 };
5430
5431 static int __net_init ipv6_inetpeer_init(struct net *net)
5432 {
5433         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
5434
5435         if (!bp)
5436                 return -ENOMEM;
5437         inet_peer_base_init(bp);
5438         net->ipv6.peers = bp;
5439         return 0;
5440 }
5441
5442 static void __net_exit ipv6_inetpeer_exit(struct net *net)
5443 {
5444         struct inet_peer_base *bp = net->ipv6.peers;
5445
5446         net->ipv6.peers = NULL;
5447         inetpeer_invalidate_tree(bp);
5448         kfree(bp);
5449 }
5450
5451 static struct pernet_operations ipv6_inetpeer_ops = {
5452         .init   =       ipv6_inetpeer_init,
5453         .exit   =       ipv6_inetpeer_exit,
5454 };
5455
5456 static struct pernet_operations ip6_route_net_late_ops = {
5457         .init = ip6_route_net_init_late,
5458         .exit = ip6_route_net_exit_late,
5459 };
5460
5461 static struct notifier_block ip6_route_dev_notifier = {
5462         .notifier_call = ip6_route_dev_notify,
5463         .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
5464 };
5465
5466 void __init ip6_route_init_special_entries(void)
5467 {
5468         /* Registering of the loopback is done before this portion of code,
5469          * the loopback reference in rt6_info will not be taken, do it
5470          * manually for init_net */
5471         init_net.ipv6.fib6_null_entry->fib6_nh.fib_nh_dev = init_net.loopback_dev;
5472         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
5473         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5474   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5475         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
5476         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5477         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
5478         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5479   #endif
5480 }
5481
5482 int __init ip6_route_init(void)
5483 {
5484         int ret;
5485         int cpu;
5486
5487         ret = -ENOMEM;
5488         ip6_dst_ops_template.kmem_cachep =
5489                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
5490                                   SLAB_HWCACHE_ALIGN, NULL);
5491         if (!ip6_dst_ops_template.kmem_cachep)
5492                 goto out;
5493
5494         ret = dst_entries_init(&ip6_dst_blackhole_ops);
5495         if (ret)
5496                 goto out_kmem_cache;
5497
5498         ret = register_pernet_subsys(&ipv6_inetpeer_ops);
5499         if (ret)
5500                 goto out_dst_entries;
5501
5502         ret = register_pernet_subsys(&ip6_route_net_ops);
5503         if (ret)
5504                 goto out_register_inetpeer;
5505
5506         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
5507
5508         ret = fib6_init();
5509         if (ret)
5510                 goto out_register_subsys;
5511
5512         ret = xfrm6_init();
5513         if (ret)
5514                 goto out_fib6_init;
5515
5516         ret = fib6_rules_init();
5517         if (ret)
5518                 goto xfrm6_init;
5519
5520         ret = register_pernet_subsys(&ip6_route_net_late_ops);
5521         if (ret)
5522                 goto fib6_rules_init;
5523
5524         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
5525                                    inet6_rtm_newroute, NULL, 0);
5526         if (ret < 0)
5527                 goto out_register_late_subsys;
5528
5529         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
5530                                    inet6_rtm_delroute, NULL, 0);
5531         if (ret < 0)
5532                 goto out_register_late_subsys;
5533
5534         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
5535                                    inet6_rtm_getroute, NULL,
5536                                    RTNL_FLAG_DOIT_UNLOCKED);
5537         if (ret < 0)
5538                 goto out_register_late_subsys;
5539
5540         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
5541         if (ret)
5542                 goto out_register_late_subsys;
5543
5544         for_each_possible_cpu(cpu) {
5545                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
5546
5547                 INIT_LIST_HEAD(&ul->head);
5548                 spin_lock_init(&ul->lock);
5549         }
5550
5551 out:
5552         return ret;
5553
5554 out_register_late_subsys:
5555         rtnl_unregister_all(PF_INET6);
5556         unregister_pernet_subsys(&ip6_route_net_late_ops);
5557 fib6_rules_init:
5558         fib6_rules_cleanup();
5559 xfrm6_init:
5560         xfrm6_fini();
5561 out_fib6_init:
5562         fib6_gc_cleanup();
5563 out_register_subsys:
5564         unregister_pernet_subsys(&ip6_route_net_ops);
5565 out_register_inetpeer:
5566         unregister_pernet_subsys(&ipv6_inetpeer_ops);
5567 out_dst_entries:
5568         dst_entries_destroy(&ip6_dst_blackhole_ops);
5569 out_kmem_cache:
5570         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5571         goto out;
5572 }
5573
5574 void ip6_route_cleanup(void)
5575 {
5576         unregister_netdevice_notifier(&ip6_route_dev_notifier);
5577         unregister_pernet_subsys(&ip6_route_net_late_ops);
5578         fib6_rules_cleanup();
5579         xfrm6_fini();
5580         fib6_gc_cleanup();
5581         unregister_pernet_subsys(&ipv6_inetpeer_ops);
5582         unregister_pernet_subsys(&ip6_route_net_ops);
5583         dst_entries_destroy(&ip6_dst_blackhole_ops);
5584         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5585 }