Merge branch 'timers-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel...
[linux-2.6-microblaze.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #define pr_fmt(fmt) "IPv6: " fmt
28
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
49 #include <net/snmp.h>
50 #include <net/ipv6.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
55 #include <net/tcp.h>
56 #include <linux/rtnetlink.h>
57 #include <net/dst.h>
58 #include <net/dst_metadata.h>
59 #include <net/xfrm.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/rtnh.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
66 #include <net/ip.h>
67 #include <linux/uaccess.h>
68
69 #ifdef CONFIG_SYSCTL
70 #include <linux/sysctl.h>
71 #endif
72
73 static int ip6_rt_type_to_error(u8 fib6_type);
74
75 #define CREATE_TRACE_POINTS
76 #include <trace/events/fib6.h>
77 EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup);
78 #undef CREATE_TRACE_POINTS
79
80 enum rt6_nud_state {
81         RT6_NUD_FAIL_HARD = -3,
82         RT6_NUD_FAIL_PROBE = -2,
83         RT6_NUD_FAIL_DO_RR = -1,
84         RT6_NUD_SUCCEED = 1
85 };
86
87 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
88 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
89 static unsigned int      ip6_mtu(const struct dst_entry *dst);
90 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
91 static void             ip6_dst_destroy(struct dst_entry *);
92 static void             ip6_dst_ifdown(struct dst_entry *,
93                                        struct net_device *dev, int how);
94 static int               ip6_dst_gc(struct dst_ops *ops);
95
96 static int              ip6_pkt_discard(struct sk_buff *skb);
97 static int              ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
98 static int              ip6_pkt_prohibit(struct sk_buff *skb);
99 static int              ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
100 static void             ip6_link_failure(struct sk_buff *skb);
101 static void             ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
102                                            struct sk_buff *skb, u32 mtu);
103 static void             rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
104                                         struct sk_buff *skb);
105 static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif,
106                            int strict);
107 static size_t rt6_nlmsg_size(struct fib6_info *rt);
108 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
109                          struct fib6_info *rt, struct dst_entry *dst,
110                          struct in6_addr *dest, struct in6_addr *src,
111                          int iif, int type, u32 portid, u32 seq,
112                          unsigned int flags);
113 static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res,
114                                            struct in6_addr *daddr,
115                                            struct in6_addr *saddr);
116
117 #ifdef CONFIG_IPV6_ROUTE_INFO
118 static struct fib6_info *rt6_add_route_info(struct net *net,
119                                            const struct in6_addr *prefix, int prefixlen,
120                                            const struct in6_addr *gwaddr,
121                                            struct net_device *dev,
122                                            unsigned int pref);
123 static struct fib6_info *rt6_get_route_info(struct net *net,
124                                            const struct in6_addr *prefix, int prefixlen,
125                                            const struct in6_addr *gwaddr,
126                                            struct net_device *dev);
127 #endif
128
129 struct uncached_list {
130         spinlock_t              lock;
131         struct list_head        head;
132 };
133
134 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
135
136 void rt6_uncached_list_add(struct rt6_info *rt)
137 {
138         struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
139
140         rt->rt6i_uncached_list = ul;
141
142         spin_lock_bh(&ul->lock);
143         list_add_tail(&rt->rt6i_uncached, &ul->head);
144         spin_unlock_bh(&ul->lock);
145 }
146
147 void rt6_uncached_list_del(struct rt6_info *rt)
148 {
149         if (!list_empty(&rt->rt6i_uncached)) {
150                 struct uncached_list *ul = rt->rt6i_uncached_list;
151                 struct net *net = dev_net(rt->dst.dev);
152
153                 spin_lock_bh(&ul->lock);
154                 list_del(&rt->rt6i_uncached);
155                 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
156                 spin_unlock_bh(&ul->lock);
157         }
158 }
159
160 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
161 {
162         struct net_device *loopback_dev = net->loopback_dev;
163         int cpu;
164
165         if (dev == loopback_dev)
166                 return;
167
168         for_each_possible_cpu(cpu) {
169                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
170                 struct rt6_info *rt;
171
172                 spin_lock_bh(&ul->lock);
173                 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
174                         struct inet6_dev *rt_idev = rt->rt6i_idev;
175                         struct net_device *rt_dev = rt->dst.dev;
176
177                         if (rt_idev->dev == dev) {
178                                 rt->rt6i_idev = in6_dev_get(loopback_dev);
179                                 in6_dev_put(rt_idev);
180                         }
181
182                         if (rt_dev == dev) {
183                                 rt->dst.dev = loopback_dev;
184                                 dev_hold(rt->dst.dev);
185                                 dev_put(rt_dev);
186                         }
187                 }
188                 spin_unlock_bh(&ul->lock);
189         }
190 }
191
192 static inline const void *choose_neigh_daddr(const struct in6_addr *p,
193                                              struct sk_buff *skb,
194                                              const void *daddr)
195 {
196         if (!ipv6_addr_any(p))
197                 return (const void *) p;
198         else if (skb)
199                 return &ipv6_hdr(skb)->daddr;
200         return daddr;
201 }
202
203 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
204                                    struct net_device *dev,
205                                    struct sk_buff *skb,
206                                    const void *daddr)
207 {
208         struct neighbour *n;
209
210         daddr = choose_neigh_daddr(gw, skb, daddr);
211         n = __ipv6_neigh_lookup(dev, daddr);
212         if (n)
213                 return n;
214
215         n = neigh_create(&nd_tbl, daddr, dev);
216         return IS_ERR(n) ? NULL : n;
217 }
218
219 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst,
220                                               struct sk_buff *skb,
221                                               const void *daddr)
222 {
223         const struct rt6_info *rt = container_of(dst, struct rt6_info, dst);
224
225         return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr);
226 }
227
228 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
229 {
230         struct net_device *dev = dst->dev;
231         struct rt6_info *rt = (struct rt6_info *)dst;
232
233         daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr);
234         if (!daddr)
235                 return;
236         if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
237                 return;
238         if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
239                 return;
240         __ipv6_confirm_neigh(dev, daddr);
241 }
242
243 static struct dst_ops ip6_dst_ops_template = {
244         .family                 =       AF_INET6,
245         .gc                     =       ip6_dst_gc,
246         .gc_thresh              =       1024,
247         .check                  =       ip6_dst_check,
248         .default_advmss         =       ip6_default_advmss,
249         .mtu                    =       ip6_mtu,
250         .cow_metrics            =       dst_cow_metrics_generic,
251         .destroy                =       ip6_dst_destroy,
252         .ifdown                 =       ip6_dst_ifdown,
253         .negative_advice        =       ip6_negative_advice,
254         .link_failure           =       ip6_link_failure,
255         .update_pmtu            =       ip6_rt_update_pmtu,
256         .redirect               =       rt6_do_redirect,
257         .local_out              =       __ip6_local_out,
258         .neigh_lookup           =       ip6_dst_neigh_lookup,
259         .confirm_neigh          =       ip6_confirm_neigh,
260 };
261
262 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
263 {
264         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
265
266         return mtu ? : dst->dev->mtu;
267 }
268
269 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
270                                          struct sk_buff *skb, u32 mtu)
271 {
272 }
273
274 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
275                                       struct sk_buff *skb)
276 {
277 }
278
279 static struct dst_ops ip6_dst_blackhole_ops = {
280         .family                 =       AF_INET6,
281         .destroy                =       ip6_dst_destroy,
282         .check                  =       ip6_dst_check,
283         .mtu                    =       ip6_blackhole_mtu,
284         .default_advmss         =       ip6_default_advmss,
285         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
286         .redirect               =       ip6_rt_blackhole_redirect,
287         .cow_metrics            =       dst_cow_metrics_generic,
288         .neigh_lookup           =       ip6_dst_neigh_lookup,
289 };
290
291 static const u32 ip6_template_metrics[RTAX_MAX] = {
292         [RTAX_HOPLIMIT - 1] = 0,
293 };
294
295 static const struct fib6_info fib6_null_entry_template = {
296         .fib6_flags     = (RTF_REJECT | RTF_NONEXTHOP),
297         .fib6_protocol  = RTPROT_KERNEL,
298         .fib6_metric    = ~(u32)0,
299         .fib6_ref       = REFCOUNT_INIT(1),
300         .fib6_type      = RTN_UNREACHABLE,
301         .fib6_metrics   = (struct dst_metrics *)&dst_default_metrics,
302 };
303
304 static const struct rt6_info ip6_null_entry_template = {
305         .dst = {
306                 .__refcnt       = ATOMIC_INIT(1),
307                 .__use          = 1,
308                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
309                 .error          = -ENETUNREACH,
310                 .input          = ip6_pkt_discard,
311                 .output         = ip6_pkt_discard_out,
312         },
313         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
314 };
315
316 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
317
318 static const struct rt6_info ip6_prohibit_entry_template = {
319         .dst = {
320                 .__refcnt       = ATOMIC_INIT(1),
321                 .__use          = 1,
322                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
323                 .error          = -EACCES,
324                 .input          = ip6_pkt_prohibit,
325                 .output         = ip6_pkt_prohibit_out,
326         },
327         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
328 };
329
330 static const struct rt6_info ip6_blk_hole_entry_template = {
331         .dst = {
332                 .__refcnt       = ATOMIC_INIT(1),
333                 .__use          = 1,
334                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
335                 .error          = -EINVAL,
336                 .input          = dst_discard,
337                 .output         = dst_discard_out,
338         },
339         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
340 };
341
342 #endif
343
344 static void rt6_info_init(struct rt6_info *rt)
345 {
346         struct dst_entry *dst = &rt->dst;
347
348         memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
349         INIT_LIST_HEAD(&rt->rt6i_uncached);
350 }
351
352 /* allocate dst with ip6_dst_ops */
353 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
354                                int flags)
355 {
356         struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
357                                         1, DST_OBSOLETE_FORCE_CHK, flags);
358
359         if (rt) {
360                 rt6_info_init(rt);
361                 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
362         }
363
364         return rt;
365 }
366 EXPORT_SYMBOL(ip6_dst_alloc);
367
368 static void ip6_dst_destroy(struct dst_entry *dst)
369 {
370         struct rt6_info *rt = (struct rt6_info *)dst;
371         struct fib6_info *from;
372         struct inet6_dev *idev;
373
374         ip_dst_metrics_put(dst);
375         rt6_uncached_list_del(rt);
376
377         idev = rt->rt6i_idev;
378         if (idev) {
379                 rt->rt6i_idev = NULL;
380                 in6_dev_put(idev);
381         }
382
383         from = xchg((__force struct fib6_info **)&rt->from, NULL);
384         fib6_info_release(from);
385 }
386
387 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
388                            int how)
389 {
390         struct rt6_info *rt = (struct rt6_info *)dst;
391         struct inet6_dev *idev = rt->rt6i_idev;
392         struct net_device *loopback_dev =
393                 dev_net(dev)->loopback_dev;
394
395         if (idev && idev->dev != loopback_dev) {
396                 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
397                 if (loopback_idev) {
398                         rt->rt6i_idev = loopback_idev;
399                         in6_dev_put(idev);
400                 }
401         }
402 }
403
404 static bool __rt6_check_expired(const struct rt6_info *rt)
405 {
406         if (rt->rt6i_flags & RTF_EXPIRES)
407                 return time_after(jiffies, rt->dst.expires);
408         else
409                 return false;
410 }
411
412 static bool rt6_check_expired(const struct rt6_info *rt)
413 {
414         struct fib6_info *from;
415
416         from = rcu_dereference(rt->from);
417
418         if (rt->rt6i_flags & RTF_EXPIRES) {
419                 if (time_after(jiffies, rt->dst.expires))
420                         return true;
421         } else if (from) {
422                 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
423                         fib6_check_expired(from);
424         }
425         return false;
426 }
427
428 void fib6_select_path(const struct net *net, struct fib6_result *res,
429                       struct flowi6 *fl6, int oif, bool have_oif_match,
430                       const struct sk_buff *skb, int strict)
431 {
432         struct fib6_info *sibling, *next_sibling;
433         struct fib6_info *match = res->f6i;
434
435         if (!match->fib6_nsiblings || have_oif_match)
436                 goto out;
437
438         /* We might have already computed the hash for ICMPv6 errors. In such
439          * case it will always be non-zero. Otherwise now is the time to do it.
440          */
441         if (!fl6->mp_hash)
442                 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
443
444         if (fl6->mp_hash <= atomic_read(&match->fib6_nh.fib_nh_upper_bound))
445                 goto out;
446
447         list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
448                                  fib6_siblings) {
449                 const struct fib6_nh *nh = &sibling->fib6_nh;
450                 int nh_upper_bound;
451
452                 nh_upper_bound = atomic_read(&nh->fib_nh_upper_bound);
453                 if (fl6->mp_hash > nh_upper_bound)
454                         continue;
455                 if (rt6_score_route(nh, sibling->fib6_flags, oif, strict) < 0)
456                         break;
457                 match = sibling;
458                 break;
459         }
460
461 out:
462         res->f6i = match;
463         res->nh = &match->fib6_nh;
464 }
465
466 /*
467  *      Route lookup. rcu_read_lock() should be held.
468  */
469
470 static bool __rt6_device_match(struct net *net, const struct fib6_nh *nh,
471                                const struct in6_addr *saddr, int oif, int flags)
472 {
473         const struct net_device *dev;
474
475         if (nh->fib_nh_flags & RTNH_F_DEAD)
476                 return false;
477
478         dev = nh->fib_nh_dev;
479         if (oif) {
480                 if (dev->ifindex == oif)
481                         return true;
482         } else {
483                 if (ipv6_chk_addr(net, saddr, dev,
484                                   flags & RT6_LOOKUP_F_IFACE))
485                         return true;
486         }
487
488         return false;
489 }
490
491 static void rt6_device_match(struct net *net, struct fib6_result *res,
492                              const struct in6_addr *saddr, int oif, int flags)
493 {
494         struct fib6_info *f6i = res->f6i;
495         struct fib6_info *spf6i;
496         struct fib6_nh *nh;
497
498         if (!oif && ipv6_addr_any(saddr)) {
499                 nh = &f6i->fib6_nh;
500                 if (!(nh->fib_nh_flags & RTNH_F_DEAD))
501                         goto out;
502         }
503
504         for (spf6i = f6i; spf6i; spf6i = rcu_dereference(spf6i->fib6_next)) {
505                 nh = &spf6i->fib6_nh;
506                 if (__rt6_device_match(net, nh, saddr, oif, flags)) {
507                         res->f6i = spf6i;
508                         goto out;
509                 }
510         }
511
512         if (oif && flags & RT6_LOOKUP_F_IFACE) {
513                 res->f6i = net->ipv6.fib6_null_entry;
514                 nh = &res->f6i->fib6_nh;
515                 goto out;
516         }
517
518         nh = &f6i->fib6_nh;
519         if (nh->fib_nh_flags & RTNH_F_DEAD) {
520                 res->f6i = net->ipv6.fib6_null_entry;
521                 nh = &res->f6i->fib6_nh;
522         }
523 out:
524         res->nh = nh;
525         res->fib6_type = res->f6i->fib6_type;
526         res->fib6_flags = res->f6i->fib6_flags;
527 }
528
529 #ifdef CONFIG_IPV6_ROUTER_PREF
530 struct __rt6_probe_work {
531         struct work_struct work;
532         struct in6_addr target;
533         struct net_device *dev;
534 };
535
536 static void rt6_probe_deferred(struct work_struct *w)
537 {
538         struct in6_addr mcaddr;
539         struct __rt6_probe_work *work =
540                 container_of(w, struct __rt6_probe_work, work);
541
542         addrconf_addr_solict_mult(&work->target, &mcaddr);
543         ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
544         dev_put(work->dev);
545         kfree(work);
546 }
547
548 static void rt6_probe(struct fib6_nh *fib6_nh)
549 {
550         struct __rt6_probe_work *work = NULL;
551         const struct in6_addr *nh_gw;
552         struct neighbour *neigh;
553         struct net_device *dev;
554         struct inet6_dev *idev;
555
556         /*
557          * Okay, this does not seem to be appropriate
558          * for now, however, we need to check if it
559          * is really so; aka Router Reachability Probing.
560          *
561          * Router Reachability Probe MUST be rate-limited
562          * to no more than one per minute.
563          */
564         if (fib6_nh->fib_nh_gw_family)
565                 return;
566
567         nh_gw = &fib6_nh->fib_nh_gw6;
568         dev = fib6_nh->fib_nh_dev;
569         rcu_read_lock_bh();
570         idev = __in6_dev_get(dev);
571         neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
572         if (neigh) {
573                 if (neigh->nud_state & NUD_VALID)
574                         goto out;
575
576                 write_lock(&neigh->lock);
577                 if (!(neigh->nud_state & NUD_VALID) &&
578                     time_after(jiffies,
579                                neigh->updated + idev->cnf.rtr_probe_interval)) {
580                         work = kmalloc(sizeof(*work), GFP_ATOMIC);
581                         if (work)
582                                 __neigh_set_probe_once(neigh);
583                 }
584                 write_unlock(&neigh->lock);
585         } else if (time_after(jiffies, fib6_nh->last_probe +
586                                        idev->cnf.rtr_probe_interval)) {
587                 work = kmalloc(sizeof(*work), GFP_ATOMIC);
588         }
589
590         if (work) {
591                 fib6_nh->last_probe = jiffies;
592                 INIT_WORK(&work->work, rt6_probe_deferred);
593                 work->target = *nh_gw;
594                 dev_hold(dev);
595                 work->dev = dev;
596                 schedule_work(&work->work);
597         }
598
599 out:
600         rcu_read_unlock_bh();
601 }
602 #else
603 static inline void rt6_probe(struct fib6_nh *fib6_nh)
604 {
605 }
606 #endif
607
608 /*
609  * Default Router Selection (RFC 2461 6.3.6)
610  */
611 static enum rt6_nud_state rt6_check_neigh(const struct fib6_nh *fib6_nh)
612 {
613         enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
614         struct neighbour *neigh;
615
616         rcu_read_lock_bh();
617         neigh = __ipv6_neigh_lookup_noref(fib6_nh->fib_nh_dev,
618                                           &fib6_nh->fib_nh_gw6);
619         if (neigh) {
620                 read_lock(&neigh->lock);
621                 if (neigh->nud_state & NUD_VALID)
622                         ret = RT6_NUD_SUCCEED;
623 #ifdef CONFIG_IPV6_ROUTER_PREF
624                 else if (!(neigh->nud_state & NUD_FAILED))
625                         ret = RT6_NUD_SUCCEED;
626                 else
627                         ret = RT6_NUD_FAIL_PROBE;
628 #endif
629                 read_unlock(&neigh->lock);
630         } else {
631                 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
632                       RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
633         }
634         rcu_read_unlock_bh();
635
636         return ret;
637 }
638
639 static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif,
640                            int strict)
641 {
642         int m = 0;
643
644         if (!oif || nh->fib_nh_dev->ifindex == oif)
645                 m = 2;
646
647         if (!m && (strict & RT6_LOOKUP_F_IFACE))
648                 return RT6_NUD_FAIL_HARD;
649 #ifdef CONFIG_IPV6_ROUTER_PREF
650         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(fib6_flags)) << 2;
651 #endif
652         if ((strict & RT6_LOOKUP_F_REACHABLE) &&
653             !(fib6_flags & RTF_NONEXTHOP) && nh->fib_nh_gw_family) {
654                 int n = rt6_check_neigh(nh);
655                 if (n < 0)
656                         return n;
657         }
658         return m;
659 }
660
661 static bool find_match(struct fib6_nh *nh, u32 fib6_flags,
662                        int oif, int strict, int *mpri, bool *do_rr)
663 {
664         bool match_do_rr = false;
665         bool rc = false;
666         int m;
667
668         if (nh->fib_nh_flags & RTNH_F_DEAD)
669                 goto out;
670
671         if (ip6_ignore_linkdown(nh->fib_nh_dev) &&
672             nh->fib_nh_flags & RTNH_F_LINKDOWN &&
673             !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
674                 goto out;
675
676         m = rt6_score_route(nh, fib6_flags, oif, strict);
677         if (m == RT6_NUD_FAIL_DO_RR) {
678                 match_do_rr = true;
679                 m = 0; /* lowest valid score */
680         } else if (m == RT6_NUD_FAIL_HARD) {
681                 goto out;
682         }
683
684         if (strict & RT6_LOOKUP_F_REACHABLE)
685                 rt6_probe(nh);
686
687         /* note that m can be RT6_NUD_FAIL_PROBE at this point */
688         if (m > *mpri) {
689                 *do_rr = match_do_rr;
690                 *mpri = m;
691                 rc = true;
692         }
693 out:
694         return rc;
695 }
696
697 static void __find_rr_leaf(struct fib6_info *f6i_start,
698                            struct fib6_info *nomatch, u32 metric,
699                            struct fib6_result *res, struct fib6_info **cont,
700                            int oif, int strict, bool *do_rr, int *mpri)
701 {
702         struct fib6_info *f6i;
703
704         for (f6i = f6i_start;
705              f6i && f6i != nomatch;
706              f6i = rcu_dereference(f6i->fib6_next)) {
707                 struct fib6_nh *nh;
708
709                 if (cont && f6i->fib6_metric != metric) {
710                         *cont = f6i;
711                         return;
712                 }
713
714                 if (fib6_check_expired(f6i))
715                         continue;
716
717                 nh = &f6i->fib6_nh;
718                 if (find_match(nh, f6i->fib6_flags, oif, strict, mpri, do_rr)) {
719                         res->f6i = f6i;
720                         res->nh = nh;
721                         res->fib6_flags = f6i->fib6_flags;
722                         res->fib6_type = f6i->fib6_type;
723                 }
724         }
725 }
726
727 static void find_rr_leaf(struct fib6_node *fn, struct fib6_info *leaf,
728                          struct fib6_info *rr_head, int oif, int strict,
729                          bool *do_rr, struct fib6_result *res)
730 {
731         u32 metric = rr_head->fib6_metric;
732         struct fib6_info *cont = NULL;
733         int mpri = -1;
734
735         __find_rr_leaf(rr_head, NULL, metric, res, &cont,
736                        oif, strict, do_rr, &mpri);
737
738         __find_rr_leaf(leaf, rr_head, metric, res, &cont,
739                        oif, strict, do_rr, &mpri);
740
741         if (res->f6i || !cont)
742                 return;
743
744         __find_rr_leaf(cont, NULL, metric, res, NULL,
745                        oif, strict, do_rr, &mpri);
746 }
747
748 static void rt6_select(struct net *net, struct fib6_node *fn, int oif,
749                        struct fib6_result *res, int strict)
750 {
751         struct fib6_info *leaf = rcu_dereference(fn->leaf);
752         struct fib6_info *rt0;
753         bool do_rr = false;
754         int key_plen;
755
756         /* make sure this function or its helpers sets f6i */
757         res->f6i = NULL;
758
759         if (!leaf || leaf == net->ipv6.fib6_null_entry)
760                 goto out;
761
762         rt0 = rcu_dereference(fn->rr_ptr);
763         if (!rt0)
764                 rt0 = leaf;
765
766         /* Double check to make sure fn is not an intermediate node
767          * and fn->leaf does not points to its child's leaf
768          * (This might happen if all routes under fn are deleted from
769          * the tree and fib6_repair_tree() is called on the node.)
770          */
771         key_plen = rt0->fib6_dst.plen;
772 #ifdef CONFIG_IPV6_SUBTREES
773         if (rt0->fib6_src.plen)
774                 key_plen = rt0->fib6_src.plen;
775 #endif
776         if (fn->fn_bit != key_plen)
777                 goto out;
778
779         find_rr_leaf(fn, leaf, rt0, oif, strict, &do_rr, res);
780         if (do_rr) {
781                 struct fib6_info *next = rcu_dereference(rt0->fib6_next);
782
783                 /* no entries matched; do round-robin */
784                 if (!next || next->fib6_metric != rt0->fib6_metric)
785                         next = leaf;
786
787                 if (next != rt0) {
788                         spin_lock_bh(&leaf->fib6_table->tb6_lock);
789                         /* make sure next is not being deleted from the tree */
790                         if (next->fib6_node)
791                                 rcu_assign_pointer(fn->rr_ptr, next);
792                         spin_unlock_bh(&leaf->fib6_table->tb6_lock);
793                 }
794         }
795
796 out:
797         if (!res->f6i) {
798                 res->f6i = net->ipv6.fib6_null_entry;
799                 res->nh = &res->f6i->fib6_nh;
800                 res->fib6_flags = res->f6i->fib6_flags;
801                 res->fib6_type = res->f6i->fib6_type;
802         }
803 }
804
805 static bool rt6_is_gw_or_nonexthop(const struct fib6_result *res)
806 {
807         return (res->f6i->fib6_flags & RTF_NONEXTHOP) ||
808                res->nh->fib_nh_gw_family;
809 }
810
811 #ifdef CONFIG_IPV6_ROUTE_INFO
812 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
813                   const struct in6_addr *gwaddr)
814 {
815         struct net *net = dev_net(dev);
816         struct route_info *rinfo = (struct route_info *) opt;
817         struct in6_addr prefix_buf, *prefix;
818         unsigned int pref;
819         unsigned long lifetime;
820         struct fib6_info *rt;
821
822         if (len < sizeof(struct route_info)) {
823                 return -EINVAL;
824         }
825
826         /* Sanity check for prefix_len and length */
827         if (rinfo->length > 3) {
828                 return -EINVAL;
829         } else if (rinfo->prefix_len > 128) {
830                 return -EINVAL;
831         } else if (rinfo->prefix_len > 64) {
832                 if (rinfo->length < 2) {
833                         return -EINVAL;
834                 }
835         } else if (rinfo->prefix_len > 0) {
836                 if (rinfo->length < 1) {
837                         return -EINVAL;
838                 }
839         }
840
841         pref = rinfo->route_pref;
842         if (pref == ICMPV6_ROUTER_PREF_INVALID)
843                 return -EINVAL;
844
845         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
846
847         if (rinfo->length == 3)
848                 prefix = (struct in6_addr *)rinfo->prefix;
849         else {
850                 /* this function is safe */
851                 ipv6_addr_prefix(&prefix_buf,
852                                  (struct in6_addr *)rinfo->prefix,
853                                  rinfo->prefix_len);
854                 prefix = &prefix_buf;
855         }
856
857         if (rinfo->prefix_len == 0)
858                 rt = rt6_get_dflt_router(net, gwaddr, dev);
859         else
860                 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
861                                         gwaddr, dev);
862
863         if (rt && !lifetime) {
864                 ip6_del_rt(net, rt);
865                 rt = NULL;
866         }
867
868         if (!rt && lifetime)
869                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
870                                         dev, pref);
871         else if (rt)
872                 rt->fib6_flags = RTF_ROUTEINFO |
873                                  (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
874
875         if (rt) {
876                 if (!addrconf_finite_timeout(lifetime))
877                         fib6_clean_expires(rt);
878                 else
879                         fib6_set_expires(rt, jiffies + HZ * lifetime);
880
881                 fib6_info_release(rt);
882         }
883         return 0;
884 }
885 #endif
886
887 /*
888  *      Misc support functions
889  */
890
891 /* called with rcu_lock held */
892 static struct net_device *ip6_rt_get_dev_rcu(const struct fib6_result *res)
893 {
894         struct net_device *dev = res->nh->fib_nh_dev;
895
896         if (res->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) {
897                 /* for copies of local routes, dst->dev needs to be the
898                  * device if it is a master device, the master device if
899                  * device is enslaved, and the loopback as the default
900                  */
901                 if (netif_is_l3_slave(dev) &&
902                     !rt6_need_strict(&res->f6i->fib6_dst.addr))
903                         dev = l3mdev_master_dev_rcu(dev);
904                 else if (!netif_is_l3_master(dev))
905                         dev = dev_net(dev)->loopback_dev;
906                 /* last case is netif_is_l3_master(dev) is true in which
907                  * case we want dev returned to be dev
908                  */
909         }
910
911         return dev;
912 }
913
914 static const int fib6_prop[RTN_MAX + 1] = {
915         [RTN_UNSPEC]    = 0,
916         [RTN_UNICAST]   = 0,
917         [RTN_LOCAL]     = 0,
918         [RTN_BROADCAST] = 0,
919         [RTN_ANYCAST]   = 0,
920         [RTN_MULTICAST] = 0,
921         [RTN_BLACKHOLE] = -EINVAL,
922         [RTN_UNREACHABLE] = -EHOSTUNREACH,
923         [RTN_PROHIBIT]  = -EACCES,
924         [RTN_THROW]     = -EAGAIN,
925         [RTN_NAT]       = -EINVAL,
926         [RTN_XRESOLVE]  = -EINVAL,
927 };
928
929 static int ip6_rt_type_to_error(u8 fib6_type)
930 {
931         return fib6_prop[fib6_type];
932 }
933
934 static unsigned short fib6_info_dst_flags(struct fib6_info *rt)
935 {
936         unsigned short flags = 0;
937
938         if (rt->dst_nocount)
939                 flags |= DST_NOCOUNT;
940         if (rt->dst_nopolicy)
941                 flags |= DST_NOPOLICY;
942         if (rt->dst_host)
943                 flags |= DST_HOST;
944
945         return flags;
946 }
947
948 static void ip6_rt_init_dst_reject(struct rt6_info *rt, u8 fib6_type)
949 {
950         rt->dst.error = ip6_rt_type_to_error(fib6_type);
951
952         switch (fib6_type) {
953         case RTN_BLACKHOLE:
954                 rt->dst.output = dst_discard_out;
955                 rt->dst.input = dst_discard;
956                 break;
957         case RTN_PROHIBIT:
958                 rt->dst.output = ip6_pkt_prohibit_out;
959                 rt->dst.input = ip6_pkt_prohibit;
960                 break;
961         case RTN_THROW:
962         case RTN_UNREACHABLE:
963         default:
964                 rt->dst.output = ip6_pkt_discard_out;
965                 rt->dst.input = ip6_pkt_discard;
966                 break;
967         }
968 }
969
970 static void ip6_rt_init_dst(struct rt6_info *rt, const struct fib6_result *res)
971 {
972         struct fib6_info *f6i = res->f6i;
973
974         if (res->fib6_flags & RTF_REJECT) {
975                 ip6_rt_init_dst_reject(rt, res->fib6_type);
976                 return;
977         }
978
979         rt->dst.error = 0;
980         rt->dst.output = ip6_output;
981
982         if (res->fib6_type == RTN_LOCAL || res->fib6_type == RTN_ANYCAST) {
983                 rt->dst.input = ip6_input;
984         } else if (ipv6_addr_type(&f6i->fib6_dst.addr) & IPV6_ADDR_MULTICAST) {
985                 rt->dst.input = ip6_mc_input;
986         } else {
987                 rt->dst.input = ip6_forward;
988         }
989
990         if (res->nh->fib_nh_lws) {
991                 rt->dst.lwtstate = lwtstate_get(res->nh->fib_nh_lws);
992                 lwtunnel_set_redirect(&rt->dst);
993         }
994
995         rt->dst.lastuse = jiffies;
996 }
997
998 /* Caller must already hold reference to @from */
999 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
1000 {
1001         rt->rt6i_flags &= ~RTF_EXPIRES;
1002         rcu_assign_pointer(rt->from, from);
1003         ip_dst_init_metrics(&rt->dst, from->fib6_metrics);
1004 }
1005
1006 /* Caller must already hold reference to f6i in result */
1007 static void ip6_rt_copy_init(struct rt6_info *rt, const struct fib6_result *res)
1008 {
1009         const struct fib6_nh *nh = res->nh;
1010         const struct net_device *dev = nh->fib_nh_dev;
1011         struct fib6_info *f6i = res->f6i;
1012
1013         ip6_rt_init_dst(rt, res);
1014
1015         rt->rt6i_dst = f6i->fib6_dst;
1016         rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL;
1017         rt->rt6i_flags = res->fib6_flags;
1018         if (nh->fib_nh_gw_family) {
1019                 rt->rt6i_gateway = nh->fib_nh_gw6;
1020                 rt->rt6i_flags |= RTF_GATEWAY;
1021         }
1022         rt6_set_from(rt, f6i);
1023 #ifdef CONFIG_IPV6_SUBTREES
1024         rt->rt6i_src = f6i->fib6_src;
1025 #endif
1026 }
1027
1028 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
1029                                         struct in6_addr *saddr)
1030 {
1031         struct fib6_node *pn, *sn;
1032         while (1) {
1033                 if (fn->fn_flags & RTN_TL_ROOT)
1034                         return NULL;
1035                 pn = rcu_dereference(fn->parent);
1036                 sn = FIB6_SUBTREE(pn);
1037                 if (sn && sn != fn)
1038                         fn = fib6_node_lookup(sn, NULL, saddr);
1039                 else
1040                         fn = pn;
1041                 if (fn->fn_flags & RTN_RTINFO)
1042                         return fn;
1043         }
1044 }
1045
1046 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt)
1047 {
1048         struct rt6_info *rt = *prt;
1049
1050         if (dst_hold_safe(&rt->dst))
1051                 return true;
1052         if (net) {
1053                 rt = net->ipv6.ip6_null_entry;
1054                 dst_hold(&rt->dst);
1055         } else {
1056                 rt = NULL;
1057         }
1058         *prt = rt;
1059         return false;
1060 }
1061
1062 /* called with rcu_lock held */
1063 static struct rt6_info *ip6_create_rt_rcu(const struct fib6_result *res)
1064 {
1065         struct net_device *dev = res->nh->fib_nh_dev;
1066         struct fib6_info *f6i = res->f6i;
1067         unsigned short flags;
1068         struct rt6_info *nrt;
1069
1070         if (!fib6_info_hold_safe(f6i))
1071                 goto fallback;
1072
1073         flags = fib6_info_dst_flags(f6i);
1074         nrt = ip6_dst_alloc(dev_net(dev), dev, flags);
1075         if (!nrt) {
1076                 fib6_info_release(f6i);
1077                 goto fallback;
1078         }
1079
1080         ip6_rt_copy_init(nrt, res);
1081         return nrt;
1082
1083 fallback:
1084         nrt = dev_net(dev)->ipv6.ip6_null_entry;
1085         dst_hold(&nrt->dst);
1086         return nrt;
1087 }
1088
1089 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
1090                                              struct fib6_table *table,
1091                                              struct flowi6 *fl6,
1092                                              const struct sk_buff *skb,
1093                                              int flags)
1094 {
1095         struct fib6_result res = {};
1096         struct fib6_node *fn;
1097         struct rt6_info *rt;
1098
1099         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1100                 flags &= ~RT6_LOOKUP_F_IFACE;
1101
1102         rcu_read_lock();
1103         fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1104 restart:
1105         res.f6i = rcu_dereference(fn->leaf);
1106         if (!res.f6i)
1107                 res.f6i = net->ipv6.fib6_null_entry;
1108         else
1109                 rt6_device_match(net, &res, &fl6->saddr, fl6->flowi6_oif,
1110                                  flags);
1111
1112         if (res.f6i == net->ipv6.fib6_null_entry) {
1113                 fn = fib6_backtrack(fn, &fl6->saddr);
1114                 if (fn)
1115                         goto restart;
1116
1117                 rt = net->ipv6.ip6_null_entry;
1118                 dst_hold(&rt->dst);
1119                 goto out;
1120         }
1121
1122         fib6_select_path(net, &res, fl6, fl6->flowi6_oif,
1123                          fl6->flowi6_oif != 0, skb, flags);
1124
1125         /* Search through exception table */
1126         rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr);
1127         if (rt) {
1128                 if (ip6_hold_safe(net, &rt))
1129                         dst_use_noref(&rt->dst, jiffies);
1130         } else {
1131                 rt = ip6_create_rt_rcu(&res);
1132         }
1133
1134 out:
1135         trace_fib6_table_lookup(net, &res, table, fl6);
1136
1137         rcu_read_unlock();
1138
1139         return rt;
1140 }
1141
1142 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
1143                                    const struct sk_buff *skb, int flags)
1144 {
1145         return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
1146 }
1147 EXPORT_SYMBOL_GPL(ip6_route_lookup);
1148
1149 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
1150                             const struct in6_addr *saddr, int oif,
1151                             const struct sk_buff *skb, int strict)
1152 {
1153         struct flowi6 fl6 = {
1154                 .flowi6_oif = oif,
1155                 .daddr = *daddr,
1156         };
1157         struct dst_entry *dst;
1158         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
1159
1160         if (saddr) {
1161                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
1162                 flags |= RT6_LOOKUP_F_HAS_SADDR;
1163         }
1164
1165         dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
1166         if (dst->error == 0)
1167                 return (struct rt6_info *) dst;
1168
1169         dst_release(dst);
1170
1171         return NULL;
1172 }
1173 EXPORT_SYMBOL(rt6_lookup);
1174
1175 /* ip6_ins_rt is called with FREE table->tb6_lock.
1176  * It takes new route entry, the addition fails by any reason the
1177  * route is released.
1178  * Caller must hold dst before calling it.
1179  */
1180
1181 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info,
1182                         struct netlink_ext_ack *extack)
1183 {
1184         int err;
1185         struct fib6_table *table;
1186
1187         table = rt->fib6_table;
1188         spin_lock_bh(&table->tb6_lock);
1189         err = fib6_add(&table->tb6_root, rt, info, extack);
1190         spin_unlock_bh(&table->tb6_lock);
1191
1192         return err;
1193 }
1194
1195 int ip6_ins_rt(struct net *net, struct fib6_info *rt)
1196 {
1197         struct nl_info info = { .nl_net = net, };
1198
1199         return __ip6_ins_rt(rt, &info, NULL);
1200 }
1201
1202 static struct rt6_info *ip6_rt_cache_alloc(const struct fib6_result *res,
1203                                            const struct in6_addr *daddr,
1204                                            const struct in6_addr *saddr)
1205 {
1206         struct fib6_info *f6i = res->f6i;
1207         struct net_device *dev;
1208         struct rt6_info *rt;
1209
1210         /*
1211          *      Clone the route.
1212          */
1213
1214         if (!fib6_info_hold_safe(f6i))
1215                 return NULL;
1216
1217         dev = ip6_rt_get_dev_rcu(res);
1218         rt = ip6_dst_alloc(dev_net(dev), dev, 0);
1219         if (!rt) {
1220                 fib6_info_release(f6i);
1221                 return NULL;
1222         }
1223
1224         ip6_rt_copy_init(rt, res);
1225         rt->rt6i_flags |= RTF_CACHE;
1226         rt->dst.flags |= DST_HOST;
1227         rt->rt6i_dst.addr = *daddr;
1228         rt->rt6i_dst.plen = 128;
1229
1230         if (!rt6_is_gw_or_nonexthop(res)) {
1231                 if (f6i->fib6_dst.plen != 128 &&
1232                     ipv6_addr_equal(&f6i->fib6_dst.addr, daddr))
1233                         rt->rt6i_flags |= RTF_ANYCAST;
1234 #ifdef CONFIG_IPV6_SUBTREES
1235                 if (rt->rt6i_src.plen && saddr) {
1236                         rt->rt6i_src.addr = *saddr;
1237                         rt->rt6i_src.plen = 128;
1238                 }
1239 #endif
1240         }
1241
1242         return rt;
1243 }
1244
1245 static struct rt6_info *ip6_rt_pcpu_alloc(const struct fib6_result *res)
1246 {
1247         struct fib6_info *f6i = res->f6i;
1248         unsigned short flags = fib6_info_dst_flags(f6i);
1249         struct net_device *dev;
1250         struct rt6_info *pcpu_rt;
1251
1252         if (!fib6_info_hold_safe(f6i))
1253                 return NULL;
1254
1255         rcu_read_lock();
1256         dev = ip6_rt_get_dev_rcu(res);
1257         pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags);
1258         rcu_read_unlock();
1259         if (!pcpu_rt) {
1260                 fib6_info_release(f6i);
1261                 return NULL;
1262         }
1263         ip6_rt_copy_init(pcpu_rt, res);
1264         pcpu_rt->rt6i_flags |= RTF_PCPU;
1265         return pcpu_rt;
1266 }
1267
1268 /* It should be called with rcu_read_lock() acquired */
1269 static struct rt6_info *rt6_get_pcpu_route(const struct fib6_result *res)
1270 {
1271         struct rt6_info *pcpu_rt, **p;
1272
1273         p = this_cpu_ptr(res->f6i->rt6i_pcpu);
1274         pcpu_rt = *p;
1275
1276         if (pcpu_rt)
1277                 ip6_hold_safe(NULL, &pcpu_rt);
1278
1279         return pcpu_rt;
1280 }
1281
1282 static struct rt6_info *rt6_make_pcpu_route(struct net *net,
1283                                             const struct fib6_result *res)
1284 {
1285         struct rt6_info *pcpu_rt, *prev, **p;
1286
1287         pcpu_rt = ip6_rt_pcpu_alloc(res);
1288         if (!pcpu_rt) {
1289                 dst_hold(&net->ipv6.ip6_null_entry->dst);
1290                 return net->ipv6.ip6_null_entry;
1291         }
1292
1293         dst_hold(&pcpu_rt->dst);
1294         p = this_cpu_ptr(res->f6i->rt6i_pcpu);
1295         prev = cmpxchg(p, NULL, pcpu_rt);
1296         BUG_ON(prev);
1297
1298         return pcpu_rt;
1299 }
1300
1301 /* exception hash table implementation
1302  */
1303 static DEFINE_SPINLOCK(rt6_exception_lock);
1304
1305 /* Remove rt6_ex from hash table and free the memory
1306  * Caller must hold rt6_exception_lock
1307  */
1308 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1309                                  struct rt6_exception *rt6_ex)
1310 {
1311         struct fib6_info *from;
1312         struct net *net;
1313
1314         if (!bucket || !rt6_ex)
1315                 return;
1316
1317         net = dev_net(rt6_ex->rt6i->dst.dev);
1318         net->ipv6.rt6_stats->fib_rt_cache--;
1319
1320         /* purge completely the exception to allow releasing the held resources:
1321          * some [sk] cache may keep the dst around for unlimited time
1322          */
1323         from = xchg((__force struct fib6_info **)&rt6_ex->rt6i->from, NULL);
1324         fib6_info_release(from);
1325         dst_dev_put(&rt6_ex->rt6i->dst);
1326
1327         hlist_del_rcu(&rt6_ex->hlist);
1328         dst_release(&rt6_ex->rt6i->dst);
1329         kfree_rcu(rt6_ex, rcu);
1330         WARN_ON_ONCE(!bucket->depth);
1331         bucket->depth--;
1332 }
1333
1334 /* Remove oldest rt6_ex in bucket and free the memory
1335  * Caller must hold rt6_exception_lock
1336  */
1337 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1338 {
1339         struct rt6_exception *rt6_ex, *oldest = NULL;
1340
1341         if (!bucket)
1342                 return;
1343
1344         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1345                 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1346                         oldest = rt6_ex;
1347         }
1348         rt6_remove_exception(bucket, oldest);
1349 }
1350
1351 static u32 rt6_exception_hash(const struct in6_addr *dst,
1352                               const struct in6_addr *src)
1353 {
1354         static u32 seed __read_mostly;
1355         u32 val;
1356
1357         net_get_random_once(&seed, sizeof(seed));
1358         val = jhash(dst, sizeof(*dst), seed);
1359
1360 #ifdef CONFIG_IPV6_SUBTREES
1361         if (src)
1362                 val = jhash(src, sizeof(*src), val);
1363 #endif
1364         return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1365 }
1366
1367 /* Helper function to find the cached rt in the hash table
1368  * and update bucket pointer to point to the bucket for this
1369  * (daddr, saddr) pair
1370  * Caller must hold rt6_exception_lock
1371  */
1372 static struct rt6_exception *
1373 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1374                               const struct in6_addr *daddr,
1375                               const struct in6_addr *saddr)
1376 {
1377         struct rt6_exception *rt6_ex;
1378         u32 hval;
1379
1380         if (!(*bucket) || !daddr)
1381                 return NULL;
1382
1383         hval = rt6_exception_hash(daddr, saddr);
1384         *bucket += hval;
1385
1386         hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1387                 struct rt6_info *rt6 = rt6_ex->rt6i;
1388                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1389
1390 #ifdef CONFIG_IPV6_SUBTREES
1391                 if (matched && saddr)
1392                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1393 #endif
1394                 if (matched)
1395                         return rt6_ex;
1396         }
1397         return NULL;
1398 }
1399
1400 /* Helper function to find the cached rt in the hash table
1401  * and update bucket pointer to point to the bucket for this
1402  * (daddr, saddr) pair
1403  * Caller must hold rcu_read_lock()
1404  */
1405 static struct rt6_exception *
1406 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1407                          const struct in6_addr *daddr,
1408                          const struct in6_addr *saddr)
1409 {
1410         struct rt6_exception *rt6_ex;
1411         u32 hval;
1412
1413         WARN_ON_ONCE(!rcu_read_lock_held());
1414
1415         if (!(*bucket) || !daddr)
1416                 return NULL;
1417
1418         hval = rt6_exception_hash(daddr, saddr);
1419         *bucket += hval;
1420
1421         hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1422                 struct rt6_info *rt6 = rt6_ex->rt6i;
1423                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1424
1425 #ifdef CONFIG_IPV6_SUBTREES
1426                 if (matched && saddr)
1427                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1428 #endif
1429                 if (matched)
1430                         return rt6_ex;
1431         }
1432         return NULL;
1433 }
1434
1435 static unsigned int fib6_mtu(const struct fib6_result *res)
1436 {
1437         const struct fib6_nh *nh = res->nh;
1438         unsigned int mtu;
1439
1440         if (res->f6i->fib6_pmtu) {
1441                 mtu = res->f6i->fib6_pmtu;
1442         } else {
1443                 struct net_device *dev = nh->fib_nh_dev;
1444                 struct inet6_dev *idev;
1445
1446                 rcu_read_lock();
1447                 idev = __in6_dev_get(dev);
1448                 mtu = idev->cnf.mtu6;
1449                 rcu_read_unlock();
1450         }
1451
1452         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1453
1454         return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu);
1455 }
1456
1457 static int rt6_insert_exception(struct rt6_info *nrt,
1458                                 const struct fib6_result *res)
1459 {
1460         struct net *net = dev_net(nrt->dst.dev);
1461         struct rt6_exception_bucket *bucket;
1462         struct in6_addr *src_key = NULL;
1463         struct rt6_exception *rt6_ex;
1464         struct fib6_info *f6i = res->f6i;
1465         int err = 0;
1466
1467         spin_lock_bh(&rt6_exception_lock);
1468
1469         if (f6i->exception_bucket_flushed) {
1470                 err = -EINVAL;
1471                 goto out;
1472         }
1473
1474         bucket = rcu_dereference_protected(f6i->rt6i_exception_bucket,
1475                                         lockdep_is_held(&rt6_exception_lock));
1476         if (!bucket) {
1477                 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1478                                  GFP_ATOMIC);
1479                 if (!bucket) {
1480                         err = -ENOMEM;
1481                         goto out;
1482                 }
1483                 rcu_assign_pointer(f6i->rt6i_exception_bucket, bucket);
1484         }
1485
1486 #ifdef CONFIG_IPV6_SUBTREES
1487         /* fib6_src.plen != 0 indicates f6i is in subtree
1488          * and exception table is indexed by a hash of
1489          * both fib6_dst and fib6_src.
1490          * Otherwise, the exception table is indexed by
1491          * a hash of only fib6_dst.
1492          */
1493         if (f6i->fib6_src.plen)
1494                 src_key = &nrt->rt6i_src.addr;
1495 #endif
1496         /* rt6_mtu_change() might lower mtu on f6i.
1497          * Only insert this exception route if its mtu
1498          * is less than f6i's mtu value.
1499          */
1500         if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(res)) {
1501                 err = -EINVAL;
1502                 goto out;
1503         }
1504
1505         rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1506                                                src_key);
1507         if (rt6_ex)
1508                 rt6_remove_exception(bucket, rt6_ex);
1509
1510         rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1511         if (!rt6_ex) {
1512                 err = -ENOMEM;
1513                 goto out;
1514         }
1515         rt6_ex->rt6i = nrt;
1516         rt6_ex->stamp = jiffies;
1517         hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1518         bucket->depth++;
1519         net->ipv6.rt6_stats->fib_rt_cache++;
1520
1521         if (bucket->depth > FIB6_MAX_DEPTH)
1522                 rt6_exception_remove_oldest(bucket);
1523
1524 out:
1525         spin_unlock_bh(&rt6_exception_lock);
1526
1527         /* Update fn->fn_sernum to invalidate all cached dst */
1528         if (!err) {
1529                 spin_lock_bh(&f6i->fib6_table->tb6_lock);
1530                 fib6_update_sernum(net, f6i);
1531                 spin_unlock_bh(&f6i->fib6_table->tb6_lock);
1532                 fib6_force_start_gc(net);
1533         }
1534
1535         return err;
1536 }
1537
1538 void rt6_flush_exceptions(struct fib6_info *rt)
1539 {
1540         struct rt6_exception_bucket *bucket;
1541         struct rt6_exception *rt6_ex;
1542         struct hlist_node *tmp;
1543         int i;
1544
1545         spin_lock_bh(&rt6_exception_lock);
1546         /* Prevent rt6_insert_exception() to recreate the bucket list */
1547         rt->exception_bucket_flushed = 1;
1548
1549         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1550                                     lockdep_is_held(&rt6_exception_lock));
1551         if (!bucket)
1552                 goto out;
1553
1554         for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1555                 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1556                         rt6_remove_exception(bucket, rt6_ex);
1557                 WARN_ON_ONCE(bucket->depth);
1558                 bucket++;
1559         }
1560
1561 out:
1562         spin_unlock_bh(&rt6_exception_lock);
1563 }
1564
1565 /* Find cached rt in the hash table inside passed in rt
1566  * Caller has to hold rcu_read_lock()
1567  */
1568 static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res,
1569                                            struct in6_addr *daddr,
1570                                            struct in6_addr *saddr)
1571 {
1572         struct rt6_exception_bucket *bucket;
1573         struct in6_addr *src_key = NULL;
1574         struct rt6_exception *rt6_ex;
1575         struct rt6_info *ret = NULL;
1576
1577         bucket = rcu_dereference(res->f6i->rt6i_exception_bucket);
1578
1579 #ifdef CONFIG_IPV6_SUBTREES
1580         /* fib6i_src.plen != 0 indicates f6i is in subtree
1581          * and exception table is indexed by a hash of
1582          * both fib6_dst and fib6_src.
1583          * Otherwise, the exception table is indexed by
1584          * a hash of only fib6_dst.
1585          */
1586         if (res->f6i->fib6_src.plen)
1587                 src_key = saddr;
1588 #endif
1589         rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1590
1591         if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1592                 ret = rt6_ex->rt6i;
1593
1594         return ret;
1595 }
1596
1597 /* Remove the passed in cached rt from the hash table that contains it */
1598 static int rt6_remove_exception_rt(struct rt6_info *rt)
1599 {
1600         struct rt6_exception_bucket *bucket;
1601         struct in6_addr *src_key = NULL;
1602         struct rt6_exception *rt6_ex;
1603         struct fib6_info *from;
1604         int err;
1605
1606         from = rcu_dereference(rt->from);
1607         if (!from ||
1608             !(rt->rt6i_flags & RTF_CACHE))
1609                 return -EINVAL;
1610
1611         if (!rcu_access_pointer(from->rt6i_exception_bucket))
1612                 return -ENOENT;
1613
1614         spin_lock_bh(&rt6_exception_lock);
1615         bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1616                                     lockdep_is_held(&rt6_exception_lock));
1617 #ifdef CONFIG_IPV6_SUBTREES
1618         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1619          * and exception table is indexed by a hash of
1620          * both rt6i_dst and rt6i_src.
1621          * Otherwise, the exception table is indexed by
1622          * a hash of only rt6i_dst.
1623          */
1624         if (from->fib6_src.plen)
1625                 src_key = &rt->rt6i_src.addr;
1626 #endif
1627         rt6_ex = __rt6_find_exception_spinlock(&bucket,
1628                                                &rt->rt6i_dst.addr,
1629                                                src_key);
1630         if (rt6_ex) {
1631                 rt6_remove_exception(bucket, rt6_ex);
1632                 err = 0;
1633         } else {
1634                 err = -ENOENT;
1635         }
1636
1637         spin_unlock_bh(&rt6_exception_lock);
1638         return err;
1639 }
1640
1641 /* Find rt6_ex which contains the passed in rt cache and
1642  * refresh its stamp
1643  */
1644 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1645 {
1646         struct rt6_exception_bucket *bucket;
1647         struct in6_addr *src_key = NULL;
1648         struct rt6_exception *rt6_ex;
1649         struct fib6_info *from;
1650
1651         rcu_read_lock();
1652         from = rcu_dereference(rt->from);
1653         if (!from || !(rt->rt6i_flags & RTF_CACHE))
1654                 goto unlock;
1655
1656         bucket = rcu_dereference(from->rt6i_exception_bucket);
1657
1658 #ifdef CONFIG_IPV6_SUBTREES
1659         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1660          * and exception table is indexed by a hash of
1661          * both rt6i_dst and rt6i_src.
1662          * Otherwise, the exception table is indexed by
1663          * a hash of only rt6i_dst.
1664          */
1665         if (from->fib6_src.plen)
1666                 src_key = &rt->rt6i_src.addr;
1667 #endif
1668         rt6_ex = __rt6_find_exception_rcu(&bucket,
1669                                           &rt->rt6i_dst.addr,
1670                                           src_key);
1671         if (rt6_ex)
1672                 rt6_ex->stamp = jiffies;
1673
1674 unlock:
1675         rcu_read_unlock();
1676 }
1677
1678 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
1679                                          struct rt6_info *rt, int mtu)
1680 {
1681         /* If the new MTU is lower than the route PMTU, this new MTU will be the
1682          * lowest MTU in the path: always allow updating the route PMTU to
1683          * reflect PMTU decreases.
1684          *
1685          * If the new MTU is higher, and the route PMTU is equal to the local
1686          * MTU, this means the old MTU is the lowest in the path, so allow
1687          * updating it: if other nodes now have lower MTUs, PMTU discovery will
1688          * handle this.
1689          */
1690
1691         if (dst_mtu(&rt->dst) >= mtu)
1692                 return true;
1693
1694         if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
1695                 return true;
1696
1697         return false;
1698 }
1699
1700 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
1701                                        struct fib6_info *rt, int mtu)
1702 {
1703         struct rt6_exception_bucket *bucket;
1704         struct rt6_exception *rt6_ex;
1705         int i;
1706
1707         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1708                                         lockdep_is_held(&rt6_exception_lock));
1709
1710         if (!bucket)
1711                 return;
1712
1713         for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1714                 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1715                         struct rt6_info *entry = rt6_ex->rt6i;
1716
1717                         /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
1718                          * route), the metrics of its rt->from have already
1719                          * been updated.
1720                          */
1721                         if (dst_metric_raw(&entry->dst, RTAX_MTU) &&
1722                             rt6_mtu_change_route_allowed(idev, entry, mtu))
1723                                 dst_metric_set(&entry->dst, RTAX_MTU, mtu);
1724                 }
1725                 bucket++;
1726         }
1727 }
1728
1729 #define RTF_CACHE_GATEWAY       (RTF_GATEWAY | RTF_CACHE)
1730
1731 static void rt6_exceptions_clean_tohost(struct fib6_info *rt,
1732                                         struct in6_addr *gateway)
1733 {
1734         struct rt6_exception_bucket *bucket;
1735         struct rt6_exception *rt6_ex;
1736         struct hlist_node *tmp;
1737         int i;
1738
1739         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1740                 return;
1741
1742         spin_lock_bh(&rt6_exception_lock);
1743         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1744                                      lockdep_is_held(&rt6_exception_lock));
1745
1746         if (bucket) {
1747                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1748                         hlist_for_each_entry_safe(rt6_ex, tmp,
1749                                                   &bucket->chain, hlist) {
1750                                 struct rt6_info *entry = rt6_ex->rt6i;
1751
1752                                 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1753                                     RTF_CACHE_GATEWAY &&
1754                                     ipv6_addr_equal(gateway,
1755                                                     &entry->rt6i_gateway)) {
1756                                         rt6_remove_exception(bucket, rt6_ex);
1757                                 }
1758                         }
1759                         bucket++;
1760                 }
1761         }
1762
1763         spin_unlock_bh(&rt6_exception_lock);
1764 }
1765
1766 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1767                                       struct rt6_exception *rt6_ex,
1768                                       struct fib6_gc_args *gc_args,
1769                                       unsigned long now)
1770 {
1771         struct rt6_info *rt = rt6_ex->rt6i;
1772
1773         /* we are pruning and obsoleting aged-out and non gateway exceptions
1774          * even if others have still references to them, so that on next
1775          * dst_check() such references can be dropped.
1776          * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1777          * expired, independently from their aging, as per RFC 8201 section 4
1778          */
1779         if (!(rt->rt6i_flags & RTF_EXPIRES)) {
1780                 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1781                         RT6_TRACE("aging clone %p\n", rt);
1782                         rt6_remove_exception(bucket, rt6_ex);
1783                         return;
1784                 }
1785         } else if (time_after(jiffies, rt->dst.expires)) {
1786                 RT6_TRACE("purging expired route %p\n", rt);
1787                 rt6_remove_exception(bucket, rt6_ex);
1788                 return;
1789         }
1790
1791         if (rt->rt6i_flags & RTF_GATEWAY) {
1792                 struct neighbour *neigh;
1793                 __u8 neigh_flags = 0;
1794
1795                 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
1796                 if (neigh)
1797                         neigh_flags = neigh->flags;
1798
1799                 if (!(neigh_flags & NTF_ROUTER)) {
1800                         RT6_TRACE("purging route %p via non-router but gateway\n",
1801                                   rt);
1802                         rt6_remove_exception(bucket, rt6_ex);
1803                         return;
1804                 }
1805         }
1806
1807         gc_args->more++;
1808 }
1809
1810 void rt6_age_exceptions(struct fib6_info *rt,
1811                         struct fib6_gc_args *gc_args,
1812                         unsigned long now)
1813 {
1814         struct rt6_exception_bucket *bucket;
1815         struct rt6_exception *rt6_ex;
1816         struct hlist_node *tmp;
1817         int i;
1818
1819         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1820                 return;
1821
1822         rcu_read_lock_bh();
1823         spin_lock(&rt6_exception_lock);
1824         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1825                                     lockdep_is_held(&rt6_exception_lock));
1826
1827         if (bucket) {
1828                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1829                         hlist_for_each_entry_safe(rt6_ex, tmp,
1830                                                   &bucket->chain, hlist) {
1831                                 rt6_age_examine_exception(bucket, rt6_ex,
1832                                                           gc_args, now);
1833                         }
1834                         bucket++;
1835                 }
1836         }
1837         spin_unlock(&rt6_exception_lock);
1838         rcu_read_unlock_bh();
1839 }
1840
1841 /* must be called with rcu lock held */
1842 int fib6_table_lookup(struct net *net, struct fib6_table *table, int oif,
1843                       struct flowi6 *fl6, struct fib6_result *res, int strict)
1844 {
1845         struct fib6_node *fn, *saved_fn;
1846
1847         fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1848         saved_fn = fn;
1849
1850         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1851                 oif = 0;
1852
1853 redo_rt6_select:
1854         rt6_select(net, fn, oif, res, strict);
1855         if (res->f6i == net->ipv6.fib6_null_entry) {
1856                 fn = fib6_backtrack(fn, &fl6->saddr);
1857                 if (fn)
1858                         goto redo_rt6_select;
1859                 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1860                         /* also consider unreachable route */
1861                         strict &= ~RT6_LOOKUP_F_REACHABLE;
1862                         fn = saved_fn;
1863                         goto redo_rt6_select;
1864                 }
1865         }
1866
1867         trace_fib6_table_lookup(net, res, table, fl6);
1868
1869         return 0;
1870 }
1871
1872 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1873                                int oif, struct flowi6 *fl6,
1874                                const struct sk_buff *skb, int flags)
1875 {
1876         struct fib6_result res = {};
1877         struct rt6_info *rt;
1878         int strict = 0;
1879
1880         strict |= flags & RT6_LOOKUP_F_IFACE;
1881         strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1882         if (net->ipv6.devconf_all->forwarding == 0)
1883                 strict |= RT6_LOOKUP_F_REACHABLE;
1884
1885         rcu_read_lock();
1886
1887         fib6_table_lookup(net, table, oif, fl6, &res, strict);
1888         if (res.f6i == net->ipv6.fib6_null_entry) {
1889                 rt = net->ipv6.ip6_null_entry;
1890                 rcu_read_unlock();
1891                 dst_hold(&rt->dst);
1892                 return rt;
1893         }
1894
1895         fib6_select_path(net, &res, fl6, oif, false, skb, strict);
1896
1897         /*Search through exception table */
1898         rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr);
1899         if (rt) {
1900                 if (ip6_hold_safe(net, &rt))
1901                         dst_use_noref(&rt->dst, jiffies);
1902
1903                 rcu_read_unlock();
1904                 return rt;
1905         } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1906                             !res.nh->fib_nh_gw_family)) {
1907                 /* Create a RTF_CACHE clone which will not be
1908                  * owned by the fib6 tree.  It is for the special case where
1909                  * the daddr in the skb during the neighbor look-up is different
1910                  * from the fl6->daddr used to look-up route here.
1911                  */
1912                 struct rt6_info *uncached_rt;
1913
1914                 uncached_rt = ip6_rt_cache_alloc(&res, &fl6->daddr, NULL);
1915
1916                 rcu_read_unlock();
1917
1918                 if (uncached_rt) {
1919                         /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1920                          * No need for another dst_hold()
1921                          */
1922                         rt6_uncached_list_add(uncached_rt);
1923                         atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1924                 } else {
1925                         uncached_rt = net->ipv6.ip6_null_entry;
1926                         dst_hold(&uncached_rt->dst);
1927                 }
1928
1929                 return uncached_rt;
1930         } else {
1931                 /* Get a percpu copy */
1932
1933                 struct rt6_info *pcpu_rt;
1934
1935                 local_bh_disable();
1936                 pcpu_rt = rt6_get_pcpu_route(&res);
1937
1938                 if (!pcpu_rt)
1939                         pcpu_rt = rt6_make_pcpu_route(net, &res);
1940
1941                 local_bh_enable();
1942                 rcu_read_unlock();
1943
1944                 return pcpu_rt;
1945         }
1946 }
1947 EXPORT_SYMBOL_GPL(ip6_pol_route);
1948
1949 static struct rt6_info *ip6_pol_route_input(struct net *net,
1950                                             struct fib6_table *table,
1951                                             struct flowi6 *fl6,
1952                                             const struct sk_buff *skb,
1953                                             int flags)
1954 {
1955         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
1956 }
1957
1958 struct dst_entry *ip6_route_input_lookup(struct net *net,
1959                                          struct net_device *dev,
1960                                          struct flowi6 *fl6,
1961                                          const struct sk_buff *skb,
1962                                          int flags)
1963 {
1964         if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1965                 flags |= RT6_LOOKUP_F_IFACE;
1966
1967         return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
1968 }
1969 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1970
1971 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1972                                   struct flow_keys *keys,
1973                                   struct flow_keys *flkeys)
1974 {
1975         const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1976         const struct ipv6hdr *key_iph = outer_iph;
1977         struct flow_keys *_flkeys = flkeys;
1978         const struct ipv6hdr *inner_iph;
1979         const struct icmp6hdr *icmph;
1980         struct ipv6hdr _inner_iph;
1981         struct icmp6hdr _icmph;
1982
1983         if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1984                 goto out;
1985
1986         icmph = skb_header_pointer(skb, skb_transport_offset(skb),
1987                                    sizeof(_icmph), &_icmph);
1988         if (!icmph)
1989                 goto out;
1990
1991         if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1992             icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1993             icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1994             icmph->icmp6_type != ICMPV6_PARAMPROB)
1995                 goto out;
1996
1997         inner_iph = skb_header_pointer(skb,
1998                                        skb_transport_offset(skb) + sizeof(*icmph),
1999                                        sizeof(_inner_iph), &_inner_iph);
2000         if (!inner_iph)
2001                 goto out;
2002
2003         key_iph = inner_iph;
2004         _flkeys = NULL;
2005 out:
2006         if (_flkeys) {
2007                 keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
2008                 keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
2009                 keys->tags.flow_label = _flkeys->tags.flow_label;
2010                 keys->basic.ip_proto = _flkeys->basic.ip_proto;
2011         } else {
2012                 keys->addrs.v6addrs.src = key_iph->saddr;
2013                 keys->addrs.v6addrs.dst = key_iph->daddr;
2014                 keys->tags.flow_label = ip6_flowlabel(key_iph);
2015                 keys->basic.ip_proto = key_iph->nexthdr;
2016         }
2017 }
2018
2019 /* if skb is set it will be used and fl6 can be NULL */
2020 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
2021                        const struct sk_buff *skb, struct flow_keys *flkeys)
2022 {
2023         struct flow_keys hash_keys;
2024         u32 mhash;
2025
2026         switch (ip6_multipath_hash_policy(net)) {
2027         case 0:
2028                 memset(&hash_keys, 0, sizeof(hash_keys));
2029                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2030                 if (skb) {
2031                         ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
2032                 } else {
2033                         hash_keys.addrs.v6addrs.src = fl6->saddr;
2034                         hash_keys.addrs.v6addrs.dst = fl6->daddr;
2035                         hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
2036                         hash_keys.basic.ip_proto = fl6->flowi6_proto;
2037                 }
2038                 break;
2039         case 1:
2040                 if (skb) {
2041                         unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
2042                         struct flow_keys keys;
2043
2044                         /* short-circuit if we already have L4 hash present */
2045                         if (skb->l4_hash)
2046                                 return skb_get_hash_raw(skb) >> 1;
2047
2048                         memset(&hash_keys, 0, sizeof(hash_keys));
2049
2050                         if (!flkeys) {
2051                                 skb_flow_dissect_flow_keys(skb, &keys, flag);
2052                                 flkeys = &keys;
2053                         }
2054                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2055                         hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
2056                         hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
2057                         hash_keys.ports.src = flkeys->ports.src;
2058                         hash_keys.ports.dst = flkeys->ports.dst;
2059                         hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2060                 } else {
2061                         memset(&hash_keys, 0, sizeof(hash_keys));
2062                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2063                         hash_keys.addrs.v6addrs.src = fl6->saddr;
2064                         hash_keys.addrs.v6addrs.dst = fl6->daddr;
2065                         hash_keys.ports.src = fl6->fl6_sport;
2066                         hash_keys.ports.dst = fl6->fl6_dport;
2067                         hash_keys.basic.ip_proto = fl6->flowi6_proto;
2068                 }
2069                 break;
2070         }
2071         mhash = flow_hash_from_keys(&hash_keys);
2072
2073         return mhash >> 1;
2074 }
2075
2076 void ip6_route_input(struct sk_buff *skb)
2077 {
2078         const struct ipv6hdr *iph = ipv6_hdr(skb);
2079         struct net *net = dev_net(skb->dev);
2080         int flags = RT6_LOOKUP_F_HAS_SADDR;
2081         struct ip_tunnel_info *tun_info;
2082         struct flowi6 fl6 = {
2083                 .flowi6_iif = skb->dev->ifindex,
2084                 .daddr = iph->daddr,
2085                 .saddr = iph->saddr,
2086                 .flowlabel = ip6_flowinfo(iph),
2087                 .flowi6_mark = skb->mark,
2088                 .flowi6_proto = iph->nexthdr,
2089         };
2090         struct flow_keys *flkeys = NULL, _flkeys;
2091
2092         tun_info = skb_tunnel_info(skb);
2093         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2094                 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
2095
2096         if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
2097                 flkeys = &_flkeys;
2098
2099         if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
2100                 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
2101         skb_dst_drop(skb);
2102         skb_dst_set(skb,
2103                     ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
2104 }
2105
2106 static struct rt6_info *ip6_pol_route_output(struct net *net,
2107                                              struct fib6_table *table,
2108                                              struct flowi6 *fl6,
2109                                              const struct sk_buff *skb,
2110                                              int flags)
2111 {
2112         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
2113 }
2114
2115 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
2116                                          struct flowi6 *fl6, int flags)
2117 {
2118         bool any_src;
2119
2120         if (ipv6_addr_type(&fl6->daddr) &
2121             (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)) {
2122                 struct dst_entry *dst;
2123
2124                 dst = l3mdev_link_scope_lookup(net, fl6);
2125                 if (dst)
2126                         return dst;
2127         }
2128
2129         fl6->flowi6_iif = LOOPBACK_IFINDEX;
2130
2131         any_src = ipv6_addr_any(&fl6->saddr);
2132         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
2133             (fl6->flowi6_oif && any_src))
2134                 flags |= RT6_LOOKUP_F_IFACE;
2135
2136         if (!any_src)
2137                 flags |= RT6_LOOKUP_F_HAS_SADDR;
2138         else if (sk)
2139                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
2140
2141         return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
2142 }
2143 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
2144
2145 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2146 {
2147         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
2148         struct net_device *loopback_dev = net->loopback_dev;
2149         struct dst_entry *new = NULL;
2150
2151         rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
2152                        DST_OBSOLETE_DEAD, 0);
2153         if (rt) {
2154                 rt6_info_init(rt);
2155                 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
2156
2157                 new = &rt->dst;
2158                 new->__use = 1;
2159                 new->input = dst_discard;
2160                 new->output = dst_discard_out;
2161
2162                 dst_copy_metrics(new, &ort->dst);
2163
2164                 rt->rt6i_idev = in6_dev_get(loopback_dev);
2165                 rt->rt6i_gateway = ort->rt6i_gateway;
2166                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
2167
2168                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
2169 #ifdef CONFIG_IPV6_SUBTREES
2170                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
2171 #endif
2172         }
2173
2174         dst_release(dst_orig);
2175         return new ? new : ERR_PTR(-ENOMEM);
2176 }
2177
2178 /*
2179  *      Destination cache support functions
2180  */
2181
2182 static bool fib6_check(struct fib6_info *f6i, u32 cookie)
2183 {
2184         u32 rt_cookie = 0;
2185
2186         if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie)
2187                 return false;
2188
2189         if (fib6_check_expired(f6i))
2190                 return false;
2191
2192         return true;
2193 }
2194
2195 static struct dst_entry *rt6_check(struct rt6_info *rt,
2196                                    struct fib6_info *from,
2197                                    u32 cookie)
2198 {
2199         u32 rt_cookie = 0;
2200
2201         if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) ||
2202             rt_cookie != cookie)
2203                 return NULL;
2204
2205         if (rt6_check_expired(rt))
2206                 return NULL;
2207
2208         return &rt->dst;
2209 }
2210
2211 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt,
2212                                             struct fib6_info *from,
2213                                             u32 cookie)
2214 {
2215         if (!__rt6_check_expired(rt) &&
2216             rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
2217             fib6_check(from, cookie))
2218                 return &rt->dst;
2219         else
2220                 return NULL;
2221 }
2222
2223 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
2224 {
2225         struct dst_entry *dst_ret;
2226         struct fib6_info *from;
2227         struct rt6_info *rt;
2228
2229         rt = container_of(dst, struct rt6_info, dst);
2230
2231         rcu_read_lock();
2232
2233         /* All IPV6 dsts are created with ->obsolete set to the value
2234          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
2235          * into this function always.
2236          */
2237
2238         from = rcu_dereference(rt->from);
2239
2240         if (from && (rt->rt6i_flags & RTF_PCPU ||
2241             unlikely(!list_empty(&rt->rt6i_uncached))))
2242                 dst_ret = rt6_dst_from_check(rt, from, cookie);
2243         else
2244                 dst_ret = rt6_check(rt, from, cookie);
2245
2246         rcu_read_unlock();
2247
2248         return dst_ret;
2249 }
2250
2251 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
2252 {
2253         struct rt6_info *rt = (struct rt6_info *) dst;
2254
2255         if (rt) {
2256                 if (rt->rt6i_flags & RTF_CACHE) {
2257                         rcu_read_lock();
2258                         if (rt6_check_expired(rt)) {
2259                                 rt6_remove_exception_rt(rt);
2260                                 dst = NULL;
2261                         }
2262                         rcu_read_unlock();
2263                 } else {
2264                         dst_release(dst);
2265                         dst = NULL;
2266                 }
2267         }
2268         return dst;
2269 }
2270
2271 static void ip6_link_failure(struct sk_buff *skb)
2272 {
2273         struct rt6_info *rt;
2274
2275         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2276
2277         rt = (struct rt6_info *) skb_dst(skb);
2278         if (rt) {
2279                 rcu_read_lock();
2280                 if (rt->rt6i_flags & RTF_CACHE) {
2281                         rt6_remove_exception_rt(rt);
2282                 } else {
2283                         struct fib6_info *from;
2284                         struct fib6_node *fn;
2285
2286                         from = rcu_dereference(rt->from);
2287                         if (from) {
2288                                 fn = rcu_dereference(from->fib6_node);
2289                                 if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2290                                         fn->fn_sernum = -1;
2291                         }
2292                 }
2293                 rcu_read_unlock();
2294         }
2295 }
2296
2297 static void rt6_update_expires(struct rt6_info *rt0, int timeout)
2298 {
2299         if (!(rt0->rt6i_flags & RTF_EXPIRES)) {
2300                 struct fib6_info *from;
2301
2302                 rcu_read_lock();
2303                 from = rcu_dereference(rt0->from);
2304                 if (from)
2305                         rt0->dst.expires = from->expires;
2306                 rcu_read_unlock();
2307         }
2308
2309         dst_set_expires(&rt0->dst, timeout);
2310         rt0->rt6i_flags |= RTF_EXPIRES;
2311 }
2312
2313 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2314 {
2315         struct net *net = dev_net(rt->dst.dev);
2316
2317         dst_metric_set(&rt->dst, RTAX_MTU, mtu);
2318         rt->rt6i_flags |= RTF_MODIFIED;
2319         rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2320 }
2321
2322 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2323 {
2324         return !(rt->rt6i_flags & RTF_CACHE) &&
2325                 (rt->rt6i_flags & RTF_PCPU || rcu_access_pointer(rt->from));
2326 }
2327
2328 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2329                                  const struct ipv6hdr *iph, u32 mtu)
2330 {
2331         const struct in6_addr *daddr, *saddr;
2332         struct rt6_info *rt6 = (struct rt6_info *)dst;
2333
2334         if (dst_metric_locked(dst, RTAX_MTU))
2335                 return;
2336
2337         if (iph) {
2338                 daddr = &iph->daddr;
2339                 saddr = &iph->saddr;
2340         } else if (sk) {
2341                 daddr = &sk->sk_v6_daddr;
2342                 saddr = &inet6_sk(sk)->saddr;
2343         } else {
2344                 daddr = NULL;
2345                 saddr = NULL;
2346         }
2347         dst_confirm_neigh(dst, daddr);
2348         mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2349         if (mtu >= dst_mtu(dst))
2350                 return;
2351
2352         if (!rt6_cache_allowed_for_pmtu(rt6)) {
2353                 rt6_do_update_pmtu(rt6, mtu);
2354                 /* update rt6_ex->stamp for cache */
2355                 if (rt6->rt6i_flags & RTF_CACHE)
2356                         rt6_update_exception_stamp_rt(rt6);
2357         } else if (daddr) {
2358                 struct fib6_result res = {};
2359                 struct rt6_info *nrt6;
2360
2361                 rcu_read_lock();
2362                 res.f6i = rcu_dereference(rt6->from);
2363                 if (!res.f6i) {
2364                         rcu_read_unlock();
2365                         return;
2366                 }
2367                 res.nh = &res.f6i->fib6_nh;
2368                 res.fib6_flags = res.f6i->fib6_flags;
2369                 res.fib6_type = res.f6i->fib6_type;
2370
2371                 nrt6 = ip6_rt_cache_alloc(&res, daddr, saddr);
2372                 if (nrt6) {
2373                         rt6_do_update_pmtu(nrt6, mtu);
2374                         if (rt6_insert_exception(nrt6, &res))
2375                                 dst_release_immediate(&nrt6->dst);
2376                 }
2377                 rcu_read_unlock();
2378         }
2379 }
2380
2381 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2382                                struct sk_buff *skb, u32 mtu)
2383 {
2384         __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2385 }
2386
2387 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2388                      int oif, u32 mark, kuid_t uid)
2389 {
2390         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2391         struct dst_entry *dst;
2392         struct flowi6 fl6 = {
2393                 .flowi6_oif = oif,
2394                 .flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark),
2395                 .daddr = iph->daddr,
2396                 .saddr = iph->saddr,
2397                 .flowlabel = ip6_flowinfo(iph),
2398                 .flowi6_uid = uid,
2399         };
2400
2401         dst = ip6_route_output(net, NULL, &fl6);
2402         if (!dst->error)
2403                 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2404         dst_release(dst);
2405 }
2406 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2407
2408 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2409 {
2410         int oif = sk->sk_bound_dev_if;
2411         struct dst_entry *dst;
2412
2413         if (!oif && skb->dev)
2414                 oif = l3mdev_master_ifindex(skb->dev);
2415
2416         ip6_update_pmtu(skb, sock_net(sk), mtu, oif, sk->sk_mark, sk->sk_uid);
2417
2418         dst = __sk_dst_get(sk);
2419         if (!dst || !dst->obsolete ||
2420             dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2421                 return;
2422
2423         bh_lock_sock(sk);
2424         if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2425                 ip6_datagram_dst_update(sk, false);
2426         bh_unlock_sock(sk);
2427 }
2428 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2429
2430 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
2431                            const struct flowi6 *fl6)
2432 {
2433 #ifdef CONFIG_IPV6_SUBTREES
2434         struct ipv6_pinfo *np = inet6_sk(sk);
2435 #endif
2436
2437         ip6_dst_store(sk, dst,
2438                       ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ?
2439                       &sk->sk_v6_daddr : NULL,
2440 #ifdef CONFIG_IPV6_SUBTREES
2441                       ipv6_addr_equal(&fl6->saddr, &np->saddr) ?
2442                       &np->saddr :
2443 #endif
2444                       NULL);
2445 }
2446
2447 static bool ip6_redirect_nh_match(const struct fib6_result *res,
2448                                   struct flowi6 *fl6,
2449                                   const struct in6_addr *gw,
2450                                   struct rt6_info **ret)
2451 {
2452         const struct fib6_nh *nh = res->nh;
2453
2454         if (nh->fib_nh_flags & RTNH_F_DEAD || !nh->fib_nh_gw_family ||
2455             fl6->flowi6_oif != nh->fib_nh_dev->ifindex)
2456                 return false;
2457
2458         /* rt_cache's gateway might be different from its 'parent'
2459          * in the case of an ip redirect.
2460          * So we keep searching in the exception table if the gateway
2461          * is different.
2462          */
2463         if (!ipv6_addr_equal(gw, &nh->fib_nh_gw6)) {
2464                 struct rt6_info *rt_cache;
2465
2466                 rt_cache = rt6_find_cached_rt(res, &fl6->daddr, &fl6->saddr);
2467                 if (rt_cache &&
2468                     ipv6_addr_equal(gw, &rt_cache->rt6i_gateway)) {
2469                         *ret = rt_cache;
2470                         return true;
2471                 }
2472                 return false;
2473         }
2474         return true;
2475 }
2476
2477 /* Handle redirects */
2478 struct ip6rd_flowi {
2479         struct flowi6 fl6;
2480         struct in6_addr gateway;
2481 };
2482
2483 static struct rt6_info *__ip6_route_redirect(struct net *net,
2484                                              struct fib6_table *table,
2485                                              struct flowi6 *fl6,
2486                                              const struct sk_buff *skb,
2487                                              int flags)
2488 {
2489         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2490         struct rt6_info *ret = NULL;
2491         struct fib6_result res = {};
2492         struct fib6_info *rt;
2493         struct fib6_node *fn;
2494
2495         /* Get the "current" route for this destination and
2496          * check if the redirect has come from appropriate router.
2497          *
2498          * RFC 4861 specifies that redirects should only be
2499          * accepted if they come from the nexthop to the target.
2500          * Due to the way the routes are chosen, this notion
2501          * is a bit fuzzy and one might need to check all possible
2502          * routes.
2503          */
2504
2505         rcu_read_lock();
2506         fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2507 restart:
2508         for_each_fib6_node_rt_rcu(fn) {
2509                 res.f6i = rt;
2510                 res.nh = &rt->fib6_nh;
2511
2512                 if (fib6_check_expired(rt))
2513                         continue;
2514                 if (rt->fib6_flags & RTF_REJECT)
2515                         break;
2516                 if (ip6_redirect_nh_match(&res, fl6, &rdfl->gateway, &ret))
2517                         goto out;
2518         }
2519
2520         if (!rt)
2521                 rt = net->ipv6.fib6_null_entry;
2522         else if (rt->fib6_flags & RTF_REJECT) {
2523                 ret = net->ipv6.ip6_null_entry;
2524                 goto out;
2525         }
2526
2527         if (rt == net->ipv6.fib6_null_entry) {
2528                 fn = fib6_backtrack(fn, &fl6->saddr);
2529                 if (fn)
2530                         goto restart;
2531         }
2532
2533         res.f6i = rt;
2534         res.nh = &rt->fib6_nh;
2535 out:
2536         if (ret) {
2537                 ip6_hold_safe(net, &ret);
2538         } else {
2539                 res.fib6_flags = res.f6i->fib6_flags;
2540                 res.fib6_type = res.f6i->fib6_type;
2541                 ret = ip6_create_rt_rcu(&res);
2542         }
2543
2544         rcu_read_unlock();
2545
2546         trace_fib6_table_lookup(net, &res, table, fl6);
2547         return ret;
2548 };
2549
2550 static struct dst_entry *ip6_route_redirect(struct net *net,
2551                                             const struct flowi6 *fl6,
2552                                             const struct sk_buff *skb,
2553                                             const struct in6_addr *gateway)
2554 {
2555         int flags = RT6_LOOKUP_F_HAS_SADDR;
2556         struct ip6rd_flowi rdfl;
2557
2558         rdfl.fl6 = *fl6;
2559         rdfl.gateway = *gateway;
2560
2561         return fib6_rule_lookup(net, &rdfl.fl6, skb,
2562                                 flags, __ip6_route_redirect);
2563 }
2564
2565 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2566                   kuid_t uid)
2567 {
2568         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2569         struct dst_entry *dst;
2570         struct flowi6 fl6 = {
2571                 .flowi6_iif = LOOPBACK_IFINDEX,
2572                 .flowi6_oif = oif,
2573                 .flowi6_mark = mark,
2574                 .daddr = iph->daddr,
2575                 .saddr = iph->saddr,
2576                 .flowlabel = ip6_flowinfo(iph),
2577                 .flowi6_uid = uid,
2578         };
2579
2580         dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
2581         rt6_do_redirect(dst, NULL, skb);
2582         dst_release(dst);
2583 }
2584 EXPORT_SYMBOL_GPL(ip6_redirect);
2585
2586 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif)
2587 {
2588         const struct ipv6hdr *iph = ipv6_hdr(skb);
2589         const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2590         struct dst_entry *dst;
2591         struct flowi6 fl6 = {
2592                 .flowi6_iif = LOOPBACK_IFINDEX,
2593                 .flowi6_oif = oif,
2594                 .daddr = msg->dest,
2595                 .saddr = iph->daddr,
2596                 .flowi6_uid = sock_net_uid(net, NULL),
2597         };
2598
2599         dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
2600         rt6_do_redirect(dst, NULL, skb);
2601         dst_release(dst);
2602 }
2603
2604 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2605 {
2606         ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2607                      sk->sk_uid);
2608 }
2609 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2610
2611 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2612 {
2613         struct net_device *dev = dst->dev;
2614         unsigned int mtu = dst_mtu(dst);
2615         struct net *net = dev_net(dev);
2616
2617         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2618
2619         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2620                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2621
2622         /*
2623          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2624          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2625          * IPV6_MAXPLEN is also valid and means: "any MSS,
2626          * rely only on pmtu discovery"
2627          */
2628         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2629                 mtu = IPV6_MAXPLEN;
2630         return mtu;
2631 }
2632
2633 static unsigned int ip6_mtu(const struct dst_entry *dst)
2634 {
2635         struct inet6_dev *idev;
2636         unsigned int mtu;
2637
2638         mtu = dst_metric_raw(dst, RTAX_MTU);
2639         if (mtu)
2640                 goto out;
2641
2642         mtu = IPV6_MIN_MTU;
2643
2644         rcu_read_lock();
2645         idev = __in6_dev_get(dst->dev);
2646         if (idev)
2647                 mtu = idev->cnf.mtu6;
2648         rcu_read_unlock();
2649
2650 out:
2651         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2652
2653         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2654 }
2655
2656 /* MTU selection:
2657  * 1. mtu on route is locked - use it
2658  * 2. mtu from nexthop exception
2659  * 3. mtu from egress device
2660  *
2661  * based on ip6_dst_mtu_forward and exception logic of
2662  * rt6_find_cached_rt; called with rcu_read_lock
2663  */
2664 u32 ip6_mtu_from_fib6(const struct fib6_result *res,
2665                       const struct in6_addr *daddr,
2666                       const struct in6_addr *saddr)
2667 {
2668         struct rt6_exception_bucket *bucket;
2669         const struct fib6_nh *nh = res->nh;
2670         struct fib6_info *f6i = res->f6i;
2671         const struct in6_addr *src_key;
2672         struct rt6_exception *rt6_ex;
2673         struct inet6_dev *idev;
2674         u32 mtu = 0;
2675
2676         if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) {
2677                 mtu = f6i->fib6_pmtu;
2678                 if (mtu)
2679                         goto out;
2680         }
2681
2682         src_key = NULL;
2683 #ifdef CONFIG_IPV6_SUBTREES
2684         if (f6i->fib6_src.plen)
2685                 src_key = saddr;
2686 #endif
2687
2688         bucket = rcu_dereference(f6i->rt6i_exception_bucket);
2689         rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
2690         if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
2691                 mtu = dst_metric_raw(&rt6_ex->rt6i->dst, RTAX_MTU);
2692
2693         if (likely(!mtu)) {
2694                 struct net_device *dev = nh->fib_nh_dev;
2695
2696                 mtu = IPV6_MIN_MTU;
2697                 idev = __in6_dev_get(dev);
2698                 if (idev && idev->cnf.mtu6 > mtu)
2699                         mtu = idev->cnf.mtu6;
2700         }
2701
2702         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2703 out:
2704         return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu);
2705 }
2706
2707 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2708                                   struct flowi6 *fl6)
2709 {
2710         struct dst_entry *dst;
2711         struct rt6_info *rt;
2712         struct inet6_dev *idev = in6_dev_get(dev);
2713         struct net *net = dev_net(dev);
2714
2715         if (unlikely(!idev))
2716                 return ERR_PTR(-ENODEV);
2717
2718         rt = ip6_dst_alloc(net, dev, 0);
2719         if (unlikely(!rt)) {
2720                 in6_dev_put(idev);
2721                 dst = ERR_PTR(-ENOMEM);
2722                 goto out;
2723         }
2724
2725         rt->dst.flags |= DST_HOST;
2726         rt->dst.input = ip6_input;
2727         rt->dst.output  = ip6_output;
2728         rt->rt6i_gateway  = fl6->daddr;
2729         rt->rt6i_dst.addr = fl6->daddr;
2730         rt->rt6i_dst.plen = 128;
2731         rt->rt6i_idev     = idev;
2732         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2733
2734         /* Add this dst into uncached_list so that rt6_disable_ip() can
2735          * do proper release of the net_device
2736          */
2737         rt6_uncached_list_add(rt);
2738         atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2739
2740         dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2741
2742 out:
2743         return dst;
2744 }
2745
2746 static int ip6_dst_gc(struct dst_ops *ops)
2747 {
2748         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2749         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2750         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2751         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2752         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2753         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2754         int entries;
2755
2756         entries = dst_entries_get_fast(ops);
2757         if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2758             entries <= rt_max_size)
2759                 goto out;
2760
2761         net->ipv6.ip6_rt_gc_expire++;
2762         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2763         entries = dst_entries_get_slow(ops);
2764         if (entries < ops->gc_thresh)
2765                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2766 out:
2767         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2768         return entries > rt_max_size;
2769 }
2770
2771 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2772                                             struct fib6_config *cfg,
2773                                             const struct in6_addr *gw_addr,
2774                                             u32 tbid, int flags)
2775 {
2776         struct flowi6 fl6 = {
2777                 .flowi6_oif = cfg->fc_ifindex,
2778                 .daddr = *gw_addr,
2779                 .saddr = cfg->fc_prefsrc,
2780         };
2781         struct fib6_table *table;
2782         struct rt6_info *rt;
2783
2784         table = fib6_get_table(net, tbid);
2785         if (!table)
2786                 return NULL;
2787
2788         if (!ipv6_addr_any(&cfg->fc_prefsrc))
2789                 flags |= RT6_LOOKUP_F_HAS_SADDR;
2790
2791         flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
2792         rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags);
2793
2794         /* if table lookup failed, fall back to full lookup */
2795         if (rt == net->ipv6.ip6_null_entry) {
2796                 ip6_rt_put(rt);
2797                 rt = NULL;
2798         }
2799
2800         return rt;
2801 }
2802
2803 static int ip6_route_check_nh_onlink(struct net *net,
2804                                      struct fib6_config *cfg,
2805                                      const struct net_device *dev,
2806                                      struct netlink_ext_ack *extack)
2807 {
2808         u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
2809         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2810         u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
2811         struct fib6_info *from;
2812         struct rt6_info *grt;
2813         int err;
2814
2815         err = 0;
2816         grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
2817         if (grt) {
2818                 rcu_read_lock();
2819                 from = rcu_dereference(grt->from);
2820                 if (!grt->dst.error &&
2821                     /* ignore match if it is the default route */
2822                     from && !ipv6_addr_any(&from->fib6_dst.addr) &&
2823                     (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
2824                         NL_SET_ERR_MSG(extack,
2825                                        "Nexthop has invalid gateway or device mismatch");
2826                         err = -EINVAL;
2827                 }
2828                 rcu_read_unlock();
2829
2830                 ip6_rt_put(grt);
2831         }
2832
2833         return err;
2834 }
2835
2836 static int ip6_route_check_nh(struct net *net,
2837                               struct fib6_config *cfg,
2838                               struct net_device **_dev,
2839                               struct inet6_dev **idev)
2840 {
2841         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2842         struct net_device *dev = _dev ? *_dev : NULL;
2843         struct rt6_info *grt = NULL;
2844         int err = -EHOSTUNREACH;
2845
2846         if (cfg->fc_table) {
2847                 int flags = RT6_LOOKUP_F_IFACE;
2848
2849                 grt = ip6_nh_lookup_table(net, cfg, gw_addr,
2850                                           cfg->fc_table, flags);
2851                 if (grt) {
2852                         if (grt->rt6i_flags & RTF_GATEWAY ||
2853                             (dev && dev != grt->dst.dev)) {
2854                                 ip6_rt_put(grt);
2855                                 grt = NULL;
2856                         }
2857                 }
2858         }
2859
2860         if (!grt)
2861                 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1);
2862
2863         if (!grt)
2864                 goto out;
2865
2866         if (dev) {
2867                 if (dev != grt->dst.dev) {
2868                         ip6_rt_put(grt);
2869                         goto out;
2870                 }
2871         } else {
2872                 *_dev = dev = grt->dst.dev;
2873                 *idev = grt->rt6i_idev;
2874                 dev_hold(dev);
2875                 in6_dev_hold(grt->rt6i_idev);
2876         }
2877
2878         if (!(grt->rt6i_flags & RTF_GATEWAY))
2879                 err = 0;
2880
2881         ip6_rt_put(grt);
2882
2883 out:
2884         return err;
2885 }
2886
2887 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
2888                            struct net_device **_dev, struct inet6_dev **idev,
2889                            struct netlink_ext_ack *extack)
2890 {
2891         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2892         int gwa_type = ipv6_addr_type(gw_addr);
2893         bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true;
2894         const struct net_device *dev = *_dev;
2895         bool need_addr_check = !dev;
2896         int err = -EINVAL;
2897
2898         /* if gw_addr is local we will fail to detect this in case
2899          * address is still TENTATIVE (DAD in progress). rt6_lookup()
2900          * will return already-added prefix route via interface that
2901          * prefix route was assigned to, which might be non-loopback.
2902          */
2903         if (dev &&
2904             ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2905                 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2906                 goto out;
2907         }
2908
2909         if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
2910                 /* IPv6 strictly inhibits using not link-local
2911                  * addresses as nexthop address.
2912                  * Otherwise, router will not able to send redirects.
2913                  * It is very good, but in some (rare!) circumstances
2914                  * (SIT, PtP, NBMA NOARP links) it is handy to allow
2915                  * some exceptions. --ANK
2916                  * We allow IPv4-mapped nexthops to support RFC4798-type
2917                  * addressing
2918                  */
2919                 if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) {
2920                         NL_SET_ERR_MSG(extack, "Invalid gateway address");
2921                         goto out;
2922                 }
2923
2924                 if (cfg->fc_flags & RTNH_F_ONLINK)
2925                         err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
2926                 else
2927                         err = ip6_route_check_nh(net, cfg, _dev, idev);
2928
2929                 if (err)
2930                         goto out;
2931         }
2932
2933         /* reload in case device was changed */
2934         dev = *_dev;
2935
2936         err = -EINVAL;
2937         if (!dev) {
2938                 NL_SET_ERR_MSG(extack, "Egress device not specified");
2939                 goto out;
2940         } else if (dev->flags & IFF_LOOPBACK) {
2941                 NL_SET_ERR_MSG(extack,
2942                                "Egress device can not be loopback device for this route");
2943                 goto out;
2944         }
2945
2946         /* if we did not check gw_addr above, do so now that the
2947          * egress device has been resolved.
2948          */
2949         if (need_addr_check &&
2950             ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2951                 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2952                 goto out;
2953         }
2954
2955         err = 0;
2956 out:
2957         return err;
2958 }
2959
2960 static bool fib6_is_reject(u32 flags, struct net_device *dev, int addr_type)
2961 {
2962         if ((flags & RTF_REJECT) ||
2963             (dev && (dev->flags & IFF_LOOPBACK) &&
2964              !(addr_type & IPV6_ADDR_LOOPBACK) &&
2965              !(flags & RTF_LOCAL)))
2966                 return true;
2967
2968         return false;
2969 }
2970
2971 int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh,
2972                  struct fib6_config *cfg, gfp_t gfp_flags,
2973                  struct netlink_ext_ack *extack)
2974 {
2975         struct net_device *dev = NULL;
2976         struct inet6_dev *idev = NULL;
2977         int addr_type;
2978         int err;
2979
2980         fib6_nh->fib_nh_family = AF_INET6;
2981
2982         err = -ENODEV;
2983         if (cfg->fc_ifindex) {
2984                 dev = dev_get_by_index(net, cfg->fc_ifindex);
2985                 if (!dev)
2986                         goto out;
2987                 idev = in6_dev_get(dev);
2988                 if (!idev)
2989                         goto out;
2990         }
2991
2992         if (cfg->fc_flags & RTNH_F_ONLINK) {
2993                 if (!dev) {
2994                         NL_SET_ERR_MSG(extack,
2995                                        "Nexthop device required for onlink");
2996                         goto out;
2997                 }
2998
2999                 if (!(dev->flags & IFF_UP)) {
3000                         NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3001                         err = -ENETDOWN;
3002                         goto out;
3003                 }
3004
3005                 fib6_nh->fib_nh_flags |= RTNH_F_ONLINK;
3006         }
3007
3008         fib6_nh->fib_nh_weight = 1;
3009
3010         /* We cannot add true routes via loopback here,
3011          * they would result in kernel looping; promote them to reject routes
3012          */
3013         addr_type = ipv6_addr_type(&cfg->fc_dst);
3014         if (fib6_is_reject(cfg->fc_flags, dev, addr_type)) {
3015                 /* hold loopback dev/idev if we haven't done so. */
3016                 if (dev != net->loopback_dev) {
3017                         if (dev) {
3018                                 dev_put(dev);
3019                                 in6_dev_put(idev);
3020                         }
3021                         dev = net->loopback_dev;
3022                         dev_hold(dev);
3023                         idev = in6_dev_get(dev);
3024                         if (!idev) {
3025                                 err = -ENODEV;
3026                                 goto out;
3027                         }
3028                 }
3029                 goto set_dev;
3030         }
3031
3032         if (cfg->fc_flags & RTF_GATEWAY) {
3033                 err = ip6_validate_gw(net, cfg, &dev, &idev, extack);
3034                 if (err)
3035                         goto out;
3036
3037                 fib6_nh->fib_nh_gw6 = cfg->fc_gateway;
3038                 fib6_nh->fib_nh_gw_family = AF_INET6;
3039         }
3040
3041         err = -ENODEV;
3042         if (!dev)
3043                 goto out;
3044
3045         if (idev->cnf.disable_ipv6) {
3046                 NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
3047                 err = -EACCES;
3048                 goto out;
3049         }
3050
3051         if (!(dev->flags & IFF_UP) && !cfg->fc_ignore_dev_down) {
3052                 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3053                 err = -ENETDOWN;
3054                 goto out;
3055         }
3056
3057         if (!(cfg->fc_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
3058             !netif_carrier_ok(dev))
3059                 fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN;
3060
3061         err = fib_nh_common_init(&fib6_nh->nh_common, cfg->fc_encap,
3062                                  cfg->fc_encap_type, cfg, gfp_flags, extack);
3063         if (err)
3064                 goto out;
3065 set_dev:
3066         fib6_nh->fib_nh_dev = dev;
3067         fib6_nh->fib_nh_oif = dev->ifindex;
3068         err = 0;
3069 out:
3070         if (idev)
3071                 in6_dev_put(idev);
3072
3073         if (err) {
3074                 lwtstate_put(fib6_nh->fib_nh_lws);
3075                 fib6_nh->fib_nh_lws = NULL;
3076                 if (dev)
3077                         dev_put(dev);
3078         }
3079
3080         return err;
3081 }
3082
3083 void fib6_nh_release(struct fib6_nh *fib6_nh)
3084 {
3085         fib_nh_common_release(&fib6_nh->nh_common);
3086 }
3087
3088 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
3089                                               gfp_t gfp_flags,
3090                                               struct netlink_ext_ack *extack)
3091 {
3092         struct net *net = cfg->fc_nlinfo.nl_net;
3093         struct fib6_info *rt = NULL;
3094         struct fib6_table *table;
3095         int err = -EINVAL;
3096         int addr_type;
3097
3098         /* RTF_PCPU is an internal flag; can not be set by userspace */
3099         if (cfg->fc_flags & RTF_PCPU) {
3100                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
3101                 goto out;
3102         }
3103
3104         /* RTF_CACHE is an internal flag; can not be set by userspace */
3105         if (cfg->fc_flags & RTF_CACHE) {
3106                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
3107                 goto out;
3108         }
3109
3110         if (cfg->fc_type > RTN_MAX) {
3111                 NL_SET_ERR_MSG(extack, "Invalid route type");
3112                 goto out;
3113         }
3114
3115         if (cfg->fc_dst_len > 128) {
3116                 NL_SET_ERR_MSG(extack, "Invalid prefix length");
3117                 goto out;
3118         }
3119         if (cfg->fc_src_len > 128) {
3120                 NL_SET_ERR_MSG(extack, "Invalid source address length");
3121                 goto out;
3122         }
3123 #ifndef CONFIG_IPV6_SUBTREES
3124         if (cfg->fc_src_len) {
3125                 NL_SET_ERR_MSG(extack,
3126                                "Specifying source address requires IPV6_SUBTREES to be enabled");
3127                 goto out;
3128         }
3129 #endif
3130
3131         err = -ENOBUFS;
3132         if (cfg->fc_nlinfo.nlh &&
3133             !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
3134                 table = fib6_get_table(net, cfg->fc_table);
3135                 if (!table) {
3136                         pr_warn("NLM_F_CREATE should be specified when creating new route\n");
3137                         table = fib6_new_table(net, cfg->fc_table);
3138                 }
3139         } else {
3140                 table = fib6_new_table(net, cfg->fc_table);
3141         }
3142
3143         if (!table)
3144                 goto out;
3145
3146         err = -ENOMEM;
3147         rt = fib6_info_alloc(gfp_flags);
3148         if (!rt)
3149                 goto out;
3150
3151         rt->fib6_metrics = ip_fib_metrics_init(net, cfg->fc_mx, cfg->fc_mx_len,
3152                                                extack);
3153         if (IS_ERR(rt->fib6_metrics)) {
3154                 err = PTR_ERR(rt->fib6_metrics);
3155                 /* Do not leave garbage there. */
3156                 rt->fib6_metrics = (struct dst_metrics *)&dst_default_metrics;
3157                 goto out;
3158         }
3159
3160         if (cfg->fc_flags & RTF_ADDRCONF)
3161                 rt->dst_nocount = true;
3162
3163         if (cfg->fc_flags & RTF_EXPIRES)
3164                 fib6_set_expires(rt, jiffies +
3165                                 clock_t_to_jiffies(cfg->fc_expires));
3166         else
3167                 fib6_clean_expires(rt);
3168
3169         if (cfg->fc_protocol == RTPROT_UNSPEC)
3170                 cfg->fc_protocol = RTPROT_BOOT;
3171         rt->fib6_protocol = cfg->fc_protocol;
3172
3173         rt->fib6_table = table;
3174         rt->fib6_metric = cfg->fc_metric;
3175         rt->fib6_type = cfg->fc_type;
3176         rt->fib6_flags = cfg->fc_flags & ~RTF_GATEWAY;
3177
3178         ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
3179         rt->fib6_dst.plen = cfg->fc_dst_len;
3180         if (rt->fib6_dst.plen == 128)
3181                 rt->dst_host = true;
3182
3183 #ifdef CONFIG_IPV6_SUBTREES
3184         ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
3185         rt->fib6_src.plen = cfg->fc_src_len;
3186 #endif
3187         err = fib6_nh_init(net, &rt->fib6_nh, cfg, gfp_flags, extack);
3188         if (err)
3189                 goto out;
3190
3191         /* We cannot add true routes via loopback here,
3192          * they would result in kernel looping; promote them to reject routes
3193          */
3194         addr_type = ipv6_addr_type(&cfg->fc_dst);
3195         if (fib6_is_reject(cfg->fc_flags, rt->fib6_nh.fib_nh_dev, addr_type))
3196                 rt->fib6_flags = RTF_REJECT | RTF_NONEXTHOP;
3197
3198         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
3199                 struct net_device *dev = fib6_info_nh_dev(rt);
3200
3201                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
3202                         NL_SET_ERR_MSG(extack, "Invalid source address");
3203                         err = -EINVAL;
3204                         goto out;
3205                 }
3206                 rt->fib6_prefsrc.addr = cfg->fc_prefsrc;
3207                 rt->fib6_prefsrc.plen = 128;
3208         } else
3209                 rt->fib6_prefsrc.plen = 0;
3210
3211         return rt;
3212 out:
3213         fib6_info_release(rt);
3214         return ERR_PTR(err);
3215 }
3216
3217 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
3218                   struct netlink_ext_ack *extack)
3219 {
3220         struct fib6_info *rt;
3221         int err;
3222
3223         rt = ip6_route_info_create(cfg, gfp_flags, extack);
3224         if (IS_ERR(rt))
3225                 return PTR_ERR(rt);
3226
3227         err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack);
3228         fib6_info_release(rt);
3229
3230         return err;
3231 }
3232
3233 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info)
3234 {
3235         struct net *net = info->nl_net;
3236         struct fib6_table *table;
3237         int err;
3238
3239         if (rt == net->ipv6.fib6_null_entry) {
3240                 err = -ENOENT;
3241                 goto out;
3242         }
3243
3244         table = rt->fib6_table;
3245         spin_lock_bh(&table->tb6_lock);
3246         err = fib6_del(rt, info);
3247         spin_unlock_bh(&table->tb6_lock);
3248
3249 out:
3250         fib6_info_release(rt);
3251         return err;
3252 }
3253
3254 int ip6_del_rt(struct net *net, struct fib6_info *rt)
3255 {
3256         struct nl_info info = { .nl_net = net };
3257
3258         return __ip6_del_rt(rt, &info);
3259 }
3260
3261 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
3262 {
3263         struct nl_info *info = &cfg->fc_nlinfo;
3264         struct net *net = info->nl_net;
3265         struct sk_buff *skb = NULL;
3266         struct fib6_table *table;
3267         int err = -ENOENT;
3268
3269         if (rt == net->ipv6.fib6_null_entry)
3270                 goto out_put;
3271         table = rt->fib6_table;
3272         spin_lock_bh(&table->tb6_lock);
3273
3274         if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) {
3275                 struct fib6_info *sibling, *next_sibling;
3276
3277                 /* prefer to send a single notification with all hops */
3278                 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3279                 if (skb) {
3280                         u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3281
3282                         if (rt6_fill_node(net, skb, rt, NULL,
3283                                           NULL, NULL, 0, RTM_DELROUTE,
3284                                           info->portid, seq, 0) < 0) {
3285                                 kfree_skb(skb);
3286                                 skb = NULL;
3287                         } else
3288                                 info->skip_notify = 1;
3289                 }
3290
3291                 list_for_each_entry_safe(sibling, next_sibling,
3292                                          &rt->fib6_siblings,
3293                                          fib6_siblings) {
3294                         err = fib6_del(sibling, info);
3295                         if (err)
3296                                 goto out_unlock;
3297                 }
3298         }
3299
3300         err = fib6_del(rt, info);
3301 out_unlock:
3302         spin_unlock_bh(&table->tb6_lock);
3303 out_put:
3304         fib6_info_release(rt);
3305
3306         if (skb) {
3307                 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3308                             info->nlh, gfp_any());
3309         }
3310         return err;
3311 }
3312
3313 static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
3314 {
3315         int rc = -ESRCH;
3316
3317         if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex)
3318                 goto out;
3319
3320         if (cfg->fc_flags & RTF_GATEWAY &&
3321             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
3322                 goto out;
3323
3324         rc = rt6_remove_exception_rt(rt);
3325 out:
3326         return rc;
3327 }
3328
3329 static int ip6_route_del(struct fib6_config *cfg,
3330                          struct netlink_ext_ack *extack)
3331 {
3332         struct rt6_info *rt_cache;
3333         struct fib6_table *table;
3334         struct fib6_info *rt;
3335         struct fib6_node *fn;
3336         int err = -ESRCH;
3337
3338         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
3339         if (!table) {
3340                 NL_SET_ERR_MSG(extack, "FIB table does not exist");
3341                 return err;
3342         }
3343
3344         rcu_read_lock();
3345
3346         fn = fib6_locate(&table->tb6_root,
3347                          &cfg->fc_dst, cfg->fc_dst_len,
3348                          &cfg->fc_src, cfg->fc_src_len,
3349                          !(cfg->fc_flags & RTF_CACHE));
3350
3351         if (fn) {
3352                 for_each_fib6_node_rt_rcu(fn) {
3353                         struct fib6_nh *nh;
3354
3355                         if (cfg->fc_flags & RTF_CACHE) {
3356                                 struct fib6_result res = {
3357                                         .f6i = rt,
3358                                 };
3359                                 int rc;
3360
3361                                 rt_cache = rt6_find_cached_rt(&res,
3362                                                               &cfg->fc_dst,
3363                                                               &cfg->fc_src);
3364                                 if (rt_cache) {
3365                                         rc = ip6_del_cached_rt(rt_cache, cfg);
3366                                         if (rc != -ESRCH) {
3367                                                 rcu_read_unlock();
3368                                                 return rc;
3369                                         }
3370                                 }
3371                                 continue;
3372                         }
3373
3374                         nh = &rt->fib6_nh;
3375                         if (cfg->fc_ifindex &&
3376                             (!nh->fib_nh_dev ||
3377                              nh->fib_nh_dev->ifindex != cfg->fc_ifindex))
3378                                 continue;
3379                         if (cfg->fc_flags & RTF_GATEWAY &&
3380                             !ipv6_addr_equal(&cfg->fc_gateway, &nh->fib_nh_gw6))
3381                                 continue;
3382                         if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
3383                                 continue;
3384                         if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol)
3385                                 continue;
3386                         if (!fib6_info_hold_safe(rt))
3387                                 continue;
3388                         rcu_read_unlock();
3389
3390                         /* if gateway was specified only delete the one hop */
3391                         if (cfg->fc_flags & RTF_GATEWAY)
3392                                 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
3393
3394                         return __ip6_del_rt_siblings(rt, cfg);
3395                 }
3396         }
3397         rcu_read_unlock();
3398
3399         return err;
3400 }
3401
3402 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
3403 {
3404         struct netevent_redirect netevent;
3405         struct rt6_info *rt, *nrt = NULL;
3406         struct fib6_result res = {};
3407         struct ndisc_options ndopts;
3408         struct inet6_dev *in6_dev;
3409         struct neighbour *neigh;
3410         struct rd_msg *msg;
3411         int optlen, on_link;
3412         u8 *lladdr;
3413
3414         optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
3415         optlen -= sizeof(*msg);
3416
3417         if (optlen < 0) {
3418                 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
3419                 return;
3420         }
3421
3422         msg = (struct rd_msg *)icmp6_hdr(skb);
3423
3424         if (ipv6_addr_is_multicast(&msg->dest)) {
3425                 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
3426                 return;
3427         }
3428
3429         on_link = 0;
3430         if (ipv6_addr_equal(&msg->dest, &msg->target)) {
3431                 on_link = 1;
3432         } else if (ipv6_addr_type(&msg->target) !=
3433                    (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
3434                 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
3435                 return;
3436         }
3437
3438         in6_dev = __in6_dev_get(skb->dev);
3439         if (!in6_dev)
3440                 return;
3441         if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
3442                 return;
3443
3444         /* RFC2461 8.1:
3445          *      The IP source address of the Redirect MUST be the same as the current
3446          *      first-hop router for the specified ICMP Destination Address.
3447          */
3448
3449         if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
3450                 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
3451                 return;
3452         }
3453
3454         lladdr = NULL;
3455         if (ndopts.nd_opts_tgt_lladdr) {
3456                 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
3457                                              skb->dev);
3458                 if (!lladdr) {
3459                         net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
3460                         return;
3461                 }
3462         }
3463
3464         rt = (struct rt6_info *) dst;
3465         if (rt->rt6i_flags & RTF_REJECT) {
3466                 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3467                 return;
3468         }
3469
3470         /* Redirect received -> path was valid.
3471          * Look, redirects are sent only in response to data packets,
3472          * so that this nexthop apparently is reachable. --ANK
3473          */
3474         dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3475
3476         neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3477         if (!neigh)
3478                 return;
3479
3480         /*
3481          *      We have finally decided to accept it.
3482          */
3483
3484         ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3485                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
3486                      NEIGH_UPDATE_F_OVERRIDE|
3487                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3488                                      NEIGH_UPDATE_F_ISROUTER)),
3489                      NDISC_REDIRECT, &ndopts);
3490
3491         rcu_read_lock();
3492         res.f6i = rcu_dereference(rt->from);
3493         if (!res.f6i)
3494                 goto out;
3495
3496         res.nh = &res.f6i->fib6_nh;
3497         res.fib6_flags = res.f6i->fib6_flags;
3498         res.fib6_type = res.f6i->fib6_type;
3499         nrt = ip6_rt_cache_alloc(&res, &msg->dest, NULL);
3500         if (!nrt)
3501                 goto out;
3502
3503         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3504         if (on_link)
3505                 nrt->rt6i_flags &= ~RTF_GATEWAY;
3506
3507         nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3508
3509         /* rt6_insert_exception() will take care of duplicated exceptions */
3510         if (rt6_insert_exception(nrt, &res)) {
3511                 dst_release_immediate(&nrt->dst);
3512                 goto out;
3513         }
3514
3515         netevent.old = &rt->dst;
3516         netevent.new = &nrt->dst;
3517         netevent.daddr = &msg->dest;
3518         netevent.neigh = neigh;
3519         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3520
3521 out:
3522         rcu_read_unlock();
3523         neigh_release(neigh);
3524 }
3525
3526 #ifdef CONFIG_IPV6_ROUTE_INFO
3527 static struct fib6_info *rt6_get_route_info(struct net *net,
3528                                            const struct in6_addr *prefix, int prefixlen,
3529                                            const struct in6_addr *gwaddr,
3530                                            struct net_device *dev)
3531 {
3532         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3533         int ifindex = dev->ifindex;
3534         struct fib6_node *fn;
3535         struct fib6_info *rt = NULL;
3536         struct fib6_table *table;
3537
3538         table = fib6_get_table(net, tb_id);
3539         if (!table)
3540                 return NULL;
3541
3542         rcu_read_lock();
3543         fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3544         if (!fn)
3545                 goto out;
3546
3547         for_each_fib6_node_rt_rcu(fn) {
3548                 if (rt->fib6_nh.fib_nh_dev->ifindex != ifindex)
3549                         continue;
3550                 if (!(rt->fib6_flags & RTF_ROUTEINFO) ||
3551                     !rt->fib6_nh.fib_nh_gw_family)
3552                         continue;
3553                 if (!ipv6_addr_equal(&rt->fib6_nh.fib_nh_gw6, gwaddr))
3554                         continue;
3555                 if (!fib6_info_hold_safe(rt))
3556                         continue;
3557                 break;
3558         }
3559 out:
3560         rcu_read_unlock();
3561         return rt;
3562 }
3563
3564 static struct fib6_info *rt6_add_route_info(struct net *net,
3565                                            const struct in6_addr *prefix, int prefixlen,
3566                                            const struct in6_addr *gwaddr,
3567                                            struct net_device *dev,
3568                                            unsigned int pref)
3569 {
3570         struct fib6_config cfg = {
3571                 .fc_metric      = IP6_RT_PRIO_USER,
3572                 .fc_ifindex     = dev->ifindex,
3573                 .fc_dst_len     = prefixlen,
3574                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3575                                   RTF_UP | RTF_PREF(pref),
3576                 .fc_protocol = RTPROT_RA,
3577                 .fc_type = RTN_UNICAST,
3578                 .fc_nlinfo.portid = 0,
3579                 .fc_nlinfo.nlh = NULL,
3580                 .fc_nlinfo.nl_net = net,
3581         };
3582
3583         cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3584         cfg.fc_dst = *prefix;
3585         cfg.fc_gateway = *gwaddr;
3586
3587         /* We should treat it as a default route if prefix length is 0. */
3588         if (!prefixlen)
3589                 cfg.fc_flags |= RTF_DEFAULT;
3590
3591         ip6_route_add(&cfg, GFP_ATOMIC, NULL);
3592
3593         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3594 }
3595 #endif
3596
3597 struct fib6_info *rt6_get_dflt_router(struct net *net,
3598                                      const struct in6_addr *addr,
3599                                      struct net_device *dev)
3600 {
3601         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3602         struct fib6_info *rt;
3603         struct fib6_table *table;
3604
3605         table = fib6_get_table(net, tb_id);
3606         if (!table)
3607                 return NULL;
3608
3609         rcu_read_lock();
3610         for_each_fib6_node_rt_rcu(&table->tb6_root) {
3611                 struct fib6_nh *nh = &rt->fib6_nh;
3612
3613                 if (dev == nh->fib_nh_dev &&
3614                     ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3615                     ipv6_addr_equal(&nh->fib_nh_gw6, addr))
3616                         break;
3617         }
3618         if (rt && !fib6_info_hold_safe(rt))
3619                 rt = NULL;
3620         rcu_read_unlock();
3621         return rt;
3622 }
3623
3624 struct fib6_info *rt6_add_dflt_router(struct net *net,
3625                                      const struct in6_addr *gwaddr,
3626                                      struct net_device *dev,
3627                                      unsigned int pref)
3628 {
3629         struct fib6_config cfg = {
3630                 .fc_table       = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3631                 .fc_metric      = IP6_RT_PRIO_USER,
3632                 .fc_ifindex     = dev->ifindex,
3633                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3634                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3635                 .fc_protocol = RTPROT_RA,
3636                 .fc_type = RTN_UNICAST,
3637                 .fc_nlinfo.portid = 0,
3638                 .fc_nlinfo.nlh = NULL,
3639                 .fc_nlinfo.nl_net = net,
3640         };
3641
3642         cfg.fc_gateway = *gwaddr;
3643
3644         if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) {
3645                 struct fib6_table *table;
3646
3647                 table = fib6_get_table(dev_net(dev), cfg.fc_table);
3648                 if (table)
3649                         table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3650         }
3651
3652         return rt6_get_dflt_router(net, gwaddr, dev);
3653 }
3654
3655 static void __rt6_purge_dflt_routers(struct net *net,
3656                                      struct fib6_table *table)
3657 {
3658         struct fib6_info *rt;
3659
3660 restart:
3661         rcu_read_lock();
3662         for_each_fib6_node_rt_rcu(&table->tb6_root) {
3663                 struct net_device *dev = fib6_info_nh_dev(rt);
3664                 struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL;
3665
3666                 if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3667                     (!idev || idev->cnf.accept_ra != 2) &&
3668                     fib6_info_hold_safe(rt)) {
3669                         rcu_read_unlock();
3670                         ip6_del_rt(net, rt);
3671                         goto restart;
3672                 }
3673         }
3674         rcu_read_unlock();
3675
3676         table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3677 }
3678
3679 void rt6_purge_dflt_routers(struct net *net)
3680 {
3681         struct fib6_table *table;
3682         struct hlist_head *head;
3683         unsigned int h;
3684
3685         rcu_read_lock();
3686
3687         for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3688                 head = &net->ipv6.fib_table_hash[h];
3689                 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3690                         if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3691                                 __rt6_purge_dflt_routers(net, table);
3692                 }
3693         }
3694
3695         rcu_read_unlock();
3696 }
3697
3698 static void rtmsg_to_fib6_config(struct net *net,
3699                                  struct in6_rtmsg *rtmsg,
3700                                  struct fib6_config *cfg)
3701 {
3702         *cfg = (struct fib6_config){
3703                 .fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3704                          : RT6_TABLE_MAIN,
3705                 .fc_ifindex = rtmsg->rtmsg_ifindex,
3706                 .fc_metric = rtmsg->rtmsg_metric ? : IP6_RT_PRIO_USER,
3707                 .fc_expires = rtmsg->rtmsg_info,
3708                 .fc_dst_len = rtmsg->rtmsg_dst_len,
3709                 .fc_src_len = rtmsg->rtmsg_src_len,
3710                 .fc_flags = rtmsg->rtmsg_flags,
3711                 .fc_type = rtmsg->rtmsg_type,
3712
3713                 .fc_nlinfo.nl_net = net,
3714
3715                 .fc_dst = rtmsg->rtmsg_dst,
3716                 .fc_src = rtmsg->rtmsg_src,
3717                 .fc_gateway = rtmsg->rtmsg_gateway,
3718         };
3719 }
3720
3721 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3722 {
3723         struct fib6_config cfg;
3724         struct in6_rtmsg rtmsg;
3725         int err;
3726
3727         switch (cmd) {
3728         case SIOCADDRT:         /* Add a route */
3729         case SIOCDELRT:         /* Delete a route */
3730                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3731                         return -EPERM;
3732                 err = copy_from_user(&rtmsg, arg,
3733                                      sizeof(struct in6_rtmsg));
3734                 if (err)
3735                         return -EFAULT;
3736
3737                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3738
3739                 rtnl_lock();
3740                 switch (cmd) {
3741                 case SIOCADDRT:
3742                         err = ip6_route_add(&cfg, GFP_KERNEL, NULL);
3743                         break;
3744                 case SIOCDELRT:
3745                         err = ip6_route_del(&cfg, NULL);
3746                         break;
3747                 default:
3748                         err = -EINVAL;
3749                 }
3750                 rtnl_unlock();
3751
3752                 return err;
3753         }
3754
3755         return -EINVAL;
3756 }
3757
3758 /*
3759  *      Drop the packet on the floor
3760  */
3761
3762 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3763 {
3764         struct dst_entry *dst = skb_dst(skb);
3765         struct net *net = dev_net(dst->dev);
3766         struct inet6_dev *idev;
3767         int type;
3768
3769         if (netif_is_l3_master(skb->dev) &&
3770             dst->dev == net->loopback_dev)
3771                 idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif));
3772         else
3773                 idev = ip6_dst_idev(dst);
3774
3775         switch (ipstats_mib_noroutes) {
3776         case IPSTATS_MIB_INNOROUTES:
3777                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3778                 if (type == IPV6_ADDR_ANY) {
3779                         IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
3780                         break;
3781                 }
3782                 /* FALLTHROUGH */
3783         case IPSTATS_MIB_OUTNOROUTES:
3784                 IP6_INC_STATS(net, idev, ipstats_mib_noroutes);
3785                 break;
3786         }
3787
3788         /* Start over by dropping the dst for l3mdev case */
3789         if (netif_is_l3_master(skb->dev))
3790                 skb_dst_drop(skb);
3791
3792         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3793         kfree_skb(skb);
3794         return 0;
3795 }
3796
3797 static int ip6_pkt_discard(struct sk_buff *skb)
3798 {
3799         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3800 }
3801
3802 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3803 {
3804         skb->dev = skb_dst(skb)->dev;
3805         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3806 }
3807
3808 static int ip6_pkt_prohibit(struct sk_buff *skb)
3809 {
3810         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3811 }
3812
3813 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3814 {
3815         skb->dev = skb_dst(skb)->dev;
3816         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3817 }
3818
3819 /*
3820  *      Allocate a dst for local (unicast / anycast) address.
3821  */
3822
3823 struct fib6_info *addrconf_f6i_alloc(struct net *net,
3824                                      struct inet6_dev *idev,
3825                                      const struct in6_addr *addr,
3826                                      bool anycast, gfp_t gfp_flags)
3827 {
3828         struct fib6_config cfg = {
3829                 .fc_table = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL,
3830                 .fc_ifindex = idev->dev->ifindex,
3831                 .fc_flags = RTF_UP | RTF_ADDRCONF | RTF_NONEXTHOP,
3832                 .fc_dst = *addr,
3833                 .fc_dst_len = 128,
3834                 .fc_protocol = RTPROT_KERNEL,
3835                 .fc_nlinfo.nl_net = net,
3836                 .fc_ignore_dev_down = true,
3837         };
3838
3839         if (anycast) {
3840                 cfg.fc_type = RTN_ANYCAST;
3841                 cfg.fc_flags |= RTF_ANYCAST;
3842         } else {
3843                 cfg.fc_type = RTN_LOCAL;
3844                 cfg.fc_flags |= RTF_LOCAL;
3845         }
3846
3847         return ip6_route_info_create(&cfg, gfp_flags, NULL);
3848 }
3849
3850 /* remove deleted ip from prefsrc entries */
3851 struct arg_dev_net_ip {
3852         struct net_device *dev;
3853         struct net *net;
3854         struct in6_addr *addr;
3855 };
3856
3857 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
3858 {
3859         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3860         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3861         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3862
3863         if (((void *)rt->fib6_nh.fib_nh_dev == dev || !dev) &&
3864             rt != net->ipv6.fib6_null_entry &&
3865             ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) {
3866                 spin_lock_bh(&rt6_exception_lock);
3867                 /* remove prefsrc entry */
3868                 rt->fib6_prefsrc.plen = 0;
3869                 spin_unlock_bh(&rt6_exception_lock);
3870         }
3871         return 0;
3872 }
3873
3874 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3875 {
3876         struct net *net = dev_net(ifp->idev->dev);
3877         struct arg_dev_net_ip adni = {
3878                 .dev = ifp->idev->dev,
3879                 .net = net,
3880                 .addr = &ifp->addr,
3881         };
3882         fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3883 }
3884
3885 #define RTF_RA_ROUTER           (RTF_ADDRCONF | RTF_DEFAULT)
3886
3887 /* Remove routers and update dst entries when gateway turn into host. */
3888 static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
3889 {
3890         struct in6_addr *gateway = (struct in6_addr *)arg;
3891
3892         if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3893             rt->fib6_nh.fib_nh_gw_family &&
3894             ipv6_addr_equal(gateway, &rt->fib6_nh.fib_nh_gw6)) {
3895                 return -1;
3896         }
3897
3898         /* Further clean up cached routes in exception table.
3899          * This is needed because cached route may have a different
3900          * gateway than its 'parent' in the case of an ip redirect.
3901          */
3902         rt6_exceptions_clean_tohost(rt, gateway);
3903
3904         return 0;
3905 }
3906
3907 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3908 {
3909         fib6_clean_all(net, fib6_clean_tohost, gateway);
3910 }
3911
3912 struct arg_netdev_event {
3913         const struct net_device *dev;
3914         union {
3915                 unsigned char nh_flags;
3916                 unsigned long event;
3917         };
3918 };
3919
3920 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
3921 {
3922         struct fib6_info *iter;
3923         struct fib6_node *fn;
3924
3925         fn = rcu_dereference_protected(rt->fib6_node,
3926                         lockdep_is_held(&rt->fib6_table->tb6_lock));
3927         iter = rcu_dereference_protected(fn->leaf,
3928                         lockdep_is_held(&rt->fib6_table->tb6_lock));
3929         while (iter) {
3930                 if (iter->fib6_metric == rt->fib6_metric &&
3931                     rt6_qualify_for_ecmp(iter))
3932                         return iter;
3933                 iter = rcu_dereference_protected(iter->fib6_next,
3934                                 lockdep_is_held(&rt->fib6_table->tb6_lock));
3935         }
3936
3937         return NULL;
3938 }
3939
3940 static bool rt6_is_dead(const struct fib6_info *rt)
3941 {
3942         if (rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD ||
3943             (rt->fib6_nh.fib_nh_flags & RTNH_F_LINKDOWN &&
3944              ip6_ignore_linkdown(rt->fib6_nh.fib_nh_dev)))
3945                 return true;
3946
3947         return false;
3948 }
3949
3950 static int rt6_multipath_total_weight(const struct fib6_info *rt)
3951 {
3952         struct fib6_info *iter;
3953         int total = 0;
3954
3955         if (!rt6_is_dead(rt))
3956                 total += rt->fib6_nh.fib_nh_weight;
3957
3958         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) {
3959                 if (!rt6_is_dead(iter))
3960                         total += iter->fib6_nh.fib_nh_weight;
3961         }
3962
3963         return total;
3964 }
3965
3966 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total)
3967 {
3968         int upper_bound = -1;
3969
3970         if (!rt6_is_dead(rt)) {
3971                 *weight += rt->fib6_nh.fib_nh_weight;
3972                 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
3973                                                     total) - 1;
3974         }
3975         atomic_set(&rt->fib6_nh.fib_nh_upper_bound, upper_bound);
3976 }
3977
3978 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total)
3979 {
3980         struct fib6_info *iter;
3981         int weight = 0;
3982
3983         rt6_upper_bound_set(rt, &weight, total);
3984
3985         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3986                 rt6_upper_bound_set(iter, &weight, total);
3987 }
3988
3989 void rt6_multipath_rebalance(struct fib6_info *rt)
3990 {
3991         struct fib6_info *first;
3992         int total;
3993
3994         /* In case the entire multipath route was marked for flushing,
3995          * then there is no need to rebalance upon the removal of every
3996          * sibling route.
3997          */
3998         if (!rt->fib6_nsiblings || rt->should_flush)
3999                 return;
4000
4001         /* During lookup routes are evaluated in order, so we need to
4002          * make sure upper bounds are assigned from the first sibling
4003          * onwards.
4004          */
4005         first = rt6_multipath_first_sibling(rt);
4006         if (WARN_ON_ONCE(!first))
4007                 return;
4008
4009         total = rt6_multipath_total_weight(first);
4010         rt6_multipath_upper_bound_set(first, total);
4011 }
4012
4013 static int fib6_ifup(struct fib6_info *rt, void *p_arg)
4014 {
4015         const struct arg_netdev_event *arg = p_arg;
4016         struct net *net = dev_net(arg->dev);
4017
4018         if (rt != net->ipv6.fib6_null_entry &&
4019             rt->fib6_nh.fib_nh_dev == arg->dev) {
4020                 rt->fib6_nh.fib_nh_flags &= ~arg->nh_flags;
4021                 fib6_update_sernum_upto_root(net, rt);
4022                 rt6_multipath_rebalance(rt);
4023         }
4024
4025         return 0;
4026 }
4027
4028 void rt6_sync_up(struct net_device *dev, unsigned char nh_flags)
4029 {
4030         struct arg_netdev_event arg = {
4031                 .dev = dev,
4032                 {
4033                         .nh_flags = nh_flags,
4034                 },
4035         };
4036
4037         if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
4038                 arg.nh_flags |= RTNH_F_LINKDOWN;
4039
4040         fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
4041 }
4042
4043 static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
4044                                    const struct net_device *dev)
4045 {
4046         struct fib6_info *iter;
4047
4048         if (rt->fib6_nh.fib_nh_dev == dev)
4049                 return true;
4050         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4051                 if (iter->fib6_nh.fib_nh_dev == dev)
4052                         return true;
4053
4054         return false;
4055 }
4056
4057 static void rt6_multipath_flush(struct fib6_info *rt)
4058 {
4059         struct fib6_info *iter;
4060
4061         rt->should_flush = 1;
4062         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4063                 iter->should_flush = 1;
4064 }
4065
4066 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt,
4067                                              const struct net_device *down_dev)
4068 {
4069         struct fib6_info *iter;
4070         unsigned int dead = 0;
4071
4072         if (rt->fib6_nh.fib_nh_dev == down_dev ||
4073             rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD)
4074                 dead++;
4075         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4076                 if (iter->fib6_nh.fib_nh_dev == down_dev ||
4077                     iter->fib6_nh.fib_nh_flags & RTNH_F_DEAD)
4078                         dead++;
4079
4080         return dead;
4081 }
4082
4083 static void rt6_multipath_nh_flags_set(struct fib6_info *rt,
4084                                        const struct net_device *dev,
4085                                        unsigned char nh_flags)
4086 {
4087         struct fib6_info *iter;
4088
4089         if (rt->fib6_nh.fib_nh_dev == dev)
4090                 rt->fib6_nh.fib_nh_flags |= nh_flags;
4091         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4092                 if (iter->fib6_nh.fib_nh_dev == dev)
4093                         iter->fib6_nh.fib_nh_flags |= nh_flags;
4094 }
4095
4096 /* called with write lock held for table with rt */
4097 static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
4098 {
4099         const struct arg_netdev_event *arg = p_arg;
4100         const struct net_device *dev = arg->dev;
4101         struct net *net = dev_net(dev);
4102
4103         if (rt == net->ipv6.fib6_null_entry)
4104                 return 0;
4105
4106         switch (arg->event) {
4107         case NETDEV_UNREGISTER:
4108                 return rt->fib6_nh.fib_nh_dev == dev ? -1 : 0;
4109         case NETDEV_DOWN:
4110                 if (rt->should_flush)
4111                         return -1;
4112                 if (!rt->fib6_nsiblings)
4113                         return rt->fib6_nh.fib_nh_dev == dev ? -1 : 0;
4114                 if (rt6_multipath_uses_dev(rt, dev)) {
4115                         unsigned int count;
4116
4117                         count = rt6_multipath_dead_count(rt, dev);
4118                         if (rt->fib6_nsiblings + 1 == count) {
4119                                 rt6_multipath_flush(rt);
4120                                 return -1;
4121                         }
4122                         rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
4123                                                    RTNH_F_LINKDOWN);
4124                         fib6_update_sernum(net, rt);
4125                         rt6_multipath_rebalance(rt);
4126                 }
4127                 return -2;
4128         case NETDEV_CHANGE:
4129                 if (rt->fib6_nh.fib_nh_dev != dev ||
4130                     rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
4131                         break;
4132                 rt->fib6_nh.fib_nh_flags |= RTNH_F_LINKDOWN;
4133                 rt6_multipath_rebalance(rt);
4134                 break;
4135         }
4136
4137         return 0;
4138 }
4139
4140 void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
4141 {
4142         struct arg_netdev_event arg = {
4143                 .dev = dev,
4144                 {
4145                         .event = event,
4146                 },
4147         };
4148         struct net *net = dev_net(dev);
4149
4150         if (net->ipv6.sysctl.skip_notify_on_dev_down)
4151                 fib6_clean_all_skip_notify(net, fib6_ifdown, &arg);
4152         else
4153                 fib6_clean_all(net, fib6_ifdown, &arg);
4154 }
4155
4156 void rt6_disable_ip(struct net_device *dev, unsigned long event)
4157 {
4158         rt6_sync_down_dev(dev, event);
4159         rt6_uncached_list_flush_dev(dev_net(dev), dev);
4160         neigh_ifdown(&nd_tbl, dev);
4161 }
4162
4163 struct rt6_mtu_change_arg {
4164         struct net_device *dev;
4165         unsigned int mtu;
4166 };
4167
4168 static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg)
4169 {
4170         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
4171         struct inet6_dev *idev;
4172
4173         /* In IPv6 pmtu discovery is not optional,
4174            so that RTAX_MTU lock cannot disable it.
4175            We still use this lock to block changes
4176            caused by addrconf/ndisc.
4177         */
4178
4179         idev = __in6_dev_get(arg->dev);
4180         if (!idev)
4181                 return 0;
4182
4183         /* For administrative MTU increase, there is no way to discover
4184            IPv6 PMTU increase, so PMTU increase should be updated here.
4185            Since RFC 1981 doesn't include administrative MTU increase
4186            update PMTU increase is a MUST. (i.e. jumbo frame)
4187          */
4188         if (rt->fib6_nh.fib_nh_dev == arg->dev &&
4189             !fib6_metric_locked(rt, RTAX_MTU)) {
4190                 u32 mtu = rt->fib6_pmtu;
4191
4192                 if (mtu >= arg->mtu ||
4193                     (mtu < arg->mtu && mtu == idev->cnf.mtu6))
4194                         fib6_metric_set(rt, RTAX_MTU, arg->mtu);
4195
4196                 spin_lock_bh(&rt6_exception_lock);
4197                 rt6_exceptions_update_pmtu(idev, rt, arg->mtu);
4198                 spin_unlock_bh(&rt6_exception_lock);
4199         }
4200         return 0;
4201 }
4202
4203 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
4204 {
4205         struct rt6_mtu_change_arg arg = {
4206                 .dev = dev,
4207                 .mtu = mtu,
4208         };
4209
4210         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
4211 }
4212
4213 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
4214         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
4215         [RTA_PREFSRC]           = { .len = sizeof(struct in6_addr) },
4216         [RTA_OIF]               = { .type = NLA_U32 },
4217         [RTA_IIF]               = { .type = NLA_U32 },
4218         [RTA_PRIORITY]          = { .type = NLA_U32 },
4219         [RTA_METRICS]           = { .type = NLA_NESTED },
4220         [RTA_MULTIPATH]         = { .len = sizeof(struct rtnexthop) },
4221         [RTA_PREF]              = { .type = NLA_U8 },
4222         [RTA_ENCAP_TYPE]        = { .type = NLA_U16 },
4223         [RTA_ENCAP]             = { .type = NLA_NESTED },
4224         [RTA_EXPIRES]           = { .type = NLA_U32 },
4225         [RTA_UID]               = { .type = NLA_U32 },
4226         [RTA_MARK]              = { .type = NLA_U32 },
4227         [RTA_TABLE]             = { .type = NLA_U32 },
4228         [RTA_IP_PROTO]          = { .type = NLA_U8 },
4229         [RTA_SPORT]             = { .type = NLA_U16 },
4230         [RTA_DPORT]             = { .type = NLA_U16 },
4231 };
4232
4233 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
4234                               struct fib6_config *cfg,
4235                               struct netlink_ext_ack *extack)
4236 {
4237         struct rtmsg *rtm;
4238         struct nlattr *tb[RTA_MAX+1];
4239         unsigned int pref;
4240         int err;
4241
4242         err = nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
4243                                      rtm_ipv6_policy, extack);
4244         if (err < 0)
4245                 goto errout;
4246
4247         err = -EINVAL;
4248         rtm = nlmsg_data(nlh);
4249
4250         *cfg = (struct fib6_config){
4251                 .fc_table = rtm->rtm_table,
4252                 .fc_dst_len = rtm->rtm_dst_len,
4253                 .fc_src_len = rtm->rtm_src_len,
4254                 .fc_flags = RTF_UP,
4255                 .fc_protocol = rtm->rtm_protocol,
4256                 .fc_type = rtm->rtm_type,
4257
4258                 .fc_nlinfo.portid = NETLINK_CB(skb).portid,
4259                 .fc_nlinfo.nlh = nlh,
4260                 .fc_nlinfo.nl_net = sock_net(skb->sk),
4261         };
4262
4263         if (rtm->rtm_type == RTN_UNREACHABLE ||
4264             rtm->rtm_type == RTN_BLACKHOLE ||
4265             rtm->rtm_type == RTN_PROHIBIT ||
4266             rtm->rtm_type == RTN_THROW)
4267                 cfg->fc_flags |= RTF_REJECT;
4268
4269         if (rtm->rtm_type == RTN_LOCAL)
4270                 cfg->fc_flags |= RTF_LOCAL;
4271
4272         if (rtm->rtm_flags & RTM_F_CLONED)
4273                 cfg->fc_flags |= RTF_CACHE;
4274
4275         cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
4276
4277         if (tb[RTA_GATEWAY]) {
4278                 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
4279                 cfg->fc_flags |= RTF_GATEWAY;
4280         }
4281         if (tb[RTA_VIA]) {
4282                 NL_SET_ERR_MSG(extack, "IPv6 does not support RTA_VIA attribute");
4283                 goto errout;
4284         }
4285
4286         if (tb[RTA_DST]) {
4287                 int plen = (rtm->rtm_dst_len + 7) >> 3;
4288
4289                 if (nla_len(tb[RTA_DST]) < plen)
4290                         goto errout;
4291
4292                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
4293         }
4294
4295         if (tb[RTA_SRC]) {
4296                 int plen = (rtm->rtm_src_len + 7) >> 3;
4297
4298                 if (nla_len(tb[RTA_SRC]) < plen)
4299                         goto errout;
4300
4301                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
4302         }
4303
4304         if (tb[RTA_PREFSRC])
4305                 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
4306
4307         if (tb[RTA_OIF])
4308                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
4309
4310         if (tb[RTA_PRIORITY])
4311                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
4312
4313         if (tb[RTA_METRICS]) {
4314                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
4315                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
4316         }
4317
4318         if (tb[RTA_TABLE])
4319                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
4320
4321         if (tb[RTA_MULTIPATH]) {
4322                 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
4323                 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
4324
4325                 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
4326                                                      cfg->fc_mp_len, extack);
4327                 if (err < 0)
4328                         goto errout;
4329         }
4330
4331         if (tb[RTA_PREF]) {
4332                 pref = nla_get_u8(tb[RTA_PREF]);
4333                 if (pref != ICMPV6_ROUTER_PREF_LOW &&
4334                     pref != ICMPV6_ROUTER_PREF_HIGH)
4335                         pref = ICMPV6_ROUTER_PREF_MEDIUM;
4336                 cfg->fc_flags |= RTF_PREF(pref);
4337         }
4338
4339         if (tb[RTA_ENCAP])
4340                 cfg->fc_encap = tb[RTA_ENCAP];
4341
4342         if (tb[RTA_ENCAP_TYPE]) {
4343                 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
4344
4345                 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
4346                 if (err < 0)
4347                         goto errout;
4348         }
4349
4350         if (tb[RTA_EXPIRES]) {
4351                 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
4352
4353                 if (addrconf_finite_timeout(timeout)) {
4354                         cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
4355                         cfg->fc_flags |= RTF_EXPIRES;
4356                 }
4357         }
4358
4359         err = 0;
4360 errout:
4361         return err;
4362 }
4363
4364 struct rt6_nh {
4365         struct fib6_info *fib6_info;
4366         struct fib6_config r_cfg;
4367         struct list_head next;
4368 };
4369
4370 static int ip6_route_info_append(struct net *net,
4371                                  struct list_head *rt6_nh_list,
4372                                  struct fib6_info *rt,
4373                                  struct fib6_config *r_cfg)
4374 {
4375         struct rt6_nh *nh;
4376         int err = -EEXIST;
4377
4378         list_for_each_entry(nh, rt6_nh_list, next) {
4379                 /* check if fib6_info already exists */
4380                 if (rt6_duplicate_nexthop(nh->fib6_info, rt))
4381                         return err;
4382         }
4383
4384         nh = kzalloc(sizeof(*nh), GFP_KERNEL);
4385         if (!nh)
4386                 return -ENOMEM;
4387         nh->fib6_info = rt;
4388         memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
4389         list_add_tail(&nh->next, rt6_nh_list);
4390
4391         return 0;
4392 }
4393
4394 static void ip6_route_mpath_notify(struct fib6_info *rt,
4395                                    struct fib6_info *rt_last,
4396                                    struct nl_info *info,
4397                                    __u16 nlflags)
4398 {
4399         /* if this is an APPEND route, then rt points to the first route
4400          * inserted and rt_last points to last route inserted. Userspace
4401          * wants a consistent dump of the route which starts at the first
4402          * nexthop. Since sibling routes are always added at the end of
4403          * the list, find the first sibling of the last route appended
4404          */
4405         if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) {
4406                 rt = list_first_entry(&rt_last->fib6_siblings,
4407                                       struct fib6_info,
4408                                       fib6_siblings);
4409         }
4410
4411         if (rt)
4412                 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
4413 }
4414
4415 static int ip6_route_multipath_add(struct fib6_config *cfg,
4416                                    struct netlink_ext_ack *extack)
4417 {
4418         struct fib6_info *rt_notif = NULL, *rt_last = NULL;
4419         struct nl_info *info = &cfg->fc_nlinfo;
4420         struct fib6_config r_cfg;
4421         struct rtnexthop *rtnh;
4422         struct fib6_info *rt;
4423         struct rt6_nh *err_nh;
4424         struct rt6_nh *nh, *nh_safe;
4425         __u16 nlflags;
4426         int remaining;
4427         int attrlen;
4428         int err = 1;
4429         int nhn = 0;
4430         int replace = (cfg->fc_nlinfo.nlh &&
4431                        (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
4432         LIST_HEAD(rt6_nh_list);
4433
4434         nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
4435         if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
4436                 nlflags |= NLM_F_APPEND;
4437
4438         remaining = cfg->fc_mp_len;
4439         rtnh = (struct rtnexthop *)cfg->fc_mp;
4440
4441         /* Parse a Multipath Entry and build a list (rt6_nh_list) of
4442          * fib6_info structs per nexthop
4443          */
4444         while (rtnh_ok(rtnh, remaining)) {
4445                 memcpy(&r_cfg, cfg, sizeof(*cfg));
4446                 if (rtnh->rtnh_ifindex)
4447                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4448
4449                 attrlen = rtnh_attrlen(rtnh);
4450                 if (attrlen > 0) {
4451                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4452
4453                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4454                         if (nla) {
4455                                 r_cfg.fc_gateway = nla_get_in6_addr(nla);
4456                                 r_cfg.fc_flags |= RTF_GATEWAY;
4457                         }
4458                         r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
4459                         nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
4460                         if (nla)
4461                                 r_cfg.fc_encap_type = nla_get_u16(nla);
4462                 }
4463
4464                 r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
4465                 rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack);
4466                 if (IS_ERR(rt)) {
4467                         err = PTR_ERR(rt);
4468                         rt = NULL;
4469                         goto cleanup;
4470                 }
4471                 if (!rt6_qualify_for_ecmp(rt)) {
4472                         err = -EINVAL;
4473                         NL_SET_ERR_MSG(extack,
4474                                        "Device only routes can not be added for IPv6 using the multipath API.");
4475                         fib6_info_release(rt);
4476                         goto cleanup;
4477                 }
4478
4479                 rt->fib6_nh.fib_nh_weight = rtnh->rtnh_hops + 1;
4480
4481                 err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
4482                                             rt, &r_cfg);
4483                 if (err) {
4484                         fib6_info_release(rt);
4485                         goto cleanup;
4486                 }
4487
4488                 rtnh = rtnh_next(rtnh, &remaining);
4489         }
4490
4491         /* for add and replace send one notification with all nexthops.
4492          * Skip the notification in fib6_add_rt2node and send one with
4493          * the full route when done
4494          */
4495         info->skip_notify = 1;
4496
4497         err_nh = NULL;
4498         list_for_each_entry(nh, &rt6_nh_list, next) {
4499                 err = __ip6_ins_rt(nh->fib6_info, info, extack);
4500                 fib6_info_release(nh->fib6_info);
4501
4502                 if (!err) {
4503                         /* save reference to last route successfully inserted */
4504                         rt_last = nh->fib6_info;
4505
4506                         /* save reference to first route for notification */
4507                         if (!rt_notif)
4508                                 rt_notif = nh->fib6_info;
4509                 }
4510
4511                 /* nh->fib6_info is used or freed at this point, reset to NULL*/
4512                 nh->fib6_info = NULL;
4513                 if (err) {
4514                         if (replace && nhn)
4515                                 NL_SET_ERR_MSG_MOD(extack,
4516                                                    "multipath route replace failed (check consistency of installed routes)");
4517                         err_nh = nh;
4518                         goto add_errout;
4519                 }
4520
4521                 /* Because each route is added like a single route we remove
4522                  * these flags after the first nexthop: if there is a collision,
4523                  * we have already failed to add the first nexthop:
4524                  * fib6_add_rt2node() has rejected it; when replacing, old
4525                  * nexthops have been replaced by first new, the rest should
4526                  * be added to it.
4527                  */
4528                 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
4529                                                      NLM_F_REPLACE);
4530                 nhn++;
4531         }
4532
4533         /* success ... tell user about new route */
4534         ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4535         goto cleanup;
4536
4537 add_errout:
4538         /* send notification for routes that were added so that
4539          * the delete notifications sent by ip6_route_del are
4540          * coherent
4541          */
4542         if (rt_notif)
4543                 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4544
4545         /* Delete routes that were already added */
4546         list_for_each_entry(nh, &rt6_nh_list, next) {
4547                 if (err_nh == nh)
4548                         break;
4549                 ip6_route_del(&nh->r_cfg, extack);
4550         }
4551
4552 cleanup:
4553         list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
4554                 if (nh->fib6_info)
4555                         fib6_info_release(nh->fib6_info);
4556                 list_del(&nh->next);
4557                 kfree(nh);
4558         }
4559
4560         return err;
4561 }
4562
4563 static int ip6_route_multipath_del(struct fib6_config *cfg,
4564                                    struct netlink_ext_ack *extack)
4565 {
4566         struct fib6_config r_cfg;
4567         struct rtnexthop *rtnh;
4568         int remaining;
4569         int attrlen;
4570         int err = 1, last_err = 0;
4571
4572         remaining = cfg->fc_mp_len;
4573         rtnh = (struct rtnexthop *)cfg->fc_mp;
4574
4575         /* Parse a Multipath Entry */
4576         while (rtnh_ok(rtnh, remaining)) {
4577                 memcpy(&r_cfg, cfg, sizeof(*cfg));
4578                 if (rtnh->rtnh_ifindex)
4579                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4580
4581                 attrlen = rtnh_attrlen(rtnh);
4582                 if (attrlen > 0) {
4583                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4584
4585                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4586                         if (nla) {
4587                                 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
4588                                 r_cfg.fc_flags |= RTF_GATEWAY;
4589                         }
4590                 }
4591                 err = ip6_route_del(&r_cfg, extack);
4592                 if (err)
4593                         last_err = err;
4594
4595                 rtnh = rtnh_next(rtnh, &remaining);
4596         }
4597
4598         return last_err;
4599 }
4600
4601 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4602                               struct netlink_ext_ack *extack)
4603 {
4604         struct fib6_config cfg;
4605         int err;
4606
4607         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4608         if (err < 0)
4609                 return err;
4610
4611         if (cfg.fc_mp)
4612                 return ip6_route_multipath_del(&cfg, extack);
4613         else {
4614                 cfg.fc_delete_all_nh = 1;
4615                 return ip6_route_del(&cfg, extack);
4616         }
4617 }
4618
4619 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4620                               struct netlink_ext_ack *extack)
4621 {
4622         struct fib6_config cfg;
4623         int err;
4624
4625         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4626         if (err < 0)
4627                 return err;
4628
4629         if (cfg.fc_metric == 0)
4630                 cfg.fc_metric = IP6_RT_PRIO_USER;
4631
4632         if (cfg.fc_mp)
4633                 return ip6_route_multipath_add(&cfg, extack);
4634         else
4635                 return ip6_route_add(&cfg, GFP_KERNEL, extack);
4636 }
4637
4638 static size_t rt6_nlmsg_size(struct fib6_info *rt)
4639 {
4640         int nexthop_len = 0;
4641
4642         if (rt->fib6_nsiblings) {
4643                 nexthop_len = nla_total_size(0)  /* RTA_MULTIPATH */
4644                             + NLA_ALIGN(sizeof(struct rtnexthop))
4645                             + nla_total_size(16) /* RTA_GATEWAY */
4646                             + lwtunnel_get_encap_size(rt->fib6_nh.fib_nh_lws);
4647
4648                 nexthop_len *= rt->fib6_nsiblings;
4649         }
4650
4651         return NLMSG_ALIGN(sizeof(struct rtmsg))
4652                + nla_total_size(16) /* RTA_SRC */
4653                + nla_total_size(16) /* RTA_DST */
4654                + nla_total_size(16) /* RTA_GATEWAY */
4655                + nla_total_size(16) /* RTA_PREFSRC */
4656                + nla_total_size(4) /* RTA_TABLE */
4657                + nla_total_size(4) /* RTA_IIF */
4658                + nla_total_size(4) /* RTA_OIF */
4659                + nla_total_size(4) /* RTA_PRIORITY */
4660                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
4661                + nla_total_size(sizeof(struct rta_cacheinfo))
4662                + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
4663                + nla_total_size(1) /* RTA_PREF */
4664                + lwtunnel_get_encap_size(rt->fib6_nh.fib_nh_lws)
4665                + nexthop_len;
4666 }
4667
4668 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
4669                          struct fib6_info *rt, struct dst_entry *dst,
4670                          struct in6_addr *dest, struct in6_addr *src,
4671                          int iif, int type, u32 portid, u32 seq,
4672                          unsigned int flags)
4673 {
4674         struct rt6_info *rt6 = (struct rt6_info *)dst;
4675         struct rt6key *rt6_dst, *rt6_src;
4676         u32 *pmetrics, table, rt6_flags;
4677         struct nlmsghdr *nlh;
4678         struct rtmsg *rtm;
4679         long expires = 0;
4680
4681         nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4682         if (!nlh)
4683                 return -EMSGSIZE;
4684
4685         if (rt6) {
4686                 rt6_dst = &rt6->rt6i_dst;
4687                 rt6_src = &rt6->rt6i_src;
4688                 rt6_flags = rt6->rt6i_flags;
4689         } else {
4690                 rt6_dst = &rt->fib6_dst;
4691                 rt6_src = &rt->fib6_src;
4692                 rt6_flags = rt->fib6_flags;
4693         }
4694
4695         rtm = nlmsg_data(nlh);
4696         rtm->rtm_family = AF_INET6;
4697         rtm->rtm_dst_len = rt6_dst->plen;
4698         rtm->rtm_src_len = rt6_src->plen;
4699         rtm->rtm_tos = 0;
4700         if (rt->fib6_table)
4701                 table = rt->fib6_table->tb6_id;
4702         else
4703                 table = RT6_TABLE_UNSPEC;
4704         rtm->rtm_table = table < 256 ? table : RT_TABLE_COMPAT;
4705         if (nla_put_u32(skb, RTA_TABLE, table))
4706                 goto nla_put_failure;
4707
4708         rtm->rtm_type = rt->fib6_type;
4709         rtm->rtm_flags = 0;
4710         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4711         rtm->rtm_protocol = rt->fib6_protocol;
4712
4713         if (rt6_flags & RTF_CACHE)
4714                 rtm->rtm_flags |= RTM_F_CLONED;
4715
4716         if (dest) {
4717                 if (nla_put_in6_addr(skb, RTA_DST, dest))
4718                         goto nla_put_failure;
4719                 rtm->rtm_dst_len = 128;
4720         } else if (rtm->rtm_dst_len)
4721                 if (nla_put_in6_addr(skb, RTA_DST, &rt6_dst->addr))
4722                         goto nla_put_failure;
4723 #ifdef CONFIG_IPV6_SUBTREES
4724         if (src) {
4725                 if (nla_put_in6_addr(skb, RTA_SRC, src))
4726                         goto nla_put_failure;
4727                 rtm->rtm_src_len = 128;
4728         } else if (rtm->rtm_src_len &&
4729                    nla_put_in6_addr(skb, RTA_SRC, &rt6_src->addr))
4730                 goto nla_put_failure;
4731 #endif
4732         if (iif) {
4733 #ifdef CONFIG_IPV6_MROUTE
4734                 if (ipv6_addr_is_multicast(&rt6_dst->addr)) {
4735                         int err = ip6mr_get_route(net, skb, rtm, portid);
4736
4737                         if (err == 0)
4738                                 return 0;
4739                         if (err < 0)
4740                                 goto nla_put_failure;
4741                 } else
4742 #endif
4743                         if (nla_put_u32(skb, RTA_IIF, iif))
4744                                 goto nla_put_failure;
4745         } else if (dest) {
4746                 struct in6_addr saddr_buf;
4747                 if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 &&
4748                     nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4749                         goto nla_put_failure;
4750         }
4751
4752         if (rt->fib6_prefsrc.plen) {
4753                 struct in6_addr saddr_buf;
4754                 saddr_buf = rt->fib6_prefsrc.addr;
4755                 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4756                         goto nla_put_failure;
4757         }
4758
4759         pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics;
4760         if (rtnetlink_put_metrics(skb, pmetrics) < 0)
4761                 goto nla_put_failure;
4762
4763         if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric))
4764                 goto nla_put_failure;
4765
4766         /* For multipath routes, walk the siblings list and add
4767          * each as a nexthop within RTA_MULTIPATH.
4768          */
4769         if (rt6) {
4770                 if (rt6_flags & RTF_GATEWAY &&
4771                     nla_put_in6_addr(skb, RTA_GATEWAY, &rt6->rt6i_gateway))
4772                         goto nla_put_failure;
4773
4774                 if (dst->dev && nla_put_u32(skb, RTA_OIF, dst->dev->ifindex))
4775                         goto nla_put_failure;
4776         } else if (rt->fib6_nsiblings) {
4777                 struct fib6_info *sibling, *next_sibling;
4778                 struct nlattr *mp;
4779
4780                 mp = nla_nest_start_noflag(skb, RTA_MULTIPATH);
4781                 if (!mp)
4782                         goto nla_put_failure;
4783
4784                 if (fib_add_nexthop(skb, &rt->fib6_nh.nh_common,
4785                                     rt->fib6_nh.fib_nh_weight) < 0)
4786                         goto nla_put_failure;
4787
4788                 list_for_each_entry_safe(sibling, next_sibling,
4789                                          &rt->fib6_siblings, fib6_siblings) {
4790                         if (fib_add_nexthop(skb, &sibling->fib6_nh.nh_common,
4791                                             sibling->fib6_nh.fib_nh_weight) < 0)
4792                                 goto nla_put_failure;
4793                 }
4794
4795                 nla_nest_end(skb, mp);
4796         } else {
4797                 unsigned char nh_flags = 0;
4798
4799                 if (fib_nexthop_info(skb, &rt->fib6_nh.nh_common,
4800                                      &nh_flags, false) < 0)
4801                         goto nla_put_failure;
4802
4803                 rtm->rtm_flags |= nh_flags;
4804         }
4805
4806         if (rt6_flags & RTF_EXPIRES) {
4807                 expires = dst ? dst->expires : rt->expires;
4808                 expires -= jiffies;
4809         }
4810
4811         if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
4812                 goto nla_put_failure;
4813
4814         if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt6_flags)))
4815                 goto nla_put_failure;
4816
4817
4818         nlmsg_end(skb, nlh);
4819         return 0;
4820
4821 nla_put_failure:
4822         nlmsg_cancel(skb, nlh);
4823         return -EMSGSIZE;
4824 }
4825
4826 static bool fib6_info_uses_dev(const struct fib6_info *f6i,
4827                                const struct net_device *dev)
4828 {
4829         if (f6i->fib6_nh.fib_nh_dev == dev)
4830                 return true;
4831
4832         if (f6i->fib6_nsiblings) {
4833                 struct fib6_info *sibling, *next_sibling;
4834
4835                 list_for_each_entry_safe(sibling, next_sibling,
4836                                          &f6i->fib6_siblings, fib6_siblings) {
4837                         if (sibling->fib6_nh.fib_nh_dev == dev)
4838                                 return true;
4839                 }
4840         }
4841
4842         return false;
4843 }
4844
4845 int rt6_dump_route(struct fib6_info *rt, void *p_arg)
4846 {
4847         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4848         struct fib_dump_filter *filter = &arg->filter;
4849         unsigned int flags = NLM_F_MULTI;
4850         struct net *net = arg->net;
4851
4852         if (rt == net->ipv6.fib6_null_entry)
4853                 return 0;
4854
4855         if ((filter->flags & RTM_F_PREFIX) &&
4856             !(rt->fib6_flags & RTF_PREFIX_RT)) {
4857                 /* success since this is not a prefix route */
4858                 return 1;
4859         }
4860         if (filter->filter_set) {
4861                 if ((filter->rt_type && rt->fib6_type != filter->rt_type) ||
4862                     (filter->dev && !fib6_info_uses_dev(rt, filter->dev)) ||
4863                     (filter->protocol && rt->fib6_protocol != filter->protocol)) {
4864                         return 1;
4865                 }
4866                 flags |= NLM_F_DUMP_FILTERED;
4867         }
4868
4869         return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0,
4870                              RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid,
4871                              arg->cb->nlh->nlmsg_seq, flags);
4872 }
4873
4874 static int inet6_rtm_valid_getroute_req(struct sk_buff *skb,
4875                                         const struct nlmsghdr *nlh,
4876                                         struct nlattr **tb,
4877                                         struct netlink_ext_ack *extack)
4878 {
4879         struct rtmsg *rtm;
4880         int i, err;
4881
4882         if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
4883                 NL_SET_ERR_MSG_MOD(extack,
4884                                    "Invalid header for get route request");
4885                 return -EINVAL;
4886         }
4887
4888         if (!netlink_strict_get_check(skb))
4889                 return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
4890                                               rtm_ipv6_policy, extack);
4891
4892         rtm = nlmsg_data(nlh);
4893         if ((rtm->rtm_src_len && rtm->rtm_src_len != 128) ||
4894             (rtm->rtm_dst_len && rtm->rtm_dst_len != 128) ||
4895             rtm->rtm_table || rtm->rtm_protocol || rtm->rtm_scope ||
4896             rtm->rtm_type) {
4897                 NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for get route request");
4898                 return -EINVAL;
4899         }
4900         if (rtm->rtm_flags & ~RTM_F_FIB_MATCH) {
4901                 NL_SET_ERR_MSG_MOD(extack,
4902                                    "Invalid flags for get route request");
4903                 return -EINVAL;
4904         }
4905
4906         err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
4907                                             rtm_ipv6_policy, extack);
4908         if (err)
4909                 return err;
4910
4911         if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
4912             (tb[RTA_DST] && !rtm->rtm_dst_len)) {
4913                 NL_SET_ERR_MSG_MOD(extack, "rtm_src_len and rtm_dst_len must be 128 for IPv6");
4914                 return -EINVAL;
4915         }
4916
4917         for (i = 0; i <= RTA_MAX; i++) {
4918                 if (!tb[i])
4919                         continue;
4920
4921                 switch (i) {
4922                 case RTA_SRC:
4923                 case RTA_DST:
4924                 case RTA_IIF:
4925                 case RTA_OIF:
4926                 case RTA_MARK:
4927                 case RTA_UID:
4928                 case RTA_SPORT:
4929                 case RTA_DPORT:
4930                 case RTA_IP_PROTO:
4931                         break;
4932                 default:
4933                         NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in get route request");
4934                         return -EINVAL;
4935                 }
4936         }
4937
4938         return 0;
4939 }
4940
4941 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4942                               struct netlink_ext_ack *extack)
4943 {
4944         struct net *net = sock_net(in_skb->sk);
4945         struct nlattr *tb[RTA_MAX+1];
4946         int err, iif = 0, oif = 0;
4947         struct fib6_info *from;
4948         struct dst_entry *dst;
4949         struct rt6_info *rt;
4950         struct sk_buff *skb;
4951         struct rtmsg *rtm;
4952         struct flowi6 fl6 = {};
4953         bool fibmatch;
4954
4955         err = inet6_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
4956         if (err < 0)
4957                 goto errout;
4958
4959         err = -EINVAL;
4960         rtm = nlmsg_data(nlh);
4961         fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4962         fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4963
4964         if (tb[RTA_SRC]) {
4965                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4966                         goto errout;
4967
4968                 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4969         }
4970
4971         if (tb[RTA_DST]) {
4972                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4973                         goto errout;
4974
4975                 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4976         }
4977
4978         if (tb[RTA_IIF])
4979                 iif = nla_get_u32(tb[RTA_IIF]);
4980
4981         if (tb[RTA_OIF])
4982                 oif = nla_get_u32(tb[RTA_OIF]);
4983
4984         if (tb[RTA_MARK])
4985                 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4986
4987         if (tb[RTA_UID])
4988                 fl6.flowi6_uid = make_kuid(current_user_ns(),
4989                                            nla_get_u32(tb[RTA_UID]));
4990         else
4991                 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4992
4993         if (tb[RTA_SPORT])
4994                 fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]);
4995
4996         if (tb[RTA_DPORT])
4997                 fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]);
4998
4999         if (tb[RTA_IP_PROTO]) {
5000                 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
5001                                                   &fl6.flowi6_proto, AF_INET6,
5002                                                   extack);
5003                 if (err)
5004                         goto errout;
5005         }
5006
5007         if (iif) {
5008                 struct net_device *dev;
5009                 int flags = 0;
5010
5011                 rcu_read_lock();
5012
5013                 dev = dev_get_by_index_rcu(net, iif);
5014                 if (!dev) {
5015                         rcu_read_unlock();
5016                         err = -ENODEV;
5017                         goto errout;
5018                 }
5019
5020                 fl6.flowi6_iif = iif;
5021
5022                 if (!ipv6_addr_any(&fl6.saddr))
5023                         flags |= RT6_LOOKUP_F_HAS_SADDR;
5024
5025                 dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags);
5026
5027                 rcu_read_unlock();
5028         } else {
5029                 fl6.flowi6_oif = oif;
5030
5031                 dst = ip6_route_output(net, NULL, &fl6);
5032         }
5033
5034
5035         rt = container_of(dst, struct rt6_info, dst);
5036         if (rt->dst.error) {
5037                 err = rt->dst.error;
5038                 ip6_rt_put(rt);
5039                 goto errout;
5040         }
5041
5042         if (rt == net->ipv6.ip6_null_entry) {
5043                 err = rt->dst.error;
5044                 ip6_rt_put(rt);
5045                 goto errout;
5046         }
5047
5048         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
5049         if (!skb) {
5050                 ip6_rt_put(rt);
5051                 err = -ENOBUFS;
5052                 goto errout;
5053         }
5054
5055         skb_dst_set(skb, &rt->dst);
5056
5057         rcu_read_lock();
5058         from = rcu_dereference(rt->from);
5059         if (from) {
5060                 if (fibmatch)
5061                         err = rt6_fill_node(net, skb, from, NULL, NULL, NULL,
5062                                             iif, RTM_NEWROUTE,
5063                                             NETLINK_CB(in_skb).portid,
5064                                             nlh->nlmsg_seq, 0);
5065                 else
5066                         err = rt6_fill_node(net, skb, from, dst, &fl6.daddr,
5067                                             &fl6.saddr, iif, RTM_NEWROUTE,
5068                                             NETLINK_CB(in_skb).portid,
5069                                             nlh->nlmsg_seq, 0);
5070         } else {
5071                 err = -ENETUNREACH;
5072         }
5073         rcu_read_unlock();
5074
5075         if (err < 0) {
5076                 kfree_skb(skb);
5077                 goto errout;
5078         }
5079
5080         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
5081 errout:
5082         return err;
5083 }
5084
5085 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
5086                      unsigned int nlm_flags)
5087 {
5088         struct sk_buff *skb;
5089         struct net *net = info->nl_net;
5090         u32 seq;
5091         int err;
5092
5093         err = -ENOBUFS;
5094         seq = info->nlh ? info->nlh->nlmsg_seq : 0;
5095
5096         skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
5097         if (!skb)
5098                 goto errout;
5099
5100         err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
5101                             event, info->portid, seq, nlm_flags);
5102         if (err < 0) {
5103                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
5104                 WARN_ON(err == -EMSGSIZE);
5105                 kfree_skb(skb);
5106                 goto errout;
5107         }
5108         rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
5109                     info->nlh, gfp_any());
5110         return;
5111 errout:
5112         if (err < 0)
5113                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
5114 }
5115
5116 static int ip6_route_dev_notify(struct notifier_block *this,
5117                                 unsigned long event, void *ptr)
5118 {
5119         struct net_device *dev = netdev_notifier_info_to_dev(ptr);
5120         struct net *net = dev_net(dev);
5121
5122         if (!(dev->flags & IFF_LOOPBACK))
5123                 return NOTIFY_OK;
5124
5125         if (event == NETDEV_REGISTER) {
5126                 net->ipv6.fib6_null_entry->fib6_nh.fib_nh_dev = dev;
5127                 net->ipv6.ip6_null_entry->dst.dev = dev;
5128                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
5129 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5130                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
5131                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
5132                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
5133                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
5134 #endif
5135          } else if (event == NETDEV_UNREGISTER &&
5136                     dev->reg_state != NETREG_UNREGISTERED) {
5137                 /* NETDEV_UNREGISTER could be fired for multiple times by
5138                  * netdev_wait_allrefs(). Make sure we only call this once.
5139                  */
5140                 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
5141 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5142                 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
5143                 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
5144 #endif
5145         }
5146
5147         return NOTIFY_OK;
5148 }
5149
5150 /*
5151  *      /proc
5152  */
5153
5154 #ifdef CONFIG_PROC_FS
5155 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
5156 {
5157         struct net *net = (struct net *)seq->private;
5158         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
5159                    net->ipv6.rt6_stats->fib_nodes,
5160                    net->ipv6.rt6_stats->fib_route_nodes,
5161                    atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
5162                    net->ipv6.rt6_stats->fib_rt_entries,
5163                    net->ipv6.rt6_stats->fib_rt_cache,
5164                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
5165                    net->ipv6.rt6_stats->fib_discarded_routes);
5166
5167         return 0;
5168 }
5169 #endif  /* CONFIG_PROC_FS */
5170
5171 #ifdef CONFIG_SYSCTL
5172
5173 static
5174 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
5175                               void __user *buffer, size_t *lenp, loff_t *ppos)
5176 {
5177         struct net *net;
5178         int delay;
5179         int ret;
5180         if (!write)
5181                 return -EINVAL;
5182
5183         net = (struct net *)ctl->extra1;
5184         delay = net->ipv6.sysctl.flush_delay;
5185         ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
5186         if (ret)
5187                 return ret;
5188
5189         fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
5190         return 0;
5191 }
5192
5193 static int zero;
5194 static int one = 1;
5195
5196 static struct ctl_table ipv6_route_table_template[] = {
5197         {
5198                 .procname       =       "flush",
5199                 .data           =       &init_net.ipv6.sysctl.flush_delay,
5200                 .maxlen         =       sizeof(int),
5201                 .mode           =       0200,
5202                 .proc_handler   =       ipv6_sysctl_rtcache_flush
5203         },
5204         {
5205                 .procname       =       "gc_thresh",
5206                 .data           =       &ip6_dst_ops_template.gc_thresh,
5207                 .maxlen         =       sizeof(int),
5208                 .mode           =       0644,
5209                 .proc_handler   =       proc_dointvec,
5210         },
5211         {
5212                 .procname       =       "max_size",
5213                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
5214                 .maxlen         =       sizeof(int),
5215                 .mode           =       0644,
5216                 .proc_handler   =       proc_dointvec,
5217         },
5218         {
5219                 .procname       =       "gc_min_interval",
5220                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5221                 .maxlen         =       sizeof(int),
5222                 .mode           =       0644,
5223                 .proc_handler   =       proc_dointvec_jiffies,
5224         },
5225         {
5226                 .procname       =       "gc_timeout",
5227                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
5228                 .maxlen         =       sizeof(int),
5229                 .mode           =       0644,
5230                 .proc_handler   =       proc_dointvec_jiffies,
5231         },
5232         {
5233                 .procname       =       "gc_interval",
5234                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
5235                 .maxlen         =       sizeof(int),
5236                 .mode           =       0644,
5237                 .proc_handler   =       proc_dointvec_jiffies,
5238         },
5239         {
5240                 .procname       =       "gc_elasticity",
5241                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
5242                 .maxlen         =       sizeof(int),
5243                 .mode           =       0644,
5244                 .proc_handler   =       proc_dointvec,
5245         },
5246         {
5247                 .procname       =       "mtu_expires",
5248                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
5249                 .maxlen         =       sizeof(int),
5250                 .mode           =       0644,
5251                 .proc_handler   =       proc_dointvec_jiffies,
5252         },
5253         {
5254                 .procname       =       "min_adv_mss",
5255                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
5256                 .maxlen         =       sizeof(int),
5257                 .mode           =       0644,
5258                 .proc_handler   =       proc_dointvec,
5259         },
5260         {
5261                 .procname       =       "gc_min_interval_ms",
5262                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5263                 .maxlen         =       sizeof(int),
5264                 .mode           =       0644,
5265                 .proc_handler   =       proc_dointvec_ms_jiffies,
5266         },
5267         {
5268                 .procname       =       "skip_notify_on_dev_down",
5269                 .data           =       &init_net.ipv6.sysctl.skip_notify_on_dev_down,
5270                 .maxlen         =       sizeof(int),
5271                 .mode           =       0644,
5272                 .proc_handler   =       proc_dointvec,
5273                 .extra1         =       &zero,
5274                 .extra2         =       &one,
5275         },
5276         { }
5277 };
5278
5279 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
5280 {
5281         struct ctl_table *table;
5282
5283         table = kmemdup(ipv6_route_table_template,
5284                         sizeof(ipv6_route_table_template),
5285                         GFP_KERNEL);
5286
5287         if (table) {
5288                 table[0].data = &net->ipv6.sysctl.flush_delay;
5289                 table[0].extra1 = net;
5290                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
5291                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
5292                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5293                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
5294                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
5295                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
5296                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
5297                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
5298                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5299                 table[10].data = &net->ipv6.sysctl.skip_notify_on_dev_down;
5300
5301                 /* Don't export sysctls to unprivileged users */
5302                 if (net->user_ns != &init_user_ns)
5303                         table[0].procname = NULL;
5304         }
5305
5306         return table;
5307 }
5308 #endif
5309
5310 static int __net_init ip6_route_net_init(struct net *net)
5311 {
5312         int ret = -ENOMEM;
5313
5314         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
5315                sizeof(net->ipv6.ip6_dst_ops));
5316
5317         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
5318                 goto out_ip6_dst_ops;
5319
5320         net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template,
5321                                             sizeof(*net->ipv6.fib6_null_entry),
5322                                             GFP_KERNEL);
5323         if (!net->ipv6.fib6_null_entry)
5324                 goto out_ip6_dst_entries;
5325
5326         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
5327                                            sizeof(*net->ipv6.ip6_null_entry),
5328                                            GFP_KERNEL);
5329         if (!net->ipv6.ip6_null_entry)
5330                 goto out_fib6_null_entry;
5331         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5332         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
5333                          ip6_template_metrics, true);
5334
5335 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5336         net->ipv6.fib6_has_custom_rules = false;
5337         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
5338                                                sizeof(*net->ipv6.ip6_prohibit_entry),
5339                                                GFP_KERNEL);
5340         if (!net->ipv6.ip6_prohibit_entry)
5341                 goto out_ip6_null_entry;
5342         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5343         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
5344                          ip6_template_metrics, true);
5345
5346         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
5347                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
5348                                                GFP_KERNEL);
5349         if (!net->ipv6.ip6_blk_hole_entry)
5350                 goto out_ip6_prohibit_entry;
5351         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5352         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
5353                          ip6_template_metrics, true);
5354 #endif
5355
5356         net->ipv6.sysctl.flush_delay = 0;
5357         net->ipv6.sysctl.ip6_rt_max_size = 4096;
5358         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
5359         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
5360         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
5361         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
5362         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
5363         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
5364         net->ipv6.sysctl.skip_notify_on_dev_down = 0;
5365
5366         net->ipv6.ip6_rt_gc_expire = 30*HZ;
5367
5368         ret = 0;
5369 out:
5370         return ret;
5371
5372 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5373 out_ip6_prohibit_entry:
5374         kfree(net->ipv6.ip6_prohibit_entry);
5375 out_ip6_null_entry:
5376         kfree(net->ipv6.ip6_null_entry);
5377 #endif
5378 out_fib6_null_entry:
5379         kfree(net->ipv6.fib6_null_entry);
5380 out_ip6_dst_entries:
5381         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5382 out_ip6_dst_ops:
5383         goto out;
5384 }
5385
5386 static void __net_exit ip6_route_net_exit(struct net *net)
5387 {
5388         kfree(net->ipv6.fib6_null_entry);
5389         kfree(net->ipv6.ip6_null_entry);
5390 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5391         kfree(net->ipv6.ip6_prohibit_entry);
5392         kfree(net->ipv6.ip6_blk_hole_entry);
5393 #endif
5394         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5395 }
5396
5397 static int __net_init ip6_route_net_init_late(struct net *net)
5398 {
5399 #ifdef CONFIG_PROC_FS
5400         proc_create_net("ipv6_route", 0, net->proc_net, &ipv6_route_seq_ops,
5401                         sizeof(struct ipv6_route_iter));
5402         proc_create_net_single("rt6_stats", 0444, net->proc_net,
5403                         rt6_stats_seq_show, NULL);
5404 #endif
5405         return 0;
5406 }
5407
5408 static void __net_exit ip6_route_net_exit_late(struct net *net)
5409 {
5410 #ifdef CONFIG_PROC_FS
5411         remove_proc_entry("ipv6_route", net->proc_net);
5412         remove_proc_entry("rt6_stats", net->proc_net);
5413 #endif
5414 }
5415
5416 static struct pernet_operations ip6_route_net_ops = {
5417         .init = ip6_route_net_init,
5418         .exit = ip6_route_net_exit,
5419 };
5420
5421 static int __net_init ipv6_inetpeer_init(struct net *net)
5422 {
5423         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
5424
5425         if (!bp)
5426                 return -ENOMEM;
5427         inet_peer_base_init(bp);
5428         net->ipv6.peers = bp;
5429         return 0;
5430 }
5431
5432 static void __net_exit ipv6_inetpeer_exit(struct net *net)
5433 {
5434         struct inet_peer_base *bp = net->ipv6.peers;
5435
5436         net->ipv6.peers = NULL;
5437         inetpeer_invalidate_tree(bp);
5438         kfree(bp);
5439 }
5440
5441 static struct pernet_operations ipv6_inetpeer_ops = {
5442         .init   =       ipv6_inetpeer_init,
5443         .exit   =       ipv6_inetpeer_exit,
5444 };
5445
5446 static struct pernet_operations ip6_route_net_late_ops = {
5447         .init = ip6_route_net_init_late,
5448         .exit = ip6_route_net_exit_late,
5449 };
5450
5451 static struct notifier_block ip6_route_dev_notifier = {
5452         .notifier_call = ip6_route_dev_notify,
5453         .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
5454 };
5455
5456 void __init ip6_route_init_special_entries(void)
5457 {
5458         /* Registering of the loopback is done before this portion of code,
5459          * the loopback reference in rt6_info will not be taken, do it
5460          * manually for init_net */
5461         init_net.ipv6.fib6_null_entry->fib6_nh.fib_nh_dev = init_net.loopback_dev;
5462         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
5463         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5464   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5465         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
5466         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5467         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
5468         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5469   #endif
5470 }
5471
5472 int __init ip6_route_init(void)
5473 {
5474         int ret;
5475         int cpu;
5476
5477         ret = -ENOMEM;
5478         ip6_dst_ops_template.kmem_cachep =
5479                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
5480                                   SLAB_HWCACHE_ALIGN, NULL);
5481         if (!ip6_dst_ops_template.kmem_cachep)
5482                 goto out;
5483
5484         ret = dst_entries_init(&ip6_dst_blackhole_ops);
5485         if (ret)
5486                 goto out_kmem_cache;
5487
5488         ret = register_pernet_subsys(&ipv6_inetpeer_ops);
5489         if (ret)
5490                 goto out_dst_entries;
5491
5492         ret = register_pernet_subsys(&ip6_route_net_ops);
5493         if (ret)
5494                 goto out_register_inetpeer;
5495
5496         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
5497
5498         ret = fib6_init();
5499         if (ret)
5500                 goto out_register_subsys;
5501
5502         ret = xfrm6_init();
5503         if (ret)
5504                 goto out_fib6_init;
5505
5506         ret = fib6_rules_init();
5507         if (ret)
5508                 goto xfrm6_init;
5509
5510         ret = register_pernet_subsys(&ip6_route_net_late_ops);
5511         if (ret)
5512                 goto fib6_rules_init;
5513
5514         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
5515                                    inet6_rtm_newroute, NULL, 0);
5516         if (ret < 0)
5517                 goto out_register_late_subsys;
5518
5519         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
5520                                    inet6_rtm_delroute, NULL, 0);
5521         if (ret < 0)
5522                 goto out_register_late_subsys;
5523
5524         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
5525                                    inet6_rtm_getroute, NULL,
5526                                    RTNL_FLAG_DOIT_UNLOCKED);
5527         if (ret < 0)
5528                 goto out_register_late_subsys;
5529
5530         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
5531         if (ret)
5532                 goto out_register_late_subsys;
5533
5534         for_each_possible_cpu(cpu) {
5535                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
5536
5537                 INIT_LIST_HEAD(&ul->head);
5538                 spin_lock_init(&ul->lock);
5539         }
5540
5541 out:
5542         return ret;
5543
5544 out_register_late_subsys:
5545         rtnl_unregister_all(PF_INET6);
5546         unregister_pernet_subsys(&ip6_route_net_late_ops);
5547 fib6_rules_init:
5548         fib6_rules_cleanup();
5549 xfrm6_init:
5550         xfrm6_fini();
5551 out_fib6_init:
5552         fib6_gc_cleanup();
5553 out_register_subsys:
5554         unregister_pernet_subsys(&ip6_route_net_ops);
5555 out_register_inetpeer:
5556         unregister_pernet_subsys(&ipv6_inetpeer_ops);
5557 out_dst_entries:
5558         dst_entries_destroy(&ip6_dst_blackhole_ops);
5559 out_kmem_cache:
5560         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5561         goto out;
5562 }
5563
5564 void ip6_route_cleanup(void)
5565 {
5566         unregister_netdevice_notifier(&ip6_route_dev_notifier);
5567         unregister_pernet_subsys(&ip6_route_net_late_ops);
5568         fib6_rules_cleanup();
5569         xfrm6_fini();
5570         fib6_gc_cleanup();
5571         unregister_pernet_subsys(&ipv6_inetpeer_ops);
5572         unregister_pernet_subsys(&ip6_route_net_ops);
5573         dst_entries_destroy(&ip6_dst_blackhole_ops);
5574         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5575 }