ipv6: Always allocate pcpu memory in a fib6_nh
[linux-2.6-microblaze.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #define pr_fmt(fmt) "IPv6: " fmt
28
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
49 #include <net/snmp.h>
50 #include <net/ipv6.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
55 #include <net/tcp.h>
56 #include <linux/rtnetlink.h>
57 #include <net/dst.h>
58 #include <net/dst_metadata.h>
59 #include <net/xfrm.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/rtnh.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
66 #include <net/ip.h>
67 #include <linux/uaccess.h>
68
69 #ifdef CONFIG_SYSCTL
70 #include <linux/sysctl.h>
71 #endif
72
73 static int ip6_rt_type_to_error(u8 fib6_type);
74
75 #define CREATE_TRACE_POINTS
76 #include <trace/events/fib6.h>
77 EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup);
78 #undef CREATE_TRACE_POINTS
79
80 enum rt6_nud_state {
81         RT6_NUD_FAIL_HARD = -3,
82         RT6_NUD_FAIL_PROBE = -2,
83         RT6_NUD_FAIL_DO_RR = -1,
84         RT6_NUD_SUCCEED = 1
85 };
86
87 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
88 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
89 static unsigned int      ip6_mtu(const struct dst_entry *dst);
90 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
91 static void             ip6_dst_destroy(struct dst_entry *);
92 static void             ip6_dst_ifdown(struct dst_entry *,
93                                        struct net_device *dev, int how);
94 static int               ip6_dst_gc(struct dst_ops *ops);
95
96 static int              ip6_pkt_discard(struct sk_buff *skb);
97 static int              ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
98 static int              ip6_pkt_prohibit(struct sk_buff *skb);
99 static int              ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
100 static void             ip6_link_failure(struct sk_buff *skb);
101 static void             ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
102                                            struct sk_buff *skb, u32 mtu);
103 static void             rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
104                                         struct sk_buff *skb);
105 static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif,
106                            int strict);
107 static size_t rt6_nlmsg_size(struct fib6_info *rt);
108 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
109                          struct fib6_info *rt, struct dst_entry *dst,
110                          struct in6_addr *dest, struct in6_addr *src,
111                          int iif, int type, u32 portid, u32 seq,
112                          unsigned int flags);
113 static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res,
114                                            const struct in6_addr *daddr,
115                                            const struct in6_addr *saddr);
116
117 #ifdef CONFIG_IPV6_ROUTE_INFO
118 static struct fib6_info *rt6_add_route_info(struct net *net,
119                                            const struct in6_addr *prefix, int prefixlen,
120                                            const struct in6_addr *gwaddr,
121                                            struct net_device *dev,
122                                            unsigned int pref);
123 static struct fib6_info *rt6_get_route_info(struct net *net,
124                                            const struct in6_addr *prefix, int prefixlen,
125                                            const struct in6_addr *gwaddr,
126                                            struct net_device *dev);
127 #endif
128
129 struct uncached_list {
130         spinlock_t              lock;
131         struct list_head        head;
132 };
133
134 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
135
136 void rt6_uncached_list_add(struct rt6_info *rt)
137 {
138         struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
139
140         rt->rt6i_uncached_list = ul;
141
142         spin_lock_bh(&ul->lock);
143         list_add_tail(&rt->rt6i_uncached, &ul->head);
144         spin_unlock_bh(&ul->lock);
145 }
146
147 void rt6_uncached_list_del(struct rt6_info *rt)
148 {
149         if (!list_empty(&rt->rt6i_uncached)) {
150                 struct uncached_list *ul = rt->rt6i_uncached_list;
151                 struct net *net = dev_net(rt->dst.dev);
152
153                 spin_lock_bh(&ul->lock);
154                 list_del(&rt->rt6i_uncached);
155                 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
156                 spin_unlock_bh(&ul->lock);
157         }
158 }
159
160 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
161 {
162         struct net_device *loopback_dev = net->loopback_dev;
163         int cpu;
164
165         if (dev == loopback_dev)
166                 return;
167
168         for_each_possible_cpu(cpu) {
169                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
170                 struct rt6_info *rt;
171
172                 spin_lock_bh(&ul->lock);
173                 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
174                         struct inet6_dev *rt_idev = rt->rt6i_idev;
175                         struct net_device *rt_dev = rt->dst.dev;
176
177                         if (rt_idev->dev == dev) {
178                                 rt->rt6i_idev = in6_dev_get(loopback_dev);
179                                 in6_dev_put(rt_idev);
180                         }
181
182                         if (rt_dev == dev) {
183                                 rt->dst.dev = loopback_dev;
184                                 dev_hold(rt->dst.dev);
185                                 dev_put(rt_dev);
186                         }
187                 }
188                 spin_unlock_bh(&ul->lock);
189         }
190 }
191
192 static inline const void *choose_neigh_daddr(const struct in6_addr *p,
193                                              struct sk_buff *skb,
194                                              const void *daddr)
195 {
196         if (!ipv6_addr_any(p))
197                 return (const void *) p;
198         else if (skb)
199                 return &ipv6_hdr(skb)->daddr;
200         return daddr;
201 }
202
203 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
204                                    struct net_device *dev,
205                                    struct sk_buff *skb,
206                                    const void *daddr)
207 {
208         struct neighbour *n;
209
210         daddr = choose_neigh_daddr(gw, skb, daddr);
211         n = __ipv6_neigh_lookup(dev, daddr);
212         if (n)
213                 return n;
214
215         n = neigh_create(&nd_tbl, daddr, dev);
216         return IS_ERR(n) ? NULL : n;
217 }
218
219 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst,
220                                               struct sk_buff *skb,
221                                               const void *daddr)
222 {
223         const struct rt6_info *rt = container_of(dst, struct rt6_info, dst);
224
225         return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr);
226 }
227
228 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
229 {
230         struct net_device *dev = dst->dev;
231         struct rt6_info *rt = (struct rt6_info *)dst;
232
233         daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr);
234         if (!daddr)
235                 return;
236         if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
237                 return;
238         if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
239                 return;
240         __ipv6_confirm_neigh(dev, daddr);
241 }
242
243 static struct dst_ops ip6_dst_ops_template = {
244         .family                 =       AF_INET6,
245         .gc                     =       ip6_dst_gc,
246         .gc_thresh              =       1024,
247         .check                  =       ip6_dst_check,
248         .default_advmss         =       ip6_default_advmss,
249         .mtu                    =       ip6_mtu,
250         .cow_metrics            =       dst_cow_metrics_generic,
251         .destroy                =       ip6_dst_destroy,
252         .ifdown                 =       ip6_dst_ifdown,
253         .negative_advice        =       ip6_negative_advice,
254         .link_failure           =       ip6_link_failure,
255         .update_pmtu            =       ip6_rt_update_pmtu,
256         .redirect               =       rt6_do_redirect,
257         .local_out              =       __ip6_local_out,
258         .neigh_lookup           =       ip6_dst_neigh_lookup,
259         .confirm_neigh          =       ip6_confirm_neigh,
260 };
261
262 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
263 {
264         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
265
266         return mtu ? : dst->dev->mtu;
267 }
268
269 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
270                                          struct sk_buff *skb, u32 mtu)
271 {
272 }
273
274 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
275                                       struct sk_buff *skb)
276 {
277 }
278
279 static struct dst_ops ip6_dst_blackhole_ops = {
280         .family                 =       AF_INET6,
281         .destroy                =       ip6_dst_destroy,
282         .check                  =       ip6_dst_check,
283         .mtu                    =       ip6_blackhole_mtu,
284         .default_advmss         =       ip6_default_advmss,
285         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
286         .redirect               =       ip6_rt_blackhole_redirect,
287         .cow_metrics            =       dst_cow_metrics_generic,
288         .neigh_lookup           =       ip6_dst_neigh_lookup,
289 };
290
291 static const u32 ip6_template_metrics[RTAX_MAX] = {
292         [RTAX_HOPLIMIT - 1] = 0,
293 };
294
295 static const struct fib6_info fib6_null_entry_template = {
296         .fib6_flags     = (RTF_REJECT | RTF_NONEXTHOP),
297         .fib6_protocol  = RTPROT_KERNEL,
298         .fib6_metric    = ~(u32)0,
299         .fib6_ref       = REFCOUNT_INIT(1),
300         .fib6_type      = RTN_UNREACHABLE,
301         .fib6_metrics   = (struct dst_metrics *)&dst_default_metrics,
302 };
303
304 static const struct rt6_info ip6_null_entry_template = {
305         .dst = {
306                 .__refcnt       = ATOMIC_INIT(1),
307                 .__use          = 1,
308                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
309                 .error          = -ENETUNREACH,
310                 .input          = ip6_pkt_discard,
311                 .output         = ip6_pkt_discard_out,
312         },
313         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
314 };
315
316 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
317
318 static const struct rt6_info ip6_prohibit_entry_template = {
319         .dst = {
320                 .__refcnt       = ATOMIC_INIT(1),
321                 .__use          = 1,
322                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
323                 .error          = -EACCES,
324                 .input          = ip6_pkt_prohibit,
325                 .output         = ip6_pkt_prohibit_out,
326         },
327         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
328 };
329
330 static const struct rt6_info ip6_blk_hole_entry_template = {
331         .dst = {
332                 .__refcnt       = ATOMIC_INIT(1),
333                 .__use          = 1,
334                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
335                 .error          = -EINVAL,
336                 .input          = dst_discard,
337                 .output         = dst_discard_out,
338         },
339         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
340 };
341
342 #endif
343
344 static void rt6_info_init(struct rt6_info *rt)
345 {
346         struct dst_entry *dst = &rt->dst;
347
348         memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
349         INIT_LIST_HEAD(&rt->rt6i_uncached);
350 }
351
352 /* allocate dst with ip6_dst_ops */
353 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
354                                int flags)
355 {
356         struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
357                                         1, DST_OBSOLETE_FORCE_CHK, flags);
358
359         if (rt) {
360                 rt6_info_init(rt);
361                 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
362         }
363
364         return rt;
365 }
366 EXPORT_SYMBOL(ip6_dst_alloc);
367
368 static void ip6_dst_destroy(struct dst_entry *dst)
369 {
370         struct rt6_info *rt = (struct rt6_info *)dst;
371         struct fib6_info *from;
372         struct inet6_dev *idev;
373
374         ip_dst_metrics_put(dst);
375         rt6_uncached_list_del(rt);
376
377         idev = rt->rt6i_idev;
378         if (idev) {
379                 rt->rt6i_idev = NULL;
380                 in6_dev_put(idev);
381         }
382
383         from = xchg((__force struct fib6_info **)&rt->from, NULL);
384         fib6_info_release(from);
385 }
386
387 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
388                            int how)
389 {
390         struct rt6_info *rt = (struct rt6_info *)dst;
391         struct inet6_dev *idev = rt->rt6i_idev;
392         struct net_device *loopback_dev =
393                 dev_net(dev)->loopback_dev;
394
395         if (idev && idev->dev != loopback_dev) {
396                 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
397                 if (loopback_idev) {
398                         rt->rt6i_idev = loopback_idev;
399                         in6_dev_put(idev);
400                 }
401         }
402 }
403
404 static bool __rt6_check_expired(const struct rt6_info *rt)
405 {
406         if (rt->rt6i_flags & RTF_EXPIRES)
407                 return time_after(jiffies, rt->dst.expires);
408         else
409                 return false;
410 }
411
412 static bool rt6_check_expired(const struct rt6_info *rt)
413 {
414         struct fib6_info *from;
415
416         from = rcu_dereference(rt->from);
417
418         if (rt->rt6i_flags & RTF_EXPIRES) {
419                 if (time_after(jiffies, rt->dst.expires))
420                         return true;
421         } else if (from) {
422                 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
423                         fib6_check_expired(from);
424         }
425         return false;
426 }
427
428 void fib6_select_path(const struct net *net, struct fib6_result *res,
429                       struct flowi6 *fl6, int oif, bool have_oif_match,
430                       const struct sk_buff *skb, int strict)
431 {
432         struct fib6_info *sibling, *next_sibling;
433         struct fib6_info *match = res->f6i;
434
435         if (!match->fib6_nsiblings || have_oif_match)
436                 goto out;
437
438         /* We might have already computed the hash for ICMPv6 errors. In such
439          * case it will always be non-zero. Otherwise now is the time to do it.
440          */
441         if (!fl6->mp_hash)
442                 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
443
444         if (fl6->mp_hash <= atomic_read(&match->fib6_nh->fib_nh_upper_bound))
445                 goto out;
446
447         list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
448                                  fib6_siblings) {
449                 const struct fib6_nh *nh = sibling->fib6_nh;
450                 int nh_upper_bound;
451
452                 nh_upper_bound = atomic_read(&nh->fib_nh_upper_bound);
453                 if (fl6->mp_hash > nh_upper_bound)
454                         continue;
455                 if (rt6_score_route(nh, sibling->fib6_flags, oif, strict) < 0)
456                         break;
457                 match = sibling;
458                 break;
459         }
460
461 out:
462         res->f6i = match;
463         res->nh = match->fib6_nh;
464 }
465
466 /*
467  *      Route lookup. rcu_read_lock() should be held.
468  */
469
470 static bool __rt6_device_match(struct net *net, const struct fib6_nh *nh,
471                                const struct in6_addr *saddr, int oif, int flags)
472 {
473         const struct net_device *dev;
474
475         if (nh->fib_nh_flags & RTNH_F_DEAD)
476                 return false;
477
478         dev = nh->fib_nh_dev;
479         if (oif) {
480                 if (dev->ifindex == oif)
481                         return true;
482         } else {
483                 if (ipv6_chk_addr(net, saddr, dev,
484                                   flags & RT6_LOOKUP_F_IFACE))
485                         return true;
486         }
487
488         return false;
489 }
490
491 static void rt6_device_match(struct net *net, struct fib6_result *res,
492                              const struct in6_addr *saddr, int oif, int flags)
493 {
494         struct fib6_info *f6i = res->f6i;
495         struct fib6_info *spf6i;
496         struct fib6_nh *nh;
497
498         if (!oif && ipv6_addr_any(saddr)) {
499                 nh = f6i->fib6_nh;
500                 if (!(nh->fib_nh_flags & RTNH_F_DEAD))
501                         goto out;
502         }
503
504         for (spf6i = f6i; spf6i; spf6i = rcu_dereference(spf6i->fib6_next)) {
505                 nh = spf6i->fib6_nh;
506                 if (__rt6_device_match(net, nh, saddr, oif, flags)) {
507                         res->f6i = spf6i;
508                         goto out;
509                 }
510         }
511
512         if (oif && flags & RT6_LOOKUP_F_IFACE) {
513                 res->f6i = net->ipv6.fib6_null_entry;
514                 nh = res->f6i->fib6_nh;
515                 goto out;
516         }
517
518         nh = f6i->fib6_nh;
519         if (nh->fib_nh_flags & RTNH_F_DEAD) {
520                 res->f6i = net->ipv6.fib6_null_entry;
521                 nh = res->f6i->fib6_nh;
522         }
523 out:
524         res->nh = nh;
525         res->fib6_type = res->f6i->fib6_type;
526         res->fib6_flags = res->f6i->fib6_flags;
527 }
528
529 #ifdef CONFIG_IPV6_ROUTER_PREF
530 struct __rt6_probe_work {
531         struct work_struct work;
532         struct in6_addr target;
533         struct net_device *dev;
534 };
535
536 static void rt6_probe_deferred(struct work_struct *w)
537 {
538         struct in6_addr mcaddr;
539         struct __rt6_probe_work *work =
540                 container_of(w, struct __rt6_probe_work, work);
541
542         addrconf_addr_solict_mult(&work->target, &mcaddr);
543         ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
544         dev_put(work->dev);
545         kfree(work);
546 }
547
548 static void rt6_probe(struct fib6_nh *fib6_nh)
549 {
550         struct __rt6_probe_work *work = NULL;
551         const struct in6_addr *nh_gw;
552         struct neighbour *neigh;
553         struct net_device *dev;
554         struct inet6_dev *idev;
555
556         /*
557          * Okay, this does not seem to be appropriate
558          * for now, however, we need to check if it
559          * is really so; aka Router Reachability Probing.
560          *
561          * Router Reachability Probe MUST be rate-limited
562          * to no more than one per minute.
563          */
564         if (fib6_nh->fib_nh_gw_family)
565                 return;
566
567         nh_gw = &fib6_nh->fib_nh_gw6;
568         dev = fib6_nh->fib_nh_dev;
569         rcu_read_lock_bh();
570         idev = __in6_dev_get(dev);
571         neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
572         if (neigh) {
573                 if (neigh->nud_state & NUD_VALID)
574                         goto out;
575
576                 write_lock(&neigh->lock);
577                 if (!(neigh->nud_state & NUD_VALID) &&
578                     time_after(jiffies,
579                                neigh->updated + idev->cnf.rtr_probe_interval)) {
580                         work = kmalloc(sizeof(*work), GFP_ATOMIC);
581                         if (work)
582                                 __neigh_set_probe_once(neigh);
583                 }
584                 write_unlock(&neigh->lock);
585         } else if (time_after(jiffies, fib6_nh->last_probe +
586                                        idev->cnf.rtr_probe_interval)) {
587                 work = kmalloc(sizeof(*work), GFP_ATOMIC);
588         }
589
590         if (work) {
591                 fib6_nh->last_probe = jiffies;
592                 INIT_WORK(&work->work, rt6_probe_deferred);
593                 work->target = *nh_gw;
594                 dev_hold(dev);
595                 work->dev = dev;
596                 schedule_work(&work->work);
597         }
598
599 out:
600         rcu_read_unlock_bh();
601 }
602 #else
603 static inline void rt6_probe(struct fib6_nh *fib6_nh)
604 {
605 }
606 #endif
607
608 /*
609  * Default Router Selection (RFC 2461 6.3.6)
610  */
611 static enum rt6_nud_state rt6_check_neigh(const struct fib6_nh *fib6_nh)
612 {
613         enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
614         struct neighbour *neigh;
615
616         rcu_read_lock_bh();
617         neigh = __ipv6_neigh_lookup_noref(fib6_nh->fib_nh_dev,
618                                           &fib6_nh->fib_nh_gw6);
619         if (neigh) {
620                 read_lock(&neigh->lock);
621                 if (neigh->nud_state & NUD_VALID)
622                         ret = RT6_NUD_SUCCEED;
623 #ifdef CONFIG_IPV6_ROUTER_PREF
624                 else if (!(neigh->nud_state & NUD_FAILED))
625                         ret = RT6_NUD_SUCCEED;
626                 else
627                         ret = RT6_NUD_FAIL_PROBE;
628 #endif
629                 read_unlock(&neigh->lock);
630         } else {
631                 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
632                       RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
633         }
634         rcu_read_unlock_bh();
635
636         return ret;
637 }
638
639 static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif,
640                            int strict)
641 {
642         int m = 0;
643
644         if (!oif || nh->fib_nh_dev->ifindex == oif)
645                 m = 2;
646
647         if (!m && (strict & RT6_LOOKUP_F_IFACE))
648                 return RT6_NUD_FAIL_HARD;
649 #ifdef CONFIG_IPV6_ROUTER_PREF
650         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(fib6_flags)) << 2;
651 #endif
652         if ((strict & RT6_LOOKUP_F_REACHABLE) &&
653             !(fib6_flags & RTF_NONEXTHOP) && nh->fib_nh_gw_family) {
654                 int n = rt6_check_neigh(nh);
655                 if (n < 0)
656                         return n;
657         }
658         return m;
659 }
660
661 static bool find_match(struct fib6_nh *nh, u32 fib6_flags,
662                        int oif, int strict, int *mpri, bool *do_rr)
663 {
664         bool match_do_rr = false;
665         bool rc = false;
666         int m;
667
668         if (nh->fib_nh_flags & RTNH_F_DEAD)
669                 goto out;
670
671         if (ip6_ignore_linkdown(nh->fib_nh_dev) &&
672             nh->fib_nh_flags & RTNH_F_LINKDOWN &&
673             !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
674                 goto out;
675
676         m = rt6_score_route(nh, fib6_flags, oif, strict);
677         if (m == RT6_NUD_FAIL_DO_RR) {
678                 match_do_rr = true;
679                 m = 0; /* lowest valid score */
680         } else if (m == RT6_NUD_FAIL_HARD) {
681                 goto out;
682         }
683
684         if (strict & RT6_LOOKUP_F_REACHABLE)
685                 rt6_probe(nh);
686
687         /* note that m can be RT6_NUD_FAIL_PROBE at this point */
688         if (m > *mpri) {
689                 *do_rr = match_do_rr;
690                 *mpri = m;
691                 rc = true;
692         }
693 out:
694         return rc;
695 }
696
697 static void __find_rr_leaf(struct fib6_info *f6i_start,
698                            struct fib6_info *nomatch, u32 metric,
699                            struct fib6_result *res, struct fib6_info **cont,
700                            int oif, int strict, bool *do_rr, int *mpri)
701 {
702         struct fib6_info *f6i;
703
704         for (f6i = f6i_start;
705              f6i && f6i != nomatch;
706              f6i = rcu_dereference(f6i->fib6_next)) {
707                 struct fib6_nh *nh;
708
709                 if (cont && f6i->fib6_metric != metric) {
710                         *cont = f6i;
711                         return;
712                 }
713
714                 if (fib6_check_expired(f6i))
715                         continue;
716
717                 nh = f6i->fib6_nh;
718                 if (find_match(nh, f6i->fib6_flags, oif, strict, mpri, do_rr)) {
719                         res->f6i = f6i;
720                         res->nh = nh;
721                         res->fib6_flags = f6i->fib6_flags;
722                         res->fib6_type = f6i->fib6_type;
723                 }
724         }
725 }
726
727 static void find_rr_leaf(struct fib6_node *fn, struct fib6_info *leaf,
728                          struct fib6_info *rr_head, int oif, int strict,
729                          bool *do_rr, struct fib6_result *res)
730 {
731         u32 metric = rr_head->fib6_metric;
732         struct fib6_info *cont = NULL;
733         int mpri = -1;
734
735         __find_rr_leaf(rr_head, NULL, metric, res, &cont,
736                        oif, strict, do_rr, &mpri);
737
738         __find_rr_leaf(leaf, rr_head, metric, res, &cont,
739                        oif, strict, do_rr, &mpri);
740
741         if (res->f6i || !cont)
742                 return;
743
744         __find_rr_leaf(cont, NULL, metric, res, NULL,
745                        oif, strict, do_rr, &mpri);
746 }
747
748 static void rt6_select(struct net *net, struct fib6_node *fn, int oif,
749                        struct fib6_result *res, int strict)
750 {
751         struct fib6_info *leaf = rcu_dereference(fn->leaf);
752         struct fib6_info *rt0;
753         bool do_rr = false;
754         int key_plen;
755
756         /* make sure this function or its helpers sets f6i */
757         res->f6i = NULL;
758
759         if (!leaf || leaf == net->ipv6.fib6_null_entry)
760                 goto out;
761
762         rt0 = rcu_dereference(fn->rr_ptr);
763         if (!rt0)
764                 rt0 = leaf;
765
766         /* Double check to make sure fn is not an intermediate node
767          * and fn->leaf does not points to its child's leaf
768          * (This might happen if all routes under fn are deleted from
769          * the tree and fib6_repair_tree() is called on the node.)
770          */
771         key_plen = rt0->fib6_dst.plen;
772 #ifdef CONFIG_IPV6_SUBTREES
773         if (rt0->fib6_src.plen)
774                 key_plen = rt0->fib6_src.plen;
775 #endif
776         if (fn->fn_bit != key_plen)
777                 goto out;
778
779         find_rr_leaf(fn, leaf, rt0, oif, strict, &do_rr, res);
780         if (do_rr) {
781                 struct fib6_info *next = rcu_dereference(rt0->fib6_next);
782
783                 /* no entries matched; do round-robin */
784                 if (!next || next->fib6_metric != rt0->fib6_metric)
785                         next = leaf;
786
787                 if (next != rt0) {
788                         spin_lock_bh(&leaf->fib6_table->tb6_lock);
789                         /* make sure next is not being deleted from the tree */
790                         if (next->fib6_node)
791                                 rcu_assign_pointer(fn->rr_ptr, next);
792                         spin_unlock_bh(&leaf->fib6_table->tb6_lock);
793                 }
794         }
795
796 out:
797         if (!res->f6i) {
798                 res->f6i = net->ipv6.fib6_null_entry;
799                 res->nh = res->f6i->fib6_nh;
800                 res->fib6_flags = res->f6i->fib6_flags;
801                 res->fib6_type = res->f6i->fib6_type;
802         }
803 }
804
805 static bool rt6_is_gw_or_nonexthop(const struct fib6_result *res)
806 {
807         return (res->f6i->fib6_flags & RTF_NONEXTHOP) ||
808                res->nh->fib_nh_gw_family;
809 }
810
811 #ifdef CONFIG_IPV6_ROUTE_INFO
812 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
813                   const struct in6_addr *gwaddr)
814 {
815         struct net *net = dev_net(dev);
816         struct route_info *rinfo = (struct route_info *) opt;
817         struct in6_addr prefix_buf, *prefix;
818         unsigned int pref;
819         unsigned long lifetime;
820         struct fib6_info *rt;
821
822         if (len < sizeof(struct route_info)) {
823                 return -EINVAL;
824         }
825
826         /* Sanity check for prefix_len and length */
827         if (rinfo->length > 3) {
828                 return -EINVAL;
829         } else if (rinfo->prefix_len > 128) {
830                 return -EINVAL;
831         } else if (rinfo->prefix_len > 64) {
832                 if (rinfo->length < 2) {
833                         return -EINVAL;
834                 }
835         } else if (rinfo->prefix_len > 0) {
836                 if (rinfo->length < 1) {
837                         return -EINVAL;
838                 }
839         }
840
841         pref = rinfo->route_pref;
842         if (pref == ICMPV6_ROUTER_PREF_INVALID)
843                 return -EINVAL;
844
845         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
846
847         if (rinfo->length == 3)
848                 prefix = (struct in6_addr *)rinfo->prefix;
849         else {
850                 /* this function is safe */
851                 ipv6_addr_prefix(&prefix_buf,
852                                  (struct in6_addr *)rinfo->prefix,
853                                  rinfo->prefix_len);
854                 prefix = &prefix_buf;
855         }
856
857         if (rinfo->prefix_len == 0)
858                 rt = rt6_get_dflt_router(net, gwaddr, dev);
859         else
860                 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
861                                         gwaddr, dev);
862
863         if (rt && !lifetime) {
864                 ip6_del_rt(net, rt);
865                 rt = NULL;
866         }
867
868         if (!rt && lifetime)
869                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
870                                         dev, pref);
871         else if (rt)
872                 rt->fib6_flags = RTF_ROUTEINFO |
873                                  (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
874
875         if (rt) {
876                 if (!addrconf_finite_timeout(lifetime))
877                         fib6_clean_expires(rt);
878                 else
879                         fib6_set_expires(rt, jiffies + HZ * lifetime);
880
881                 fib6_info_release(rt);
882         }
883         return 0;
884 }
885 #endif
886
887 /*
888  *      Misc support functions
889  */
890
891 /* called with rcu_lock held */
892 static struct net_device *ip6_rt_get_dev_rcu(const struct fib6_result *res)
893 {
894         struct net_device *dev = res->nh->fib_nh_dev;
895
896         if (res->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) {
897                 /* for copies of local routes, dst->dev needs to be the
898                  * device if it is a master device, the master device if
899                  * device is enslaved, and the loopback as the default
900                  */
901                 if (netif_is_l3_slave(dev) &&
902                     !rt6_need_strict(&res->f6i->fib6_dst.addr))
903                         dev = l3mdev_master_dev_rcu(dev);
904                 else if (!netif_is_l3_master(dev))
905                         dev = dev_net(dev)->loopback_dev;
906                 /* last case is netif_is_l3_master(dev) is true in which
907                  * case we want dev returned to be dev
908                  */
909         }
910
911         return dev;
912 }
913
914 static const int fib6_prop[RTN_MAX + 1] = {
915         [RTN_UNSPEC]    = 0,
916         [RTN_UNICAST]   = 0,
917         [RTN_LOCAL]     = 0,
918         [RTN_BROADCAST] = 0,
919         [RTN_ANYCAST]   = 0,
920         [RTN_MULTICAST] = 0,
921         [RTN_BLACKHOLE] = -EINVAL,
922         [RTN_UNREACHABLE] = -EHOSTUNREACH,
923         [RTN_PROHIBIT]  = -EACCES,
924         [RTN_THROW]     = -EAGAIN,
925         [RTN_NAT]       = -EINVAL,
926         [RTN_XRESOLVE]  = -EINVAL,
927 };
928
929 static int ip6_rt_type_to_error(u8 fib6_type)
930 {
931         return fib6_prop[fib6_type];
932 }
933
934 static unsigned short fib6_info_dst_flags(struct fib6_info *rt)
935 {
936         unsigned short flags = 0;
937
938         if (rt->dst_nocount)
939                 flags |= DST_NOCOUNT;
940         if (rt->dst_nopolicy)
941                 flags |= DST_NOPOLICY;
942         if (rt->dst_host)
943                 flags |= DST_HOST;
944
945         return flags;
946 }
947
948 static void ip6_rt_init_dst_reject(struct rt6_info *rt, u8 fib6_type)
949 {
950         rt->dst.error = ip6_rt_type_to_error(fib6_type);
951
952         switch (fib6_type) {
953         case RTN_BLACKHOLE:
954                 rt->dst.output = dst_discard_out;
955                 rt->dst.input = dst_discard;
956                 break;
957         case RTN_PROHIBIT:
958                 rt->dst.output = ip6_pkt_prohibit_out;
959                 rt->dst.input = ip6_pkt_prohibit;
960                 break;
961         case RTN_THROW:
962         case RTN_UNREACHABLE:
963         default:
964                 rt->dst.output = ip6_pkt_discard_out;
965                 rt->dst.input = ip6_pkt_discard;
966                 break;
967         }
968 }
969
970 static void ip6_rt_init_dst(struct rt6_info *rt, const struct fib6_result *res)
971 {
972         struct fib6_info *f6i = res->f6i;
973
974         if (res->fib6_flags & RTF_REJECT) {
975                 ip6_rt_init_dst_reject(rt, res->fib6_type);
976                 return;
977         }
978
979         rt->dst.error = 0;
980         rt->dst.output = ip6_output;
981
982         if (res->fib6_type == RTN_LOCAL || res->fib6_type == RTN_ANYCAST) {
983                 rt->dst.input = ip6_input;
984         } else if (ipv6_addr_type(&f6i->fib6_dst.addr) & IPV6_ADDR_MULTICAST) {
985                 rt->dst.input = ip6_mc_input;
986         } else {
987                 rt->dst.input = ip6_forward;
988         }
989
990         if (res->nh->fib_nh_lws) {
991                 rt->dst.lwtstate = lwtstate_get(res->nh->fib_nh_lws);
992                 lwtunnel_set_redirect(&rt->dst);
993         }
994
995         rt->dst.lastuse = jiffies;
996 }
997
998 /* Caller must already hold reference to @from */
999 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
1000 {
1001         rt->rt6i_flags &= ~RTF_EXPIRES;
1002         rcu_assign_pointer(rt->from, from);
1003         ip_dst_init_metrics(&rt->dst, from->fib6_metrics);
1004 }
1005
1006 /* Caller must already hold reference to f6i in result */
1007 static void ip6_rt_copy_init(struct rt6_info *rt, const struct fib6_result *res)
1008 {
1009         const struct fib6_nh *nh = res->nh;
1010         const struct net_device *dev = nh->fib_nh_dev;
1011         struct fib6_info *f6i = res->f6i;
1012
1013         ip6_rt_init_dst(rt, res);
1014
1015         rt->rt6i_dst = f6i->fib6_dst;
1016         rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL;
1017         rt->rt6i_flags = res->fib6_flags;
1018         if (nh->fib_nh_gw_family) {
1019                 rt->rt6i_gateway = nh->fib_nh_gw6;
1020                 rt->rt6i_flags |= RTF_GATEWAY;
1021         }
1022         rt6_set_from(rt, f6i);
1023 #ifdef CONFIG_IPV6_SUBTREES
1024         rt->rt6i_src = f6i->fib6_src;
1025 #endif
1026 }
1027
1028 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
1029                                         struct in6_addr *saddr)
1030 {
1031         struct fib6_node *pn, *sn;
1032         while (1) {
1033                 if (fn->fn_flags & RTN_TL_ROOT)
1034                         return NULL;
1035                 pn = rcu_dereference(fn->parent);
1036                 sn = FIB6_SUBTREE(pn);
1037                 if (sn && sn != fn)
1038                         fn = fib6_node_lookup(sn, NULL, saddr);
1039                 else
1040                         fn = pn;
1041                 if (fn->fn_flags & RTN_RTINFO)
1042                         return fn;
1043         }
1044 }
1045
1046 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt)
1047 {
1048         struct rt6_info *rt = *prt;
1049
1050         if (dst_hold_safe(&rt->dst))
1051                 return true;
1052         if (net) {
1053                 rt = net->ipv6.ip6_null_entry;
1054                 dst_hold(&rt->dst);
1055         } else {
1056                 rt = NULL;
1057         }
1058         *prt = rt;
1059         return false;
1060 }
1061
1062 /* called with rcu_lock held */
1063 static struct rt6_info *ip6_create_rt_rcu(const struct fib6_result *res)
1064 {
1065         struct net_device *dev = res->nh->fib_nh_dev;
1066         struct fib6_info *f6i = res->f6i;
1067         unsigned short flags;
1068         struct rt6_info *nrt;
1069
1070         if (!fib6_info_hold_safe(f6i))
1071                 goto fallback;
1072
1073         flags = fib6_info_dst_flags(f6i);
1074         nrt = ip6_dst_alloc(dev_net(dev), dev, flags);
1075         if (!nrt) {
1076                 fib6_info_release(f6i);
1077                 goto fallback;
1078         }
1079
1080         ip6_rt_copy_init(nrt, res);
1081         return nrt;
1082
1083 fallback:
1084         nrt = dev_net(dev)->ipv6.ip6_null_entry;
1085         dst_hold(&nrt->dst);
1086         return nrt;
1087 }
1088
1089 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
1090                                              struct fib6_table *table,
1091                                              struct flowi6 *fl6,
1092                                              const struct sk_buff *skb,
1093                                              int flags)
1094 {
1095         struct fib6_result res = {};
1096         struct fib6_node *fn;
1097         struct rt6_info *rt;
1098
1099         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1100                 flags &= ~RT6_LOOKUP_F_IFACE;
1101
1102         rcu_read_lock();
1103         fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1104 restart:
1105         res.f6i = rcu_dereference(fn->leaf);
1106         if (!res.f6i)
1107                 res.f6i = net->ipv6.fib6_null_entry;
1108         else
1109                 rt6_device_match(net, &res, &fl6->saddr, fl6->flowi6_oif,
1110                                  flags);
1111
1112         if (res.f6i == net->ipv6.fib6_null_entry) {
1113                 fn = fib6_backtrack(fn, &fl6->saddr);
1114                 if (fn)
1115                         goto restart;
1116
1117                 rt = net->ipv6.ip6_null_entry;
1118                 dst_hold(&rt->dst);
1119                 goto out;
1120         }
1121
1122         fib6_select_path(net, &res, fl6, fl6->flowi6_oif,
1123                          fl6->flowi6_oif != 0, skb, flags);
1124
1125         /* Search through exception table */
1126         rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr);
1127         if (rt) {
1128                 if (ip6_hold_safe(net, &rt))
1129                         dst_use_noref(&rt->dst, jiffies);
1130         } else {
1131                 rt = ip6_create_rt_rcu(&res);
1132         }
1133
1134 out:
1135         trace_fib6_table_lookup(net, &res, table, fl6);
1136
1137         rcu_read_unlock();
1138
1139         return rt;
1140 }
1141
1142 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
1143                                    const struct sk_buff *skb, int flags)
1144 {
1145         return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
1146 }
1147 EXPORT_SYMBOL_GPL(ip6_route_lookup);
1148
1149 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
1150                             const struct in6_addr *saddr, int oif,
1151                             const struct sk_buff *skb, int strict)
1152 {
1153         struct flowi6 fl6 = {
1154                 .flowi6_oif = oif,
1155                 .daddr = *daddr,
1156         };
1157         struct dst_entry *dst;
1158         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
1159
1160         if (saddr) {
1161                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
1162                 flags |= RT6_LOOKUP_F_HAS_SADDR;
1163         }
1164
1165         dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
1166         if (dst->error == 0)
1167                 return (struct rt6_info *) dst;
1168
1169         dst_release(dst);
1170
1171         return NULL;
1172 }
1173 EXPORT_SYMBOL(rt6_lookup);
1174
1175 /* ip6_ins_rt is called with FREE table->tb6_lock.
1176  * It takes new route entry, the addition fails by any reason the
1177  * route is released.
1178  * Caller must hold dst before calling it.
1179  */
1180
1181 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info,
1182                         struct netlink_ext_ack *extack)
1183 {
1184         int err;
1185         struct fib6_table *table;
1186
1187         table = rt->fib6_table;
1188         spin_lock_bh(&table->tb6_lock);
1189         err = fib6_add(&table->tb6_root, rt, info, extack);
1190         spin_unlock_bh(&table->tb6_lock);
1191
1192         return err;
1193 }
1194
1195 int ip6_ins_rt(struct net *net, struct fib6_info *rt)
1196 {
1197         struct nl_info info = { .nl_net = net, };
1198
1199         return __ip6_ins_rt(rt, &info, NULL);
1200 }
1201
1202 static struct rt6_info *ip6_rt_cache_alloc(const struct fib6_result *res,
1203                                            const struct in6_addr *daddr,
1204                                            const struct in6_addr *saddr)
1205 {
1206         struct fib6_info *f6i = res->f6i;
1207         struct net_device *dev;
1208         struct rt6_info *rt;
1209
1210         /*
1211          *      Clone the route.
1212          */
1213
1214         if (!fib6_info_hold_safe(f6i))
1215                 return NULL;
1216
1217         dev = ip6_rt_get_dev_rcu(res);
1218         rt = ip6_dst_alloc(dev_net(dev), dev, 0);
1219         if (!rt) {
1220                 fib6_info_release(f6i);
1221                 return NULL;
1222         }
1223
1224         ip6_rt_copy_init(rt, res);
1225         rt->rt6i_flags |= RTF_CACHE;
1226         rt->dst.flags |= DST_HOST;
1227         rt->rt6i_dst.addr = *daddr;
1228         rt->rt6i_dst.plen = 128;
1229
1230         if (!rt6_is_gw_or_nonexthop(res)) {
1231                 if (f6i->fib6_dst.plen != 128 &&
1232                     ipv6_addr_equal(&f6i->fib6_dst.addr, daddr))
1233                         rt->rt6i_flags |= RTF_ANYCAST;
1234 #ifdef CONFIG_IPV6_SUBTREES
1235                 if (rt->rt6i_src.plen && saddr) {
1236                         rt->rt6i_src.addr = *saddr;
1237                         rt->rt6i_src.plen = 128;
1238                 }
1239 #endif
1240         }
1241
1242         return rt;
1243 }
1244
1245 static struct rt6_info *ip6_rt_pcpu_alloc(const struct fib6_result *res)
1246 {
1247         struct fib6_info *f6i = res->f6i;
1248         unsigned short flags = fib6_info_dst_flags(f6i);
1249         struct net_device *dev;
1250         struct rt6_info *pcpu_rt;
1251
1252         if (!fib6_info_hold_safe(f6i))
1253                 return NULL;
1254
1255         rcu_read_lock();
1256         dev = ip6_rt_get_dev_rcu(res);
1257         pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags);
1258         rcu_read_unlock();
1259         if (!pcpu_rt) {
1260                 fib6_info_release(f6i);
1261                 return NULL;
1262         }
1263         ip6_rt_copy_init(pcpu_rt, res);
1264         pcpu_rt->rt6i_flags |= RTF_PCPU;
1265         return pcpu_rt;
1266 }
1267
1268 /* It should be called with rcu_read_lock() acquired */
1269 static struct rt6_info *rt6_get_pcpu_route(const struct fib6_result *res)
1270 {
1271         struct rt6_info *pcpu_rt;
1272
1273         pcpu_rt = this_cpu_read(*res->nh->rt6i_pcpu);
1274
1275         if (pcpu_rt)
1276                 ip6_hold_safe(NULL, &pcpu_rt);
1277
1278         return pcpu_rt;
1279 }
1280
1281 static struct rt6_info *rt6_make_pcpu_route(struct net *net,
1282                                             const struct fib6_result *res)
1283 {
1284         struct rt6_info *pcpu_rt, *prev, **p;
1285
1286         pcpu_rt = ip6_rt_pcpu_alloc(res);
1287         if (!pcpu_rt) {
1288                 dst_hold(&net->ipv6.ip6_null_entry->dst);
1289                 return net->ipv6.ip6_null_entry;
1290         }
1291
1292         dst_hold(&pcpu_rt->dst);
1293         p = this_cpu_ptr(res->nh->rt6i_pcpu);
1294         prev = cmpxchg(p, NULL, pcpu_rt);
1295         BUG_ON(prev);
1296
1297         if (res->f6i->fib6_destroying) {
1298                 struct fib6_info *from;
1299
1300                 from = xchg((__force struct fib6_info **)&pcpu_rt->from, NULL);
1301                 fib6_info_release(from);
1302         }
1303
1304         return pcpu_rt;
1305 }
1306
1307 /* exception hash table implementation
1308  */
1309 static DEFINE_SPINLOCK(rt6_exception_lock);
1310
1311 /* Remove rt6_ex from hash table and free the memory
1312  * Caller must hold rt6_exception_lock
1313  */
1314 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1315                                  struct rt6_exception *rt6_ex)
1316 {
1317         struct fib6_info *from;
1318         struct net *net;
1319
1320         if (!bucket || !rt6_ex)
1321                 return;
1322
1323         net = dev_net(rt6_ex->rt6i->dst.dev);
1324         net->ipv6.rt6_stats->fib_rt_cache--;
1325
1326         /* purge completely the exception to allow releasing the held resources:
1327          * some [sk] cache may keep the dst around for unlimited time
1328          */
1329         from = xchg((__force struct fib6_info **)&rt6_ex->rt6i->from, NULL);
1330         fib6_info_release(from);
1331         dst_dev_put(&rt6_ex->rt6i->dst);
1332
1333         hlist_del_rcu(&rt6_ex->hlist);
1334         dst_release(&rt6_ex->rt6i->dst);
1335         kfree_rcu(rt6_ex, rcu);
1336         WARN_ON_ONCE(!bucket->depth);
1337         bucket->depth--;
1338 }
1339
1340 /* Remove oldest rt6_ex in bucket and free the memory
1341  * Caller must hold rt6_exception_lock
1342  */
1343 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1344 {
1345         struct rt6_exception *rt6_ex, *oldest = NULL;
1346
1347         if (!bucket)
1348                 return;
1349
1350         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1351                 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1352                         oldest = rt6_ex;
1353         }
1354         rt6_remove_exception(bucket, oldest);
1355 }
1356
1357 static u32 rt6_exception_hash(const struct in6_addr *dst,
1358                               const struct in6_addr *src)
1359 {
1360         static u32 seed __read_mostly;
1361         u32 val;
1362
1363         net_get_random_once(&seed, sizeof(seed));
1364         val = jhash(dst, sizeof(*dst), seed);
1365
1366 #ifdef CONFIG_IPV6_SUBTREES
1367         if (src)
1368                 val = jhash(src, sizeof(*src), val);
1369 #endif
1370         return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1371 }
1372
1373 /* Helper function to find the cached rt in the hash table
1374  * and update bucket pointer to point to the bucket for this
1375  * (daddr, saddr) pair
1376  * Caller must hold rt6_exception_lock
1377  */
1378 static struct rt6_exception *
1379 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1380                               const struct in6_addr *daddr,
1381                               const struct in6_addr *saddr)
1382 {
1383         struct rt6_exception *rt6_ex;
1384         u32 hval;
1385
1386         if (!(*bucket) || !daddr)
1387                 return NULL;
1388
1389         hval = rt6_exception_hash(daddr, saddr);
1390         *bucket += hval;
1391
1392         hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1393                 struct rt6_info *rt6 = rt6_ex->rt6i;
1394                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1395
1396 #ifdef CONFIG_IPV6_SUBTREES
1397                 if (matched && saddr)
1398                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1399 #endif
1400                 if (matched)
1401                         return rt6_ex;
1402         }
1403         return NULL;
1404 }
1405
1406 /* Helper function to find the cached rt in the hash table
1407  * and update bucket pointer to point to the bucket for this
1408  * (daddr, saddr) pair
1409  * Caller must hold rcu_read_lock()
1410  */
1411 static struct rt6_exception *
1412 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1413                          const struct in6_addr *daddr,
1414                          const struct in6_addr *saddr)
1415 {
1416         struct rt6_exception *rt6_ex;
1417         u32 hval;
1418
1419         WARN_ON_ONCE(!rcu_read_lock_held());
1420
1421         if (!(*bucket) || !daddr)
1422                 return NULL;
1423
1424         hval = rt6_exception_hash(daddr, saddr);
1425         *bucket += hval;
1426
1427         hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1428                 struct rt6_info *rt6 = rt6_ex->rt6i;
1429                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1430
1431 #ifdef CONFIG_IPV6_SUBTREES
1432                 if (matched && saddr)
1433                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1434 #endif
1435                 if (matched)
1436                         return rt6_ex;
1437         }
1438         return NULL;
1439 }
1440
1441 static unsigned int fib6_mtu(const struct fib6_result *res)
1442 {
1443         const struct fib6_nh *nh = res->nh;
1444         unsigned int mtu;
1445
1446         if (res->f6i->fib6_pmtu) {
1447                 mtu = res->f6i->fib6_pmtu;
1448         } else {
1449                 struct net_device *dev = nh->fib_nh_dev;
1450                 struct inet6_dev *idev;
1451
1452                 rcu_read_lock();
1453                 idev = __in6_dev_get(dev);
1454                 mtu = idev->cnf.mtu6;
1455                 rcu_read_unlock();
1456         }
1457
1458         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1459
1460         return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu);
1461 }
1462
1463 #define FIB6_EXCEPTION_BUCKET_FLUSHED  0x1UL
1464
1465 /* used when the flushed bit is not relevant, only access to the bucket
1466  * (ie., all bucket users except rt6_insert_exception);
1467  *
1468  * called under rcu lock; sometimes called with rt6_exception_lock held
1469  */
1470 static
1471 struct rt6_exception_bucket *fib6_nh_get_excptn_bucket(const struct fib6_nh *nh,
1472                                                        spinlock_t *lock)
1473 {
1474         struct rt6_exception_bucket *bucket;
1475
1476         if (lock)
1477                 bucket = rcu_dereference_protected(nh->rt6i_exception_bucket,
1478                                                    lockdep_is_held(lock));
1479         else
1480                 bucket = rcu_dereference(nh->rt6i_exception_bucket);
1481
1482         /* remove bucket flushed bit if set */
1483         if (bucket) {
1484                 unsigned long p = (unsigned long)bucket;
1485
1486                 p &= ~FIB6_EXCEPTION_BUCKET_FLUSHED;
1487                 bucket = (struct rt6_exception_bucket *)p;
1488         }
1489
1490         return bucket;
1491 }
1492
1493 static bool fib6_nh_excptn_bucket_flushed(struct rt6_exception_bucket *bucket)
1494 {
1495         unsigned long p = (unsigned long)bucket;
1496
1497         return !!(p & FIB6_EXCEPTION_BUCKET_FLUSHED);
1498 }
1499
1500 /* called with rt6_exception_lock held */
1501 static void fib6_nh_excptn_bucket_set_flushed(struct fib6_nh *nh,
1502                                               spinlock_t *lock)
1503 {
1504         struct rt6_exception_bucket *bucket;
1505         unsigned long p;
1506
1507         bucket = rcu_dereference_protected(nh->rt6i_exception_bucket,
1508                                            lockdep_is_held(lock));
1509
1510         p = (unsigned long)bucket;
1511         p |= FIB6_EXCEPTION_BUCKET_FLUSHED;
1512         bucket = (struct rt6_exception_bucket *)p;
1513         rcu_assign_pointer(nh->rt6i_exception_bucket, bucket);
1514 }
1515
1516 static int rt6_insert_exception(struct rt6_info *nrt,
1517                                 const struct fib6_result *res)
1518 {
1519         struct net *net = dev_net(nrt->dst.dev);
1520         struct rt6_exception_bucket *bucket;
1521         struct fib6_info *f6i = res->f6i;
1522         struct in6_addr *src_key = NULL;
1523         struct rt6_exception *rt6_ex;
1524         struct fib6_nh *nh = res->nh;
1525         int err = 0;
1526
1527         spin_lock_bh(&rt6_exception_lock);
1528
1529         bucket = rcu_dereference_protected(nh->rt6i_exception_bucket,
1530                                           lockdep_is_held(&rt6_exception_lock));
1531         if (!bucket) {
1532                 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1533                                  GFP_ATOMIC);
1534                 if (!bucket) {
1535                         err = -ENOMEM;
1536                         goto out;
1537                 }
1538                 rcu_assign_pointer(nh->rt6i_exception_bucket, bucket);
1539         } else if (fib6_nh_excptn_bucket_flushed(bucket)) {
1540                 err = -EINVAL;
1541                 goto out;
1542         }
1543
1544 #ifdef CONFIG_IPV6_SUBTREES
1545         /* fib6_src.plen != 0 indicates f6i is in subtree
1546          * and exception table is indexed by a hash of
1547          * both fib6_dst and fib6_src.
1548          * Otherwise, the exception table is indexed by
1549          * a hash of only fib6_dst.
1550          */
1551         if (f6i->fib6_src.plen)
1552                 src_key = &nrt->rt6i_src.addr;
1553 #endif
1554         /* rt6_mtu_change() might lower mtu on f6i.
1555          * Only insert this exception route if its mtu
1556          * is less than f6i's mtu value.
1557          */
1558         if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(res)) {
1559                 err = -EINVAL;
1560                 goto out;
1561         }
1562
1563         rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1564                                                src_key);
1565         if (rt6_ex)
1566                 rt6_remove_exception(bucket, rt6_ex);
1567
1568         rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1569         if (!rt6_ex) {
1570                 err = -ENOMEM;
1571                 goto out;
1572         }
1573         rt6_ex->rt6i = nrt;
1574         rt6_ex->stamp = jiffies;
1575         hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1576         bucket->depth++;
1577         net->ipv6.rt6_stats->fib_rt_cache++;
1578
1579         if (bucket->depth > FIB6_MAX_DEPTH)
1580                 rt6_exception_remove_oldest(bucket);
1581
1582 out:
1583         spin_unlock_bh(&rt6_exception_lock);
1584
1585         /* Update fn->fn_sernum to invalidate all cached dst */
1586         if (!err) {
1587                 spin_lock_bh(&f6i->fib6_table->tb6_lock);
1588                 fib6_update_sernum(net, f6i);
1589                 spin_unlock_bh(&f6i->fib6_table->tb6_lock);
1590                 fib6_force_start_gc(net);
1591         }
1592
1593         return err;
1594 }
1595
1596 static void fib6_nh_flush_exceptions(struct fib6_nh *nh, struct fib6_info *from)
1597 {
1598         struct rt6_exception_bucket *bucket;
1599         struct rt6_exception *rt6_ex;
1600         struct hlist_node *tmp;
1601         int i;
1602
1603         spin_lock_bh(&rt6_exception_lock);
1604
1605         bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock);
1606         if (!bucket)
1607                 goto out;
1608
1609         /* Prevent rt6_insert_exception() to recreate the bucket list */
1610         if (!from)
1611                 fib6_nh_excptn_bucket_set_flushed(nh, &rt6_exception_lock);
1612
1613         for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1614                 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist) {
1615                         if (!from ||
1616                             rcu_access_pointer(rt6_ex->rt6i->from) == from)
1617                                 rt6_remove_exception(bucket, rt6_ex);
1618                 }
1619                 WARN_ON_ONCE(!from && bucket->depth);
1620                 bucket++;
1621         }
1622 out:
1623         spin_unlock_bh(&rt6_exception_lock);
1624 }
1625
1626 void rt6_flush_exceptions(struct fib6_info *f6i)
1627 {
1628         fib6_nh_flush_exceptions(f6i->fib6_nh, f6i);
1629 }
1630
1631 /* Find cached rt in the hash table inside passed in rt
1632  * Caller has to hold rcu_read_lock()
1633  */
1634 static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res,
1635                                            const struct in6_addr *daddr,
1636                                            const struct in6_addr *saddr)
1637 {
1638         const struct in6_addr *src_key = NULL;
1639         struct rt6_exception_bucket *bucket;
1640         struct rt6_exception *rt6_ex;
1641         struct rt6_info *ret = NULL;
1642
1643 #ifdef CONFIG_IPV6_SUBTREES
1644         /* fib6i_src.plen != 0 indicates f6i is in subtree
1645          * and exception table is indexed by a hash of
1646          * both fib6_dst and fib6_src.
1647          * However, the src addr used to create the hash
1648          * might not be exactly the passed in saddr which
1649          * is a /128 addr from the flow.
1650          * So we need to use f6i->fib6_src to redo lookup
1651          * if the passed in saddr does not find anything.
1652          * (See the logic in ip6_rt_cache_alloc() on how
1653          * rt->rt6i_src is updated.)
1654          */
1655         if (res->f6i->fib6_src.plen)
1656                 src_key = saddr;
1657 find_ex:
1658 #endif
1659         bucket = fib6_nh_get_excptn_bucket(res->nh, NULL);
1660         rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1661
1662         if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1663                 ret = rt6_ex->rt6i;
1664
1665 #ifdef CONFIG_IPV6_SUBTREES
1666         /* Use fib6_src as src_key and redo lookup */
1667         if (!ret && src_key && src_key != &res->f6i->fib6_src.addr) {
1668                 src_key = &res->f6i->fib6_src.addr;
1669                 goto find_ex;
1670         }
1671 #endif
1672
1673         return ret;
1674 }
1675
1676 /* Remove the passed in cached rt from the hash table that contains it */
1677 static int fib6_nh_remove_exception(const struct fib6_nh *nh, int plen,
1678                                     const struct rt6_info *rt)
1679 {
1680         const struct in6_addr *src_key = NULL;
1681         struct rt6_exception_bucket *bucket;
1682         struct rt6_exception *rt6_ex;
1683         int err;
1684
1685         if (!rcu_access_pointer(nh->rt6i_exception_bucket))
1686                 return -ENOENT;
1687
1688         spin_lock_bh(&rt6_exception_lock);
1689         bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock);
1690
1691 #ifdef CONFIG_IPV6_SUBTREES
1692         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1693          * and exception table is indexed by a hash of
1694          * both rt6i_dst and rt6i_src.
1695          * Otherwise, the exception table is indexed by
1696          * a hash of only rt6i_dst.
1697          */
1698         if (plen)
1699                 src_key = &rt->rt6i_src.addr;
1700 #endif
1701         rt6_ex = __rt6_find_exception_spinlock(&bucket,
1702                                                &rt->rt6i_dst.addr,
1703                                                src_key);
1704         if (rt6_ex) {
1705                 rt6_remove_exception(bucket, rt6_ex);
1706                 err = 0;
1707         } else {
1708                 err = -ENOENT;
1709         }
1710
1711         spin_unlock_bh(&rt6_exception_lock);
1712         return err;
1713 }
1714
1715 static int rt6_remove_exception_rt(struct rt6_info *rt)
1716 {
1717         struct fib6_info *from;
1718
1719         from = rcu_dereference(rt->from);
1720         if (!from || !(rt->rt6i_flags & RTF_CACHE))
1721                 return -EINVAL;
1722
1723         return fib6_nh_remove_exception(from->fib6_nh,
1724                                         from->fib6_src.plen, rt);
1725 }
1726
1727 /* Find rt6_ex which contains the passed in rt cache and
1728  * refresh its stamp
1729  */
1730 static void fib6_nh_update_exception(const struct fib6_nh *nh, int plen,
1731                                      const struct rt6_info *rt)
1732 {
1733         const struct in6_addr *src_key = NULL;
1734         struct rt6_exception_bucket *bucket;
1735         struct rt6_exception *rt6_ex;
1736
1737         bucket = fib6_nh_get_excptn_bucket(nh, NULL);
1738 #ifdef CONFIG_IPV6_SUBTREES
1739         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1740          * and exception table is indexed by a hash of
1741          * both rt6i_dst and rt6i_src.
1742          * Otherwise, the exception table is indexed by
1743          * a hash of only rt6i_dst.
1744          */
1745         if (plen)
1746                 src_key = &rt->rt6i_src.addr;
1747 #endif
1748         rt6_ex = __rt6_find_exception_rcu(&bucket, &rt->rt6i_dst.addr, src_key);
1749         if (rt6_ex)
1750                 rt6_ex->stamp = jiffies;
1751 }
1752
1753 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1754 {
1755         struct fib6_info *from;
1756
1757         rcu_read_lock();
1758
1759         from = rcu_dereference(rt->from);
1760         if (!from || !(rt->rt6i_flags & RTF_CACHE))
1761                 goto unlock;
1762
1763         fib6_nh_update_exception(from->fib6_nh, from->fib6_src.plen, rt);
1764 unlock:
1765         rcu_read_unlock();
1766 }
1767
1768 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
1769                                          struct rt6_info *rt, int mtu)
1770 {
1771         /* If the new MTU is lower than the route PMTU, this new MTU will be the
1772          * lowest MTU in the path: always allow updating the route PMTU to
1773          * reflect PMTU decreases.
1774          *
1775          * If the new MTU is higher, and the route PMTU is equal to the local
1776          * MTU, this means the old MTU is the lowest in the path, so allow
1777          * updating it: if other nodes now have lower MTUs, PMTU discovery will
1778          * handle this.
1779          */
1780
1781         if (dst_mtu(&rt->dst) >= mtu)
1782                 return true;
1783
1784         if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
1785                 return true;
1786
1787         return false;
1788 }
1789
1790 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
1791                                        const struct fib6_nh *nh, int mtu)
1792 {
1793         struct rt6_exception_bucket *bucket;
1794         struct rt6_exception *rt6_ex;
1795         int i;
1796
1797         bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock);
1798         if (!bucket)
1799                 return;
1800
1801         for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1802                 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1803                         struct rt6_info *entry = rt6_ex->rt6i;
1804
1805                         /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
1806                          * route), the metrics of its rt->from have already
1807                          * been updated.
1808                          */
1809                         if (dst_metric_raw(&entry->dst, RTAX_MTU) &&
1810                             rt6_mtu_change_route_allowed(idev, entry, mtu))
1811                                 dst_metric_set(&entry->dst, RTAX_MTU, mtu);
1812                 }
1813                 bucket++;
1814         }
1815 }
1816
1817 #define RTF_CACHE_GATEWAY       (RTF_GATEWAY | RTF_CACHE)
1818
1819 static void fib6_nh_exceptions_clean_tohost(const struct fib6_nh *nh,
1820                                             const struct in6_addr *gateway)
1821 {
1822         struct rt6_exception_bucket *bucket;
1823         struct rt6_exception *rt6_ex;
1824         struct hlist_node *tmp;
1825         int i;
1826
1827         if (!rcu_access_pointer(nh->rt6i_exception_bucket))
1828                 return;
1829
1830         spin_lock_bh(&rt6_exception_lock);
1831         bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock);
1832         if (bucket) {
1833                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1834                         hlist_for_each_entry_safe(rt6_ex, tmp,
1835                                                   &bucket->chain, hlist) {
1836                                 struct rt6_info *entry = rt6_ex->rt6i;
1837
1838                                 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1839                                     RTF_CACHE_GATEWAY &&
1840                                     ipv6_addr_equal(gateway,
1841                                                     &entry->rt6i_gateway)) {
1842                                         rt6_remove_exception(bucket, rt6_ex);
1843                                 }
1844                         }
1845                         bucket++;
1846                 }
1847         }
1848
1849         spin_unlock_bh(&rt6_exception_lock);
1850 }
1851
1852 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1853                                       struct rt6_exception *rt6_ex,
1854                                       struct fib6_gc_args *gc_args,
1855                                       unsigned long now)
1856 {
1857         struct rt6_info *rt = rt6_ex->rt6i;
1858
1859         /* we are pruning and obsoleting aged-out and non gateway exceptions
1860          * even if others have still references to them, so that on next
1861          * dst_check() such references can be dropped.
1862          * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1863          * expired, independently from their aging, as per RFC 8201 section 4
1864          */
1865         if (!(rt->rt6i_flags & RTF_EXPIRES)) {
1866                 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1867                         RT6_TRACE("aging clone %p\n", rt);
1868                         rt6_remove_exception(bucket, rt6_ex);
1869                         return;
1870                 }
1871         } else if (time_after(jiffies, rt->dst.expires)) {
1872                 RT6_TRACE("purging expired route %p\n", rt);
1873                 rt6_remove_exception(bucket, rt6_ex);
1874                 return;
1875         }
1876
1877         if (rt->rt6i_flags & RTF_GATEWAY) {
1878                 struct neighbour *neigh;
1879                 __u8 neigh_flags = 0;
1880
1881                 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
1882                 if (neigh)
1883                         neigh_flags = neigh->flags;
1884
1885                 if (!(neigh_flags & NTF_ROUTER)) {
1886                         RT6_TRACE("purging route %p via non-router but gateway\n",
1887                                   rt);
1888                         rt6_remove_exception(bucket, rt6_ex);
1889                         return;
1890                 }
1891         }
1892
1893         gc_args->more++;
1894 }
1895
1896 static void fib6_nh_age_exceptions(const struct fib6_nh *nh,
1897                                    struct fib6_gc_args *gc_args,
1898                                    unsigned long now)
1899 {
1900         struct rt6_exception_bucket *bucket;
1901         struct rt6_exception *rt6_ex;
1902         struct hlist_node *tmp;
1903         int i;
1904
1905         if (!rcu_access_pointer(nh->rt6i_exception_bucket))
1906                 return;
1907
1908         rcu_read_lock_bh();
1909         spin_lock(&rt6_exception_lock);
1910         bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock);
1911         if (bucket) {
1912                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1913                         hlist_for_each_entry_safe(rt6_ex, tmp,
1914                                                   &bucket->chain, hlist) {
1915                                 rt6_age_examine_exception(bucket, rt6_ex,
1916                                                           gc_args, now);
1917                         }
1918                         bucket++;
1919                 }
1920         }
1921         spin_unlock(&rt6_exception_lock);
1922         rcu_read_unlock_bh();
1923 }
1924
1925 void rt6_age_exceptions(struct fib6_info *f6i,
1926                         struct fib6_gc_args *gc_args,
1927                         unsigned long now)
1928 {
1929         fib6_nh_age_exceptions(f6i->fib6_nh, gc_args, now);
1930 }
1931
1932 /* must be called with rcu lock held */
1933 int fib6_table_lookup(struct net *net, struct fib6_table *table, int oif,
1934                       struct flowi6 *fl6, struct fib6_result *res, int strict)
1935 {
1936         struct fib6_node *fn, *saved_fn;
1937
1938         fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1939         saved_fn = fn;
1940
1941         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1942                 oif = 0;
1943
1944 redo_rt6_select:
1945         rt6_select(net, fn, oif, res, strict);
1946         if (res->f6i == net->ipv6.fib6_null_entry) {
1947                 fn = fib6_backtrack(fn, &fl6->saddr);
1948                 if (fn)
1949                         goto redo_rt6_select;
1950                 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1951                         /* also consider unreachable route */
1952                         strict &= ~RT6_LOOKUP_F_REACHABLE;
1953                         fn = saved_fn;
1954                         goto redo_rt6_select;
1955                 }
1956         }
1957
1958         trace_fib6_table_lookup(net, res, table, fl6);
1959
1960         return 0;
1961 }
1962
1963 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1964                                int oif, struct flowi6 *fl6,
1965                                const struct sk_buff *skb, int flags)
1966 {
1967         struct fib6_result res = {};
1968         struct rt6_info *rt;
1969         int strict = 0;
1970
1971         strict |= flags & RT6_LOOKUP_F_IFACE;
1972         strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1973         if (net->ipv6.devconf_all->forwarding == 0)
1974                 strict |= RT6_LOOKUP_F_REACHABLE;
1975
1976         rcu_read_lock();
1977
1978         fib6_table_lookup(net, table, oif, fl6, &res, strict);
1979         if (res.f6i == net->ipv6.fib6_null_entry) {
1980                 rt = net->ipv6.ip6_null_entry;
1981                 rcu_read_unlock();
1982                 dst_hold(&rt->dst);
1983                 return rt;
1984         }
1985
1986         fib6_select_path(net, &res, fl6, oif, false, skb, strict);
1987
1988         /*Search through exception table */
1989         rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr);
1990         if (rt) {
1991                 if (ip6_hold_safe(net, &rt))
1992                         dst_use_noref(&rt->dst, jiffies);
1993
1994                 rcu_read_unlock();
1995                 return rt;
1996         } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1997                             !res.nh->fib_nh_gw_family)) {
1998                 /* Create a RTF_CACHE clone which will not be
1999                  * owned by the fib6 tree.  It is for the special case where
2000                  * the daddr in the skb during the neighbor look-up is different
2001                  * from the fl6->daddr used to look-up route here.
2002                  */
2003                 struct rt6_info *uncached_rt;
2004
2005                 uncached_rt = ip6_rt_cache_alloc(&res, &fl6->daddr, NULL);
2006
2007                 rcu_read_unlock();
2008
2009                 if (uncached_rt) {
2010                         /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
2011                          * No need for another dst_hold()
2012                          */
2013                         rt6_uncached_list_add(uncached_rt);
2014                         atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2015                 } else {
2016                         uncached_rt = net->ipv6.ip6_null_entry;
2017                         dst_hold(&uncached_rt->dst);
2018                 }
2019
2020                 return uncached_rt;
2021         } else {
2022                 /* Get a percpu copy */
2023
2024                 struct rt6_info *pcpu_rt;
2025
2026                 local_bh_disable();
2027                 pcpu_rt = rt6_get_pcpu_route(&res);
2028
2029                 if (!pcpu_rt)
2030                         pcpu_rt = rt6_make_pcpu_route(net, &res);
2031
2032                 local_bh_enable();
2033                 rcu_read_unlock();
2034
2035                 return pcpu_rt;
2036         }
2037 }
2038 EXPORT_SYMBOL_GPL(ip6_pol_route);
2039
2040 static struct rt6_info *ip6_pol_route_input(struct net *net,
2041                                             struct fib6_table *table,
2042                                             struct flowi6 *fl6,
2043                                             const struct sk_buff *skb,
2044                                             int flags)
2045 {
2046         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
2047 }
2048
2049 struct dst_entry *ip6_route_input_lookup(struct net *net,
2050                                          struct net_device *dev,
2051                                          struct flowi6 *fl6,
2052                                          const struct sk_buff *skb,
2053                                          int flags)
2054 {
2055         if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
2056                 flags |= RT6_LOOKUP_F_IFACE;
2057
2058         return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
2059 }
2060 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
2061
2062 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
2063                                   struct flow_keys *keys,
2064                                   struct flow_keys *flkeys)
2065 {
2066         const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
2067         const struct ipv6hdr *key_iph = outer_iph;
2068         struct flow_keys *_flkeys = flkeys;
2069         const struct ipv6hdr *inner_iph;
2070         const struct icmp6hdr *icmph;
2071         struct ipv6hdr _inner_iph;
2072         struct icmp6hdr _icmph;
2073
2074         if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
2075                 goto out;
2076
2077         icmph = skb_header_pointer(skb, skb_transport_offset(skb),
2078                                    sizeof(_icmph), &_icmph);
2079         if (!icmph)
2080                 goto out;
2081
2082         if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
2083             icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
2084             icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
2085             icmph->icmp6_type != ICMPV6_PARAMPROB)
2086                 goto out;
2087
2088         inner_iph = skb_header_pointer(skb,
2089                                        skb_transport_offset(skb) + sizeof(*icmph),
2090                                        sizeof(_inner_iph), &_inner_iph);
2091         if (!inner_iph)
2092                 goto out;
2093
2094         key_iph = inner_iph;
2095         _flkeys = NULL;
2096 out:
2097         if (_flkeys) {
2098                 keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
2099                 keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
2100                 keys->tags.flow_label = _flkeys->tags.flow_label;
2101                 keys->basic.ip_proto = _flkeys->basic.ip_proto;
2102         } else {
2103                 keys->addrs.v6addrs.src = key_iph->saddr;
2104                 keys->addrs.v6addrs.dst = key_iph->daddr;
2105                 keys->tags.flow_label = ip6_flowlabel(key_iph);
2106                 keys->basic.ip_proto = key_iph->nexthdr;
2107         }
2108 }
2109
2110 /* if skb is set it will be used and fl6 can be NULL */
2111 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
2112                        const struct sk_buff *skb, struct flow_keys *flkeys)
2113 {
2114         struct flow_keys hash_keys;
2115         u32 mhash;
2116
2117         switch (ip6_multipath_hash_policy(net)) {
2118         case 0:
2119                 memset(&hash_keys, 0, sizeof(hash_keys));
2120                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2121                 if (skb) {
2122                         ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
2123                 } else {
2124                         hash_keys.addrs.v6addrs.src = fl6->saddr;
2125                         hash_keys.addrs.v6addrs.dst = fl6->daddr;
2126                         hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
2127                         hash_keys.basic.ip_proto = fl6->flowi6_proto;
2128                 }
2129                 break;
2130         case 1:
2131                 if (skb) {
2132                         unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
2133                         struct flow_keys keys;
2134
2135                         /* short-circuit if we already have L4 hash present */
2136                         if (skb->l4_hash)
2137                                 return skb_get_hash_raw(skb) >> 1;
2138
2139                         memset(&hash_keys, 0, sizeof(hash_keys));
2140
2141                         if (!flkeys) {
2142                                 skb_flow_dissect_flow_keys(skb, &keys, flag);
2143                                 flkeys = &keys;
2144                         }
2145                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2146                         hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
2147                         hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
2148                         hash_keys.ports.src = flkeys->ports.src;
2149                         hash_keys.ports.dst = flkeys->ports.dst;
2150                         hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2151                 } else {
2152                         memset(&hash_keys, 0, sizeof(hash_keys));
2153                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2154                         hash_keys.addrs.v6addrs.src = fl6->saddr;
2155                         hash_keys.addrs.v6addrs.dst = fl6->daddr;
2156                         hash_keys.ports.src = fl6->fl6_sport;
2157                         hash_keys.ports.dst = fl6->fl6_dport;
2158                         hash_keys.basic.ip_proto = fl6->flowi6_proto;
2159                 }
2160                 break;
2161         }
2162         mhash = flow_hash_from_keys(&hash_keys);
2163
2164         return mhash >> 1;
2165 }
2166
2167 void ip6_route_input(struct sk_buff *skb)
2168 {
2169         const struct ipv6hdr *iph = ipv6_hdr(skb);
2170         struct net *net = dev_net(skb->dev);
2171         int flags = RT6_LOOKUP_F_HAS_SADDR;
2172         struct ip_tunnel_info *tun_info;
2173         struct flowi6 fl6 = {
2174                 .flowi6_iif = skb->dev->ifindex,
2175                 .daddr = iph->daddr,
2176                 .saddr = iph->saddr,
2177                 .flowlabel = ip6_flowinfo(iph),
2178                 .flowi6_mark = skb->mark,
2179                 .flowi6_proto = iph->nexthdr,
2180         };
2181         struct flow_keys *flkeys = NULL, _flkeys;
2182
2183         tun_info = skb_tunnel_info(skb);
2184         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2185                 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
2186
2187         if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
2188                 flkeys = &_flkeys;
2189
2190         if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
2191                 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
2192         skb_dst_drop(skb);
2193         skb_dst_set(skb,
2194                     ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
2195 }
2196
2197 static struct rt6_info *ip6_pol_route_output(struct net *net,
2198                                              struct fib6_table *table,
2199                                              struct flowi6 *fl6,
2200                                              const struct sk_buff *skb,
2201                                              int flags)
2202 {
2203         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
2204 }
2205
2206 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
2207                                          struct flowi6 *fl6, int flags)
2208 {
2209         bool any_src;
2210
2211         if (ipv6_addr_type(&fl6->daddr) &
2212             (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)) {
2213                 struct dst_entry *dst;
2214
2215                 dst = l3mdev_link_scope_lookup(net, fl6);
2216                 if (dst)
2217                         return dst;
2218         }
2219
2220         fl6->flowi6_iif = LOOPBACK_IFINDEX;
2221
2222         any_src = ipv6_addr_any(&fl6->saddr);
2223         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
2224             (fl6->flowi6_oif && any_src))
2225                 flags |= RT6_LOOKUP_F_IFACE;
2226
2227         if (!any_src)
2228                 flags |= RT6_LOOKUP_F_HAS_SADDR;
2229         else if (sk)
2230                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
2231
2232         return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
2233 }
2234 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
2235
2236 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2237 {
2238         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
2239         struct net_device *loopback_dev = net->loopback_dev;
2240         struct dst_entry *new = NULL;
2241
2242         rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
2243                        DST_OBSOLETE_DEAD, 0);
2244         if (rt) {
2245                 rt6_info_init(rt);
2246                 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
2247
2248                 new = &rt->dst;
2249                 new->__use = 1;
2250                 new->input = dst_discard;
2251                 new->output = dst_discard_out;
2252
2253                 dst_copy_metrics(new, &ort->dst);
2254
2255                 rt->rt6i_idev = in6_dev_get(loopback_dev);
2256                 rt->rt6i_gateway = ort->rt6i_gateway;
2257                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
2258
2259                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
2260 #ifdef CONFIG_IPV6_SUBTREES
2261                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
2262 #endif
2263         }
2264
2265         dst_release(dst_orig);
2266         return new ? new : ERR_PTR(-ENOMEM);
2267 }
2268
2269 /*
2270  *      Destination cache support functions
2271  */
2272
2273 static bool fib6_check(struct fib6_info *f6i, u32 cookie)
2274 {
2275         u32 rt_cookie = 0;
2276
2277         if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie)
2278                 return false;
2279
2280         if (fib6_check_expired(f6i))
2281                 return false;
2282
2283         return true;
2284 }
2285
2286 static struct dst_entry *rt6_check(struct rt6_info *rt,
2287                                    struct fib6_info *from,
2288                                    u32 cookie)
2289 {
2290         u32 rt_cookie = 0;
2291
2292         if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) ||
2293             rt_cookie != cookie)
2294                 return NULL;
2295
2296         if (rt6_check_expired(rt))
2297                 return NULL;
2298
2299         return &rt->dst;
2300 }
2301
2302 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt,
2303                                             struct fib6_info *from,
2304                                             u32 cookie)
2305 {
2306         if (!__rt6_check_expired(rt) &&
2307             rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
2308             fib6_check(from, cookie))
2309                 return &rt->dst;
2310         else
2311                 return NULL;
2312 }
2313
2314 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
2315 {
2316         struct dst_entry *dst_ret;
2317         struct fib6_info *from;
2318         struct rt6_info *rt;
2319
2320         rt = container_of(dst, struct rt6_info, dst);
2321
2322         rcu_read_lock();
2323
2324         /* All IPV6 dsts are created with ->obsolete set to the value
2325          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
2326          * into this function always.
2327          */
2328
2329         from = rcu_dereference(rt->from);
2330
2331         if (from && (rt->rt6i_flags & RTF_PCPU ||
2332             unlikely(!list_empty(&rt->rt6i_uncached))))
2333                 dst_ret = rt6_dst_from_check(rt, from, cookie);
2334         else
2335                 dst_ret = rt6_check(rt, from, cookie);
2336
2337         rcu_read_unlock();
2338
2339         return dst_ret;
2340 }
2341
2342 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
2343 {
2344         struct rt6_info *rt = (struct rt6_info *) dst;
2345
2346         if (rt) {
2347                 if (rt->rt6i_flags & RTF_CACHE) {
2348                         rcu_read_lock();
2349                         if (rt6_check_expired(rt)) {
2350                                 rt6_remove_exception_rt(rt);
2351                                 dst = NULL;
2352                         }
2353                         rcu_read_unlock();
2354                 } else {
2355                         dst_release(dst);
2356                         dst = NULL;
2357                 }
2358         }
2359         return dst;
2360 }
2361
2362 static void ip6_link_failure(struct sk_buff *skb)
2363 {
2364         struct rt6_info *rt;
2365
2366         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2367
2368         rt = (struct rt6_info *) skb_dst(skb);
2369         if (rt) {
2370                 rcu_read_lock();
2371                 if (rt->rt6i_flags & RTF_CACHE) {
2372                         rt6_remove_exception_rt(rt);
2373                 } else {
2374                         struct fib6_info *from;
2375                         struct fib6_node *fn;
2376
2377                         from = rcu_dereference(rt->from);
2378                         if (from) {
2379                                 fn = rcu_dereference(from->fib6_node);
2380                                 if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2381                                         fn->fn_sernum = -1;
2382                         }
2383                 }
2384                 rcu_read_unlock();
2385         }
2386 }
2387
2388 static void rt6_update_expires(struct rt6_info *rt0, int timeout)
2389 {
2390         if (!(rt0->rt6i_flags & RTF_EXPIRES)) {
2391                 struct fib6_info *from;
2392
2393                 rcu_read_lock();
2394                 from = rcu_dereference(rt0->from);
2395                 if (from)
2396                         rt0->dst.expires = from->expires;
2397                 rcu_read_unlock();
2398         }
2399
2400         dst_set_expires(&rt0->dst, timeout);
2401         rt0->rt6i_flags |= RTF_EXPIRES;
2402 }
2403
2404 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2405 {
2406         struct net *net = dev_net(rt->dst.dev);
2407
2408         dst_metric_set(&rt->dst, RTAX_MTU, mtu);
2409         rt->rt6i_flags |= RTF_MODIFIED;
2410         rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2411 }
2412
2413 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2414 {
2415         return !(rt->rt6i_flags & RTF_CACHE) &&
2416                 (rt->rt6i_flags & RTF_PCPU || rcu_access_pointer(rt->from));
2417 }
2418
2419 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2420                                  const struct ipv6hdr *iph, u32 mtu)
2421 {
2422         const struct in6_addr *daddr, *saddr;
2423         struct rt6_info *rt6 = (struct rt6_info *)dst;
2424
2425         if (dst_metric_locked(dst, RTAX_MTU))
2426                 return;
2427
2428         if (iph) {
2429                 daddr = &iph->daddr;
2430                 saddr = &iph->saddr;
2431         } else if (sk) {
2432                 daddr = &sk->sk_v6_daddr;
2433                 saddr = &inet6_sk(sk)->saddr;
2434         } else {
2435                 daddr = NULL;
2436                 saddr = NULL;
2437         }
2438         dst_confirm_neigh(dst, daddr);
2439         mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2440         if (mtu >= dst_mtu(dst))
2441                 return;
2442
2443         if (!rt6_cache_allowed_for_pmtu(rt6)) {
2444                 rt6_do_update_pmtu(rt6, mtu);
2445                 /* update rt6_ex->stamp for cache */
2446                 if (rt6->rt6i_flags & RTF_CACHE)
2447                         rt6_update_exception_stamp_rt(rt6);
2448         } else if (daddr) {
2449                 struct fib6_result res = {};
2450                 struct rt6_info *nrt6;
2451
2452                 rcu_read_lock();
2453                 res.f6i = rcu_dereference(rt6->from);
2454                 if (!res.f6i) {
2455                         rcu_read_unlock();
2456                         return;
2457                 }
2458                 res.nh = res.f6i->fib6_nh;
2459                 res.fib6_flags = res.f6i->fib6_flags;
2460                 res.fib6_type = res.f6i->fib6_type;
2461
2462                 nrt6 = ip6_rt_cache_alloc(&res, daddr, saddr);
2463                 if (nrt6) {
2464                         rt6_do_update_pmtu(nrt6, mtu);
2465                         if (rt6_insert_exception(nrt6, &res))
2466                                 dst_release_immediate(&nrt6->dst);
2467                 }
2468                 rcu_read_unlock();
2469         }
2470 }
2471
2472 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2473                                struct sk_buff *skb, u32 mtu)
2474 {
2475         __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2476 }
2477
2478 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2479                      int oif, u32 mark, kuid_t uid)
2480 {
2481         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2482         struct dst_entry *dst;
2483         struct flowi6 fl6 = {
2484                 .flowi6_oif = oif,
2485                 .flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark),
2486                 .daddr = iph->daddr,
2487                 .saddr = iph->saddr,
2488                 .flowlabel = ip6_flowinfo(iph),
2489                 .flowi6_uid = uid,
2490         };
2491
2492         dst = ip6_route_output(net, NULL, &fl6);
2493         if (!dst->error)
2494                 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2495         dst_release(dst);
2496 }
2497 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2498
2499 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2500 {
2501         int oif = sk->sk_bound_dev_if;
2502         struct dst_entry *dst;
2503
2504         if (!oif && skb->dev)
2505                 oif = l3mdev_master_ifindex(skb->dev);
2506
2507         ip6_update_pmtu(skb, sock_net(sk), mtu, oif, sk->sk_mark, sk->sk_uid);
2508
2509         dst = __sk_dst_get(sk);
2510         if (!dst || !dst->obsolete ||
2511             dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2512                 return;
2513
2514         bh_lock_sock(sk);
2515         if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2516                 ip6_datagram_dst_update(sk, false);
2517         bh_unlock_sock(sk);
2518 }
2519 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2520
2521 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
2522                            const struct flowi6 *fl6)
2523 {
2524 #ifdef CONFIG_IPV6_SUBTREES
2525         struct ipv6_pinfo *np = inet6_sk(sk);
2526 #endif
2527
2528         ip6_dst_store(sk, dst,
2529                       ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ?
2530                       &sk->sk_v6_daddr : NULL,
2531 #ifdef CONFIG_IPV6_SUBTREES
2532                       ipv6_addr_equal(&fl6->saddr, &np->saddr) ?
2533                       &np->saddr :
2534 #endif
2535                       NULL);
2536 }
2537
2538 static bool ip6_redirect_nh_match(const struct fib6_result *res,
2539                                   struct flowi6 *fl6,
2540                                   const struct in6_addr *gw,
2541                                   struct rt6_info **ret)
2542 {
2543         const struct fib6_nh *nh = res->nh;
2544
2545         if (nh->fib_nh_flags & RTNH_F_DEAD || !nh->fib_nh_gw_family ||
2546             fl6->flowi6_oif != nh->fib_nh_dev->ifindex)
2547                 return false;
2548
2549         /* rt_cache's gateway might be different from its 'parent'
2550          * in the case of an ip redirect.
2551          * So we keep searching in the exception table if the gateway
2552          * is different.
2553          */
2554         if (!ipv6_addr_equal(gw, &nh->fib_nh_gw6)) {
2555                 struct rt6_info *rt_cache;
2556
2557                 rt_cache = rt6_find_cached_rt(res, &fl6->daddr, &fl6->saddr);
2558                 if (rt_cache &&
2559                     ipv6_addr_equal(gw, &rt_cache->rt6i_gateway)) {
2560                         *ret = rt_cache;
2561                         return true;
2562                 }
2563                 return false;
2564         }
2565         return true;
2566 }
2567
2568 /* Handle redirects */
2569 struct ip6rd_flowi {
2570         struct flowi6 fl6;
2571         struct in6_addr gateway;
2572 };
2573
2574 static struct rt6_info *__ip6_route_redirect(struct net *net,
2575                                              struct fib6_table *table,
2576                                              struct flowi6 *fl6,
2577                                              const struct sk_buff *skb,
2578                                              int flags)
2579 {
2580         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2581         struct rt6_info *ret = NULL;
2582         struct fib6_result res = {};
2583         struct fib6_info *rt;
2584         struct fib6_node *fn;
2585
2586         /* l3mdev_update_flow overrides oif if the device is enslaved; in
2587          * this case we must match on the real ingress device, so reset it
2588          */
2589         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
2590                 fl6->flowi6_oif = skb->dev->ifindex;
2591
2592         /* Get the "current" route for this destination and
2593          * check if the redirect has come from appropriate router.
2594          *
2595          * RFC 4861 specifies that redirects should only be
2596          * accepted if they come from the nexthop to the target.
2597          * Due to the way the routes are chosen, this notion
2598          * is a bit fuzzy and one might need to check all possible
2599          * routes.
2600          */
2601
2602         rcu_read_lock();
2603         fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2604 restart:
2605         for_each_fib6_node_rt_rcu(fn) {
2606                 res.f6i = rt;
2607                 res.nh = rt->fib6_nh;
2608
2609                 if (fib6_check_expired(rt))
2610                         continue;
2611                 if (rt->fib6_flags & RTF_REJECT)
2612                         break;
2613                 if (ip6_redirect_nh_match(&res, fl6, &rdfl->gateway, &ret))
2614                         goto out;
2615         }
2616
2617         if (!rt)
2618                 rt = net->ipv6.fib6_null_entry;
2619         else if (rt->fib6_flags & RTF_REJECT) {
2620                 ret = net->ipv6.ip6_null_entry;
2621                 goto out;
2622         }
2623
2624         if (rt == net->ipv6.fib6_null_entry) {
2625                 fn = fib6_backtrack(fn, &fl6->saddr);
2626                 if (fn)
2627                         goto restart;
2628         }
2629
2630         res.f6i = rt;
2631         res.nh = rt->fib6_nh;
2632 out:
2633         if (ret) {
2634                 ip6_hold_safe(net, &ret);
2635         } else {
2636                 res.fib6_flags = res.f6i->fib6_flags;
2637                 res.fib6_type = res.f6i->fib6_type;
2638                 ret = ip6_create_rt_rcu(&res);
2639         }
2640
2641         rcu_read_unlock();
2642
2643         trace_fib6_table_lookup(net, &res, table, fl6);
2644         return ret;
2645 };
2646
2647 static struct dst_entry *ip6_route_redirect(struct net *net,
2648                                             const struct flowi6 *fl6,
2649                                             const struct sk_buff *skb,
2650                                             const struct in6_addr *gateway)
2651 {
2652         int flags = RT6_LOOKUP_F_HAS_SADDR;
2653         struct ip6rd_flowi rdfl;
2654
2655         rdfl.fl6 = *fl6;
2656         rdfl.gateway = *gateway;
2657
2658         return fib6_rule_lookup(net, &rdfl.fl6, skb,
2659                                 flags, __ip6_route_redirect);
2660 }
2661
2662 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2663                   kuid_t uid)
2664 {
2665         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2666         struct dst_entry *dst;
2667         struct flowi6 fl6 = {
2668                 .flowi6_iif = LOOPBACK_IFINDEX,
2669                 .flowi6_oif = oif,
2670                 .flowi6_mark = mark,
2671                 .daddr = iph->daddr,
2672                 .saddr = iph->saddr,
2673                 .flowlabel = ip6_flowinfo(iph),
2674                 .flowi6_uid = uid,
2675         };
2676
2677         dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
2678         rt6_do_redirect(dst, NULL, skb);
2679         dst_release(dst);
2680 }
2681 EXPORT_SYMBOL_GPL(ip6_redirect);
2682
2683 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif)
2684 {
2685         const struct ipv6hdr *iph = ipv6_hdr(skb);
2686         const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2687         struct dst_entry *dst;
2688         struct flowi6 fl6 = {
2689                 .flowi6_iif = LOOPBACK_IFINDEX,
2690                 .flowi6_oif = oif,
2691                 .daddr = msg->dest,
2692                 .saddr = iph->daddr,
2693                 .flowi6_uid = sock_net_uid(net, NULL),
2694         };
2695
2696         dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
2697         rt6_do_redirect(dst, NULL, skb);
2698         dst_release(dst);
2699 }
2700
2701 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2702 {
2703         ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2704                      sk->sk_uid);
2705 }
2706 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2707
2708 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2709 {
2710         struct net_device *dev = dst->dev;
2711         unsigned int mtu = dst_mtu(dst);
2712         struct net *net = dev_net(dev);
2713
2714         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2715
2716         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2717                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2718
2719         /*
2720          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2721          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2722          * IPV6_MAXPLEN is also valid and means: "any MSS,
2723          * rely only on pmtu discovery"
2724          */
2725         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2726                 mtu = IPV6_MAXPLEN;
2727         return mtu;
2728 }
2729
2730 static unsigned int ip6_mtu(const struct dst_entry *dst)
2731 {
2732         struct inet6_dev *idev;
2733         unsigned int mtu;
2734
2735         mtu = dst_metric_raw(dst, RTAX_MTU);
2736         if (mtu)
2737                 goto out;
2738
2739         mtu = IPV6_MIN_MTU;
2740
2741         rcu_read_lock();
2742         idev = __in6_dev_get(dst->dev);
2743         if (idev)
2744                 mtu = idev->cnf.mtu6;
2745         rcu_read_unlock();
2746
2747 out:
2748         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2749
2750         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2751 }
2752
2753 /* MTU selection:
2754  * 1. mtu on route is locked - use it
2755  * 2. mtu from nexthop exception
2756  * 3. mtu from egress device
2757  *
2758  * based on ip6_dst_mtu_forward and exception logic of
2759  * rt6_find_cached_rt; called with rcu_read_lock
2760  */
2761 u32 ip6_mtu_from_fib6(const struct fib6_result *res,
2762                       const struct in6_addr *daddr,
2763                       const struct in6_addr *saddr)
2764 {
2765         const struct fib6_nh *nh = res->nh;
2766         struct fib6_info *f6i = res->f6i;
2767         struct inet6_dev *idev;
2768         struct rt6_info *rt;
2769         u32 mtu = 0;
2770
2771         if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) {
2772                 mtu = f6i->fib6_pmtu;
2773                 if (mtu)
2774                         goto out;
2775         }
2776
2777         rt = rt6_find_cached_rt(res, daddr, saddr);
2778         if (unlikely(rt)) {
2779                 mtu = dst_metric_raw(&rt->dst, RTAX_MTU);
2780         } else {
2781                 struct net_device *dev = nh->fib_nh_dev;
2782
2783                 mtu = IPV6_MIN_MTU;
2784                 idev = __in6_dev_get(dev);
2785                 if (idev && idev->cnf.mtu6 > mtu)
2786                         mtu = idev->cnf.mtu6;
2787         }
2788
2789         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2790 out:
2791         return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu);
2792 }
2793
2794 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2795                                   struct flowi6 *fl6)
2796 {
2797         struct dst_entry *dst;
2798         struct rt6_info *rt;
2799         struct inet6_dev *idev = in6_dev_get(dev);
2800         struct net *net = dev_net(dev);
2801
2802         if (unlikely(!idev))
2803                 return ERR_PTR(-ENODEV);
2804
2805         rt = ip6_dst_alloc(net, dev, 0);
2806         if (unlikely(!rt)) {
2807                 in6_dev_put(idev);
2808                 dst = ERR_PTR(-ENOMEM);
2809                 goto out;
2810         }
2811
2812         rt->dst.flags |= DST_HOST;
2813         rt->dst.input = ip6_input;
2814         rt->dst.output  = ip6_output;
2815         rt->rt6i_gateway  = fl6->daddr;
2816         rt->rt6i_dst.addr = fl6->daddr;
2817         rt->rt6i_dst.plen = 128;
2818         rt->rt6i_idev     = idev;
2819         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2820
2821         /* Add this dst into uncached_list so that rt6_disable_ip() can
2822          * do proper release of the net_device
2823          */
2824         rt6_uncached_list_add(rt);
2825         atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2826
2827         dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2828
2829 out:
2830         return dst;
2831 }
2832
2833 static int ip6_dst_gc(struct dst_ops *ops)
2834 {
2835         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2836         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2837         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2838         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2839         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2840         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2841         int entries;
2842
2843         entries = dst_entries_get_fast(ops);
2844         if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2845             entries <= rt_max_size)
2846                 goto out;
2847
2848         net->ipv6.ip6_rt_gc_expire++;
2849         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2850         entries = dst_entries_get_slow(ops);
2851         if (entries < ops->gc_thresh)
2852                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2853 out:
2854         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2855         return entries > rt_max_size;
2856 }
2857
2858 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2859                                             struct fib6_config *cfg,
2860                                             const struct in6_addr *gw_addr,
2861                                             u32 tbid, int flags)
2862 {
2863         struct flowi6 fl6 = {
2864                 .flowi6_oif = cfg->fc_ifindex,
2865                 .daddr = *gw_addr,
2866                 .saddr = cfg->fc_prefsrc,
2867         };
2868         struct fib6_table *table;
2869         struct rt6_info *rt;
2870
2871         table = fib6_get_table(net, tbid);
2872         if (!table)
2873                 return NULL;
2874
2875         if (!ipv6_addr_any(&cfg->fc_prefsrc))
2876                 flags |= RT6_LOOKUP_F_HAS_SADDR;
2877
2878         flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
2879         rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags);
2880
2881         /* if table lookup failed, fall back to full lookup */
2882         if (rt == net->ipv6.ip6_null_entry) {
2883                 ip6_rt_put(rt);
2884                 rt = NULL;
2885         }
2886
2887         return rt;
2888 }
2889
2890 static int ip6_route_check_nh_onlink(struct net *net,
2891                                      struct fib6_config *cfg,
2892                                      const struct net_device *dev,
2893                                      struct netlink_ext_ack *extack)
2894 {
2895         u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
2896         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2897         u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
2898         struct fib6_info *from;
2899         struct rt6_info *grt;
2900         int err;
2901
2902         err = 0;
2903         grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
2904         if (grt) {
2905                 rcu_read_lock();
2906                 from = rcu_dereference(grt->from);
2907                 if (!grt->dst.error &&
2908                     /* ignore match if it is the default route */
2909                     from && !ipv6_addr_any(&from->fib6_dst.addr) &&
2910                     (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
2911                         NL_SET_ERR_MSG(extack,
2912                                        "Nexthop has invalid gateway or device mismatch");
2913                         err = -EINVAL;
2914                 }
2915                 rcu_read_unlock();
2916
2917                 ip6_rt_put(grt);
2918         }
2919
2920         return err;
2921 }
2922
2923 static int ip6_route_check_nh(struct net *net,
2924                               struct fib6_config *cfg,
2925                               struct net_device **_dev,
2926                               struct inet6_dev **idev)
2927 {
2928         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2929         struct net_device *dev = _dev ? *_dev : NULL;
2930         struct rt6_info *grt = NULL;
2931         int err = -EHOSTUNREACH;
2932
2933         if (cfg->fc_table) {
2934                 int flags = RT6_LOOKUP_F_IFACE;
2935
2936                 grt = ip6_nh_lookup_table(net, cfg, gw_addr,
2937                                           cfg->fc_table, flags);
2938                 if (grt) {
2939                         if (grt->rt6i_flags & RTF_GATEWAY ||
2940                             (dev && dev != grt->dst.dev)) {
2941                                 ip6_rt_put(grt);
2942                                 grt = NULL;
2943                         }
2944                 }
2945         }
2946
2947         if (!grt)
2948                 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1);
2949
2950         if (!grt)
2951                 goto out;
2952
2953         if (dev) {
2954                 if (dev != grt->dst.dev) {
2955                         ip6_rt_put(grt);
2956                         goto out;
2957                 }
2958         } else {
2959                 *_dev = dev = grt->dst.dev;
2960                 *idev = grt->rt6i_idev;
2961                 dev_hold(dev);
2962                 in6_dev_hold(grt->rt6i_idev);
2963         }
2964
2965         if (!(grt->rt6i_flags & RTF_GATEWAY))
2966                 err = 0;
2967
2968         ip6_rt_put(grt);
2969
2970 out:
2971         return err;
2972 }
2973
2974 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
2975                            struct net_device **_dev, struct inet6_dev **idev,
2976                            struct netlink_ext_ack *extack)
2977 {
2978         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2979         int gwa_type = ipv6_addr_type(gw_addr);
2980         bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true;
2981         const struct net_device *dev = *_dev;
2982         bool need_addr_check = !dev;
2983         int err = -EINVAL;
2984
2985         /* if gw_addr is local we will fail to detect this in case
2986          * address is still TENTATIVE (DAD in progress). rt6_lookup()
2987          * will return already-added prefix route via interface that
2988          * prefix route was assigned to, which might be non-loopback.
2989          */
2990         if (dev &&
2991             ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2992                 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2993                 goto out;
2994         }
2995
2996         if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
2997                 /* IPv6 strictly inhibits using not link-local
2998                  * addresses as nexthop address.
2999                  * Otherwise, router will not able to send redirects.
3000                  * It is very good, but in some (rare!) circumstances
3001                  * (SIT, PtP, NBMA NOARP links) it is handy to allow
3002                  * some exceptions. --ANK
3003                  * We allow IPv4-mapped nexthops to support RFC4798-type
3004                  * addressing
3005                  */
3006                 if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) {
3007                         NL_SET_ERR_MSG(extack, "Invalid gateway address");
3008                         goto out;
3009                 }
3010
3011                 if (cfg->fc_flags & RTNH_F_ONLINK)
3012                         err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
3013                 else
3014                         err = ip6_route_check_nh(net, cfg, _dev, idev);
3015
3016                 if (err)
3017                         goto out;
3018         }
3019
3020         /* reload in case device was changed */
3021         dev = *_dev;
3022
3023         err = -EINVAL;
3024         if (!dev) {
3025                 NL_SET_ERR_MSG(extack, "Egress device not specified");
3026                 goto out;
3027         } else if (dev->flags & IFF_LOOPBACK) {
3028                 NL_SET_ERR_MSG(extack,
3029                                "Egress device can not be loopback device for this route");
3030                 goto out;
3031         }
3032
3033         /* if we did not check gw_addr above, do so now that the
3034          * egress device has been resolved.
3035          */
3036         if (need_addr_check &&
3037             ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
3038                 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
3039                 goto out;
3040         }
3041
3042         err = 0;
3043 out:
3044         return err;
3045 }
3046
3047 static bool fib6_is_reject(u32 flags, struct net_device *dev, int addr_type)
3048 {
3049         if ((flags & RTF_REJECT) ||
3050             (dev && (dev->flags & IFF_LOOPBACK) &&
3051              !(addr_type & IPV6_ADDR_LOOPBACK) &&
3052              !(flags & RTF_LOCAL)))
3053                 return true;
3054
3055         return false;
3056 }
3057
3058 int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh,
3059                  struct fib6_config *cfg, gfp_t gfp_flags,
3060                  struct netlink_ext_ack *extack)
3061 {
3062         struct net_device *dev = NULL;
3063         struct inet6_dev *idev = NULL;
3064         int addr_type;
3065         int err;
3066
3067         fib6_nh->fib_nh_family = AF_INET6;
3068
3069         err = -ENODEV;
3070         if (cfg->fc_ifindex) {
3071                 dev = dev_get_by_index(net, cfg->fc_ifindex);
3072                 if (!dev)
3073                         goto out;
3074                 idev = in6_dev_get(dev);
3075                 if (!idev)
3076                         goto out;
3077         }
3078
3079         if (cfg->fc_flags & RTNH_F_ONLINK) {
3080                 if (!dev) {
3081                         NL_SET_ERR_MSG(extack,
3082                                        "Nexthop device required for onlink");
3083                         goto out;
3084                 }
3085
3086                 if (!(dev->flags & IFF_UP)) {
3087                         NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3088                         err = -ENETDOWN;
3089                         goto out;
3090                 }
3091
3092                 fib6_nh->fib_nh_flags |= RTNH_F_ONLINK;
3093         }
3094
3095         fib6_nh->fib_nh_weight = 1;
3096
3097         /* We cannot add true routes via loopback here,
3098          * they would result in kernel looping; promote them to reject routes
3099          */
3100         addr_type = ipv6_addr_type(&cfg->fc_dst);
3101         if (fib6_is_reject(cfg->fc_flags, dev, addr_type)) {
3102                 /* hold loopback dev/idev if we haven't done so. */
3103                 if (dev != net->loopback_dev) {
3104                         if (dev) {
3105                                 dev_put(dev);
3106                                 in6_dev_put(idev);
3107                         }
3108                         dev = net->loopback_dev;
3109                         dev_hold(dev);
3110                         idev = in6_dev_get(dev);
3111                         if (!idev) {
3112                                 err = -ENODEV;
3113                                 goto out;
3114                         }
3115                 }
3116                 goto pcpu_alloc;
3117         }
3118
3119         if (cfg->fc_flags & RTF_GATEWAY) {
3120                 err = ip6_validate_gw(net, cfg, &dev, &idev, extack);
3121                 if (err)
3122                         goto out;
3123
3124                 fib6_nh->fib_nh_gw6 = cfg->fc_gateway;
3125                 fib6_nh->fib_nh_gw_family = AF_INET6;
3126         }
3127
3128         err = -ENODEV;
3129         if (!dev)
3130                 goto out;
3131
3132         if (idev->cnf.disable_ipv6) {
3133                 NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
3134                 err = -EACCES;
3135                 goto out;
3136         }
3137
3138         if (!(dev->flags & IFF_UP) && !cfg->fc_ignore_dev_down) {
3139                 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3140                 err = -ENETDOWN;
3141                 goto out;
3142         }
3143
3144         if (!(cfg->fc_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
3145             !netif_carrier_ok(dev))
3146                 fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN;
3147
3148         err = fib_nh_common_init(&fib6_nh->nh_common, cfg->fc_encap,
3149                                  cfg->fc_encap_type, cfg, gfp_flags, extack);
3150         if (err)
3151                 goto out;
3152
3153 pcpu_alloc:
3154         fib6_nh->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, gfp_flags);
3155         if (!fib6_nh->rt6i_pcpu) {
3156                 err = -ENOMEM;
3157                 goto out;
3158         }
3159
3160         fib6_nh->fib_nh_dev = dev;
3161         fib6_nh->fib_nh_oif = dev->ifindex;
3162         err = 0;
3163 out:
3164         if (idev)
3165                 in6_dev_put(idev);
3166
3167         if (err) {
3168                 lwtstate_put(fib6_nh->fib_nh_lws);
3169                 fib6_nh->fib_nh_lws = NULL;
3170                 if (dev)
3171                         dev_put(dev);
3172         }
3173
3174         return err;
3175 }
3176
3177 void fib6_nh_release(struct fib6_nh *fib6_nh)
3178 {
3179         struct rt6_exception_bucket *bucket;
3180
3181         rcu_read_lock();
3182
3183         fib6_nh_flush_exceptions(fib6_nh, NULL);
3184         bucket = fib6_nh_get_excptn_bucket(fib6_nh, NULL);
3185         if (bucket) {
3186                 rcu_assign_pointer(fib6_nh->rt6i_exception_bucket, NULL);
3187                 kfree(bucket);
3188         }
3189
3190         rcu_read_unlock();
3191
3192         if (fib6_nh->rt6i_pcpu) {
3193                 int cpu;
3194
3195                 for_each_possible_cpu(cpu) {
3196                         struct rt6_info **ppcpu_rt;
3197                         struct rt6_info *pcpu_rt;
3198
3199                         ppcpu_rt = per_cpu_ptr(fib6_nh->rt6i_pcpu, cpu);
3200                         pcpu_rt = *ppcpu_rt;
3201                         if (pcpu_rt) {
3202                                 dst_dev_put(&pcpu_rt->dst);
3203                                 dst_release(&pcpu_rt->dst);
3204                                 *ppcpu_rt = NULL;
3205                         }
3206                 }
3207
3208                 free_percpu(fib6_nh->rt6i_pcpu);
3209         }
3210
3211         fib_nh_common_release(&fib6_nh->nh_common);
3212 }
3213
3214 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
3215                                               gfp_t gfp_flags,
3216                                               struct netlink_ext_ack *extack)
3217 {
3218         struct net *net = cfg->fc_nlinfo.nl_net;
3219         struct fib6_info *rt = NULL;
3220         struct fib6_table *table;
3221         int err = -EINVAL;
3222         int addr_type;
3223
3224         /* RTF_PCPU is an internal flag; can not be set by userspace */
3225         if (cfg->fc_flags & RTF_PCPU) {
3226                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
3227                 goto out;
3228         }
3229
3230         /* RTF_CACHE is an internal flag; can not be set by userspace */
3231         if (cfg->fc_flags & RTF_CACHE) {
3232                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
3233                 goto out;
3234         }
3235
3236         if (cfg->fc_type > RTN_MAX) {
3237                 NL_SET_ERR_MSG(extack, "Invalid route type");
3238                 goto out;
3239         }
3240
3241         if (cfg->fc_dst_len > 128) {
3242                 NL_SET_ERR_MSG(extack, "Invalid prefix length");
3243                 goto out;
3244         }
3245         if (cfg->fc_src_len > 128) {
3246                 NL_SET_ERR_MSG(extack, "Invalid source address length");
3247                 goto out;
3248         }
3249 #ifndef CONFIG_IPV6_SUBTREES
3250         if (cfg->fc_src_len) {
3251                 NL_SET_ERR_MSG(extack,
3252                                "Specifying source address requires IPV6_SUBTREES to be enabled");
3253                 goto out;
3254         }
3255 #endif
3256
3257         err = -ENOBUFS;
3258         if (cfg->fc_nlinfo.nlh &&
3259             !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
3260                 table = fib6_get_table(net, cfg->fc_table);
3261                 if (!table) {
3262                         pr_warn("NLM_F_CREATE should be specified when creating new route\n");
3263                         table = fib6_new_table(net, cfg->fc_table);
3264                 }
3265         } else {
3266                 table = fib6_new_table(net, cfg->fc_table);
3267         }
3268
3269         if (!table)
3270                 goto out;
3271
3272         err = -ENOMEM;
3273         rt = fib6_info_alloc(gfp_flags, true);
3274         if (!rt)
3275                 goto out;
3276
3277         rt->fib6_metrics = ip_fib_metrics_init(net, cfg->fc_mx, cfg->fc_mx_len,
3278                                                extack);
3279         if (IS_ERR(rt->fib6_metrics)) {
3280                 err = PTR_ERR(rt->fib6_metrics);
3281                 /* Do not leave garbage there. */
3282                 rt->fib6_metrics = (struct dst_metrics *)&dst_default_metrics;
3283                 goto out;
3284         }
3285
3286         if (cfg->fc_flags & RTF_ADDRCONF)
3287                 rt->dst_nocount = true;
3288
3289         if (cfg->fc_flags & RTF_EXPIRES)
3290                 fib6_set_expires(rt, jiffies +
3291                                 clock_t_to_jiffies(cfg->fc_expires));
3292         else
3293                 fib6_clean_expires(rt);
3294
3295         if (cfg->fc_protocol == RTPROT_UNSPEC)
3296                 cfg->fc_protocol = RTPROT_BOOT;
3297         rt->fib6_protocol = cfg->fc_protocol;
3298
3299         rt->fib6_table = table;
3300         rt->fib6_metric = cfg->fc_metric;
3301         rt->fib6_type = cfg->fc_type;
3302         rt->fib6_flags = cfg->fc_flags & ~RTF_GATEWAY;
3303
3304         ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
3305         rt->fib6_dst.plen = cfg->fc_dst_len;
3306         if (rt->fib6_dst.plen == 128)
3307                 rt->dst_host = true;
3308
3309 #ifdef CONFIG_IPV6_SUBTREES
3310         ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
3311         rt->fib6_src.plen = cfg->fc_src_len;
3312 #endif
3313         err = fib6_nh_init(net, rt->fib6_nh, cfg, gfp_flags, extack);
3314         if (err)
3315                 goto out;
3316
3317         /* We cannot add true routes via loopback here,
3318          * they would result in kernel looping; promote them to reject routes
3319          */
3320         addr_type = ipv6_addr_type(&cfg->fc_dst);
3321         if (fib6_is_reject(cfg->fc_flags, rt->fib6_nh->fib_nh_dev, addr_type))
3322                 rt->fib6_flags = RTF_REJECT | RTF_NONEXTHOP;
3323
3324         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
3325                 struct net_device *dev = fib6_info_nh_dev(rt);
3326
3327                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
3328                         NL_SET_ERR_MSG(extack, "Invalid source address");
3329                         err = -EINVAL;
3330                         goto out;
3331                 }
3332                 rt->fib6_prefsrc.addr = cfg->fc_prefsrc;
3333                 rt->fib6_prefsrc.plen = 128;
3334         } else
3335                 rt->fib6_prefsrc.plen = 0;
3336
3337         return rt;
3338 out:
3339         fib6_info_release(rt);
3340         return ERR_PTR(err);
3341 }
3342
3343 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
3344                   struct netlink_ext_ack *extack)
3345 {
3346         struct fib6_info *rt;
3347         int err;
3348
3349         rt = ip6_route_info_create(cfg, gfp_flags, extack);
3350         if (IS_ERR(rt))
3351                 return PTR_ERR(rt);
3352
3353         err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack);
3354         fib6_info_release(rt);
3355
3356         return err;
3357 }
3358
3359 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info)
3360 {
3361         struct net *net = info->nl_net;
3362         struct fib6_table *table;
3363         int err;
3364
3365         if (rt == net->ipv6.fib6_null_entry) {
3366                 err = -ENOENT;
3367                 goto out;
3368         }
3369
3370         table = rt->fib6_table;
3371         spin_lock_bh(&table->tb6_lock);
3372         err = fib6_del(rt, info);
3373         spin_unlock_bh(&table->tb6_lock);
3374
3375 out:
3376         fib6_info_release(rt);
3377         return err;
3378 }
3379
3380 int ip6_del_rt(struct net *net, struct fib6_info *rt)
3381 {
3382         struct nl_info info = { .nl_net = net };
3383
3384         return __ip6_del_rt(rt, &info);
3385 }
3386
3387 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
3388 {
3389         struct nl_info *info = &cfg->fc_nlinfo;
3390         struct net *net = info->nl_net;
3391         struct sk_buff *skb = NULL;
3392         struct fib6_table *table;
3393         int err = -ENOENT;
3394
3395         if (rt == net->ipv6.fib6_null_entry)
3396                 goto out_put;
3397         table = rt->fib6_table;
3398         spin_lock_bh(&table->tb6_lock);
3399
3400         if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) {
3401                 struct fib6_info *sibling, *next_sibling;
3402
3403                 /* prefer to send a single notification with all hops */
3404                 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3405                 if (skb) {
3406                         u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3407
3408                         if (rt6_fill_node(net, skb, rt, NULL,
3409                                           NULL, NULL, 0, RTM_DELROUTE,
3410                                           info->portid, seq, 0) < 0) {
3411                                 kfree_skb(skb);
3412                                 skb = NULL;
3413                         } else
3414                                 info->skip_notify = 1;
3415                 }
3416
3417                 list_for_each_entry_safe(sibling, next_sibling,
3418                                          &rt->fib6_siblings,
3419                                          fib6_siblings) {
3420                         err = fib6_del(sibling, info);
3421                         if (err)
3422                                 goto out_unlock;
3423                 }
3424         }
3425
3426         err = fib6_del(rt, info);
3427 out_unlock:
3428         spin_unlock_bh(&table->tb6_lock);
3429 out_put:
3430         fib6_info_release(rt);
3431
3432         if (skb) {
3433                 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3434                             info->nlh, gfp_any());
3435         }
3436         return err;
3437 }
3438
3439 static int __ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
3440 {
3441         int rc = -ESRCH;
3442
3443         if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex)
3444                 goto out;
3445
3446         if (cfg->fc_flags & RTF_GATEWAY &&
3447             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
3448                 goto out;
3449
3450         rc = rt6_remove_exception_rt(rt);
3451 out:
3452         return rc;
3453 }
3454
3455 static int ip6_del_cached_rt(struct fib6_config *cfg, struct fib6_info *rt,
3456                              struct fib6_nh *nh)
3457 {
3458         struct fib6_result res = {
3459                 .f6i = rt,
3460                 .nh = nh,
3461         };
3462         struct rt6_info *rt_cache;
3463
3464         rt_cache = rt6_find_cached_rt(&res, &cfg->fc_dst, &cfg->fc_src);
3465         if (rt_cache)
3466                 return __ip6_del_cached_rt(rt_cache, cfg);
3467
3468         return 0;
3469 }
3470
3471 static int ip6_route_del(struct fib6_config *cfg,
3472                          struct netlink_ext_ack *extack)
3473 {
3474         struct fib6_table *table;
3475         struct fib6_info *rt;
3476         struct fib6_node *fn;
3477         int err = -ESRCH;
3478
3479         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
3480         if (!table) {
3481                 NL_SET_ERR_MSG(extack, "FIB table does not exist");
3482                 return err;
3483         }
3484
3485         rcu_read_lock();
3486
3487         fn = fib6_locate(&table->tb6_root,
3488                          &cfg->fc_dst, cfg->fc_dst_len,
3489                          &cfg->fc_src, cfg->fc_src_len,
3490                          !(cfg->fc_flags & RTF_CACHE));
3491
3492         if (fn) {
3493                 for_each_fib6_node_rt_rcu(fn) {
3494                         struct fib6_nh *nh;
3495
3496                         nh = rt->fib6_nh;
3497                         if (cfg->fc_flags & RTF_CACHE) {
3498                                 int rc;
3499
3500                                 rc = ip6_del_cached_rt(cfg, rt, nh);
3501                                 if (rc != -ESRCH) {
3502                                         rcu_read_unlock();
3503                                         return rc;
3504                                 }
3505                                 continue;
3506                         }
3507
3508                         if (cfg->fc_ifindex &&
3509                             (!nh->fib_nh_dev ||
3510                              nh->fib_nh_dev->ifindex != cfg->fc_ifindex))
3511                                 continue;
3512                         if (cfg->fc_flags & RTF_GATEWAY &&
3513                             !ipv6_addr_equal(&cfg->fc_gateway, &nh->fib_nh_gw6))
3514                                 continue;
3515                         if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
3516                                 continue;
3517                         if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol)
3518                                 continue;
3519                         if (!fib6_info_hold_safe(rt))
3520                                 continue;
3521                         rcu_read_unlock();
3522
3523                         /* if gateway was specified only delete the one hop */
3524                         if (cfg->fc_flags & RTF_GATEWAY)
3525                                 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
3526
3527                         return __ip6_del_rt_siblings(rt, cfg);
3528                 }
3529         }
3530         rcu_read_unlock();
3531
3532         return err;
3533 }
3534
3535 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
3536 {
3537         struct netevent_redirect netevent;
3538         struct rt6_info *rt, *nrt = NULL;
3539         struct fib6_result res = {};
3540         struct ndisc_options ndopts;
3541         struct inet6_dev *in6_dev;
3542         struct neighbour *neigh;
3543         struct rd_msg *msg;
3544         int optlen, on_link;
3545         u8 *lladdr;
3546
3547         optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
3548         optlen -= sizeof(*msg);
3549
3550         if (optlen < 0) {
3551                 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
3552                 return;
3553         }
3554
3555         msg = (struct rd_msg *)icmp6_hdr(skb);
3556
3557         if (ipv6_addr_is_multicast(&msg->dest)) {
3558                 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
3559                 return;
3560         }
3561
3562         on_link = 0;
3563         if (ipv6_addr_equal(&msg->dest, &msg->target)) {
3564                 on_link = 1;
3565         } else if (ipv6_addr_type(&msg->target) !=
3566                    (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
3567                 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
3568                 return;
3569         }
3570
3571         in6_dev = __in6_dev_get(skb->dev);
3572         if (!in6_dev)
3573                 return;
3574         if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
3575                 return;
3576
3577         /* RFC2461 8.1:
3578          *      The IP source address of the Redirect MUST be the same as the current
3579          *      first-hop router for the specified ICMP Destination Address.
3580          */
3581
3582         if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
3583                 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
3584                 return;
3585         }
3586
3587         lladdr = NULL;
3588         if (ndopts.nd_opts_tgt_lladdr) {
3589                 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
3590                                              skb->dev);
3591                 if (!lladdr) {
3592                         net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
3593                         return;
3594                 }
3595         }
3596
3597         rt = (struct rt6_info *) dst;
3598         if (rt->rt6i_flags & RTF_REJECT) {
3599                 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3600                 return;
3601         }
3602
3603         /* Redirect received -> path was valid.
3604          * Look, redirects are sent only in response to data packets,
3605          * so that this nexthop apparently is reachable. --ANK
3606          */
3607         dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3608
3609         neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3610         if (!neigh)
3611                 return;
3612
3613         /*
3614          *      We have finally decided to accept it.
3615          */
3616
3617         ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3618                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
3619                      NEIGH_UPDATE_F_OVERRIDE|
3620                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3621                                      NEIGH_UPDATE_F_ISROUTER)),
3622                      NDISC_REDIRECT, &ndopts);
3623
3624         rcu_read_lock();
3625         res.f6i = rcu_dereference(rt->from);
3626         if (!res.f6i)
3627                 goto out;
3628
3629         res.nh = res.f6i->fib6_nh;
3630         res.fib6_flags = res.f6i->fib6_flags;
3631         res.fib6_type = res.f6i->fib6_type;
3632         nrt = ip6_rt_cache_alloc(&res, &msg->dest, NULL);
3633         if (!nrt)
3634                 goto out;
3635
3636         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3637         if (on_link)
3638                 nrt->rt6i_flags &= ~RTF_GATEWAY;
3639
3640         nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3641
3642         /* rt6_insert_exception() will take care of duplicated exceptions */
3643         if (rt6_insert_exception(nrt, &res)) {
3644                 dst_release_immediate(&nrt->dst);
3645                 goto out;
3646         }
3647
3648         netevent.old = &rt->dst;
3649         netevent.new = &nrt->dst;
3650         netevent.daddr = &msg->dest;
3651         netevent.neigh = neigh;
3652         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3653
3654 out:
3655         rcu_read_unlock();
3656         neigh_release(neigh);
3657 }
3658
3659 #ifdef CONFIG_IPV6_ROUTE_INFO
3660 static struct fib6_info *rt6_get_route_info(struct net *net,
3661                                            const struct in6_addr *prefix, int prefixlen,
3662                                            const struct in6_addr *gwaddr,
3663                                            struct net_device *dev)
3664 {
3665         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3666         int ifindex = dev->ifindex;
3667         struct fib6_node *fn;
3668         struct fib6_info *rt = NULL;
3669         struct fib6_table *table;
3670
3671         table = fib6_get_table(net, tb_id);
3672         if (!table)
3673                 return NULL;
3674
3675         rcu_read_lock();
3676         fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3677         if (!fn)
3678                 goto out;
3679
3680         for_each_fib6_node_rt_rcu(fn) {
3681                 if (rt->fib6_nh->fib_nh_dev->ifindex != ifindex)
3682                         continue;
3683                 if (!(rt->fib6_flags & RTF_ROUTEINFO) ||
3684                     !rt->fib6_nh->fib_nh_gw_family)
3685                         continue;
3686                 if (!ipv6_addr_equal(&rt->fib6_nh->fib_nh_gw6, gwaddr))
3687                         continue;
3688                 if (!fib6_info_hold_safe(rt))
3689                         continue;
3690                 break;
3691         }
3692 out:
3693         rcu_read_unlock();
3694         return rt;
3695 }
3696
3697 static struct fib6_info *rt6_add_route_info(struct net *net,
3698                                            const struct in6_addr *prefix, int prefixlen,
3699                                            const struct in6_addr *gwaddr,
3700                                            struct net_device *dev,
3701                                            unsigned int pref)
3702 {
3703         struct fib6_config cfg = {
3704                 .fc_metric      = IP6_RT_PRIO_USER,
3705                 .fc_ifindex     = dev->ifindex,
3706                 .fc_dst_len     = prefixlen,
3707                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3708                                   RTF_UP | RTF_PREF(pref),
3709                 .fc_protocol = RTPROT_RA,
3710                 .fc_type = RTN_UNICAST,
3711                 .fc_nlinfo.portid = 0,
3712                 .fc_nlinfo.nlh = NULL,
3713                 .fc_nlinfo.nl_net = net,
3714         };
3715
3716         cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3717         cfg.fc_dst = *prefix;
3718         cfg.fc_gateway = *gwaddr;
3719
3720         /* We should treat it as a default route if prefix length is 0. */
3721         if (!prefixlen)
3722                 cfg.fc_flags |= RTF_DEFAULT;
3723
3724         ip6_route_add(&cfg, GFP_ATOMIC, NULL);
3725
3726         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3727 }
3728 #endif
3729
3730 struct fib6_info *rt6_get_dflt_router(struct net *net,
3731                                      const struct in6_addr *addr,
3732                                      struct net_device *dev)
3733 {
3734         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3735         struct fib6_info *rt;
3736         struct fib6_table *table;
3737
3738         table = fib6_get_table(net, tb_id);
3739         if (!table)
3740                 return NULL;
3741
3742         rcu_read_lock();
3743         for_each_fib6_node_rt_rcu(&table->tb6_root) {
3744                 struct fib6_nh *nh = rt->fib6_nh;
3745
3746                 if (dev == nh->fib_nh_dev &&
3747                     ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3748                     ipv6_addr_equal(&nh->fib_nh_gw6, addr))
3749                         break;
3750         }
3751         if (rt && !fib6_info_hold_safe(rt))
3752                 rt = NULL;
3753         rcu_read_unlock();
3754         return rt;
3755 }
3756
3757 struct fib6_info *rt6_add_dflt_router(struct net *net,
3758                                      const struct in6_addr *gwaddr,
3759                                      struct net_device *dev,
3760                                      unsigned int pref)
3761 {
3762         struct fib6_config cfg = {
3763                 .fc_table       = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3764                 .fc_metric      = IP6_RT_PRIO_USER,
3765                 .fc_ifindex     = dev->ifindex,
3766                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3767                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3768                 .fc_protocol = RTPROT_RA,
3769                 .fc_type = RTN_UNICAST,
3770                 .fc_nlinfo.portid = 0,
3771                 .fc_nlinfo.nlh = NULL,
3772                 .fc_nlinfo.nl_net = net,
3773         };
3774
3775         cfg.fc_gateway = *gwaddr;
3776
3777         if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) {
3778                 struct fib6_table *table;
3779
3780                 table = fib6_get_table(dev_net(dev), cfg.fc_table);
3781                 if (table)
3782                         table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3783         }
3784
3785         return rt6_get_dflt_router(net, gwaddr, dev);
3786 }
3787
3788 static void __rt6_purge_dflt_routers(struct net *net,
3789                                      struct fib6_table *table)
3790 {
3791         struct fib6_info *rt;
3792
3793 restart:
3794         rcu_read_lock();
3795         for_each_fib6_node_rt_rcu(&table->tb6_root) {
3796                 struct net_device *dev = fib6_info_nh_dev(rt);
3797                 struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL;
3798
3799                 if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3800                     (!idev || idev->cnf.accept_ra != 2) &&
3801                     fib6_info_hold_safe(rt)) {
3802                         rcu_read_unlock();
3803                         ip6_del_rt(net, rt);
3804                         goto restart;
3805                 }
3806         }
3807         rcu_read_unlock();
3808
3809         table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3810 }
3811
3812 void rt6_purge_dflt_routers(struct net *net)
3813 {
3814         struct fib6_table *table;
3815         struct hlist_head *head;
3816         unsigned int h;
3817
3818         rcu_read_lock();
3819
3820         for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3821                 head = &net->ipv6.fib_table_hash[h];
3822                 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3823                         if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3824                                 __rt6_purge_dflt_routers(net, table);
3825                 }
3826         }
3827
3828         rcu_read_unlock();
3829 }
3830
3831 static void rtmsg_to_fib6_config(struct net *net,
3832                                  struct in6_rtmsg *rtmsg,
3833                                  struct fib6_config *cfg)
3834 {
3835         *cfg = (struct fib6_config){
3836                 .fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3837                          : RT6_TABLE_MAIN,
3838                 .fc_ifindex = rtmsg->rtmsg_ifindex,
3839                 .fc_metric = rtmsg->rtmsg_metric ? : IP6_RT_PRIO_USER,
3840                 .fc_expires = rtmsg->rtmsg_info,
3841                 .fc_dst_len = rtmsg->rtmsg_dst_len,
3842                 .fc_src_len = rtmsg->rtmsg_src_len,
3843                 .fc_flags = rtmsg->rtmsg_flags,
3844                 .fc_type = rtmsg->rtmsg_type,
3845
3846                 .fc_nlinfo.nl_net = net,
3847
3848                 .fc_dst = rtmsg->rtmsg_dst,
3849                 .fc_src = rtmsg->rtmsg_src,
3850                 .fc_gateway = rtmsg->rtmsg_gateway,
3851         };
3852 }
3853
3854 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3855 {
3856         struct fib6_config cfg;
3857         struct in6_rtmsg rtmsg;
3858         int err;
3859
3860         switch (cmd) {
3861         case SIOCADDRT:         /* Add a route */
3862         case SIOCDELRT:         /* Delete a route */
3863                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3864                         return -EPERM;
3865                 err = copy_from_user(&rtmsg, arg,
3866                                      sizeof(struct in6_rtmsg));
3867                 if (err)
3868                         return -EFAULT;
3869
3870                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3871
3872                 rtnl_lock();
3873                 switch (cmd) {
3874                 case SIOCADDRT:
3875                         err = ip6_route_add(&cfg, GFP_KERNEL, NULL);
3876                         break;
3877                 case SIOCDELRT:
3878                         err = ip6_route_del(&cfg, NULL);
3879                         break;
3880                 default:
3881                         err = -EINVAL;
3882                 }
3883                 rtnl_unlock();
3884
3885                 return err;
3886         }
3887
3888         return -EINVAL;
3889 }
3890
3891 /*
3892  *      Drop the packet on the floor
3893  */
3894
3895 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3896 {
3897         struct dst_entry *dst = skb_dst(skb);
3898         struct net *net = dev_net(dst->dev);
3899         struct inet6_dev *idev;
3900         int type;
3901
3902         if (netif_is_l3_master(skb->dev) &&
3903             dst->dev == net->loopback_dev)
3904                 idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif));
3905         else
3906                 idev = ip6_dst_idev(dst);
3907
3908         switch (ipstats_mib_noroutes) {
3909         case IPSTATS_MIB_INNOROUTES:
3910                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3911                 if (type == IPV6_ADDR_ANY) {
3912                         IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
3913                         break;
3914                 }
3915                 /* FALLTHROUGH */
3916         case IPSTATS_MIB_OUTNOROUTES:
3917                 IP6_INC_STATS(net, idev, ipstats_mib_noroutes);
3918                 break;
3919         }
3920
3921         /* Start over by dropping the dst for l3mdev case */
3922         if (netif_is_l3_master(skb->dev))
3923                 skb_dst_drop(skb);
3924
3925         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3926         kfree_skb(skb);
3927         return 0;
3928 }
3929
3930 static int ip6_pkt_discard(struct sk_buff *skb)
3931 {
3932         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3933 }
3934
3935 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3936 {
3937         skb->dev = skb_dst(skb)->dev;
3938         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3939 }
3940
3941 static int ip6_pkt_prohibit(struct sk_buff *skb)
3942 {
3943         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3944 }
3945
3946 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3947 {
3948         skb->dev = skb_dst(skb)->dev;
3949         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3950 }
3951
3952 /*
3953  *      Allocate a dst for local (unicast / anycast) address.
3954  */
3955
3956 struct fib6_info *addrconf_f6i_alloc(struct net *net,
3957                                      struct inet6_dev *idev,
3958                                      const struct in6_addr *addr,
3959                                      bool anycast, gfp_t gfp_flags)
3960 {
3961         struct fib6_config cfg = {
3962                 .fc_table = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL,
3963                 .fc_ifindex = idev->dev->ifindex,
3964                 .fc_flags = RTF_UP | RTF_ADDRCONF | RTF_NONEXTHOP,
3965                 .fc_dst = *addr,
3966                 .fc_dst_len = 128,
3967                 .fc_protocol = RTPROT_KERNEL,
3968                 .fc_nlinfo.nl_net = net,
3969                 .fc_ignore_dev_down = true,
3970         };
3971
3972         if (anycast) {
3973                 cfg.fc_type = RTN_ANYCAST;
3974                 cfg.fc_flags |= RTF_ANYCAST;
3975         } else {
3976                 cfg.fc_type = RTN_LOCAL;
3977                 cfg.fc_flags |= RTF_LOCAL;
3978         }
3979
3980         return ip6_route_info_create(&cfg, gfp_flags, NULL);
3981 }
3982
3983 /* remove deleted ip from prefsrc entries */
3984 struct arg_dev_net_ip {
3985         struct net_device *dev;
3986         struct net *net;
3987         struct in6_addr *addr;
3988 };
3989
3990 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
3991 {
3992         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3993         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3994         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3995
3996         if (((void *)rt->fib6_nh->fib_nh_dev == dev || !dev) &&
3997             rt != net->ipv6.fib6_null_entry &&
3998             ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) {
3999                 spin_lock_bh(&rt6_exception_lock);
4000                 /* remove prefsrc entry */
4001                 rt->fib6_prefsrc.plen = 0;
4002                 spin_unlock_bh(&rt6_exception_lock);
4003         }
4004         return 0;
4005 }
4006
4007 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
4008 {
4009         struct net *net = dev_net(ifp->idev->dev);
4010         struct arg_dev_net_ip adni = {
4011                 .dev = ifp->idev->dev,
4012                 .net = net,
4013                 .addr = &ifp->addr,
4014         };
4015         fib6_clean_all(net, fib6_remove_prefsrc, &adni);
4016 }
4017
4018 #define RTF_RA_ROUTER           (RTF_ADDRCONF | RTF_DEFAULT)
4019
4020 /* Remove routers and update dst entries when gateway turn into host. */
4021 static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
4022 {
4023         struct in6_addr *gateway = (struct in6_addr *)arg;
4024         struct fib6_nh *nh = rt->fib6_nh;
4025
4026         if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
4027             nh->fib_nh_gw_family && ipv6_addr_equal(gateway, &nh->fib_nh_gw6))
4028                 return -1;
4029
4030         /* Further clean up cached routes in exception table.
4031          * This is needed because cached route may have a different
4032          * gateway than its 'parent' in the case of an ip redirect.
4033          */
4034         fib6_nh_exceptions_clean_tohost(nh, gateway);
4035
4036         return 0;
4037 }
4038
4039 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
4040 {
4041         fib6_clean_all(net, fib6_clean_tohost, gateway);
4042 }
4043
4044 struct arg_netdev_event {
4045         const struct net_device *dev;
4046         union {
4047                 unsigned char nh_flags;
4048                 unsigned long event;
4049         };
4050 };
4051
4052 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
4053 {
4054         struct fib6_info *iter;
4055         struct fib6_node *fn;
4056
4057         fn = rcu_dereference_protected(rt->fib6_node,
4058                         lockdep_is_held(&rt->fib6_table->tb6_lock));
4059         iter = rcu_dereference_protected(fn->leaf,
4060                         lockdep_is_held(&rt->fib6_table->tb6_lock));
4061         while (iter) {
4062                 if (iter->fib6_metric == rt->fib6_metric &&
4063                     rt6_qualify_for_ecmp(iter))
4064                         return iter;
4065                 iter = rcu_dereference_protected(iter->fib6_next,
4066                                 lockdep_is_held(&rt->fib6_table->tb6_lock));
4067         }
4068
4069         return NULL;
4070 }
4071
4072 static bool rt6_is_dead(const struct fib6_info *rt)
4073 {
4074         if (rt->fib6_nh->fib_nh_flags & RTNH_F_DEAD ||
4075             (rt->fib6_nh->fib_nh_flags & RTNH_F_LINKDOWN &&
4076              ip6_ignore_linkdown(rt->fib6_nh->fib_nh_dev)))
4077                 return true;
4078
4079         return false;
4080 }
4081
4082 static int rt6_multipath_total_weight(const struct fib6_info *rt)
4083 {
4084         struct fib6_info *iter;
4085         int total = 0;
4086
4087         if (!rt6_is_dead(rt))
4088                 total += rt->fib6_nh->fib_nh_weight;
4089
4090         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) {
4091                 if (!rt6_is_dead(iter))
4092                         total += iter->fib6_nh->fib_nh_weight;
4093         }
4094
4095         return total;
4096 }
4097
4098 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total)
4099 {
4100         int upper_bound = -1;
4101
4102         if (!rt6_is_dead(rt)) {
4103                 *weight += rt->fib6_nh->fib_nh_weight;
4104                 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
4105                                                     total) - 1;
4106         }
4107         atomic_set(&rt->fib6_nh->fib_nh_upper_bound, upper_bound);
4108 }
4109
4110 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total)
4111 {
4112         struct fib6_info *iter;
4113         int weight = 0;
4114
4115         rt6_upper_bound_set(rt, &weight, total);
4116
4117         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4118                 rt6_upper_bound_set(iter, &weight, total);
4119 }
4120
4121 void rt6_multipath_rebalance(struct fib6_info *rt)
4122 {
4123         struct fib6_info *first;
4124         int total;
4125
4126         /* In case the entire multipath route was marked for flushing,
4127          * then there is no need to rebalance upon the removal of every
4128          * sibling route.
4129          */
4130         if (!rt->fib6_nsiblings || rt->should_flush)
4131                 return;
4132
4133         /* During lookup routes are evaluated in order, so we need to
4134          * make sure upper bounds are assigned from the first sibling
4135          * onwards.
4136          */
4137         first = rt6_multipath_first_sibling(rt);
4138         if (WARN_ON_ONCE(!first))
4139                 return;
4140
4141         total = rt6_multipath_total_weight(first);
4142         rt6_multipath_upper_bound_set(first, total);
4143 }
4144
4145 static int fib6_ifup(struct fib6_info *rt, void *p_arg)
4146 {
4147         const struct arg_netdev_event *arg = p_arg;
4148         struct net *net = dev_net(arg->dev);
4149
4150         if (rt != net->ipv6.fib6_null_entry &&
4151             rt->fib6_nh->fib_nh_dev == arg->dev) {
4152                 rt->fib6_nh->fib_nh_flags &= ~arg->nh_flags;
4153                 fib6_update_sernum_upto_root(net, rt);
4154                 rt6_multipath_rebalance(rt);
4155         }
4156
4157         return 0;
4158 }
4159
4160 void rt6_sync_up(struct net_device *dev, unsigned char nh_flags)
4161 {
4162         struct arg_netdev_event arg = {
4163                 .dev = dev,
4164                 {
4165                         .nh_flags = nh_flags,
4166                 },
4167         };
4168
4169         if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
4170                 arg.nh_flags |= RTNH_F_LINKDOWN;
4171
4172         fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
4173 }
4174
4175 static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
4176                                    const struct net_device *dev)
4177 {
4178         struct fib6_info *iter;
4179
4180         if (rt->fib6_nh->fib_nh_dev == dev)
4181                 return true;
4182         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4183                 if (iter->fib6_nh->fib_nh_dev == dev)
4184                         return true;
4185
4186         return false;
4187 }
4188
4189 static void rt6_multipath_flush(struct fib6_info *rt)
4190 {
4191         struct fib6_info *iter;
4192
4193         rt->should_flush = 1;
4194         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4195                 iter->should_flush = 1;
4196 }
4197
4198 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt,
4199                                              const struct net_device *down_dev)
4200 {
4201         struct fib6_info *iter;
4202         unsigned int dead = 0;
4203
4204         if (rt->fib6_nh->fib_nh_dev == down_dev ||
4205             rt->fib6_nh->fib_nh_flags & RTNH_F_DEAD)
4206                 dead++;
4207         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4208                 if (iter->fib6_nh->fib_nh_dev == down_dev ||
4209                     iter->fib6_nh->fib_nh_flags & RTNH_F_DEAD)
4210                         dead++;
4211
4212         return dead;
4213 }
4214
4215 static void rt6_multipath_nh_flags_set(struct fib6_info *rt,
4216                                        const struct net_device *dev,
4217                                        unsigned char nh_flags)
4218 {
4219         struct fib6_info *iter;
4220
4221         if (rt->fib6_nh->fib_nh_dev == dev)
4222                 rt->fib6_nh->fib_nh_flags |= nh_flags;
4223         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4224                 if (iter->fib6_nh->fib_nh_dev == dev)
4225                         iter->fib6_nh->fib_nh_flags |= nh_flags;
4226 }
4227
4228 /* called with write lock held for table with rt */
4229 static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
4230 {
4231         const struct arg_netdev_event *arg = p_arg;
4232         const struct net_device *dev = arg->dev;
4233         struct net *net = dev_net(dev);
4234
4235         if (rt == net->ipv6.fib6_null_entry)
4236                 return 0;
4237
4238         switch (arg->event) {
4239         case NETDEV_UNREGISTER:
4240                 return rt->fib6_nh->fib_nh_dev == dev ? -1 : 0;
4241         case NETDEV_DOWN:
4242                 if (rt->should_flush)
4243                         return -1;
4244                 if (!rt->fib6_nsiblings)
4245                         return rt->fib6_nh->fib_nh_dev == dev ? -1 : 0;
4246                 if (rt6_multipath_uses_dev(rt, dev)) {
4247                         unsigned int count;
4248
4249                         count = rt6_multipath_dead_count(rt, dev);
4250                         if (rt->fib6_nsiblings + 1 == count) {
4251                                 rt6_multipath_flush(rt);
4252                                 return -1;
4253                         }
4254                         rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
4255                                                    RTNH_F_LINKDOWN);
4256                         fib6_update_sernum(net, rt);
4257                         rt6_multipath_rebalance(rt);
4258                 }
4259                 return -2;
4260         case NETDEV_CHANGE:
4261                 if (rt->fib6_nh->fib_nh_dev != dev ||
4262                     rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
4263                         break;
4264                 rt->fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN;
4265                 rt6_multipath_rebalance(rt);
4266                 break;
4267         }
4268
4269         return 0;
4270 }
4271
4272 void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
4273 {
4274         struct arg_netdev_event arg = {
4275                 .dev = dev,
4276                 {
4277                         .event = event,
4278                 },
4279         };
4280         struct net *net = dev_net(dev);
4281
4282         if (net->ipv6.sysctl.skip_notify_on_dev_down)
4283                 fib6_clean_all_skip_notify(net, fib6_ifdown, &arg);
4284         else
4285                 fib6_clean_all(net, fib6_ifdown, &arg);
4286 }
4287
4288 void rt6_disable_ip(struct net_device *dev, unsigned long event)
4289 {
4290         rt6_sync_down_dev(dev, event);
4291         rt6_uncached_list_flush_dev(dev_net(dev), dev);
4292         neigh_ifdown(&nd_tbl, dev);
4293 }
4294
4295 struct rt6_mtu_change_arg {
4296         struct net_device *dev;
4297         unsigned int mtu;
4298         struct fib6_info *f6i;
4299 };
4300
4301 static int fib6_nh_mtu_change(struct fib6_nh *nh, void *_arg)
4302 {
4303         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *)_arg;
4304         struct fib6_info *f6i = arg->f6i;
4305
4306         /* For administrative MTU increase, there is no way to discover
4307          * IPv6 PMTU increase, so PMTU increase should be updated here.
4308          * Since RFC 1981 doesn't include administrative MTU increase
4309          * update PMTU increase is a MUST. (i.e. jumbo frame)
4310          */
4311         if (nh->fib_nh_dev == arg->dev) {
4312                 struct inet6_dev *idev = __in6_dev_get(arg->dev);
4313                 u32 mtu = f6i->fib6_pmtu;
4314
4315                 if (mtu >= arg->mtu ||
4316                     (mtu < arg->mtu && mtu == idev->cnf.mtu6))
4317                         fib6_metric_set(f6i, RTAX_MTU, arg->mtu);
4318
4319                 spin_lock_bh(&rt6_exception_lock);
4320                 rt6_exceptions_update_pmtu(idev, nh, arg->mtu);
4321                 spin_unlock_bh(&rt6_exception_lock);
4322         }
4323
4324         return 0;
4325 }
4326
4327 static int rt6_mtu_change_route(struct fib6_info *f6i, void *p_arg)
4328 {
4329         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
4330         struct inet6_dev *idev;
4331
4332         /* In IPv6 pmtu discovery is not optional,
4333            so that RTAX_MTU lock cannot disable it.
4334            We still use this lock to block changes
4335            caused by addrconf/ndisc.
4336         */
4337
4338         idev = __in6_dev_get(arg->dev);
4339         if (!idev)
4340                 return 0;
4341
4342         if (fib6_metric_locked(f6i, RTAX_MTU))
4343                 return 0;
4344
4345         arg->f6i = f6i;
4346         return fib6_nh_mtu_change(f6i->fib6_nh, arg);
4347 }
4348
4349 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
4350 {
4351         struct rt6_mtu_change_arg arg = {
4352                 .dev = dev,
4353                 .mtu = mtu,
4354         };
4355
4356         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
4357 }
4358
4359 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
4360         [RTA_UNSPEC]            = { .strict_start_type = RTA_DPORT + 1 },
4361         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
4362         [RTA_PREFSRC]           = { .len = sizeof(struct in6_addr) },
4363         [RTA_OIF]               = { .type = NLA_U32 },
4364         [RTA_IIF]               = { .type = NLA_U32 },
4365         [RTA_PRIORITY]          = { .type = NLA_U32 },
4366         [RTA_METRICS]           = { .type = NLA_NESTED },
4367         [RTA_MULTIPATH]         = { .len = sizeof(struct rtnexthop) },
4368         [RTA_PREF]              = { .type = NLA_U8 },
4369         [RTA_ENCAP_TYPE]        = { .type = NLA_U16 },
4370         [RTA_ENCAP]             = { .type = NLA_NESTED },
4371         [RTA_EXPIRES]           = { .type = NLA_U32 },
4372         [RTA_UID]               = { .type = NLA_U32 },
4373         [RTA_MARK]              = { .type = NLA_U32 },
4374         [RTA_TABLE]             = { .type = NLA_U32 },
4375         [RTA_IP_PROTO]          = { .type = NLA_U8 },
4376         [RTA_SPORT]             = { .type = NLA_U16 },
4377         [RTA_DPORT]             = { .type = NLA_U16 },
4378 };
4379
4380 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
4381                               struct fib6_config *cfg,
4382                               struct netlink_ext_ack *extack)
4383 {
4384         struct rtmsg *rtm;
4385         struct nlattr *tb[RTA_MAX+1];
4386         unsigned int pref;
4387         int err;
4388
4389         err = nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
4390                                      rtm_ipv6_policy, extack);
4391         if (err < 0)
4392                 goto errout;
4393
4394         err = -EINVAL;
4395         rtm = nlmsg_data(nlh);
4396
4397         *cfg = (struct fib6_config){
4398                 .fc_table = rtm->rtm_table,
4399                 .fc_dst_len = rtm->rtm_dst_len,
4400                 .fc_src_len = rtm->rtm_src_len,
4401                 .fc_flags = RTF_UP,
4402                 .fc_protocol = rtm->rtm_protocol,
4403                 .fc_type = rtm->rtm_type,
4404
4405                 .fc_nlinfo.portid = NETLINK_CB(skb).portid,
4406                 .fc_nlinfo.nlh = nlh,
4407                 .fc_nlinfo.nl_net = sock_net(skb->sk),
4408         };
4409
4410         if (rtm->rtm_type == RTN_UNREACHABLE ||
4411             rtm->rtm_type == RTN_BLACKHOLE ||
4412             rtm->rtm_type == RTN_PROHIBIT ||
4413             rtm->rtm_type == RTN_THROW)
4414                 cfg->fc_flags |= RTF_REJECT;
4415
4416         if (rtm->rtm_type == RTN_LOCAL)
4417                 cfg->fc_flags |= RTF_LOCAL;
4418
4419         if (rtm->rtm_flags & RTM_F_CLONED)
4420                 cfg->fc_flags |= RTF_CACHE;
4421
4422         cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
4423
4424         if (tb[RTA_GATEWAY]) {
4425                 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
4426                 cfg->fc_flags |= RTF_GATEWAY;
4427         }
4428         if (tb[RTA_VIA]) {
4429                 NL_SET_ERR_MSG(extack, "IPv6 does not support RTA_VIA attribute");
4430                 goto errout;
4431         }
4432
4433         if (tb[RTA_DST]) {
4434                 int plen = (rtm->rtm_dst_len + 7) >> 3;
4435
4436                 if (nla_len(tb[RTA_DST]) < plen)
4437                         goto errout;
4438
4439                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
4440         }
4441
4442         if (tb[RTA_SRC]) {
4443                 int plen = (rtm->rtm_src_len + 7) >> 3;
4444
4445                 if (nla_len(tb[RTA_SRC]) < plen)
4446                         goto errout;
4447
4448                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
4449         }
4450
4451         if (tb[RTA_PREFSRC])
4452                 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
4453
4454         if (tb[RTA_OIF])
4455                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
4456
4457         if (tb[RTA_PRIORITY])
4458                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
4459
4460         if (tb[RTA_METRICS]) {
4461                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
4462                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
4463         }
4464
4465         if (tb[RTA_TABLE])
4466                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
4467
4468         if (tb[RTA_MULTIPATH]) {
4469                 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
4470                 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
4471
4472                 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
4473                                                      cfg->fc_mp_len, extack);
4474                 if (err < 0)
4475                         goto errout;
4476         }
4477
4478         if (tb[RTA_PREF]) {
4479                 pref = nla_get_u8(tb[RTA_PREF]);
4480                 if (pref != ICMPV6_ROUTER_PREF_LOW &&
4481                     pref != ICMPV6_ROUTER_PREF_HIGH)
4482                         pref = ICMPV6_ROUTER_PREF_MEDIUM;
4483                 cfg->fc_flags |= RTF_PREF(pref);
4484         }
4485
4486         if (tb[RTA_ENCAP])
4487                 cfg->fc_encap = tb[RTA_ENCAP];
4488
4489         if (tb[RTA_ENCAP_TYPE]) {
4490                 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
4491
4492                 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
4493                 if (err < 0)
4494                         goto errout;
4495         }
4496
4497         if (tb[RTA_EXPIRES]) {
4498                 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
4499
4500                 if (addrconf_finite_timeout(timeout)) {
4501                         cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
4502                         cfg->fc_flags |= RTF_EXPIRES;
4503                 }
4504         }
4505
4506         err = 0;
4507 errout:
4508         return err;
4509 }
4510
4511 struct rt6_nh {
4512         struct fib6_info *fib6_info;
4513         struct fib6_config r_cfg;
4514         struct list_head next;
4515 };
4516
4517 static int ip6_route_info_append(struct net *net,
4518                                  struct list_head *rt6_nh_list,
4519                                  struct fib6_info *rt,
4520                                  struct fib6_config *r_cfg)
4521 {
4522         struct rt6_nh *nh;
4523         int err = -EEXIST;
4524
4525         list_for_each_entry(nh, rt6_nh_list, next) {
4526                 /* check if fib6_info already exists */
4527                 if (rt6_duplicate_nexthop(nh->fib6_info, rt))
4528                         return err;
4529         }
4530
4531         nh = kzalloc(sizeof(*nh), GFP_KERNEL);
4532         if (!nh)
4533                 return -ENOMEM;
4534         nh->fib6_info = rt;
4535         memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
4536         list_add_tail(&nh->next, rt6_nh_list);
4537
4538         return 0;
4539 }
4540
4541 static void ip6_route_mpath_notify(struct fib6_info *rt,
4542                                    struct fib6_info *rt_last,
4543                                    struct nl_info *info,
4544                                    __u16 nlflags)
4545 {
4546         /* if this is an APPEND route, then rt points to the first route
4547          * inserted and rt_last points to last route inserted. Userspace
4548          * wants a consistent dump of the route which starts at the first
4549          * nexthop. Since sibling routes are always added at the end of
4550          * the list, find the first sibling of the last route appended
4551          */
4552         if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) {
4553                 rt = list_first_entry(&rt_last->fib6_siblings,
4554                                       struct fib6_info,
4555                                       fib6_siblings);
4556         }
4557
4558         if (rt)
4559                 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
4560 }
4561
4562 static int ip6_route_multipath_add(struct fib6_config *cfg,
4563                                    struct netlink_ext_ack *extack)
4564 {
4565         struct fib6_info *rt_notif = NULL, *rt_last = NULL;
4566         struct nl_info *info = &cfg->fc_nlinfo;
4567         struct fib6_config r_cfg;
4568         struct rtnexthop *rtnh;
4569         struct fib6_info *rt;
4570         struct rt6_nh *err_nh;
4571         struct rt6_nh *nh, *nh_safe;
4572         __u16 nlflags;
4573         int remaining;
4574         int attrlen;
4575         int err = 1;
4576         int nhn = 0;
4577         int replace = (cfg->fc_nlinfo.nlh &&
4578                        (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
4579         LIST_HEAD(rt6_nh_list);
4580
4581         nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
4582         if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
4583                 nlflags |= NLM_F_APPEND;
4584
4585         remaining = cfg->fc_mp_len;
4586         rtnh = (struct rtnexthop *)cfg->fc_mp;
4587
4588         /* Parse a Multipath Entry and build a list (rt6_nh_list) of
4589          * fib6_info structs per nexthop
4590          */
4591         while (rtnh_ok(rtnh, remaining)) {
4592                 memcpy(&r_cfg, cfg, sizeof(*cfg));
4593                 if (rtnh->rtnh_ifindex)
4594                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4595
4596                 attrlen = rtnh_attrlen(rtnh);
4597                 if (attrlen > 0) {
4598                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4599
4600                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4601                         if (nla) {
4602                                 r_cfg.fc_gateway = nla_get_in6_addr(nla);
4603                                 r_cfg.fc_flags |= RTF_GATEWAY;
4604                         }
4605                         r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
4606                         nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
4607                         if (nla)
4608                                 r_cfg.fc_encap_type = nla_get_u16(nla);
4609                 }
4610
4611                 r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
4612                 rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack);
4613                 if (IS_ERR(rt)) {
4614                         err = PTR_ERR(rt);
4615                         rt = NULL;
4616                         goto cleanup;
4617                 }
4618                 if (!rt6_qualify_for_ecmp(rt)) {
4619                         err = -EINVAL;
4620                         NL_SET_ERR_MSG(extack,
4621                                        "Device only routes can not be added for IPv6 using the multipath API.");
4622                         fib6_info_release(rt);
4623                         goto cleanup;
4624                 }
4625
4626                 rt->fib6_nh->fib_nh_weight = rtnh->rtnh_hops + 1;
4627
4628                 err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
4629                                             rt, &r_cfg);
4630                 if (err) {
4631                         fib6_info_release(rt);
4632                         goto cleanup;
4633                 }
4634
4635                 rtnh = rtnh_next(rtnh, &remaining);
4636         }
4637
4638         /* for add and replace send one notification with all nexthops.
4639          * Skip the notification in fib6_add_rt2node and send one with
4640          * the full route when done
4641          */
4642         info->skip_notify = 1;
4643
4644         err_nh = NULL;
4645         list_for_each_entry(nh, &rt6_nh_list, next) {
4646                 err = __ip6_ins_rt(nh->fib6_info, info, extack);
4647                 fib6_info_release(nh->fib6_info);
4648
4649                 if (!err) {
4650                         /* save reference to last route successfully inserted */
4651                         rt_last = nh->fib6_info;
4652
4653                         /* save reference to first route for notification */
4654                         if (!rt_notif)
4655                                 rt_notif = nh->fib6_info;
4656                 }
4657
4658                 /* nh->fib6_info is used or freed at this point, reset to NULL*/
4659                 nh->fib6_info = NULL;
4660                 if (err) {
4661                         if (replace && nhn)
4662                                 NL_SET_ERR_MSG_MOD(extack,
4663                                                    "multipath route replace failed (check consistency of installed routes)");
4664                         err_nh = nh;
4665                         goto add_errout;
4666                 }
4667
4668                 /* Because each route is added like a single route we remove
4669                  * these flags after the first nexthop: if there is a collision,
4670                  * we have already failed to add the first nexthop:
4671                  * fib6_add_rt2node() has rejected it; when replacing, old
4672                  * nexthops have been replaced by first new, the rest should
4673                  * be added to it.
4674                  */
4675                 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
4676                                                      NLM_F_REPLACE);
4677                 nhn++;
4678         }
4679
4680         /* success ... tell user about new route */
4681         ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4682         goto cleanup;
4683
4684 add_errout:
4685         /* send notification for routes that were added so that
4686          * the delete notifications sent by ip6_route_del are
4687          * coherent
4688          */
4689         if (rt_notif)
4690                 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4691
4692         /* Delete routes that were already added */
4693         list_for_each_entry(nh, &rt6_nh_list, next) {
4694                 if (err_nh == nh)
4695                         break;
4696                 ip6_route_del(&nh->r_cfg, extack);
4697         }
4698
4699 cleanup:
4700         list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
4701                 if (nh->fib6_info)
4702                         fib6_info_release(nh->fib6_info);
4703                 list_del(&nh->next);
4704                 kfree(nh);
4705         }
4706
4707         return err;
4708 }
4709
4710 static int ip6_route_multipath_del(struct fib6_config *cfg,
4711                                    struct netlink_ext_ack *extack)
4712 {
4713         struct fib6_config r_cfg;
4714         struct rtnexthop *rtnh;
4715         int remaining;
4716         int attrlen;
4717         int err = 1, last_err = 0;
4718
4719         remaining = cfg->fc_mp_len;
4720         rtnh = (struct rtnexthop *)cfg->fc_mp;
4721
4722         /* Parse a Multipath Entry */
4723         while (rtnh_ok(rtnh, remaining)) {
4724                 memcpy(&r_cfg, cfg, sizeof(*cfg));
4725                 if (rtnh->rtnh_ifindex)
4726                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4727
4728                 attrlen = rtnh_attrlen(rtnh);
4729                 if (attrlen > 0) {
4730                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4731
4732                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4733                         if (nla) {
4734                                 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
4735                                 r_cfg.fc_flags |= RTF_GATEWAY;
4736                         }
4737                 }
4738                 err = ip6_route_del(&r_cfg, extack);
4739                 if (err)
4740                         last_err = err;
4741
4742                 rtnh = rtnh_next(rtnh, &remaining);
4743         }
4744
4745         return last_err;
4746 }
4747
4748 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4749                               struct netlink_ext_ack *extack)
4750 {
4751         struct fib6_config cfg;
4752         int err;
4753
4754         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4755         if (err < 0)
4756                 return err;
4757
4758         if (cfg.fc_mp)
4759                 return ip6_route_multipath_del(&cfg, extack);
4760         else {
4761                 cfg.fc_delete_all_nh = 1;
4762                 return ip6_route_del(&cfg, extack);
4763         }
4764 }
4765
4766 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4767                               struct netlink_ext_ack *extack)
4768 {
4769         struct fib6_config cfg;
4770         int err;
4771
4772         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4773         if (err < 0)
4774                 return err;
4775
4776         if (cfg.fc_metric == 0)
4777                 cfg.fc_metric = IP6_RT_PRIO_USER;
4778
4779         if (cfg.fc_mp)
4780                 return ip6_route_multipath_add(&cfg, extack);
4781         else
4782                 return ip6_route_add(&cfg, GFP_KERNEL, extack);
4783 }
4784
4785 static size_t rt6_nlmsg_size(struct fib6_info *rt)
4786 {
4787         int nexthop_len = 0;
4788
4789         if (rt->fib6_nsiblings) {
4790                 nexthop_len = nla_total_size(0)  /* RTA_MULTIPATH */
4791                             + NLA_ALIGN(sizeof(struct rtnexthop))
4792                             + nla_total_size(16) /* RTA_GATEWAY */
4793                             + lwtunnel_get_encap_size(rt->fib6_nh->fib_nh_lws);
4794
4795                 nexthop_len *= rt->fib6_nsiblings;
4796         }
4797
4798         return NLMSG_ALIGN(sizeof(struct rtmsg))
4799                + nla_total_size(16) /* RTA_SRC */
4800                + nla_total_size(16) /* RTA_DST */
4801                + nla_total_size(16) /* RTA_GATEWAY */
4802                + nla_total_size(16) /* RTA_PREFSRC */
4803                + nla_total_size(4) /* RTA_TABLE */
4804                + nla_total_size(4) /* RTA_IIF */
4805                + nla_total_size(4) /* RTA_OIF */
4806                + nla_total_size(4) /* RTA_PRIORITY */
4807                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
4808                + nla_total_size(sizeof(struct rta_cacheinfo))
4809                + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
4810                + nla_total_size(1) /* RTA_PREF */
4811                + lwtunnel_get_encap_size(rt->fib6_nh->fib_nh_lws)
4812                + nexthop_len;
4813 }
4814
4815 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
4816                          struct fib6_info *rt, struct dst_entry *dst,
4817                          struct in6_addr *dest, struct in6_addr *src,
4818                          int iif, int type, u32 portid, u32 seq,
4819                          unsigned int flags)
4820 {
4821         struct rt6_info *rt6 = (struct rt6_info *)dst;
4822         struct rt6key *rt6_dst, *rt6_src;
4823         u32 *pmetrics, table, rt6_flags;
4824         struct nlmsghdr *nlh;
4825         struct rtmsg *rtm;
4826         long expires = 0;
4827
4828         nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4829         if (!nlh)
4830                 return -EMSGSIZE;
4831
4832         if (rt6) {
4833                 rt6_dst = &rt6->rt6i_dst;
4834                 rt6_src = &rt6->rt6i_src;
4835                 rt6_flags = rt6->rt6i_flags;
4836         } else {
4837                 rt6_dst = &rt->fib6_dst;
4838                 rt6_src = &rt->fib6_src;
4839                 rt6_flags = rt->fib6_flags;
4840         }
4841
4842         rtm = nlmsg_data(nlh);
4843         rtm->rtm_family = AF_INET6;
4844         rtm->rtm_dst_len = rt6_dst->plen;
4845         rtm->rtm_src_len = rt6_src->plen;
4846         rtm->rtm_tos = 0;
4847         if (rt->fib6_table)
4848                 table = rt->fib6_table->tb6_id;
4849         else
4850                 table = RT6_TABLE_UNSPEC;
4851         rtm->rtm_table = table < 256 ? table : RT_TABLE_COMPAT;
4852         if (nla_put_u32(skb, RTA_TABLE, table))
4853                 goto nla_put_failure;
4854
4855         rtm->rtm_type = rt->fib6_type;
4856         rtm->rtm_flags = 0;
4857         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4858         rtm->rtm_protocol = rt->fib6_protocol;
4859
4860         if (rt6_flags & RTF_CACHE)
4861                 rtm->rtm_flags |= RTM_F_CLONED;
4862
4863         if (dest) {
4864                 if (nla_put_in6_addr(skb, RTA_DST, dest))
4865                         goto nla_put_failure;
4866                 rtm->rtm_dst_len = 128;
4867         } else if (rtm->rtm_dst_len)
4868                 if (nla_put_in6_addr(skb, RTA_DST, &rt6_dst->addr))
4869                         goto nla_put_failure;
4870 #ifdef CONFIG_IPV6_SUBTREES
4871         if (src) {
4872                 if (nla_put_in6_addr(skb, RTA_SRC, src))
4873                         goto nla_put_failure;
4874                 rtm->rtm_src_len = 128;
4875         } else if (rtm->rtm_src_len &&
4876                    nla_put_in6_addr(skb, RTA_SRC, &rt6_src->addr))
4877                 goto nla_put_failure;
4878 #endif
4879         if (iif) {
4880 #ifdef CONFIG_IPV6_MROUTE
4881                 if (ipv6_addr_is_multicast(&rt6_dst->addr)) {
4882                         int err = ip6mr_get_route(net, skb, rtm, portid);
4883
4884                         if (err == 0)
4885                                 return 0;
4886                         if (err < 0)
4887                                 goto nla_put_failure;
4888                 } else
4889 #endif
4890                         if (nla_put_u32(skb, RTA_IIF, iif))
4891                                 goto nla_put_failure;
4892         } else if (dest) {
4893                 struct in6_addr saddr_buf;
4894                 if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 &&
4895                     nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4896                         goto nla_put_failure;
4897         }
4898
4899         if (rt->fib6_prefsrc.plen) {
4900                 struct in6_addr saddr_buf;
4901                 saddr_buf = rt->fib6_prefsrc.addr;
4902                 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4903                         goto nla_put_failure;
4904         }
4905
4906         pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics;
4907         if (rtnetlink_put_metrics(skb, pmetrics) < 0)
4908                 goto nla_put_failure;
4909
4910         if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric))
4911                 goto nla_put_failure;
4912
4913         /* For multipath routes, walk the siblings list and add
4914          * each as a nexthop within RTA_MULTIPATH.
4915          */
4916         if (rt6) {
4917                 if (rt6_flags & RTF_GATEWAY &&
4918                     nla_put_in6_addr(skb, RTA_GATEWAY, &rt6->rt6i_gateway))
4919                         goto nla_put_failure;
4920
4921                 if (dst->dev && nla_put_u32(skb, RTA_OIF, dst->dev->ifindex))
4922                         goto nla_put_failure;
4923         } else if (rt->fib6_nsiblings) {
4924                 struct fib6_info *sibling, *next_sibling;
4925                 struct nlattr *mp;
4926
4927                 mp = nla_nest_start_noflag(skb, RTA_MULTIPATH);
4928                 if (!mp)
4929                         goto nla_put_failure;
4930
4931                 if (fib_add_nexthop(skb, &rt->fib6_nh->nh_common,
4932                                     rt->fib6_nh->fib_nh_weight) < 0)
4933                         goto nla_put_failure;
4934
4935                 list_for_each_entry_safe(sibling, next_sibling,
4936                                          &rt->fib6_siblings, fib6_siblings) {
4937                         if (fib_add_nexthop(skb, &sibling->fib6_nh->nh_common,
4938                                             sibling->fib6_nh->fib_nh_weight) < 0)
4939                                 goto nla_put_failure;
4940                 }
4941
4942                 nla_nest_end(skb, mp);
4943         } else {
4944                 unsigned char nh_flags = 0;
4945
4946                 if (fib_nexthop_info(skb, &rt->fib6_nh->nh_common,
4947                                      &nh_flags, false) < 0)
4948                         goto nla_put_failure;
4949
4950                 rtm->rtm_flags |= nh_flags;
4951         }
4952
4953         if (rt6_flags & RTF_EXPIRES) {
4954                 expires = dst ? dst->expires : rt->expires;
4955                 expires -= jiffies;
4956         }
4957
4958         if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
4959                 goto nla_put_failure;
4960
4961         if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt6_flags)))
4962                 goto nla_put_failure;
4963
4964
4965         nlmsg_end(skb, nlh);
4966         return 0;
4967
4968 nla_put_failure:
4969         nlmsg_cancel(skb, nlh);
4970         return -EMSGSIZE;
4971 }
4972
4973 static bool fib6_info_uses_dev(const struct fib6_info *f6i,
4974                                const struct net_device *dev)
4975 {
4976         if (f6i->fib6_nh->fib_nh_dev == dev)
4977                 return true;
4978
4979         if (f6i->fib6_nsiblings) {
4980                 struct fib6_info *sibling, *next_sibling;
4981
4982                 list_for_each_entry_safe(sibling, next_sibling,
4983                                          &f6i->fib6_siblings, fib6_siblings) {
4984                         if (sibling->fib6_nh->fib_nh_dev == dev)
4985                                 return true;
4986                 }
4987         }
4988
4989         return false;
4990 }
4991
4992 int rt6_dump_route(struct fib6_info *rt, void *p_arg)
4993 {
4994         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4995         struct fib_dump_filter *filter = &arg->filter;
4996         unsigned int flags = NLM_F_MULTI;
4997         struct net *net = arg->net;
4998
4999         if (rt == net->ipv6.fib6_null_entry)
5000                 return 0;
5001
5002         if ((filter->flags & RTM_F_PREFIX) &&
5003             !(rt->fib6_flags & RTF_PREFIX_RT)) {
5004                 /* success since this is not a prefix route */
5005                 return 1;
5006         }
5007         if (filter->filter_set) {
5008                 if ((filter->rt_type && rt->fib6_type != filter->rt_type) ||
5009                     (filter->dev && !fib6_info_uses_dev(rt, filter->dev)) ||
5010                     (filter->protocol && rt->fib6_protocol != filter->protocol)) {
5011                         return 1;
5012                 }
5013                 flags |= NLM_F_DUMP_FILTERED;
5014         }
5015
5016         return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0,
5017                              RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid,
5018                              arg->cb->nlh->nlmsg_seq, flags);
5019 }
5020
5021 static int inet6_rtm_valid_getroute_req(struct sk_buff *skb,
5022                                         const struct nlmsghdr *nlh,
5023                                         struct nlattr **tb,
5024                                         struct netlink_ext_ack *extack)
5025 {
5026         struct rtmsg *rtm;
5027         int i, err;
5028
5029         if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
5030                 NL_SET_ERR_MSG_MOD(extack,
5031                                    "Invalid header for get route request");
5032                 return -EINVAL;
5033         }
5034
5035         if (!netlink_strict_get_check(skb))
5036                 return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
5037                                               rtm_ipv6_policy, extack);
5038
5039         rtm = nlmsg_data(nlh);
5040         if ((rtm->rtm_src_len && rtm->rtm_src_len != 128) ||
5041             (rtm->rtm_dst_len && rtm->rtm_dst_len != 128) ||
5042             rtm->rtm_table || rtm->rtm_protocol || rtm->rtm_scope ||
5043             rtm->rtm_type) {
5044                 NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for get route request");
5045                 return -EINVAL;
5046         }
5047         if (rtm->rtm_flags & ~RTM_F_FIB_MATCH) {
5048                 NL_SET_ERR_MSG_MOD(extack,
5049                                    "Invalid flags for get route request");
5050                 return -EINVAL;
5051         }
5052
5053         err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
5054                                             rtm_ipv6_policy, extack);
5055         if (err)
5056                 return err;
5057
5058         if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
5059             (tb[RTA_DST] && !rtm->rtm_dst_len)) {
5060                 NL_SET_ERR_MSG_MOD(extack, "rtm_src_len and rtm_dst_len must be 128 for IPv6");
5061                 return -EINVAL;
5062         }
5063
5064         for (i = 0; i <= RTA_MAX; i++) {
5065                 if (!tb[i])
5066                         continue;
5067
5068                 switch (i) {
5069                 case RTA_SRC:
5070                 case RTA_DST:
5071                 case RTA_IIF:
5072                 case RTA_OIF:
5073                 case RTA_MARK:
5074                 case RTA_UID:
5075                 case RTA_SPORT:
5076                 case RTA_DPORT:
5077                 case RTA_IP_PROTO:
5078                         break;
5079                 default:
5080                         NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in get route request");
5081                         return -EINVAL;
5082                 }
5083         }
5084
5085         return 0;
5086 }
5087
5088 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
5089                               struct netlink_ext_ack *extack)
5090 {
5091         struct net *net = sock_net(in_skb->sk);
5092         struct nlattr *tb[RTA_MAX+1];
5093         int err, iif = 0, oif = 0;
5094         struct fib6_info *from;
5095         struct dst_entry *dst;
5096         struct rt6_info *rt;
5097         struct sk_buff *skb;
5098         struct rtmsg *rtm;
5099         struct flowi6 fl6 = {};
5100         bool fibmatch;
5101
5102         err = inet6_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
5103         if (err < 0)
5104                 goto errout;
5105
5106         err = -EINVAL;
5107         rtm = nlmsg_data(nlh);
5108         fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
5109         fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
5110
5111         if (tb[RTA_SRC]) {
5112                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
5113                         goto errout;
5114
5115                 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
5116         }
5117
5118         if (tb[RTA_DST]) {
5119                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
5120                         goto errout;
5121
5122                 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
5123         }
5124
5125         if (tb[RTA_IIF])
5126                 iif = nla_get_u32(tb[RTA_IIF]);
5127
5128         if (tb[RTA_OIF])
5129                 oif = nla_get_u32(tb[RTA_OIF]);
5130
5131         if (tb[RTA_MARK])
5132                 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
5133
5134         if (tb[RTA_UID])
5135                 fl6.flowi6_uid = make_kuid(current_user_ns(),
5136                                            nla_get_u32(tb[RTA_UID]));
5137         else
5138                 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
5139
5140         if (tb[RTA_SPORT])
5141                 fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]);
5142
5143         if (tb[RTA_DPORT])
5144                 fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]);
5145
5146         if (tb[RTA_IP_PROTO]) {
5147                 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
5148                                                   &fl6.flowi6_proto, AF_INET6,
5149                                                   extack);
5150                 if (err)
5151                         goto errout;
5152         }
5153
5154         if (iif) {
5155                 struct net_device *dev;
5156                 int flags = 0;
5157
5158                 rcu_read_lock();
5159
5160                 dev = dev_get_by_index_rcu(net, iif);
5161                 if (!dev) {
5162                         rcu_read_unlock();
5163                         err = -ENODEV;
5164                         goto errout;
5165                 }
5166
5167                 fl6.flowi6_iif = iif;
5168
5169                 if (!ipv6_addr_any(&fl6.saddr))
5170                         flags |= RT6_LOOKUP_F_HAS_SADDR;
5171
5172                 dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags);
5173
5174                 rcu_read_unlock();
5175         } else {
5176                 fl6.flowi6_oif = oif;
5177
5178                 dst = ip6_route_output(net, NULL, &fl6);
5179         }
5180
5181
5182         rt = container_of(dst, struct rt6_info, dst);
5183         if (rt->dst.error) {
5184                 err = rt->dst.error;
5185                 ip6_rt_put(rt);
5186                 goto errout;
5187         }
5188
5189         if (rt == net->ipv6.ip6_null_entry) {
5190                 err = rt->dst.error;
5191                 ip6_rt_put(rt);
5192                 goto errout;
5193         }
5194
5195         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
5196         if (!skb) {
5197                 ip6_rt_put(rt);
5198                 err = -ENOBUFS;
5199                 goto errout;
5200         }
5201
5202         skb_dst_set(skb, &rt->dst);
5203
5204         rcu_read_lock();
5205         from = rcu_dereference(rt->from);
5206         if (from) {
5207                 if (fibmatch)
5208                         err = rt6_fill_node(net, skb, from, NULL, NULL, NULL,
5209                                             iif, RTM_NEWROUTE,
5210                                             NETLINK_CB(in_skb).portid,
5211                                             nlh->nlmsg_seq, 0);
5212                 else
5213                         err = rt6_fill_node(net, skb, from, dst, &fl6.daddr,
5214                                             &fl6.saddr, iif, RTM_NEWROUTE,
5215                                             NETLINK_CB(in_skb).portid,
5216                                             nlh->nlmsg_seq, 0);
5217         } else {
5218                 err = -ENETUNREACH;
5219         }
5220         rcu_read_unlock();
5221
5222         if (err < 0) {
5223                 kfree_skb(skb);
5224                 goto errout;
5225         }
5226
5227         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
5228 errout:
5229         return err;
5230 }
5231
5232 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
5233                      unsigned int nlm_flags)
5234 {
5235         struct sk_buff *skb;
5236         struct net *net = info->nl_net;
5237         u32 seq;
5238         int err;
5239
5240         err = -ENOBUFS;
5241         seq = info->nlh ? info->nlh->nlmsg_seq : 0;
5242
5243         skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
5244         if (!skb)
5245                 goto errout;
5246
5247         err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
5248                             event, info->portid, seq, nlm_flags);
5249         if (err < 0) {
5250                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
5251                 WARN_ON(err == -EMSGSIZE);
5252                 kfree_skb(skb);
5253                 goto errout;
5254         }
5255         rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
5256                     info->nlh, gfp_any());
5257         return;
5258 errout:
5259         if (err < 0)
5260                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
5261 }
5262
5263 void fib6_rt_update(struct net *net, struct fib6_info *rt,
5264                     struct nl_info *info)
5265 {
5266         u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
5267         struct sk_buff *skb;
5268         int err = -ENOBUFS;
5269
5270         /* call_fib6_entry_notifiers will be removed when in-kernel notifier
5271          * is implemented and supported for nexthop objects
5272          */
5273         call_fib6_entry_notifiers(net, FIB_EVENT_ENTRY_REPLACE, rt, NULL);
5274
5275         skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
5276         if (!skb)
5277                 goto errout;
5278
5279         err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
5280                             RTM_NEWROUTE, info->portid, seq, NLM_F_REPLACE);
5281         if (err < 0) {
5282                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
5283                 WARN_ON(err == -EMSGSIZE);
5284                 kfree_skb(skb);
5285                 goto errout;
5286         }
5287         rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
5288                     info->nlh, gfp_any());
5289         return;
5290 errout:
5291         if (err < 0)
5292                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
5293 }
5294
5295 static int ip6_route_dev_notify(struct notifier_block *this,
5296                                 unsigned long event, void *ptr)
5297 {
5298         struct net_device *dev = netdev_notifier_info_to_dev(ptr);
5299         struct net *net = dev_net(dev);
5300
5301         if (!(dev->flags & IFF_LOOPBACK))
5302                 return NOTIFY_OK;
5303
5304         if (event == NETDEV_REGISTER) {
5305                 net->ipv6.fib6_null_entry->fib6_nh->fib_nh_dev = dev;
5306                 net->ipv6.ip6_null_entry->dst.dev = dev;
5307                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
5308 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5309                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
5310                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
5311                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
5312                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
5313 #endif
5314          } else if (event == NETDEV_UNREGISTER &&
5315                     dev->reg_state != NETREG_UNREGISTERED) {
5316                 /* NETDEV_UNREGISTER could be fired for multiple times by
5317                  * netdev_wait_allrefs(). Make sure we only call this once.
5318                  */
5319                 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
5320 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5321                 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
5322                 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
5323 #endif
5324         }
5325
5326         return NOTIFY_OK;
5327 }
5328
5329 /*
5330  *      /proc
5331  */
5332
5333 #ifdef CONFIG_PROC_FS
5334 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
5335 {
5336         struct net *net = (struct net *)seq->private;
5337         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
5338                    net->ipv6.rt6_stats->fib_nodes,
5339                    net->ipv6.rt6_stats->fib_route_nodes,
5340                    atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
5341                    net->ipv6.rt6_stats->fib_rt_entries,
5342                    net->ipv6.rt6_stats->fib_rt_cache,
5343                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
5344                    net->ipv6.rt6_stats->fib_discarded_routes);
5345
5346         return 0;
5347 }
5348 #endif  /* CONFIG_PROC_FS */
5349
5350 #ifdef CONFIG_SYSCTL
5351
5352 static
5353 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
5354                               void __user *buffer, size_t *lenp, loff_t *ppos)
5355 {
5356         struct net *net;
5357         int delay;
5358         int ret;
5359         if (!write)
5360                 return -EINVAL;
5361
5362         net = (struct net *)ctl->extra1;
5363         delay = net->ipv6.sysctl.flush_delay;
5364         ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
5365         if (ret)
5366                 return ret;
5367
5368         fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
5369         return 0;
5370 }
5371
5372 static int zero;
5373 static int one = 1;
5374
5375 static struct ctl_table ipv6_route_table_template[] = {
5376         {
5377                 .procname       =       "flush",
5378                 .data           =       &init_net.ipv6.sysctl.flush_delay,
5379                 .maxlen         =       sizeof(int),
5380                 .mode           =       0200,
5381                 .proc_handler   =       ipv6_sysctl_rtcache_flush
5382         },
5383         {
5384                 .procname       =       "gc_thresh",
5385                 .data           =       &ip6_dst_ops_template.gc_thresh,
5386                 .maxlen         =       sizeof(int),
5387                 .mode           =       0644,
5388                 .proc_handler   =       proc_dointvec,
5389         },
5390         {
5391                 .procname       =       "max_size",
5392                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
5393                 .maxlen         =       sizeof(int),
5394                 .mode           =       0644,
5395                 .proc_handler   =       proc_dointvec,
5396         },
5397         {
5398                 .procname       =       "gc_min_interval",
5399                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5400                 .maxlen         =       sizeof(int),
5401                 .mode           =       0644,
5402                 .proc_handler   =       proc_dointvec_jiffies,
5403         },
5404         {
5405                 .procname       =       "gc_timeout",
5406                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
5407                 .maxlen         =       sizeof(int),
5408                 .mode           =       0644,
5409                 .proc_handler   =       proc_dointvec_jiffies,
5410         },
5411         {
5412                 .procname       =       "gc_interval",
5413                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
5414                 .maxlen         =       sizeof(int),
5415                 .mode           =       0644,
5416                 .proc_handler   =       proc_dointvec_jiffies,
5417         },
5418         {
5419                 .procname       =       "gc_elasticity",
5420                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
5421                 .maxlen         =       sizeof(int),
5422                 .mode           =       0644,
5423                 .proc_handler   =       proc_dointvec,
5424         },
5425         {
5426                 .procname       =       "mtu_expires",
5427                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
5428                 .maxlen         =       sizeof(int),
5429                 .mode           =       0644,
5430                 .proc_handler   =       proc_dointvec_jiffies,
5431         },
5432         {
5433                 .procname       =       "min_adv_mss",
5434                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
5435                 .maxlen         =       sizeof(int),
5436                 .mode           =       0644,
5437                 .proc_handler   =       proc_dointvec,
5438         },
5439         {
5440                 .procname       =       "gc_min_interval_ms",
5441                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5442                 .maxlen         =       sizeof(int),
5443                 .mode           =       0644,
5444                 .proc_handler   =       proc_dointvec_ms_jiffies,
5445         },
5446         {
5447                 .procname       =       "skip_notify_on_dev_down",
5448                 .data           =       &init_net.ipv6.sysctl.skip_notify_on_dev_down,
5449                 .maxlen         =       sizeof(int),
5450                 .mode           =       0644,
5451                 .proc_handler   =       proc_dointvec,
5452                 .extra1         =       &zero,
5453                 .extra2         =       &one,
5454         },
5455         { }
5456 };
5457
5458 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
5459 {
5460         struct ctl_table *table;
5461
5462         table = kmemdup(ipv6_route_table_template,
5463                         sizeof(ipv6_route_table_template),
5464                         GFP_KERNEL);
5465
5466         if (table) {
5467                 table[0].data = &net->ipv6.sysctl.flush_delay;
5468                 table[0].extra1 = net;
5469                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
5470                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
5471                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5472                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
5473                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
5474                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
5475                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
5476                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
5477                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5478                 table[10].data = &net->ipv6.sysctl.skip_notify_on_dev_down;
5479
5480                 /* Don't export sysctls to unprivileged users */
5481                 if (net->user_ns != &init_user_ns)
5482                         table[0].procname = NULL;
5483         }
5484
5485         return table;
5486 }
5487 #endif
5488
5489 static int __net_init ip6_route_net_init(struct net *net)
5490 {
5491         int ret = -ENOMEM;
5492
5493         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
5494                sizeof(net->ipv6.ip6_dst_ops));
5495
5496         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
5497                 goto out_ip6_dst_ops;
5498
5499         net->ipv6.fib6_null_entry = fib6_info_alloc(GFP_KERNEL, true);
5500         if (!net->ipv6.fib6_null_entry)
5501                 goto out_ip6_dst_entries;
5502         memcpy(net->ipv6.fib6_null_entry, &fib6_null_entry_template,
5503                sizeof(*net->ipv6.fib6_null_entry));
5504
5505         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
5506                                            sizeof(*net->ipv6.ip6_null_entry),
5507                                            GFP_KERNEL);
5508         if (!net->ipv6.ip6_null_entry)
5509                 goto out_fib6_null_entry;
5510         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5511         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
5512                          ip6_template_metrics, true);
5513
5514 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5515         net->ipv6.fib6_has_custom_rules = false;
5516         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
5517                                                sizeof(*net->ipv6.ip6_prohibit_entry),
5518                                                GFP_KERNEL);
5519         if (!net->ipv6.ip6_prohibit_entry)
5520                 goto out_ip6_null_entry;
5521         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5522         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
5523                          ip6_template_metrics, true);
5524
5525         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
5526                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
5527                                                GFP_KERNEL);
5528         if (!net->ipv6.ip6_blk_hole_entry)
5529                 goto out_ip6_prohibit_entry;
5530         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5531         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
5532                          ip6_template_metrics, true);
5533 #endif
5534
5535         net->ipv6.sysctl.flush_delay = 0;
5536         net->ipv6.sysctl.ip6_rt_max_size = 4096;
5537         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
5538         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
5539         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
5540         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
5541         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
5542         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
5543         net->ipv6.sysctl.skip_notify_on_dev_down = 0;
5544
5545         net->ipv6.ip6_rt_gc_expire = 30*HZ;
5546
5547         ret = 0;
5548 out:
5549         return ret;
5550
5551 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5552 out_ip6_prohibit_entry:
5553         kfree(net->ipv6.ip6_prohibit_entry);
5554 out_ip6_null_entry:
5555         kfree(net->ipv6.ip6_null_entry);
5556 #endif
5557 out_fib6_null_entry:
5558         kfree(net->ipv6.fib6_null_entry);
5559 out_ip6_dst_entries:
5560         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5561 out_ip6_dst_ops:
5562         goto out;
5563 }
5564
5565 static void __net_exit ip6_route_net_exit(struct net *net)
5566 {
5567         kfree(net->ipv6.fib6_null_entry);
5568         kfree(net->ipv6.ip6_null_entry);
5569 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5570         kfree(net->ipv6.ip6_prohibit_entry);
5571         kfree(net->ipv6.ip6_blk_hole_entry);
5572 #endif
5573         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5574 }
5575
5576 static int __net_init ip6_route_net_init_late(struct net *net)
5577 {
5578 #ifdef CONFIG_PROC_FS
5579         proc_create_net("ipv6_route", 0, net->proc_net, &ipv6_route_seq_ops,
5580                         sizeof(struct ipv6_route_iter));
5581         proc_create_net_single("rt6_stats", 0444, net->proc_net,
5582                         rt6_stats_seq_show, NULL);
5583 #endif
5584         return 0;
5585 }
5586
5587 static void __net_exit ip6_route_net_exit_late(struct net *net)
5588 {
5589 #ifdef CONFIG_PROC_FS
5590         remove_proc_entry("ipv6_route", net->proc_net);
5591         remove_proc_entry("rt6_stats", net->proc_net);
5592 #endif
5593 }
5594
5595 static struct pernet_operations ip6_route_net_ops = {
5596         .init = ip6_route_net_init,
5597         .exit = ip6_route_net_exit,
5598 };
5599
5600 static int __net_init ipv6_inetpeer_init(struct net *net)
5601 {
5602         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
5603
5604         if (!bp)
5605                 return -ENOMEM;
5606         inet_peer_base_init(bp);
5607         net->ipv6.peers = bp;
5608         return 0;
5609 }
5610
5611 static void __net_exit ipv6_inetpeer_exit(struct net *net)
5612 {
5613         struct inet_peer_base *bp = net->ipv6.peers;
5614
5615         net->ipv6.peers = NULL;
5616         inetpeer_invalidate_tree(bp);
5617         kfree(bp);
5618 }
5619
5620 static struct pernet_operations ipv6_inetpeer_ops = {
5621         .init   =       ipv6_inetpeer_init,
5622         .exit   =       ipv6_inetpeer_exit,
5623 };
5624
5625 static struct pernet_operations ip6_route_net_late_ops = {
5626         .init = ip6_route_net_init_late,
5627         .exit = ip6_route_net_exit_late,
5628 };
5629
5630 static struct notifier_block ip6_route_dev_notifier = {
5631         .notifier_call = ip6_route_dev_notify,
5632         .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
5633 };
5634
5635 void __init ip6_route_init_special_entries(void)
5636 {
5637         /* Registering of the loopback is done before this portion of code,
5638          * the loopback reference in rt6_info will not be taken, do it
5639          * manually for init_net */
5640         init_net.ipv6.fib6_null_entry->fib6_nh->fib_nh_dev = init_net.loopback_dev;
5641         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
5642         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5643   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5644         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
5645         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5646         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
5647         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5648   #endif
5649 }
5650
5651 int __init ip6_route_init(void)
5652 {
5653         int ret;
5654         int cpu;
5655
5656         ret = -ENOMEM;
5657         ip6_dst_ops_template.kmem_cachep =
5658                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
5659                                   SLAB_HWCACHE_ALIGN, NULL);
5660         if (!ip6_dst_ops_template.kmem_cachep)
5661                 goto out;
5662
5663         ret = dst_entries_init(&ip6_dst_blackhole_ops);
5664         if (ret)
5665                 goto out_kmem_cache;
5666
5667         ret = register_pernet_subsys(&ipv6_inetpeer_ops);
5668         if (ret)
5669                 goto out_dst_entries;
5670
5671         ret = register_pernet_subsys(&ip6_route_net_ops);
5672         if (ret)
5673                 goto out_register_inetpeer;
5674
5675         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
5676
5677         ret = fib6_init();
5678         if (ret)
5679                 goto out_register_subsys;
5680
5681         ret = xfrm6_init();
5682         if (ret)
5683                 goto out_fib6_init;
5684
5685         ret = fib6_rules_init();
5686         if (ret)
5687                 goto xfrm6_init;
5688
5689         ret = register_pernet_subsys(&ip6_route_net_late_ops);
5690         if (ret)
5691                 goto fib6_rules_init;
5692
5693         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
5694                                    inet6_rtm_newroute, NULL, 0);
5695         if (ret < 0)
5696                 goto out_register_late_subsys;
5697
5698         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
5699                                    inet6_rtm_delroute, NULL, 0);
5700         if (ret < 0)
5701                 goto out_register_late_subsys;
5702
5703         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
5704                                    inet6_rtm_getroute, NULL,
5705                                    RTNL_FLAG_DOIT_UNLOCKED);
5706         if (ret < 0)
5707                 goto out_register_late_subsys;
5708
5709         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
5710         if (ret)
5711                 goto out_register_late_subsys;
5712
5713         for_each_possible_cpu(cpu) {
5714                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
5715
5716                 INIT_LIST_HEAD(&ul->head);
5717                 spin_lock_init(&ul->lock);
5718         }
5719
5720 out:
5721         return ret;
5722
5723 out_register_late_subsys:
5724         rtnl_unregister_all(PF_INET6);
5725         unregister_pernet_subsys(&ip6_route_net_late_ops);
5726 fib6_rules_init:
5727         fib6_rules_cleanup();
5728 xfrm6_init:
5729         xfrm6_fini();
5730 out_fib6_init:
5731         fib6_gc_cleanup();
5732 out_register_subsys:
5733         unregister_pernet_subsys(&ip6_route_net_ops);
5734 out_register_inetpeer:
5735         unregister_pernet_subsys(&ipv6_inetpeer_ops);
5736 out_dst_entries:
5737         dst_entries_destroy(&ip6_dst_blackhole_ops);
5738 out_kmem_cache:
5739         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5740         goto out;
5741 }
5742
5743 void ip6_route_cleanup(void)
5744 {
5745         unregister_netdevice_notifier(&ip6_route_dev_notifier);
5746         unregister_pernet_subsys(&ip6_route_net_late_ops);
5747         fib6_rules_cleanup();
5748         xfrm6_fini();
5749         fib6_gc_cleanup();
5750         unregister_pernet_subsys(&ipv6_inetpeer_ops);
5751         unregister_pernet_subsys(&ip6_route_net_ops);
5752         dst_entries_destroy(&ip6_dst_blackhole_ops);
5753         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5754 }