ipv6: Move pcpu cached routes to fib6_nh
[linux-2.6-microblaze.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #define pr_fmt(fmt) "IPv6: " fmt
28
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
49 #include <net/snmp.h>
50 #include <net/ipv6.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
55 #include <net/tcp.h>
56 #include <linux/rtnetlink.h>
57 #include <net/dst.h>
58 #include <net/dst_metadata.h>
59 #include <net/xfrm.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/rtnh.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
66 #include <net/ip.h>
67 #include <linux/uaccess.h>
68
69 #ifdef CONFIG_SYSCTL
70 #include <linux/sysctl.h>
71 #endif
72
73 static int ip6_rt_type_to_error(u8 fib6_type);
74
75 #define CREATE_TRACE_POINTS
76 #include <trace/events/fib6.h>
77 EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup);
78 #undef CREATE_TRACE_POINTS
79
80 enum rt6_nud_state {
81         RT6_NUD_FAIL_HARD = -3,
82         RT6_NUD_FAIL_PROBE = -2,
83         RT6_NUD_FAIL_DO_RR = -1,
84         RT6_NUD_SUCCEED = 1
85 };
86
87 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
88 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
89 static unsigned int      ip6_mtu(const struct dst_entry *dst);
90 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
91 static void             ip6_dst_destroy(struct dst_entry *);
92 static void             ip6_dst_ifdown(struct dst_entry *,
93                                        struct net_device *dev, int how);
94 static int               ip6_dst_gc(struct dst_ops *ops);
95
96 static int              ip6_pkt_discard(struct sk_buff *skb);
97 static int              ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
98 static int              ip6_pkt_prohibit(struct sk_buff *skb);
99 static int              ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
100 static void             ip6_link_failure(struct sk_buff *skb);
101 static void             ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
102                                            struct sk_buff *skb, u32 mtu);
103 static void             rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
104                                         struct sk_buff *skb);
105 static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif,
106                            int strict);
107 static size_t rt6_nlmsg_size(struct fib6_info *rt);
108 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
109                          struct fib6_info *rt, struct dst_entry *dst,
110                          struct in6_addr *dest, struct in6_addr *src,
111                          int iif, int type, u32 portid, u32 seq,
112                          unsigned int flags);
113 static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res,
114                                            const struct in6_addr *daddr,
115                                            const struct in6_addr *saddr);
116
117 #ifdef CONFIG_IPV6_ROUTE_INFO
118 static struct fib6_info *rt6_add_route_info(struct net *net,
119                                            const struct in6_addr *prefix, int prefixlen,
120                                            const struct in6_addr *gwaddr,
121                                            struct net_device *dev,
122                                            unsigned int pref);
123 static struct fib6_info *rt6_get_route_info(struct net *net,
124                                            const struct in6_addr *prefix, int prefixlen,
125                                            const struct in6_addr *gwaddr,
126                                            struct net_device *dev);
127 #endif
128
129 struct uncached_list {
130         spinlock_t              lock;
131         struct list_head        head;
132 };
133
134 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
135
136 void rt6_uncached_list_add(struct rt6_info *rt)
137 {
138         struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
139
140         rt->rt6i_uncached_list = ul;
141
142         spin_lock_bh(&ul->lock);
143         list_add_tail(&rt->rt6i_uncached, &ul->head);
144         spin_unlock_bh(&ul->lock);
145 }
146
147 void rt6_uncached_list_del(struct rt6_info *rt)
148 {
149         if (!list_empty(&rt->rt6i_uncached)) {
150                 struct uncached_list *ul = rt->rt6i_uncached_list;
151                 struct net *net = dev_net(rt->dst.dev);
152
153                 spin_lock_bh(&ul->lock);
154                 list_del(&rt->rt6i_uncached);
155                 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
156                 spin_unlock_bh(&ul->lock);
157         }
158 }
159
160 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
161 {
162         struct net_device *loopback_dev = net->loopback_dev;
163         int cpu;
164
165         if (dev == loopback_dev)
166                 return;
167
168         for_each_possible_cpu(cpu) {
169                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
170                 struct rt6_info *rt;
171
172                 spin_lock_bh(&ul->lock);
173                 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
174                         struct inet6_dev *rt_idev = rt->rt6i_idev;
175                         struct net_device *rt_dev = rt->dst.dev;
176
177                         if (rt_idev->dev == dev) {
178                                 rt->rt6i_idev = in6_dev_get(loopback_dev);
179                                 in6_dev_put(rt_idev);
180                         }
181
182                         if (rt_dev == dev) {
183                                 rt->dst.dev = loopback_dev;
184                                 dev_hold(rt->dst.dev);
185                                 dev_put(rt_dev);
186                         }
187                 }
188                 spin_unlock_bh(&ul->lock);
189         }
190 }
191
192 static inline const void *choose_neigh_daddr(const struct in6_addr *p,
193                                              struct sk_buff *skb,
194                                              const void *daddr)
195 {
196         if (!ipv6_addr_any(p))
197                 return (const void *) p;
198         else if (skb)
199                 return &ipv6_hdr(skb)->daddr;
200         return daddr;
201 }
202
203 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
204                                    struct net_device *dev,
205                                    struct sk_buff *skb,
206                                    const void *daddr)
207 {
208         struct neighbour *n;
209
210         daddr = choose_neigh_daddr(gw, skb, daddr);
211         n = __ipv6_neigh_lookup(dev, daddr);
212         if (n)
213                 return n;
214
215         n = neigh_create(&nd_tbl, daddr, dev);
216         return IS_ERR(n) ? NULL : n;
217 }
218
219 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst,
220                                               struct sk_buff *skb,
221                                               const void *daddr)
222 {
223         const struct rt6_info *rt = container_of(dst, struct rt6_info, dst);
224
225         return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr);
226 }
227
228 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
229 {
230         struct net_device *dev = dst->dev;
231         struct rt6_info *rt = (struct rt6_info *)dst;
232
233         daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr);
234         if (!daddr)
235                 return;
236         if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
237                 return;
238         if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
239                 return;
240         __ipv6_confirm_neigh(dev, daddr);
241 }
242
243 static struct dst_ops ip6_dst_ops_template = {
244         .family                 =       AF_INET6,
245         .gc                     =       ip6_dst_gc,
246         .gc_thresh              =       1024,
247         .check                  =       ip6_dst_check,
248         .default_advmss         =       ip6_default_advmss,
249         .mtu                    =       ip6_mtu,
250         .cow_metrics            =       dst_cow_metrics_generic,
251         .destroy                =       ip6_dst_destroy,
252         .ifdown                 =       ip6_dst_ifdown,
253         .negative_advice        =       ip6_negative_advice,
254         .link_failure           =       ip6_link_failure,
255         .update_pmtu            =       ip6_rt_update_pmtu,
256         .redirect               =       rt6_do_redirect,
257         .local_out              =       __ip6_local_out,
258         .neigh_lookup           =       ip6_dst_neigh_lookup,
259         .confirm_neigh          =       ip6_confirm_neigh,
260 };
261
262 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
263 {
264         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
265
266         return mtu ? : dst->dev->mtu;
267 }
268
269 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
270                                          struct sk_buff *skb, u32 mtu)
271 {
272 }
273
274 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
275                                       struct sk_buff *skb)
276 {
277 }
278
279 static struct dst_ops ip6_dst_blackhole_ops = {
280         .family                 =       AF_INET6,
281         .destroy                =       ip6_dst_destroy,
282         .check                  =       ip6_dst_check,
283         .mtu                    =       ip6_blackhole_mtu,
284         .default_advmss         =       ip6_default_advmss,
285         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
286         .redirect               =       ip6_rt_blackhole_redirect,
287         .cow_metrics            =       dst_cow_metrics_generic,
288         .neigh_lookup           =       ip6_dst_neigh_lookup,
289 };
290
291 static const u32 ip6_template_metrics[RTAX_MAX] = {
292         [RTAX_HOPLIMIT - 1] = 0,
293 };
294
295 static const struct fib6_info fib6_null_entry_template = {
296         .fib6_flags     = (RTF_REJECT | RTF_NONEXTHOP),
297         .fib6_protocol  = RTPROT_KERNEL,
298         .fib6_metric    = ~(u32)0,
299         .fib6_ref       = REFCOUNT_INIT(1),
300         .fib6_type      = RTN_UNREACHABLE,
301         .fib6_metrics   = (struct dst_metrics *)&dst_default_metrics,
302 };
303
304 static const struct rt6_info ip6_null_entry_template = {
305         .dst = {
306                 .__refcnt       = ATOMIC_INIT(1),
307                 .__use          = 1,
308                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
309                 .error          = -ENETUNREACH,
310                 .input          = ip6_pkt_discard,
311                 .output         = ip6_pkt_discard_out,
312         },
313         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
314 };
315
316 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
317
318 static const struct rt6_info ip6_prohibit_entry_template = {
319         .dst = {
320                 .__refcnt       = ATOMIC_INIT(1),
321                 .__use          = 1,
322                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
323                 .error          = -EACCES,
324                 .input          = ip6_pkt_prohibit,
325                 .output         = ip6_pkt_prohibit_out,
326         },
327         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
328 };
329
330 static const struct rt6_info ip6_blk_hole_entry_template = {
331         .dst = {
332                 .__refcnt       = ATOMIC_INIT(1),
333                 .__use          = 1,
334                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
335                 .error          = -EINVAL,
336                 .input          = dst_discard,
337                 .output         = dst_discard_out,
338         },
339         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
340 };
341
342 #endif
343
344 static void rt6_info_init(struct rt6_info *rt)
345 {
346         struct dst_entry *dst = &rt->dst;
347
348         memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
349         INIT_LIST_HEAD(&rt->rt6i_uncached);
350 }
351
352 /* allocate dst with ip6_dst_ops */
353 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
354                                int flags)
355 {
356         struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
357                                         1, DST_OBSOLETE_FORCE_CHK, flags);
358
359         if (rt) {
360                 rt6_info_init(rt);
361                 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
362         }
363
364         return rt;
365 }
366 EXPORT_SYMBOL(ip6_dst_alloc);
367
368 static void ip6_dst_destroy(struct dst_entry *dst)
369 {
370         struct rt6_info *rt = (struct rt6_info *)dst;
371         struct fib6_info *from;
372         struct inet6_dev *idev;
373
374         ip_dst_metrics_put(dst);
375         rt6_uncached_list_del(rt);
376
377         idev = rt->rt6i_idev;
378         if (idev) {
379                 rt->rt6i_idev = NULL;
380                 in6_dev_put(idev);
381         }
382
383         from = xchg((__force struct fib6_info **)&rt->from, NULL);
384         fib6_info_release(from);
385 }
386
387 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
388                            int how)
389 {
390         struct rt6_info *rt = (struct rt6_info *)dst;
391         struct inet6_dev *idev = rt->rt6i_idev;
392         struct net_device *loopback_dev =
393                 dev_net(dev)->loopback_dev;
394
395         if (idev && idev->dev != loopback_dev) {
396                 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
397                 if (loopback_idev) {
398                         rt->rt6i_idev = loopback_idev;
399                         in6_dev_put(idev);
400                 }
401         }
402 }
403
404 static bool __rt6_check_expired(const struct rt6_info *rt)
405 {
406         if (rt->rt6i_flags & RTF_EXPIRES)
407                 return time_after(jiffies, rt->dst.expires);
408         else
409                 return false;
410 }
411
412 static bool rt6_check_expired(const struct rt6_info *rt)
413 {
414         struct fib6_info *from;
415
416         from = rcu_dereference(rt->from);
417
418         if (rt->rt6i_flags & RTF_EXPIRES) {
419                 if (time_after(jiffies, rt->dst.expires))
420                         return true;
421         } else if (from) {
422                 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
423                         fib6_check_expired(from);
424         }
425         return false;
426 }
427
428 void fib6_select_path(const struct net *net, struct fib6_result *res,
429                       struct flowi6 *fl6, int oif, bool have_oif_match,
430                       const struct sk_buff *skb, int strict)
431 {
432         struct fib6_info *sibling, *next_sibling;
433         struct fib6_info *match = res->f6i;
434
435         if (!match->fib6_nsiblings || have_oif_match)
436                 goto out;
437
438         /* We might have already computed the hash for ICMPv6 errors. In such
439          * case it will always be non-zero. Otherwise now is the time to do it.
440          */
441         if (!fl6->mp_hash)
442                 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
443
444         if (fl6->mp_hash <= atomic_read(&match->fib6_nh.fib_nh_upper_bound))
445                 goto out;
446
447         list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
448                                  fib6_siblings) {
449                 const struct fib6_nh *nh = &sibling->fib6_nh;
450                 int nh_upper_bound;
451
452                 nh_upper_bound = atomic_read(&nh->fib_nh_upper_bound);
453                 if (fl6->mp_hash > nh_upper_bound)
454                         continue;
455                 if (rt6_score_route(nh, sibling->fib6_flags, oif, strict) < 0)
456                         break;
457                 match = sibling;
458                 break;
459         }
460
461 out:
462         res->f6i = match;
463         res->nh = &match->fib6_nh;
464 }
465
466 /*
467  *      Route lookup. rcu_read_lock() should be held.
468  */
469
470 static bool __rt6_device_match(struct net *net, const struct fib6_nh *nh,
471                                const struct in6_addr *saddr, int oif, int flags)
472 {
473         const struct net_device *dev;
474
475         if (nh->fib_nh_flags & RTNH_F_DEAD)
476                 return false;
477
478         dev = nh->fib_nh_dev;
479         if (oif) {
480                 if (dev->ifindex == oif)
481                         return true;
482         } else {
483                 if (ipv6_chk_addr(net, saddr, dev,
484                                   flags & RT6_LOOKUP_F_IFACE))
485                         return true;
486         }
487
488         return false;
489 }
490
491 static void rt6_device_match(struct net *net, struct fib6_result *res,
492                              const struct in6_addr *saddr, int oif, int flags)
493 {
494         struct fib6_info *f6i = res->f6i;
495         struct fib6_info *spf6i;
496         struct fib6_nh *nh;
497
498         if (!oif && ipv6_addr_any(saddr)) {
499                 nh = &f6i->fib6_nh;
500                 if (!(nh->fib_nh_flags & RTNH_F_DEAD))
501                         goto out;
502         }
503
504         for (spf6i = f6i; spf6i; spf6i = rcu_dereference(spf6i->fib6_next)) {
505                 nh = &spf6i->fib6_nh;
506                 if (__rt6_device_match(net, nh, saddr, oif, flags)) {
507                         res->f6i = spf6i;
508                         goto out;
509                 }
510         }
511
512         if (oif && flags & RT6_LOOKUP_F_IFACE) {
513                 res->f6i = net->ipv6.fib6_null_entry;
514                 nh = &res->f6i->fib6_nh;
515                 goto out;
516         }
517
518         nh = &f6i->fib6_nh;
519         if (nh->fib_nh_flags & RTNH_F_DEAD) {
520                 res->f6i = net->ipv6.fib6_null_entry;
521                 nh = &res->f6i->fib6_nh;
522         }
523 out:
524         res->nh = nh;
525         res->fib6_type = res->f6i->fib6_type;
526         res->fib6_flags = res->f6i->fib6_flags;
527 }
528
529 #ifdef CONFIG_IPV6_ROUTER_PREF
530 struct __rt6_probe_work {
531         struct work_struct work;
532         struct in6_addr target;
533         struct net_device *dev;
534 };
535
536 static void rt6_probe_deferred(struct work_struct *w)
537 {
538         struct in6_addr mcaddr;
539         struct __rt6_probe_work *work =
540                 container_of(w, struct __rt6_probe_work, work);
541
542         addrconf_addr_solict_mult(&work->target, &mcaddr);
543         ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
544         dev_put(work->dev);
545         kfree(work);
546 }
547
548 static void rt6_probe(struct fib6_nh *fib6_nh)
549 {
550         struct __rt6_probe_work *work = NULL;
551         const struct in6_addr *nh_gw;
552         struct neighbour *neigh;
553         struct net_device *dev;
554         struct inet6_dev *idev;
555
556         /*
557          * Okay, this does not seem to be appropriate
558          * for now, however, we need to check if it
559          * is really so; aka Router Reachability Probing.
560          *
561          * Router Reachability Probe MUST be rate-limited
562          * to no more than one per minute.
563          */
564         if (fib6_nh->fib_nh_gw_family)
565                 return;
566
567         nh_gw = &fib6_nh->fib_nh_gw6;
568         dev = fib6_nh->fib_nh_dev;
569         rcu_read_lock_bh();
570         idev = __in6_dev_get(dev);
571         neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
572         if (neigh) {
573                 if (neigh->nud_state & NUD_VALID)
574                         goto out;
575
576                 write_lock(&neigh->lock);
577                 if (!(neigh->nud_state & NUD_VALID) &&
578                     time_after(jiffies,
579                                neigh->updated + idev->cnf.rtr_probe_interval)) {
580                         work = kmalloc(sizeof(*work), GFP_ATOMIC);
581                         if (work)
582                                 __neigh_set_probe_once(neigh);
583                 }
584                 write_unlock(&neigh->lock);
585         } else if (time_after(jiffies, fib6_nh->last_probe +
586                                        idev->cnf.rtr_probe_interval)) {
587                 work = kmalloc(sizeof(*work), GFP_ATOMIC);
588         }
589
590         if (work) {
591                 fib6_nh->last_probe = jiffies;
592                 INIT_WORK(&work->work, rt6_probe_deferred);
593                 work->target = *nh_gw;
594                 dev_hold(dev);
595                 work->dev = dev;
596                 schedule_work(&work->work);
597         }
598
599 out:
600         rcu_read_unlock_bh();
601 }
602 #else
603 static inline void rt6_probe(struct fib6_nh *fib6_nh)
604 {
605 }
606 #endif
607
608 /*
609  * Default Router Selection (RFC 2461 6.3.6)
610  */
611 static enum rt6_nud_state rt6_check_neigh(const struct fib6_nh *fib6_nh)
612 {
613         enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
614         struct neighbour *neigh;
615
616         rcu_read_lock_bh();
617         neigh = __ipv6_neigh_lookup_noref(fib6_nh->fib_nh_dev,
618                                           &fib6_nh->fib_nh_gw6);
619         if (neigh) {
620                 read_lock(&neigh->lock);
621                 if (neigh->nud_state & NUD_VALID)
622                         ret = RT6_NUD_SUCCEED;
623 #ifdef CONFIG_IPV6_ROUTER_PREF
624                 else if (!(neigh->nud_state & NUD_FAILED))
625                         ret = RT6_NUD_SUCCEED;
626                 else
627                         ret = RT6_NUD_FAIL_PROBE;
628 #endif
629                 read_unlock(&neigh->lock);
630         } else {
631                 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
632                       RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
633         }
634         rcu_read_unlock_bh();
635
636         return ret;
637 }
638
639 static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif,
640                            int strict)
641 {
642         int m = 0;
643
644         if (!oif || nh->fib_nh_dev->ifindex == oif)
645                 m = 2;
646
647         if (!m && (strict & RT6_LOOKUP_F_IFACE))
648                 return RT6_NUD_FAIL_HARD;
649 #ifdef CONFIG_IPV6_ROUTER_PREF
650         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(fib6_flags)) << 2;
651 #endif
652         if ((strict & RT6_LOOKUP_F_REACHABLE) &&
653             !(fib6_flags & RTF_NONEXTHOP) && nh->fib_nh_gw_family) {
654                 int n = rt6_check_neigh(nh);
655                 if (n < 0)
656                         return n;
657         }
658         return m;
659 }
660
661 static bool find_match(struct fib6_nh *nh, u32 fib6_flags,
662                        int oif, int strict, int *mpri, bool *do_rr)
663 {
664         bool match_do_rr = false;
665         bool rc = false;
666         int m;
667
668         if (nh->fib_nh_flags & RTNH_F_DEAD)
669                 goto out;
670
671         if (ip6_ignore_linkdown(nh->fib_nh_dev) &&
672             nh->fib_nh_flags & RTNH_F_LINKDOWN &&
673             !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
674                 goto out;
675
676         m = rt6_score_route(nh, fib6_flags, oif, strict);
677         if (m == RT6_NUD_FAIL_DO_RR) {
678                 match_do_rr = true;
679                 m = 0; /* lowest valid score */
680         } else if (m == RT6_NUD_FAIL_HARD) {
681                 goto out;
682         }
683
684         if (strict & RT6_LOOKUP_F_REACHABLE)
685                 rt6_probe(nh);
686
687         /* note that m can be RT6_NUD_FAIL_PROBE at this point */
688         if (m > *mpri) {
689                 *do_rr = match_do_rr;
690                 *mpri = m;
691                 rc = true;
692         }
693 out:
694         return rc;
695 }
696
697 static void __find_rr_leaf(struct fib6_info *f6i_start,
698                            struct fib6_info *nomatch, u32 metric,
699                            struct fib6_result *res, struct fib6_info **cont,
700                            int oif, int strict, bool *do_rr, int *mpri)
701 {
702         struct fib6_info *f6i;
703
704         for (f6i = f6i_start;
705              f6i && f6i != nomatch;
706              f6i = rcu_dereference(f6i->fib6_next)) {
707                 struct fib6_nh *nh;
708
709                 if (cont && f6i->fib6_metric != metric) {
710                         *cont = f6i;
711                         return;
712                 }
713
714                 if (fib6_check_expired(f6i))
715                         continue;
716
717                 nh = &f6i->fib6_nh;
718                 if (find_match(nh, f6i->fib6_flags, oif, strict, mpri, do_rr)) {
719                         res->f6i = f6i;
720                         res->nh = nh;
721                         res->fib6_flags = f6i->fib6_flags;
722                         res->fib6_type = f6i->fib6_type;
723                 }
724         }
725 }
726
727 static void find_rr_leaf(struct fib6_node *fn, struct fib6_info *leaf,
728                          struct fib6_info *rr_head, int oif, int strict,
729                          bool *do_rr, struct fib6_result *res)
730 {
731         u32 metric = rr_head->fib6_metric;
732         struct fib6_info *cont = NULL;
733         int mpri = -1;
734
735         __find_rr_leaf(rr_head, NULL, metric, res, &cont,
736                        oif, strict, do_rr, &mpri);
737
738         __find_rr_leaf(leaf, rr_head, metric, res, &cont,
739                        oif, strict, do_rr, &mpri);
740
741         if (res->f6i || !cont)
742                 return;
743
744         __find_rr_leaf(cont, NULL, metric, res, NULL,
745                        oif, strict, do_rr, &mpri);
746 }
747
748 static void rt6_select(struct net *net, struct fib6_node *fn, int oif,
749                        struct fib6_result *res, int strict)
750 {
751         struct fib6_info *leaf = rcu_dereference(fn->leaf);
752         struct fib6_info *rt0;
753         bool do_rr = false;
754         int key_plen;
755
756         /* make sure this function or its helpers sets f6i */
757         res->f6i = NULL;
758
759         if (!leaf || leaf == net->ipv6.fib6_null_entry)
760                 goto out;
761
762         rt0 = rcu_dereference(fn->rr_ptr);
763         if (!rt0)
764                 rt0 = leaf;
765
766         /* Double check to make sure fn is not an intermediate node
767          * and fn->leaf does not points to its child's leaf
768          * (This might happen if all routes under fn are deleted from
769          * the tree and fib6_repair_tree() is called on the node.)
770          */
771         key_plen = rt0->fib6_dst.plen;
772 #ifdef CONFIG_IPV6_SUBTREES
773         if (rt0->fib6_src.plen)
774                 key_plen = rt0->fib6_src.plen;
775 #endif
776         if (fn->fn_bit != key_plen)
777                 goto out;
778
779         find_rr_leaf(fn, leaf, rt0, oif, strict, &do_rr, res);
780         if (do_rr) {
781                 struct fib6_info *next = rcu_dereference(rt0->fib6_next);
782
783                 /* no entries matched; do round-robin */
784                 if (!next || next->fib6_metric != rt0->fib6_metric)
785                         next = leaf;
786
787                 if (next != rt0) {
788                         spin_lock_bh(&leaf->fib6_table->tb6_lock);
789                         /* make sure next is not being deleted from the tree */
790                         if (next->fib6_node)
791                                 rcu_assign_pointer(fn->rr_ptr, next);
792                         spin_unlock_bh(&leaf->fib6_table->tb6_lock);
793                 }
794         }
795
796 out:
797         if (!res->f6i) {
798                 res->f6i = net->ipv6.fib6_null_entry;
799                 res->nh = &res->f6i->fib6_nh;
800                 res->fib6_flags = res->f6i->fib6_flags;
801                 res->fib6_type = res->f6i->fib6_type;
802         }
803 }
804
805 static bool rt6_is_gw_or_nonexthop(const struct fib6_result *res)
806 {
807         return (res->f6i->fib6_flags & RTF_NONEXTHOP) ||
808                res->nh->fib_nh_gw_family;
809 }
810
811 #ifdef CONFIG_IPV6_ROUTE_INFO
812 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
813                   const struct in6_addr *gwaddr)
814 {
815         struct net *net = dev_net(dev);
816         struct route_info *rinfo = (struct route_info *) opt;
817         struct in6_addr prefix_buf, *prefix;
818         unsigned int pref;
819         unsigned long lifetime;
820         struct fib6_info *rt;
821
822         if (len < sizeof(struct route_info)) {
823                 return -EINVAL;
824         }
825
826         /* Sanity check for prefix_len and length */
827         if (rinfo->length > 3) {
828                 return -EINVAL;
829         } else if (rinfo->prefix_len > 128) {
830                 return -EINVAL;
831         } else if (rinfo->prefix_len > 64) {
832                 if (rinfo->length < 2) {
833                         return -EINVAL;
834                 }
835         } else if (rinfo->prefix_len > 0) {
836                 if (rinfo->length < 1) {
837                         return -EINVAL;
838                 }
839         }
840
841         pref = rinfo->route_pref;
842         if (pref == ICMPV6_ROUTER_PREF_INVALID)
843                 return -EINVAL;
844
845         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
846
847         if (rinfo->length == 3)
848                 prefix = (struct in6_addr *)rinfo->prefix;
849         else {
850                 /* this function is safe */
851                 ipv6_addr_prefix(&prefix_buf,
852                                  (struct in6_addr *)rinfo->prefix,
853                                  rinfo->prefix_len);
854                 prefix = &prefix_buf;
855         }
856
857         if (rinfo->prefix_len == 0)
858                 rt = rt6_get_dflt_router(net, gwaddr, dev);
859         else
860                 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
861                                         gwaddr, dev);
862
863         if (rt && !lifetime) {
864                 ip6_del_rt(net, rt);
865                 rt = NULL;
866         }
867
868         if (!rt && lifetime)
869                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
870                                         dev, pref);
871         else if (rt)
872                 rt->fib6_flags = RTF_ROUTEINFO |
873                                  (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
874
875         if (rt) {
876                 if (!addrconf_finite_timeout(lifetime))
877                         fib6_clean_expires(rt);
878                 else
879                         fib6_set_expires(rt, jiffies + HZ * lifetime);
880
881                 fib6_info_release(rt);
882         }
883         return 0;
884 }
885 #endif
886
887 /*
888  *      Misc support functions
889  */
890
891 /* called with rcu_lock held */
892 static struct net_device *ip6_rt_get_dev_rcu(const struct fib6_result *res)
893 {
894         struct net_device *dev = res->nh->fib_nh_dev;
895
896         if (res->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) {
897                 /* for copies of local routes, dst->dev needs to be the
898                  * device if it is a master device, the master device if
899                  * device is enslaved, and the loopback as the default
900                  */
901                 if (netif_is_l3_slave(dev) &&
902                     !rt6_need_strict(&res->f6i->fib6_dst.addr))
903                         dev = l3mdev_master_dev_rcu(dev);
904                 else if (!netif_is_l3_master(dev))
905                         dev = dev_net(dev)->loopback_dev;
906                 /* last case is netif_is_l3_master(dev) is true in which
907                  * case we want dev returned to be dev
908                  */
909         }
910
911         return dev;
912 }
913
914 static const int fib6_prop[RTN_MAX + 1] = {
915         [RTN_UNSPEC]    = 0,
916         [RTN_UNICAST]   = 0,
917         [RTN_LOCAL]     = 0,
918         [RTN_BROADCAST] = 0,
919         [RTN_ANYCAST]   = 0,
920         [RTN_MULTICAST] = 0,
921         [RTN_BLACKHOLE] = -EINVAL,
922         [RTN_UNREACHABLE] = -EHOSTUNREACH,
923         [RTN_PROHIBIT]  = -EACCES,
924         [RTN_THROW]     = -EAGAIN,
925         [RTN_NAT]       = -EINVAL,
926         [RTN_XRESOLVE]  = -EINVAL,
927 };
928
929 static int ip6_rt_type_to_error(u8 fib6_type)
930 {
931         return fib6_prop[fib6_type];
932 }
933
934 static unsigned short fib6_info_dst_flags(struct fib6_info *rt)
935 {
936         unsigned short flags = 0;
937
938         if (rt->dst_nocount)
939                 flags |= DST_NOCOUNT;
940         if (rt->dst_nopolicy)
941                 flags |= DST_NOPOLICY;
942         if (rt->dst_host)
943                 flags |= DST_HOST;
944
945         return flags;
946 }
947
948 static void ip6_rt_init_dst_reject(struct rt6_info *rt, u8 fib6_type)
949 {
950         rt->dst.error = ip6_rt_type_to_error(fib6_type);
951
952         switch (fib6_type) {
953         case RTN_BLACKHOLE:
954                 rt->dst.output = dst_discard_out;
955                 rt->dst.input = dst_discard;
956                 break;
957         case RTN_PROHIBIT:
958                 rt->dst.output = ip6_pkt_prohibit_out;
959                 rt->dst.input = ip6_pkt_prohibit;
960                 break;
961         case RTN_THROW:
962         case RTN_UNREACHABLE:
963         default:
964                 rt->dst.output = ip6_pkt_discard_out;
965                 rt->dst.input = ip6_pkt_discard;
966                 break;
967         }
968 }
969
970 static void ip6_rt_init_dst(struct rt6_info *rt, const struct fib6_result *res)
971 {
972         struct fib6_info *f6i = res->f6i;
973
974         if (res->fib6_flags & RTF_REJECT) {
975                 ip6_rt_init_dst_reject(rt, res->fib6_type);
976                 return;
977         }
978
979         rt->dst.error = 0;
980         rt->dst.output = ip6_output;
981
982         if (res->fib6_type == RTN_LOCAL || res->fib6_type == RTN_ANYCAST) {
983                 rt->dst.input = ip6_input;
984         } else if (ipv6_addr_type(&f6i->fib6_dst.addr) & IPV6_ADDR_MULTICAST) {
985                 rt->dst.input = ip6_mc_input;
986         } else {
987                 rt->dst.input = ip6_forward;
988         }
989
990         if (res->nh->fib_nh_lws) {
991                 rt->dst.lwtstate = lwtstate_get(res->nh->fib_nh_lws);
992                 lwtunnel_set_redirect(&rt->dst);
993         }
994
995         rt->dst.lastuse = jiffies;
996 }
997
998 /* Caller must already hold reference to @from */
999 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
1000 {
1001         rt->rt6i_flags &= ~RTF_EXPIRES;
1002         rcu_assign_pointer(rt->from, from);
1003         ip_dst_init_metrics(&rt->dst, from->fib6_metrics);
1004 }
1005
1006 /* Caller must already hold reference to f6i in result */
1007 static void ip6_rt_copy_init(struct rt6_info *rt, const struct fib6_result *res)
1008 {
1009         const struct fib6_nh *nh = res->nh;
1010         const struct net_device *dev = nh->fib_nh_dev;
1011         struct fib6_info *f6i = res->f6i;
1012
1013         ip6_rt_init_dst(rt, res);
1014
1015         rt->rt6i_dst = f6i->fib6_dst;
1016         rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL;
1017         rt->rt6i_flags = res->fib6_flags;
1018         if (nh->fib_nh_gw_family) {
1019                 rt->rt6i_gateway = nh->fib_nh_gw6;
1020                 rt->rt6i_flags |= RTF_GATEWAY;
1021         }
1022         rt6_set_from(rt, f6i);
1023 #ifdef CONFIG_IPV6_SUBTREES
1024         rt->rt6i_src = f6i->fib6_src;
1025 #endif
1026 }
1027
1028 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
1029                                         struct in6_addr *saddr)
1030 {
1031         struct fib6_node *pn, *sn;
1032         while (1) {
1033                 if (fn->fn_flags & RTN_TL_ROOT)
1034                         return NULL;
1035                 pn = rcu_dereference(fn->parent);
1036                 sn = FIB6_SUBTREE(pn);
1037                 if (sn && sn != fn)
1038                         fn = fib6_node_lookup(sn, NULL, saddr);
1039                 else
1040                         fn = pn;
1041                 if (fn->fn_flags & RTN_RTINFO)
1042                         return fn;
1043         }
1044 }
1045
1046 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt)
1047 {
1048         struct rt6_info *rt = *prt;
1049
1050         if (dst_hold_safe(&rt->dst))
1051                 return true;
1052         if (net) {
1053                 rt = net->ipv6.ip6_null_entry;
1054                 dst_hold(&rt->dst);
1055         } else {
1056                 rt = NULL;
1057         }
1058         *prt = rt;
1059         return false;
1060 }
1061
1062 /* called with rcu_lock held */
1063 static struct rt6_info *ip6_create_rt_rcu(const struct fib6_result *res)
1064 {
1065         struct net_device *dev = res->nh->fib_nh_dev;
1066         struct fib6_info *f6i = res->f6i;
1067         unsigned short flags;
1068         struct rt6_info *nrt;
1069
1070         if (!fib6_info_hold_safe(f6i))
1071                 goto fallback;
1072
1073         flags = fib6_info_dst_flags(f6i);
1074         nrt = ip6_dst_alloc(dev_net(dev), dev, flags);
1075         if (!nrt) {
1076                 fib6_info_release(f6i);
1077                 goto fallback;
1078         }
1079
1080         ip6_rt_copy_init(nrt, res);
1081         return nrt;
1082
1083 fallback:
1084         nrt = dev_net(dev)->ipv6.ip6_null_entry;
1085         dst_hold(&nrt->dst);
1086         return nrt;
1087 }
1088
1089 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
1090                                              struct fib6_table *table,
1091                                              struct flowi6 *fl6,
1092                                              const struct sk_buff *skb,
1093                                              int flags)
1094 {
1095         struct fib6_result res = {};
1096         struct fib6_node *fn;
1097         struct rt6_info *rt;
1098
1099         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1100                 flags &= ~RT6_LOOKUP_F_IFACE;
1101
1102         rcu_read_lock();
1103         fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1104 restart:
1105         res.f6i = rcu_dereference(fn->leaf);
1106         if (!res.f6i)
1107                 res.f6i = net->ipv6.fib6_null_entry;
1108         else
1109                 rt6_device_match(net, &res, &fl6->saddr, fl6->flowi6_oif,
1110                                  flags);
1111
1112         if (res.f6i == net->ipv6.fib6_null_entry) {
1113                 fn = fib6_backtrack(fn, &fl6->saddr);
1114                 if (fn)
1115                         goto restart;
1116
1117                 rt = net->ipv6.ip6_null_entry;
1118                 dst_hold(&rt->dst);
1119                 goto out;
1120         }
1121
1122         fib6_select_path(net, &res, fl6, fl6->flowi6_oif,
1123                          fl6->flowi6_oif != 0, skb, flags);
1124
1125         /* Search through exception table */
1126         rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr);
1127         if (rt) {
1128                 if (ip6_hold_safe(net, &rt))
1129                         dst_use_noref(&rt->dst, jiffies);
1130         } else {
1131                 rt = ip6_create_rt_rcu(&res);
1132         }
1133
1134 out:
1135         trace_fib6_table_lookup(net, &res, table, fl6);
1136
1137         rcu_read_unlock();
1138
1139         return rt;
1140 }
1141
1142 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
1143                                    const struct sk_buff *skb, int flags)
1144 {
1145         return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
1146 }
1147 EXPORT_SYMBOL_GPL(ip6_route_lookup);
1148
1149 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
1150                             const struct in6_addr *saddr, int oif,
1151                             const struct sk_buff *skb, int strict)
1152 {
1153         struct flowi6 fl6 = {
1154                 .flowi6_oif = oif,
1155                 .daddr = *daddr,
1156         };
1157         struct dst_entry *dst;
1158         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
1159
1160         if (saddr) {
1161                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
1162                 flags |= RT6_LOOKUP_F_HAS_SADDR;
1163         }
1164
1165         dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
1166         if (dst->error == 0)
1167                 return (struct rt6_info *) dst;
1168
1169         dst_release(dst);
1170
1171         return NULL;
1172 }
1173 EXPORT_SYMBOL(rt6_lookup);
1174
1175 /* ip6_ins_rt is called with FREE table->tb6_lock.
1176  * It takes new route entry, the addition fails by any reason the
1177  * route is released.
1178  * Caller must hold dst before calling it.
1179  */
1180
1181 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info,
1182                         struct netlink_ext_ack *extack)
1183 {
1184         int err;
1185         struct fib6_table *table;
1186
1187         table = rt->fib6_table;
1188         spin_lock_bh(&table->tb6_lock);
1189         err = fib6_add(&table->tb6_root, rt, info, extack);
1190         spin_unlock_bh(&table->tb6_lock);
1191
1192         return err;
1193 }
1194
1195 int ip6_ins_rt(struct net *net, struct fib6_info *rt)
1196 {
1197         struct nl_info info = { .nl_net = net, };
1198
1199         return __ip6_ins_rt(rt, &info, NULL);
1200 }
1201
1202 static struct rt6_info *ip6_rt_cache_alloc(const struct fib6_result *res,
1203                                            const struct in6_addr *daddr,
1204                                            const struct in6_addr *saddr)
1205 {
1206         struct fib6_info *f6i = res->f6i;
1207         struct net_device *dev;
1208         struct rt6_info *rt;
1209
1210         /*
1211          *      Clone the route.
1212          */
1213
1214         if (!fib6_info_hold_safe(f6i))
1215                 return NULL;
1216
1217         dev = ip6_rt_get_dev_rcu(res);
1218         rt = ip6_dst_alloc(dev_net(dev), dev, 0);
1219         if (!rt) {
1220                 fib6_info_release(f6i);
1221                 return NULL;
1222         }
1223
1224         ip6_rt_copy_init(rt, res);
1225         rt->rt6i_flags |= RTF_CACHE;
1226         rt->dst.flags |= DST_HOST;
1227         rt->rt6i_dst.addr = *daddr;
1228         rt->rt6i_dst.plen = 128;
1229
1230         if (!rt6_is_gw_or_nonexthop(res)) {
1231                 if (f6i->fib6_dst.plen != 128 &&
1232                     ipv6_addr_equal(&f6i->fib6_dst.addr, daddr))
1233                         rt->rt6i_flags |= RTF_ANYCAST;
1234 #ifdef CONFIG_IPV6_SUBTREES
1235                 if (rt->rt6i_src.plen && saddr) {
1236                         rt->rt6i_src.addr = *saddr;
1237                         rt->rt6i_src.plen = 128;
1238                 }
1239 #endif
1240         }
1241
1242         return rt;
1243 }
1244
1245 static struct rt6_info *ip6_rt_pcpu_alloc(const struct fib6_result *res)
1246 {
1247         struct fib6_info *f6i = res->f6i;
1248         unsigned short flags = fib6_info_dst_flags(f6i);
1249         struct net_device *dev;
1250         struct rt6_info *pcpu_rt;
1251
1252         if (!fib6_info_hold_safe(f6i))
1253                 return NULL;
1254
1255         rcu_read_lock();
1256         dev = ip6_rt_get_dev_rcu(res);
1257         pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags);
1258         rcu_read_unlock();
1259         if (!pcpu_rt) {
1260                 fib6_info_release(f6i);
1261                 return NULL;
1262         }
1263         ip6_rt_copy_init(pcpu_rt, res);
1264         pcpu_rt->rt6i_flags |= RTF_PCPU;
1265         return pcpu_rt;
1266 }
1267
1268 /* It should be called with rcu_read_lock() acquired */
1269 static struct rt6_info *rt6_get_pcpu_route(const struct fib6_result *res)
1270 {
1271         struct rt6_info *pcpu_rt, **p;
1272
1273         p = this_cpu_ptr(res->nh->rt6i_pcpu);
1274         pcpu_rt = *p;
1275
1276         if (pcpu_rt)
1277                 ip6_hold_safe(NULL, &pcpu_rt);
1278
1279         return pcpu_rt;
1280 }
1281
1282 static struct rt6_info *rt6_make_pcpu_route(struct net *net,
1283                                             const struct fib6_result *res)
1284 {
1285         struct rt6_info *pcpu_rt, *prev, **p;
1286
1287         pcpu_rt = ip6_rt_pcpu_alloc(res);
1288         if (!pcpu_rt) {
1289                 dst_hold(&net->ipv6.ip6_null_entry->dst);
1290                 return net->ipv6.ip6_null_entry;
1291         }
1292
1293         dst_hold(&pcpu_rt->dst);
1294         p = this_cpu_ptr(res->nh->rt6i_pcpu);
1295         prev = cmpxchg(p, NULL, pcpu_rt);
1296         BUG_ON(prev);
1297
1298         if (res->f6i->fib6_destroying) {
1299                 struct fib6_info *from;
1300
1301                 from = xchg((__force struct fib6_info **)&pcpu_rt->from, NULL);
1302                 fib6_info_release(from);
1303         }
1304
1305         return pcpu_rt;
1306 }
1307
1308 /* exception hash table implementation
1309  */
1310 static DEFINE_SPINLOCK(rt6_exception_lock);
1311
1312 /* Remove rt6_ex from hash table and free the memory
1313  * Caller must hold rt6_exception_lock
1314  */
1315 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1316                                  struct rt6_exception *rt6_ex)
1317 {
1318         struct fib6_info *from;
1319         struct net *net;
1320
1321         if (!bucket || !rt6_ex)
1322                 return;
1323
1324         net = dev_net(rt6_ex->rt6i->dst.dev);
1325         net->ipv6.rt6_stats->fib_rt_cache--;
1326
1327         /* purge completely the exception to allow releasing the held resources:
1328          * some [sk] cache may keep the dst around for unlimited time
1329          */
1330         from = xchg((__force struct fib6_info **)&rt6_ex->rt6i->from, NULL);
1331         fib6_info_release(from);
1332         dst_dev_put(&rt6_ex->rt6i->dst);
1333
1334         hlist_del_rcu(&rt6_ex->hlist);
1335         dst_release(&rt6_ex->rt6i->dst);
1336         kfree_rcu(rt6_ex, rcu);
1337         WARN_ON_ONCE(!bucket->depth);
1338         bucket->depth--;
1339 }
1340
1341 /* Remove oldest rt6_ex in bucket and free the memory
1342  * Caller must hold rt6_exception_lock
1343  */
1344 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1345 {
1346         struct rt6_exception *rt6_ex, *oldest = NULL;
1347
1348         if (!bucket)
1349                 return;
1350
1351         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1352                 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1353                         oldest = rt6_ex;
1354         }
1355         rt6_remove_exception(bucket, oldest);
1356 }
1357
1358 static u32 rt6_exception_hash(const struct in6_addr *dst,
1359                               const struct in6_addr *src)
1360 {
1361         static u32 seed __read_mostly;
1362         u32 val;
1363
1364         net_get_random_once(&seed, sizeof(seed));
1365         val = jhash(dst, sizeof(*dst), seed);
1366
1367 #ifdef CONFIG_IPV6_SUBTREES
1368         if (src)
1369                 val = jhash(src, sizeof(*src), val);
1370 #endif
1371         return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1372 }
1373
1374 /* Helper function to find the cached rt in the hash table
1375  * and update bucket pointer to point to the bucket for this
1376  * (daddr, saddr) pair
1377  * Caller must hold rt6_exception_lock
1378  */
1379 static struct rt6_exception *
1380 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1381                               const struct in6_addr *daddr,
1382                               const struct in6_addr *saddr)
1383 {
1384         struct rt6_exception *rt6_ex;
1385         u32 hval;
1386
1387         if (!(*bucket) || !daddr)
1388                 return NULL;
1389
1390         hval = rt6_exception_hash(daddr, saddr);
1391         *bucket += hval;
1392
1393         hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1394                 struct rt6_info *rt6 = rt6_ex->rt6i;
1395                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1396
1397 #ifdef CONFIG_IPV6_SUBTREES
1398                 if (matched && saddr)
1399                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1400 #endif
1401                 if (matched)
1402                         return rt6_ex;
1403         }
1404         return NULL;
1405 }
1406
1407 /* Helper function to find the cached rt in the hash table
1408  * and update bucket pointer to point to the bucket for this
1409  * (daddr, saddr) pair
1410  * Caller must hold rcu_read_lock()
1411  */
1412 static struct rt6_exception *
1413 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1414                          const struct in6_addr *daddr,
1415                          const struct in6_addr *saddr)
1416 {
1417         struct rt6_exception *rt6_ex;
1418         u32 hval;
1419
1420         WARN_ON_ONCE(!rcu_read_lock_held());
1421
1422         if (!(*bucket) || !daddr)
1423                 return NULL;
1424
1425         hval = rt6_exception_hash(daddr, saddr);
1426         *bucket += hval;
1427
1428         hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1429                 struct rt6_info *rt6 = rt6_ex->rt6i;
1430                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1431
1432 #ifdef CONFIG_IPV6_SUBTREES
1433                 if (matched && saddr)
1434                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1435 #endif
1436                 if (matched)
1437                         return rt6_ex;
1438         }
1439         return NULL;
1440 }
1441
1442 static unsigned int fib6_mtu(const struct fib6_result *res)
1443 {
1444         const struct fib6_nh *nh = res->nh;
1445         unsigned int mtu;
1446
1447         if (res->f6i->fib6_pmtu) {
1448                 mtu = res->f6i->fib6_pmtu;
1449         } else {
1450                 struct net_device *dev = nh->fib_nh_dev;
1451                 struct inet6_dev *idev;
1452
1453                 rcu_read_lock();
1454                 idev = __in6_dev_get(dev);
1455                 mtu = idev->cnf.mtu6;
1456                 rcu_read_unlock();
1457         }
1458
1459         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1460
1461         return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu);
1462 }
1463
1464 static int rt6_insert_exception(struct rt6_info *nrt,
1465                                 const struct fib6_result *res)
1466 {
1467         struct net *net = dev_net(nrt->dst.dev);
1468         struct rt6_exception_bucket *bucket;
1469         struct in6_addr *src_key = NULL;
1470         struct rt6_exception *rt6_ex;
1471         struct fib6_info *f6i = res->f6i;
1472         int err = 0;
1473
1474         spin_lock_bh(&rt6_exception_lock);
1475
1476         if (f6i->exception_bucket_flushed) {
1477                 err = -EINVAL;
1478                 goto out;
1479         }
1480
1481         bucket = rcu_dereference_protected(f6i->rt6i_exception_bucket,
1482                                         lockdep_is_held(&rt6_exception_lock));
1483         if (!bucket) {
1484                 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1485                                  GFP_ATOMIC);
1486                 if (!bucket) {
1487                         err = -ENOMEM;
1488                         goto out;
1489                 }
1490                 rcu_assign_pointer(f6i->rt6i_exception_bucket, bucket);
1491         }
1492
1493 #ifdef CONFIG_IPV6_SUBTREES
1494         /* fib6_src.plen != 0 indicates f6i is in subtree
1495          * and exception table is indexed by a hash of
1496          * both fib6_dst and fib6_src.
1497          * Otherwise, the exception table is indexed by
1498          * a hash of only fib6_dst.
1499          */
1500         if (f6i->fib6_src.plen)
1501                 src_key = &nrt->rt6i_src.addr;
1502 #endif
1503         /* rt6_mtu_change() might lower mtu on f6i.
1504          * Only insert this exception route if its mtu
1505          * is less than f6i's mtu value.
1506          */
1507         if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(res)) {
1508                 err = -EINVAL;
1509                 goto out;
1510         }
1511
1512         rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1513                                                src_key);
1514         if (rt6_ex)
1515                 rt6_remove_exception(bucket, rt6_ex);
1516
1517         rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1518         if (!rt6_ex) {
1519                 err = -ENOMEM;
1520                 goto out;
1521         }
1522         rt6_ex->rt6i = nrt;
1523         rt6_ex->stamp = jiffies;
1524         hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1525         bucket->depth++;
1526         net->ipv6.rt6_stats->fib_rt_cache++;
1527
1528         if (bucket->depth > FIB6_MAX_DEPTH)
1529                 rt6_exception_remove_oldest(bucket);
1530
1531 out:
1532         spin_unlock_bh(&rt6_exception_lock);
1533
1534         /* Update fn->fn_sernum to invalidate all cached dst */
1535         if (!err) {
1536                 spin_lock_bh(&f6i->fib6_table->tb6_lock);
1537                 fib6_update_sernum(net, f6i);
1538                 spin_unlock_bh(&f6i->fib6_table->tb6_lock);
1539                 fib6_force_start_gc(net);
1540         }
1541
1542         return err;
1543 }
1544
1545 void rt6_flush_exceptions(struct fib6_info *rt)
1546 {
1547         struct rt6_exception_bucket *bucket;
1548         struct rt6_exception *rt6_ex;
1549         struct hlist_node *tmp;
1550         int i;
1551
1552         spin_lock_bh(&rt6_exception_lock);
1553         /* Prevent rt6_insert_exception() to recreate the bucket list */
1554         rt->exception_bucket_flushed = 1;
1555
1556         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1557                                     lockdep_is_held(&rt6_exception_lock));
1558         if (!bucket)
1559                 goto out;
1560
1561         for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1562                 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1563                         rt6_remove_exception(bucket, rt6_ex);
1564                 WARN_ON_ONCE(bucket->depth);
1565                 bucket++;
1566         }
1567
1568 out:
1569         spin_unlock_bh(&rt6_exception_lock);
1570 }
1571
1572 /* Find cached rt in the hash table inside passed in rt
1573  * Caller has to hold rcu_read_lock()
1574  */
1575 static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res,
1576                                            const struct in6_addr *daddr,
1577                                            const struct in6_addr *saddr)
1578 {
1579         const struct in6_addr *src_key = NULL;
1580         struct rt6_exception_bucket *bucket;
1581         struct rt6_exception *rt6_ex;
1582         struct rt6_info *ret = NULL;
1583
1584 #ifdef CONFIG_IPV6_SUBTREES
1585         /* fib6i_src.plen != 0 indicates f6i is in subtree
1586          * and exception table is indexed by a hash of
1587          * both fib6_dst and fib6_src.
1588          * However, the src addr used to create the hash
1589          * might not be exactly the passed in saddr which
1590          * is a /128 addr from the flow.
1591          * So we need to use f6i->fib6_src to redo lookup
1592          * if the passed in saddr does not find anything.
1593          * (See the logic in ip6_rt_cache_alloc() on how
1594          * rt->rt6i_src is updated.)
1595          */
1596         if (res->f6i->fib6_src.plen)
1597                 src_key = saddr;
1598 find_ex:
1599 #endif
1600         bucket = rcu_dereference(res->f6i->rt6i_exception_bucket);
1601         rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1602
1603         if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1604                 ret = rt6_ex->rt6i;
1605
1606 #ifdef CONFIG_IPV6_SUBTREES
1607         /* Use fib6_src as src_key and redo lookup */
1608         if (!ret && src_key && src_key != &res->f6i->fib6_src.addr) {
1609                 src_key = &res->f6i->fib6_src.addr;
1610                 goto find_ex;
1611         }
1612 #endif
1613
1614         return ret;
1615 }
1616
1617 /* Remove the passed in cached rt from the hash table that contains it */
1618 static int rt6_remove_exception_rt(struct rt6_info *rt)
1619 {
1620         struct rt6_exception_bucket *bucket;
1621         struct in6_addr *src_key = NULL;
1622         struct rt6_exception *rt6_ex;
1623         struct fib6_info *from;
1624         int err;
1625
1626         from = rcu_dereference(rt->from);
1627         if (!from ||
1628             !(rt->rt6i_flags & RTF_CACHE))
1629                 return -EINVAL;
1630
1631         if (!rcu_access_pointer(from->rt6i_exception_bucket))
1632                 return -ENOENT;
1633
1634         spin_lock_bh(&rt6_exception_lock);
1635         bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1636                                     lockdep_is_held(&rt6_exception_lock));
1637 #ifdef CONFIG_IPV6_SUBTREES
1638         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1639          * and exception table is indexed by a hash of
1640          * both rt6i_dst and rt6i_src.
1641          * Otherwise, the exception table is indexed by
1642          * a hash of only rt6i_dst.
1643          */
1644         if (from->fib6_src.plen)
1645                 src_key = &rt->rt6i_src.addr;
1646 #endif
1647         rt6_ex = __rt6_find_exception_spinlock(&bucket,
1648                                                &rt->rt6i_dst.addr,
1649                                                src_key);
1650         if (rt6_ex) {
1651                 rt6_remove_exception(bucket, rt6_ex);
1652                 err = 0;
1653         } else {
1654                 err = -ENOENT;
1655         }
1656
1657         spin_unlock_bh(&rt6_exception_lock);
1658         return err;
1659 }
1660
1661 /* Find rt6_ex which contains the passed in rt cache and
1662  * refresh its stamp
1663  */
1664 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1665 {
1666         struct rt6_exception_bucket *bucket;
1667         struct in6_addr *src_key = NULL;
1668         struct rt6_exception *rt6_ex;
1669         struct fib6_info *from;
1670
1671         rcu_read_lock();
1672         from = rcu_dereference(rt->from);
1673         if (!from || !(rt->rt6i_flags & RTF_CACHE))
1674                 goto unlock;
1675
1676         bucket = rcu_dereference(from->rt6i_exception_bucket);
1677
1678 #ifdef CONFIG_IPV6_SUBTREES
1679         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1680          * and exception table is indexed by a hash of
1681          * both rt6i_dst and rt6i_src.
1682          * Otherwise, the exception table is indexed by
1683          * a hash of only rt6i_dst.
1684          */
1685         if (from->fib6_src.plen)
1686                 src_key = &rt->rt6i_src.addr;
1687 #endif
1688         rt6_ex = __rt6_find_exception_rcu(&bucket,
1689                                           &rt->rt6i_dst.addr,
1690                                           src_key);
1691         if (rt6_ex)
1692                 rt6_ex->stamp = jiffies;
1693
1694 unlock:
1695         rcu_read_unlock();
1696 }
1697
1698 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
1699                                          struct rt6_info *rt, int mtu)
1700 {
1701         /* If the new MTU is lower than the route PMTU, this new MTU will be the
1702          * lowest MTU in the path: always allow updating the route PMTU to
1703          * reflect PMTU decreases.
1704          *
1705          * If the new MTU is higher, and the route PMTU is equal to the local
1706          * MTU, this means the old MTU is the lowest in the path, so allow
1707          * updating it: if other nodes now have lower MTUs, PMTU discovery will
1708          * handle this.
1709          */
1710
1711         if (dst_mtu(&rt->dst) >= mtu)
1712                 return true;
1713
1714         if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
1715                 return true;
1716
1717         return false;
1718 }
1719
1720 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
1721                                        struct fib6_info *rt, int mtu)
1722 {
1723         struct rt6_exception_bucket *bucket;
1724         struct rt6_exception *rt6_ex;
1725         int i;
1726
1727         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1728                                         lockdep_is_held(&rt6_exception_lock));
1729
1730         if (!bucket)
1731                 return;
1732
1733         for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1734                 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1735                         struct rt6_info *entry = rt6_ex->rt6i;
1736
1737                         /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
1738                          * route), the metrics of its rt->from have already
1739                          * been updated.
1740                          */
1741                         if (dst_metric_raw(&entry->dst, RTAX_MTU) &&
1742                             rt6_mtu_change_route_allowed(idev, entry, mtu))
1743                                 dst_metric_set(&entry->dst, RTAX_MTU, mtu);
1744                 }
1745                 bucket++;
1746         }
1747 }
1748
1749 #define RTF_CACHE_GATEWAY       (RTF_GATEWAY | RTF_CACHE)
1750
1751 static void rt6_exceptions_clean_tohost(struct fib6_info *rt,
1752                                         struct in6_addr *gateway)
1753 {
1754         struct rt6_exception_bucket *bucket;
1755         struct rt6_exception *rt6_ex;
1756         struct hlist_node *tmp;
1757         int i;
1758
1759         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1760                 return;
1761
1762         spin_lock_bh(&rt6_exception_lock);
1763         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1764                                      lockdep_is_held(&rt6_exception_lock));
1765
1766         if (bucket) {
1767                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1768                         hlist_for_each_entry_safe(rt6_ex, tmp,
1769                                                   &bucket->chain, hlist) {
1770                                 struct rt6_info *entry = rt6_ex->rt6i;
1771
1772                                 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1773                                     RTF_CACHE_GATEWAY &&
1774                                     ipv6_addr_equal(gateway,
1775                                                     &entry->rt6i_gateway)) {
1776                                         rt6_remove_exception(bucket, rt6_ex);
1777                                 }
1778                         }
1779                         bucket++;
1780                 }
1781         }
1782
1783         spin_unlock_bh(&rt6_exception_lock);
1784 }
1785
1786 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1787                                       struct rt6_exception *rt6_ex,
1788                                       struct fib6_gc_args *gc_args,
1789                                       unsigned long now)
1790 {
1791         struct rt6_info *rt = rt6_ex->rt6i;
1792
1793         /* we are pruning and obsoleting aged-out and non gateway exceptions
1794          * even if others have still references to them, so that on next
1795          * dst_check() such references can be dropped.
1796          * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1797          * expired, independently from their aging, as per RFC 8201 section 4
1798          */
1799         if (!(rt->rt6i_flags & RTF_EXPIRES)) {
1800                 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1801                         RT6_TRACE("aging clone %p\n", rt);
1802                         rt6_remove_exception(bucket, rt6_ex);
1803                         return;
1804                 }
1805         } else if (time_after(jiffies, rt->dst.expires)) {
1806                 RT6_TRACE("purging expired route %p\n", rt);
1807                 rt6_remove_exception(bucket, rt6_ex);
1808                 return;
1809         }
1810
1811         if (rt->rt6i_flags & RTF_GATEWAY) {
1812                 struct neighbour *neigh;
1813                 __u8 neigh_flags = 0;
1814
1815                 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
1816                 if (neigh)
1817                         neigh_flags = neigh->flags;
1818
1819                 if (!(neigh_flags & NTF_ROUTER)) {
1820                         RT6_TRACE("purging route %p via non-router but gateway\n",
1821                                   rt);
1822                         rt6_remove_exception(bucket, rt6_ex);
1823                         return;
1824                 }
1825         }
1826
1827         gc_args->more++;
1828 }
1829
1830 void rt6_age_exceptions(struct fib6_info *rt,
1831                         struct fib6_gc_args *gc_args,
1832                         unsigned long now)
1833 {
1834         struct rt6_exception_bucket *bucket;
1835         struct rt6_exception *rt6_ex;
1836         struct hlist_node *tmp;
1837         int i;
1838
1839         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1840                 return;
1841
1842         rcu_read_lock_bh();
1843         spin_lock(&rt6_exception_lock);
1844         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1845                                     lockdep_is_held(&rt6_exception_lock));
1846
1847         if (bucket) {
1848                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1849                         hlist_for_each_entry_safe(rt6_ex, tmp,
1850                                                   &bucket->chain, hlist) {
1851                                 rt6_age_examine_exception(bucket, rt6_ex,
1852                                                           gc_args, now);
1853                         }
1854                         bucket++;
1855                 }
1856         }
1857         spin_unlock(&rt6_exception_lock);
1858         rcu_read_unlock_bh();
1859 }
1860
1861 /* must be called with rcu lock held */
1862 int fib6_table_lookup(struct net *net, struct fib6_table *table, int oif,
1863                       struct flowi6 *fl6, struct fib6_result *res, int strict)
1864 {
1865         struct fib6_node *fn, *saved_fn;
1866
1867         fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1868         saved_fn = fn;
1869
1870         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1871                 oif = 0;
1872
1873 redo_rt6_select:
1874         rt6_select(net, fn, oif, res, strict);
1875         if (res->f6i == net->ipv6.fib6_null_entry) {
1876                 fn = fib6_backtrack(fn, &fl6->saddr);
1877                 if (fn)
1878                         goto redo_rt6_select;
1879                 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1880                         /* also consider unreachable route */
1881                         strict &= ~RT6_LOOKUP_F_REACHABLE;
1882                         fn = saved_fn;
1883                         goto redo_rt6_select;
1884                 }
1885         }
1886
1887         trace_fib6_table_lookup(net, res, table, fl6);
1888
1889         return 0;
1890 }
1891
1892 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1893                                int oif, struct flowi6 *fl6,
1894                                const struct sk_buff *skb, int flags)
1895 {
1896         struct fib6_result res = {};
1897         struct rt6_info *rt;
1898         int strict = 0;
1899
1900         strict |= flags & RT6_LOOKUP_F_IFACE;
1901         strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1902         if (net->ipv6.devconf_all->forwarding == 0)
1903                 strict |= RT6_LOOKUP_F_REACHABLE;
1904
1905         rcu_read_lock();
1906
1907         fib6_table_lookup(net, table, oif, fl6, &res, strict);
1908         if (res.f6i == net->ipv6.fib6_null_entry) {
1909                 rt = net->ipv6.ip6_null_entry;
1910                 rcu_read_unlock();
1911                 dst_hold(&rt->dst);
1912                 return rt;
1913         }
1914
1915         fib6_select_path(net, &res, fl6, oif, false, skb, strict);
1916
1917         /*Search through exception table */
1918         rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr);
1919         if (rt) {
1920                 if (ip6_hold_safe(net, &rt))
1921                         dst_use_noref(&rt->dst, jiffies);
1922
1923                 rcu_read_unlock();
1924                 return rt;
1925         } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1926                             !res.nh->fib_nh_gw_family)) {
1927                 /* Create a RTF_CACHE clone which will not be
1928                  * owned by the fib6 tree.  It is for the special case where
1929                  * the daddr in the skb during the neighbor look-up is different
1930                  * from the fl6->daddr used to look-up route here.
1931                  */
1932                 struct rt6_info *uncached_rt;
1933
1934                 uncached_rt = ip6_rt_cache_alloc(&res, &fl6->daddr, NULL);
1935
1936                 rcu_read_unlock();
1937
1938                 if (uncached_rt) {
1939                         /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1940                          * No need for another dst_hold()
1941                          */
1942                         rt6_uncached_list_add(uncached_rt);
1943                         atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1944                 } else {
1945                         uncached_rt = net->ipv6.ip6_null_entry;
1946                         dst_hold(&uncached_rt->dst);
1947                 }
1948
1949                 return uncached_rt;
1950         } else {
1951                 /* Get a percpu copy */
1952
1953                 struct rt6_info *pcpu_rt;
1954
1955                 local_bh_disable();
1956                 pcpu_rt = rt6_get_pcpu_route(&res);
1957
1958                 if (!pcpu_rt)
1959                         pcpu_rt = rt6_make_pcpu_route(net, &res);
1960
1961                 local_bh_enable();
1962                 rcu_read_unlock();
1963
1964                 return pcpu_rt;
1965         }
1966 }
1967 EXPORT_SYMBOL_GPL(ip6_pol_route);
1968
1969 static struct rt6_info *ip6_pol_route_input(struct net *net,
1970                                             struct fib6_table *table,
1971                                             struct flowi6 *fl6,
1972                                             const struct sk_buff *skb,
1973                                             int flags)
1974 {
1975         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
1976 }
1977
1978 struct dst_entry *ip6_route_input_lookup(struct net *net,
1979                                          struct net_device *dev,
1980                                          struct flowi6 *fl6,
1981                                          const struct sk_buff *skb,
1982                                          int flags)
1983 {
1984         if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1985                 flags |= RT6_LOOKUP_F_IFACE;
1986
1987         return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
1988 }
1989 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1990
1991 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1992                                   struct flow_keys *keys,
1993                                   struct flow_keys *flkeys)
1994 {
1995         const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1996         const struct ipv6hdr *key_iph = outer_iph;
1997         struct flow_keys *_flkeys = flkeys;
1998         const struct ipv6hdr *inner_iph;
1999         const struct icmp6hdr *icmph;
2000         struct ipv6hdr _inner_iph;
2001         struct icmp6hdr _icmph;
2002
2003         if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
2004                 goto out;
2005
2006         icmph = skb_header_pointer(skb, skb_transport_offset(skb),
2007                                    sizeof(_icmph), &_icmph);
2008         if (!icmph)
2009                 goto out;
2010
2011         if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
2012             icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
2013             icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
2014             icmph->icmp6_type != ICMPV6_PARAMPROB)
2015                 goto out;
2016
2017         inner_iph = skb_header_pointer(skb,
2018                                        skb_transport_offset(skb) + sizeof(*icmph),
2019                                        sizeof(_inner_iph), &_inner_iph);
2020         if (!inner_iph)
2021                 goto out;
2022
2023         key_iph = inner_iph;
2024         _flkeys = NULL;
2025 out:
2026         if (_flkeys) {
2027                 keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
2028                 keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
2029                 keys->tags.flow_label = _flkeys->tags.flow_label;
2030                 keys->basic.ip_proto = _flkeys->basic.ip_proto;
2031         } else {
2032                 keys->addrs.v6addrs.src = key_iph->saddr;
2033                 keys->addrs.v6addrs.dst = key_iph->daddr;
2034                 keys->tags.flow_label = ip6_flowlabel(key_iph);
2035                 keys->basic.ip_proto = key_iph->nexthdr;
2036         }
2037 }
2038
2039 /* if skb is set it will be used and fl6 can be NULL */
2040 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
2041                        const struct sk_buff *skb, struct flow_keys *flkeys)
2042 {
2043         struct flow_keys hash_keys;
2044         u32 mhash;
2045
2046         switch (ip6_multipath_hash_policy(net)) {
2047         case 0:
2048                 memset(&hash_keys, 0, sizeof(hash_keys));
2049                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2050                 if (skb) {
2051                         ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
2052                 } else {
2053                         hash_keys.addrs.v6addrs.src = fl6->saddr;
2054                         hash_keys.addrs.v6addrs.dst = fl6->daddr;
2055                         hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
2056                         hash_keys.basic.ip_proto = fl6->flowi6_proto;
2057                 }
2058                 break;
2059         case 1:
2060                 if (skb) {
2061                         unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
2062                         struct flow_keys keys;
2063
2064                         /* short-circuit if we already have L4 hash present */
2065                         if (skb->l4_hash)
2066                                 return skb_get_hash_raw(skb) >> 1;
2067
2068                         memset(&hash_keys, 0, sizeof(hash_keys));
2069
2070                         if (!flkeys) {
2071                                 skb_flow_dissect_flow_keys(skb, &keys, flag);
2072                                 flkeys = &keys;
2073                         }
2074                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2075                         hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
2076                         hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
2077                         hash_keys.ports.src = flkeys->ports.src;
2078                         hash_keys.ports.dst = flkeys->ports.dst;
2079                         hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2080                 } else {
2081                         memset(&hash_keys, 0, sizeof(hash_keys));
2082                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2083                         hash_keys.addrs.v6addrs.src = fl6->saddr;
2084                         hash_keys.addrs.v6addrs.dst = fl6->daddr;
2085                         hash_keys.ports.src = fl6->fl6_sport;
2086                         hash_keys.ports.dst = fl6->fl6_dport;
2087                         hash_keys.basic.ip_proto = fl6->flowi6_proto;
2088                 }
2089                 break;
2090         }
2091         mhash = flow_hash_from_keys(&hash_keys);
2092
2093         return mhash >> 1;
2094 }
2095
2096 void ip6_route_input(struct sk_buff *skb)
2097 {
2098         const struct ipv6hdr *iph = ipv6_hdr(skb);
2099         struct net *net = dev_net(skb->dev);
2100         int flags = RT6_LOOKUP_F_HAS_SADDR;
2101         struct ip_tunnel_info *tun_info;
2102         struct flowi6 fl6 = {
2103                 .flowi6_iif = skb->dev->ifindex,
2104                 .daddr = iph->daddr,
2105                 .saddr = iph->saddr,
2106                 .flowlabel = ip6_flowinfo(iph),
2107                 .flowi6_mark = skb->mark,
2108                 .flowi6_proto = iph->nexthdr,
2109         };
2110         struct flow_keys *flkeys = NULL, _flkeys;
2111
2112         tun_info = skb_tunnel_info(skb);
2113         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2114                 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
2115
2116         if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
2117                 flkeys = &_flkeys;
2118
2119         if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
2120                 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
2121         skb_dst_drop(skb);
2122         skb_dst_set(skb,
2123                     ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
2124 }
2125
2126 static struct rt6_info *ip6_pol_route_output(struct net *net,
2127                                              struct fib6_table *table,
2128                                              struct flowi6 *fl6,
2129                                              const struct sk_buff *skb,
2130                                              int flags)
2131 {
2132         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
2133 }
2134
2135 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
2136                                          struct flowi6 *fl6, int flags)
2137 {
2138         bool any_src;
2139
2140         if (ipv6_addr_type(&fl6->daddr) &
2141             (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)) {
2142                 struct dst_entry *dst;
2143
2144                 dst = l3mdev_link_scope_lookup(net, fl6);
2145                 if (dst)
2146                         return dst;
2147         }
2148
2149         fl6->flowi6_iif = LOOPBACK_IFINDEX;
2150
2151         any_src = ipv6_addr_any(&fl6->saddr);
2152         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
2153             (fl6->flowi6_oif && any_src))
2154                 flags |= RT6_LOOKUP_F_IFACE;
2155
2156         if (!any_src)
2157                 flags |= RT6_LOOKUP_F_HAS_SADDR;
2158         else if (sk)
2159                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
2160
2161         return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
2162 }
2163 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
2164
2165 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2166 {
2167         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
2168         struct net_device *loopback_dev = net->loopback_dev;
2169         struct dst_entry *new = NULL;
2170
2171         rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
2172                        DST_OBSOLETE_DEAD, 0);
2173         if (rt) {
2174                 rt6_info_init(rt);
2175                 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
2176
2177                 new = &rt->dst;
2178                 new->__use = 1;
2179                 new->input = dst_discard;
2180                 new->output = dst_discard_out;
2181
2182                 dst_copy_metrics(new, &ort->dst);
2183
2184                 rt->rt6i_idev = in6_dev_get(loopback_dev);
2185                 rt->rt6i_gateway = ort->rt6i_gateway;
2186                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
2187
2188                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
2189 #ifdef CONFIG_IPV6_SUBTREES
2190                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
2191 #endif
2192         }
2193
2194         dst_release(dst_orig);
2195         return new ? new : ERR_PTR(-ENOMEM);
2196 }
2197
2198 /*
2199  *      Destination cache support functions
2200  */
2201
2202 static bool fib6_check(struct fib6_info *f6i, u32 cookie)
2203 {
2204         u32 rt_cookie = 0;
2205
2206         if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie)
2207                 return false;
2208
2209         if (fib6_check_expired(f6i))
2210                 return false;
2211
2212         return true;
2213 }
2214
2215 static struct dst_entry *rt6_check(struct rt6_info *rt,
2216                                    struct fib6_info *from,
2217                                    u32 cookie)
2218 {
2219         u32 rt_cookie = 0;
2220
2221         if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) ||
2222             rt_cookie != cookie)
2223                 return NULL;
2224
2225         if (rt6_check_expired(rt))
2226                 return NULL;
2227
2228         return &rt->dst;
2229 }
2230
2231 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt,
2232                                             struct fib6_info *from,
2233                                             u32 cookie)
2234 {
2235         if (!__rt6_check_expired(rt) &&
2236             rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
2237             fib6_check(from, cookie))
2238                 return &rt->dst;
2239         else
2240                 return NULL;
2241 }
2242
2243 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
2244 {
2245         struct dst_entry *dst_ret;
2246         struct fib6_info *from;
2247         struct rt6_info *rt;
2248
2249         rt = container_of(dst, struct rt6_info, dst);
2250
2251         rcu_read_lock();
2252
2253         /* All IPV6 dsts are created with ->obsolete set to the value
2254          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
2255          * into this function always.
2256          */
2257
2258         from = rcu_dereference(rt->from);
2259
2260         if (from && (rt->rt6i_flags & RTF_PCPU ||
2261             unlikely(!list_empty(&rt->rt6i_uncached))))
2262                 dst_ret = rt6_dst_from_check(rt, from, cookie);
2263         else
2264                 dst_ret = rt6_check(rt, from, cookie);
2265
2266         rcu_read_unlock();
2267
2268         return dst_ret;
2269 }
2270
2271 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
2272 {
2273         struct rt6_info *rt = (struct rt6_info *) dst;
2274
2275         if (rt) {
2276                 if (rt->rt6i_flags & RTF_CACHE) {
2277                         rcu_read_lock();
2278                         if (rt6_check_expired(rt)) {
2279                                 rt6_remove_exception_rt(rt);
2280                                 dst = NULL;
2281                         }
2282                         rcu_read_unlock();
2283                 } else {
2284                         dst_release(dst);
2285                         dst = NULL;
2286                 }
2287         }
2288         return dst;
2289 }
2290
2291 static void ip6_link_failure(struct sk_buff *skb)
2292 {
2293         struct rt6_info *rt;
2294
2295         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2296
2297         rt = (struct rt6_info *) skb_dst(skb);
2298         if (rt) {
2299                 rcu_read_lock();
2300                 if (rt->rt6i_flags & RTF_CACHE) {
2301                         rt6_remove_exception_rt(rt);
2302                 } else {
2303                         struct fib6_info *from;
2304                         struct fib6_node *fn;
2305
2306                         from = rcu_dereference(rt->from);
2307                         if (from) {
2308                                 fn = rcu_dereference(from->fib6_node);
2309                                 if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2310                                         fn->fn_sernum = -1;
2311                         }
2312                 }
2313                 rcu_read_unlock();
2314         }
2315 }
2316
2317 static void rt6_update_expires(struct rt6_info *rt0, int timeout)
2318 {
2319         if (!(rt0->rt6i_flags & RTF_EXPIRES)) {
2320                 struct fib6_info *from;
2321
2322                 rcu_read_lock();
2323                 from = rcu_dereference(rt0->from);
2324                 if (from)
2325                         rt0->dst.expires = from->expires;
2326                 rcu_read_unlock();
2327         }
2328
2329         dst_set_expires(&rt0->dst, timeout);
2330         rt0->rt6i_flags |= RTF_EXPIRES;
2331 }
2332
2333 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2334 {
2335         struct net *net = dev_net(rt->dst.dev);
2336
2337         dst_metric_set(&rt->dst, RTAX_MTU, mtu);
2338         rt->rt6i_flags |= RTF_MODIFIED;
2339         rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2340 }
2341
2342 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2343 {
2344         return !(rt->rt6i_flags & RTF_CACHE) &&
2345                 (rt->rt6i_flags & RTF_PCPU || rcu_access_pointer(rt->from));
2346 }
2347
2348 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2349                                  const struct ipv6hdr *iph, u32 mtu)
2350 {
2351         const struct in6_addr *daddr, *saddr;
2352         struct rt6_info *rt6 = (struct rt6_info *)dst;
2353
2354         if (dst_metric_locked(dst, RTAX_MTU))
2355                 return;
2356
2357         if (iph) {
2358                 daddr = &iph->daddr;
2359                 saddr = &iph->saddr;
2360         } else if (sk) {
2361                 daddr = &sk->sk_v6_daddr;
2362                 saddr = &inet6_sk(sk)->saddr;
2363         } else {
2364                 daddr = NULL;
2365                 saddr = NULL;
2366         }
2367         dst_confirm_neigh(dst, daddr);
2368         mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2369         if (mtu >= dst_mtu(dst))
2370                 return;
2371
2372         if (!rt6_cache_allowed_for_pmtu(rt6)) {
2373                 rt6_do_update_pmtu(rt6, mtu);
2374                 /* update rt6_ex->stamp for cache */
2375                 if (rt6->rt6i_flags & RTF_CACHE)
2376                         rt6_update_exception_stamp_rt(rt6);
2377         } else if (daddr) {
2378                 struct fib6_result res = {};
2379                 struct rt6_info *nrt6;
2380
2381                 rcu_read_lock();
2382                 res.f6i = rcu_dereference(rt6->from);
2383                 if (!res.f6i) {
2384                         rcu_read_unlock();
2385                         return;
2386                 }
2387                 res.nh = &res.f6i->fib6_nh;
2388                 res.fib6_flags = res.f6i->fib6_flags;
2389                 res.fib6_type = res.f6i->fib6_type;
2390
2391                 nrt6 = ip6_rt_cache_alloc(&res, daddr, saddr);
2392                 if (nrt6) {
2393                         rt6_do_update_pmtu(nrt6, mtu);
2394                         if (rt6_insert_exception(nrt6, &res))
2395                                 dst_release_immediate(&nrt6->dst);
2396                 }
2397                 rcu_read_unlock();
2398         }
2399 }
2400
2401 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2402                                struct sk_buff *skb, u32 mtu)
2403 {
2404         __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2405 }
2406
2407 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2408                      int oif, u32 mark, kuid_t uid)
2409 {
2410         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2411         struct dst_entry *dst;
2412         struct flowi6 fl6 = {
2413                 .flowi6_oif = oif,
2414                 .flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark),
2415                 .daddr = iph->daddr,
2416                 .saddr = iph->saddr,
2417                 .flowlabel = ip6_flowinfo(iph),
2418                 .flowi6_uid = uid,
2419         };
2420
2421         dst = ip6_route_output(net, NULL, &fl6);
2422         if (!dst->error)
2423                 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2424         dst_release(dst);
2425 }
2426 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2427
2428 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2429 {
2430         int oif = sk->sk_bound_dev_if;
2431         struct dst_entry *dst;
2432
2433         if (!oif && skb->dev)
2434                 oif = l3mdev_master_ifindex(skb->dev);
2435
2436         ip6_update_pmtu(skb, sock_net(sk), mtu, oif, sk->sk_mark, sk->sk_uid);
2437
2438         dst = __sk_dst_get(sk);
2439         if (!dst || !dst->obsolete ||
2440             dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2441                 return;
2442
2443         bh_lock_sock(sk);
2444         if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2445                 ip6_datagram_dst_update(sk, false);
2446         bh_unlock_sock(sk);
2447 }
2448 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2449
2450 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
2451                            const struct flowi6 *fl6)
2452 {
2453 #ifdef CONFIG_IPV6_SUBTREES
2454         struct ipv6_pinfo *np = inet6_sk(sk);
2455 #endif
2456
2457         ip6_dst_store(sk, dst,
2458                       ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ?
2459                       &sk->sk_v6_daddr : NULL,
2460 #ifdef CONFIG_IPV6_SUBTREES
2461                       ipv6_addr_equal(&fl6->saddr, &np->saddr) ?
2462                       &np->saddr :
2463 #endif
2464                       NULL);
2465 }
2466
2467 static bool ip6_redirect_nh_match(const struct fib6_result *res,
2468                                   struct flowi6 *fl6,
2469                                   const struct in6_addr *gw,
2470                                   struct rt6_info **ret)
2471 {
2472         const struct fib6_nh *nh = res->nh;
2473
2474         if (nh->fib_nh_flags & RTNH_F_DEAD || !nh->fib_nh_gw_family ||
2475             fl6->flowi6_oif != nh->fib_nh_dev->ifindex)
2476                 return false;
2477
2478         /* rt_cache's gateway might be different from its 'parent'
2479          * in the case of an ip redirect.
2480          * So we keep searching in the exception table if the gateway
2481          * is different.
2482          */
2483         if (!ipv6_addr_equal(gw, &nh->fib_nh_gw6)) {
2484                 struct rt6_info *rt_cache;
2485
2486                 rt_cache = rt6_find_cached_rt(res, &fl6->daddr, &fl6->saddr);
2487                 if (rt_cache &&
2488                     ipv6_addr_equal(gw, &rt_cache->rt6i_gateway)) {
2489                         *ret = rt_cache;
2490                         return true;
2491                 }
2492                 return false;
2493         }
2494         return true;
2495 }
2496
2497 /* Handle redirects */
2498 struct ip6rd_flowi {
2499         struct flowi6 fl6;
2500         struct in6_addr gateway;
2501 };
2502
2503 static struct rt6_info *__ip6_route_redirect(struct net *net,
2504                                              struct fib6_table *table,
2505                                              struct flowi6 *fl6,
2506                                              const struct sk_buff *skb,
2507                                              int flags)
2508 {
2509         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2510         struct rt6_info *ret = NULL;
2511         struct fib6_result res = {};
2512         struct fib6_info *rt;
2513         struct fib6_node *fn;
2514
2515         /* Get the "current" route for this destination and
2516          * check if the redirect has come from appropriate router.
2517          *
2518          * RFC 4861 specifies that redirects should only be
2519          * accepted if they come from the nexthop to the target.
2520          * Due to the way the routes are chosen, this notion
2521          * is a bit fuzzy and one might need to check all possible
2522          * routes.
2523          */
2524
2525         rcu_read_lock();
2526         fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2527 restart:
2528         for_each_fib6_node_rt_rcu(fn) {
2529                 res.f6i = rt;
2530                 res.nh = &rt->fib6_nh;
2531
2532                 if (fib6_check_expired(rt))
2533                         continue;
2534                 if (rt->fib6_flags & RTF_REJECT)
2535                         break;
2536                 if (ip6_redirect_nh_match(&res, fl6, &rdfl->gateway, &ret))
2537                         goto out;
2538         }
2539
2540         if (!rt)
2541                 rt = net->ipv6.fib6_null_entry;
2542         else if (rt->fib6_flags & RTF_REJECT) {
2543                 ret = net->ipv6.ip6_null_entry;
2544                 goto out;
2545         }
2546
2547         if (rt == net->ipv6.fib6_null_entry) {
2548                 fn = fib6_backtrack(fn, &fl6->saddr);
2549                 if (fn)
2550                         goto restart;
2551         }
2552
2553         res.f6i = rt;
2554         res.nh = &rt->fib6_nh;
2555 out:
2556         if (ret) {
2557                 ip6_hold_safe(net, &ret);
2558         } else {
2559                 res.fib6_flags = res.f6i->fib6_flags;
2560                 res.fib6_type = res.f6i->fib6_type;
2561                 ret = ip6_create_rt_rcu(&res);
2562         }
2563
2564         rcu_read_unlock();
2565
2566         trace_fib6_table_lookup(net, &res, table, fl6);
2567         return ret;
2568 };
2569
2570 static struct dst_entry *ip6_route_redirect(struct net *net,
2571                                             const struct flowi6 *fl6,
2572                                             const struct sk_buff *skb,
2573                                             const struct in6_addr *gateway)
2574 {
2575         int flags = RT6_LOOKUP_F_HAS_SADDR;
2576         struct ip6rd_flowi rdfl;
2577
2578         rdfl.fl6 = *fl6;
2579         rdfl.gateway = *gateway;
2580
2581         return fib6_rule_lookup(net, &rdfl.fl6, skb,
2582                                 flags, __ip6_route_redirect);
2583 }
2584
2585 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2586                   kuid_t uid)
2587 {
2588         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2589         struct dst_entry *dst;
2590         struct flowi6 fl6 = {
2591                 .flowi6_iif = LOOPBACK_IFINDEX,
2592                 .flowi6_oif = oif,
2593                 .flowi6_mark = mark,
2594                 .daddr = iph->daddr,
2595                 .saddr = iph->saddr,
2596                 .flowlabel = ip6_flowinfo(iph),
2597                 .flowi6_uid = uid,
2598         };
2599
2600         dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
2601         rt6_do_redirect(dst, NULL, skb);
2602         dst_release(dst);
2603 }
2604 EXPORT_SYMBOL_GPL(ip6_redirect);
2605
2606 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif)
2607 {
2608         const struct ipv6hdr *iph = ipv6_hdr(skb);
2609         const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2610         struct dst_entry *dst;
2611         struct flowi6 fl6 = {
2612                 .flowi6_iif = LOOPBACK_IFINDEX,
2613                 .flowi6_oif = oif,
2614                 .daddr = msg->dest,
2615                 .saddr = iph->daddr,
2616                 .flowi6_uid = sock_net_uid(net, NULL),
2617         };
2618
2619         dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
2620         rt6_do_redirect(dst, NULL, skb);
2621         dst_release(dst);
2622 }
2623
2624 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2625 {
2626         ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2627                      sk->sk_uid);
2628 }
2629 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2630
2631 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2632 {
2633         struct net_device *dev = dst->dev;
2634         unsigned int mtu = dst_mtu(dst);
2635         struct net *net = dev_net(dev);
2636
2637         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2638
2639         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2640                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2641
2642         /*
2643          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2644          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2645          * IPV6_MAXPLEN is also valid and means: "any MSS,
2646          * rely only on pmtu discovery"
2647          */
2648         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2649                 mtu = IPV6_MAXPLEN;
2650         return mtu;
2651 }
2652
2653 static unsigned int ip6_mtu(const struct dst_entry *dst)
2654 {
2655         struct inet6_dev *idev;
2656         unsigned int mtu;
2657
2658         mtu = dst_metric_raw(dst, RTAX_MTU);
2659         if (mtu)
2660                 goto out;
2661
2662         mtu = IPV6_MIN_MTU;
2663
2664         rcu_read_lock();
2665         idev = __in6_dev_get(dst->dev);
2666         if (idev)
2667                 mtu = idev->cnf.mtu6;
2668         rcu_read_unlock();
2669
2670 out:
2671         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2672
2673         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2674 }
2675
2676 /* MTU selection:
2677  * 1. mtu on route is locked - use it
2678  * 2. mtu from nexthop exception
2679  * 3. mtu from egress device
2680  *
2681  * based on ip6_dst_mtu_forward and exception logic of
2682  * rt6_find_cached_rt; called with rcu_read_lock
2683  */
2684 u32 ip6_mtu_from_fib6(const struct fib6_result *res,
2685                       const struct in6_addr *daddr,
2686                       const struct in6_addr *saddr)
2687 {
2688         const struct fib6_nh *nh = res->nh;
2689         struct fib6_info *f6i = res->f6i;
2690         struct inet6_dev *idev;
2691         struct rt6_info *rt;
2692         u32 mtu = 0;
2693
2694         if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) {
2695                 mtu = f6i->fib6_pmtu;
2696                 if (mtu)
2697                         goto out;
2698         }
2699
2700         rt = rt6_find_cached_rt(res, daddr, saddr);
2701         if (unlikely(rt)) {
2702                 mtu = dst_metric_raw(&rt->dst, RTAX_MTU);
2703         } else {
2704                 struct net_device *dev = nh->fib_nh_dev;
2705
2706                 mtu = IPV6_MIN_MTU;
2707                 idev = __in6_dev_get(dev);
2708                 if (idev && idev->cnf.mtu6 > mtu)
2709                         mtu = idev->cnf.mtu6;
2710         }
2711
2712         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2713 out:
2714         return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu);
2715 }
2716
2717 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2718                                   struct flowi6 *fl6)
2719 {
2720         struct dst_entry *dst;
2721         struct rt6_info *rt;
2722         struct inet6_dev *idev = in6_dev_get(dev);
2723         struct net *net = dev_net(dev);
2724
2725         if (unlikely(!idev))
2726                 return ERR_PTR(-ENODEV);
2727
2728         rt = ip6_dst_alloc(net, dev, 0);
2729         if (unlikely(!rt)) {
2730                 in6_dev_put(idev);
2731                 dst = ERR_PTR(-ENOMEM);
2732                 goto out;
2733         }
2734
2735         rt->dst.flags |= DST_HOST;
2736         rt->dst.input = ip6_input;
2737         rt->dst.output  = ip6_output;
2738         rt->rt6i_gateway  = fl6->daddr;
2739         rt->rt6i_dst.addr = fl6->daddr;
2740         rt->rt6i_dst.plen = 128;
2741         rt->rt6i_idev     = idev;
2742         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2743
2744         /* Add this dst into uncached_list so that rt6_disable_ip() can
2745          * do proper release of the net_device
2746          */
2747         rt6_uncached_list_add(rt);
2748         atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2749
2750         dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2751
2752 out:
2753         return dst;
2754 }
2755
2756 static int ip6_dst_gc(struct dst_ops *ops)
2757 {
2758         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2759         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2760         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2761         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2762         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2763         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2764         int entries;
2765
2766         entries = dst_entries_get_fast(ops);
2767         if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2768             entries <= rt_max_size)
2769                 goto out;
2770
2771         net->ipv6.ip6_rt_gc_expire++;
2772         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2773         entries = dst_entries_get_slow(ops);
2774         if (entries < ops->gc_thresh)
2775                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2776 out:
2777         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2778         return entries > rt_max_size;
2779 }
2780
2781 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2782                                             struct fib6_config *cfg,
2783                                             const struct in6_addr *gw_addr,
2784                                             u32 tbid, int flags)
2785 {
2786         struct flowi6 fl6 = {
2787                 .flowi6_oif = cfg->fc_ifindex,
2788                 .daddr = *gw_addr,
2789                 .saddr = cfg->fc_prefsrc,
2790         };
2791         struct fib6_table *table;
2792         struct rt6_info *rt;
2793
2794         table = fib6_get_table(net, tbid);
2795         if (!table)
2796                 return NULL;
2797
2798         if (!ipv6_addr_any(&cfg->fc_prefsrc))
2799                 flags |= RT6_LOOKUP_F_HAS_SADDR;
2800
2801         flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
2802         rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags);
2803
2804         /* if table lookup failed, fall back to full lookup */
2805         if (rt == net->ipv6.ip6_null_entry) {
2806                 ip6_rt_put(rt);
2807                 rt = NULL;
2808         }
2809
2810         return rt;
2811 }
2812
2813 static int ip6_route_check_nh_onlink(struct net *net,
2814                                      struct fib6_config *cfg,
2815                                      const struct net_device *dev,
2816                                      struct netlink_ext_ack *extack)
2817 {
2818         u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
2819         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2820         u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
2821         struct fib6_info *from;
2822         struct rt6_info *grt;
2823         int err;
2824
2825         err = 0;
2826         grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
2827         if (grt) {
2828                 rcu_read_lock();
2829                 from = rcu_dereference(grt->from);
2830                 if (!grt->dst.error &&
2831                     /* ignore match if it is the default route */
2832                     from && !ipv6_addr_any(&from->fib6_dst.addr) &&
2833                     (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
2834                         NL_SET_ERR_MSG(extack,
2835                                        "Nexthop has invalid gateway or device mismatch");
2836                         err = -EINVAL;
2837                 }
2838                 rcu_read_unlock();
2839
2840                 ip6_rt_put(grt);
2841         }
2842
2843         return err;
2844 }
2845
2846 static int ip6_route_check_nh(struct net *net,
2847                               struct fib6_config *cfg,
2848                               struct net_device **_dev,
2849                               struct inet6_dev **idev)
2850 {
2851         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2852         struct net_device *dev = _dev ? *_dev : NULL;
2853         struct rt6_info *grt = NULL;
2854         int err = -EHOSTUNREACH;
2855
2856         if (cfg->fc_table) {
2857                 int flags = RT6_LOOKUP_F_IFACE;
2858
2859                 grt = ip6_nh_lookup_table(net, cfg, gw_addr,
2860                                           cfg->fc_table, flags);
2861                 if (grt) {
2862                         if (grt->rt6i_flags & RTF_GATEWAY ||
2863                             (dev && dev != grt->dst.dev)) {
2864                                 ip6_rt_put(grt);
2865                                 grt = NULL;
2866                         }
2867                 }
2868         }
2869
2870         if (!grt)
2871                 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1);
2872
2873         if (!grt)
2874                 goto out;
2875
2876         if (dev) {
2877                 if (dev != grt->dst.dev) {
2878                         ip6_rt_put(grt);
2879                         goto out;
2880                 }
2881         } else {
2882                 *_dev = dev = grt->dst.dev;
2883                 *idev = grt->rt6i_idev;
2884                 dev_hold(dev);
2885                 in6_dev_hold(grt->rt6i_idev);
2886         }
2887
2888         if (!(grt->rt6i_flags & RTF_GATEWAY))
2889                 err = 0;
2890
2891         ip6_rt_put(grt);
2892
2893 out:
2894         return err;
2895 }
2896
2897 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
2898                            struct net_device **_dev, struct inet6_dev **idev,
2899                            struct netlink_ext_ack *extack)
2900 {
2901         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2902         int gwa_type = ipv6_addr_type(gw_addr);
2903         bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true;
2904         const struct net_device *dev = *_dev;
2905         bool need_addr_check = !dev;
2906         int err = -EINVAL;
2907
2908         /* if gw_addr is local we will fail to detect this in case
2909          * address is still TENTATIVE (DAD in progress). rt6_lookup()
2910          * will return already-added prefix route via interface that
2911          * prefix route was assigned to, which might be non-loopback.
2912          */
2913         if (dev &&
2914             ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2915                 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2916                 goto out;
2917         }
2918
2919         if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
2920                 /* IPv6 strictly inhibits using not link-local
2921                  * addresses as nexthop address.
2922                  * Otherwise, router will not able to send redirects.
2923                  * It is very good, but in some (rare!) circumstances
2924                  * (SIT, PtP, NBMA NOARP links) it is handy to allow
2925                  * some exceptions. --ANK
2926                  * We allow IPv4-mapped nexthops to support RFC4798-type
2927                  * addressing
2928                  */
2929                 if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) {
2930                         NL_SET_ERR_MSG(extack, "Invalid gateway address");
2931                         goto out;
2932                 }
2933
2934                 if (cfg->fc_flags & RTNH_F_ONLINK)
2935                         err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
2936                 else
2937                         err = ip6_route_check_nh(net, cfg, _dev, idev);
2938
2939                 if (err)
2940                         goto out;
2941         }
2942
2943         /* reload in case device was changed */
2944         dev = *_dev;
2945
2946         err = -EINVAL;
2947         if (!dev) {
2948                 NL_SET_ERR_MSG(extack, "Egress device not specified");
2949                 goto out;
2950         } else if (dev->flags & IFF_LOOPBACK) {
2951                 NL_SET_ERR_MSG(extack,
2952                                "Egress device can not be loopback device for this route");
2953                 goto out;
2954         }
2955
2956         /* if we did not check gw_addr above, do so now that the
2957          * egress device has been resolved.
2958          */
2959         if (need_addr_check &&
2960             ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2961                 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2962                 goto out;
2963         }
2964
2965         err = 0;
2966 out:
2967         return err;
2968 }
2969
2970 static bool fib6_is_reject(u32 flags, struct net_device *dev, int addr_type)
2971 {
2972         if ((flags & RTF_REJECT) ||
2973             (dev && (dev->flags & IFF_LOOPBACK) &&
2974              !(addr_type & IPV6_ADDR_LOOPBACK) &&
2975              !(flags & RTF_LOCAL)))
2976                 return true;
2977
2978         return false;
2979 }
2980
2981 int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh,
2982                  struct fib6_config *cfg, gfp_t gfp_flags,
2983                  struct netlink_ext_ack *extack)
2984 {
2985         struct net_device *dev = NULL;
2986         struct inet6_dev *idev = NULL;
2987         int addr_type;
2988         int err;
2989
2990         fib6_nh->fib_nh_family = AF_INET6;
2991
2992         err = -ENODEV;
2993         if (cfg->fc_ifindex) {
2994                 dev = dev_get_by_index(net, cfg->fc_ifindex);
2995                 if (!dev)
2996                         goto out;
2997                 idev = in6_dev_get(dev);
2998                 if (!idev)
2999                         goto out;
3000         }
3001
3002         if (cfg->fc_flags & RTNH_F_ONLINK) {
3003                 if (!dev) {
3004                         NL_SET_ERR_MSG(extack,
3005                                        "Nexthop device required for onlink");
3006                         goto out;
3007                 }
3008
3009                 if (!(dev->flags & IFF_UP)) {
3010                         NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3011                         err = -ENETDOWN;
3012                         goto out;
3013                 }
3014
3015                 fib6_nh->fib_nh_flags |= RTNH_F_ONLINK;
3016         }
3017
3018         fib6_nh->fib_nh_weight = 1;
3019
3020         /* We cannot add true routes via loopback here,
3021          * they would result in kernel looping; promote them to reject routes
3022          */
3023         addr_type = ipv6_addr_type(&cfg->fc_dst);
3024         if (fib6_is_reject(cfg->fc_flags, dev, addr_type)) {
3025                 /* hold loopback dev/idev if we haven't done so. */
3026                 if (dev != net->loopback_dev) {
3027                         if (dev) {
3028                                 dev_put(dev);
3029                                 in6_dev_put(idev);
3030                         }
3031                         dev = net->loopback_dev;
3032                         dev_hold(dev);
3033                         idev = in6_dev_get(dev);
3034                         if (!idev) {
3035                                 err = -ENODEV;
3036                                 goto out;
3037                         }
3038                 }
3039                 goto set_dev;
3040         }
3041
3042         if (cfg->fc_flags & RTF_GATEWAY) {
3043                 err = ip6_validate_gw(net, cfg, &dev, &idev, extack);
3044                 if (err)
3045                         goto out;
3046
3047                 fib6_nh->fib_nh_gw6 = cfg->fc_gateway;
3048                 fib6_nh->fib_nh_gw_family = AF_INET6;
3049         }
3050
3051         err = -ENODEV;
3052         if (!dev)
3053                 goto out;
3054
3055         if (idev->cnf.disable_ipv6) {
3056                 NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
3057                 err = -EACCES;
3058                 goto out;
3059         }
3060
3061         if (!(dev->flags & IFF_UP) && !cfg->fc_ignore_dev_down) {
3062                 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3063                 err = -ENETDOWN;
3064                 goto out;
3065         }
3066
3067         if (!(cfg->fc_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
3068             !netif_carrier_ok(dev))
3069                 fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN;
3070
3071         fib6_nh->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, gfp_flags);
3072         if (!fib6_nh->rt6i_pcpu) {
3073                 err = -ENOMEM;
3074                 goto out;
3075         }
3076
3077         err = fib_nh_common_init(&fib6_nh->nh_common, cfg->fc_encap,
3078                                  cfg->fc_encap_type, cfg, gfp_flags, extack);
3079         if (err)
3080                 goto out;
3081 set_dev:
3082         fib6_nh->fib_nh_dev = dev;
3083         fib6_nh->fib_nh_oif = dev->ifindex;
3084         err = 0;
3085 out:
3086         if (idev)
3087                 in6_dev_put(idev);
3088
3089         if (err) {
3090                 lwtstate_put(fib6_nh->fib_nh_lws);
3091                 fib6_nh->fib_nh_lws = NULL;
3092                 if (dev)
3093                         dev_put(dev);
3094         }
3095
3096         return err;
3097 }
3098
3099 void fib6_nh_release(struct fib6_nh *fib6_nh)
3100 {
3101         if (fib6_nh->rt6i_pcpu) {
3102                 int cpu;
3103
3104                 for_each_possible_cpu(cpu) {
3105                         struct rt6_info **ppcpu_rt;
3106                         struct rt6_info *pcpu_rt;
3107
3108                         ppcpu_rt = per_cpu_ptr(fib6_nh->rt6i_pcpu, cpu);
3109                         pcpu_rt = *ppcpu_rt;
3110                         if (pcpu_rt) {
3111                                 dst_dev_put(&pcpu_rt->dst);
3112                                 dst_release(&pcpu_rt->dst);
3113                                 *ppcpu_rt = NULL;
3114                         }
3115                 }
3116
3117                 free_percpu(fib6_nh->rt6i_pcpu);
3118         }
3119
3120         fib_nh_common_release(&fib6_nh->nh_common);
3121 }
3122
3123 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
3124                                               gfp_t gfp_flags,
3125                                               struct netlink_ext_ack *extack)
3126 {
3127         struct net *net = cfg->fc_nlinfo.nl_net;
3128         struct fib6_info *rt = NULL;
3129         struct fib6_table *table;
3130         int err = -EINVAL;
3131         int addr_type;
3132
3133         /* RTF_PCPU is an internal flag; can not be set by userspace */
3134         if (cfg->fc_flags & RTF_PCPU) {
3135                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
3136                 goto out;
3137         }
3138
3139         /* RTF_CACHE is an internal flag; can not be set by userspace */
3140         if (cfg->fc_flags & RTF_CACHE) {
3141                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
3142                 goto out;
3143         }
3144
3145         if (cfg->fc_type > RTN_MAX) {
3146                 NL_SET_ERR_MSG(extack, "Invalid route type");
3147                 goto out;
3148         }
3149
3150         if (cfg->fc_dst_len > 128) {
3151                 NL_SET_ERR_MSG(extack, "Invalid prefix length");
3152                 goto out;
3153         }
3154         if (cfg->fc_src_len > 128) {
3155                 NL_SET_ERR_MSG(extack, "Invalid source address length");
3156                 goto out;
3157         }
3158 #ifndef CONFIG_IPV6_SUBTREES
3159         if (cfg->fc_src_len) {
3160                 NL_SET_ERR_MSG(extack,
3161                                "Specifying source address requires IPV6_SUBTREES to be enabled");
3162                 goto out;
3163         }
3164 #endif
3165
3166         err = -ENOBUFS;
3167         if (cfg->fc_nlinfo.nlh &&
3168             !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
3169                 table = fib6_get_table(net, cfg->fc_table);
3170                 if (!table) {
3171                         pr_warn("NLM_F_CREATE should be specified when creating new route\n");
3172                         table = fib6_new_table(net, cfg->fc_table);
3173                 }
3174         } else {
3175                 table = fib6_new_table(net, cfg->fc_table);
3176         }
3177
3178         if (!table)
3179                 goto out;
3180
3181         err = -ENOMEM;
3182         rt = fib6_info_alloc(gfp_flags);
3183         if (!rt)
3184                 goto out;
3185
3186         rt->fib6_metrics = ip_fib_metrics_init(net, cfg->fc_mx, cfg->fc_mx_len,
3187                                                extack);
3188         if (IS_ERR(rt->fib6_metrics)) {
3189                 err = PTR_ERR(rt->fib6_metrics);
3190                 /* Do not leave garbage there. */
3191                 rt->fib6_metrics = (struct dst_metrics *)&dst_default_metrics;
3192                 goto out;
3193         }
3194
3195         if (cfg->fc_flags & RTF_ADDRCONF)
3196                 rt->dst_nocount = true;
3197
3198         if (cfg->fc_flags & RTF_EXPIRES)
3199                 fib6_set_expires(rt, jiffies +
3200                                 clock_t_to_jiffies(cfg->fc_expires));
3201         else
3202                 fib6_clean_expires(rt);
3203
3204         if (cfg->fc_protocol == RTPROT_UNSPEC)
3205                 cfg->fc_protocol = RTPROT_BOOT;
3206         rt->fib6_protocol = cfg->fc_protocol;
3207
3208         rt->fib6_table = table;
3209         rt->fib6_metric = cfg->fc_metric;
3210         rt->fib6_type = cfg->fc_type;
3211         rt->fib6_flags = cfg->fc_flags & ~RTF_GATEWAY;
3212
3213         ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
3214         rt->fib6_dst.plen = cfg->fc_dst_len;
3215         if (rt->fib6_dst.plen == 128)
3216                 rt->dst_host = true;
3217
3218 #ifdef CONFIG_IPV6_SUBTREES
3219         ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
3220         rt->fib6_src.plen = cfg->fc_src_len;
3221 #endif
3222         err = fib6_nh_init(net, &rt->fib6_nh, cfg, gfp_flags, extack);
3223         if (err)
3224                 goto out;
3225
3226         /* We cannot add true routes via loopback here,
3227          * they would result in kernel looping; promote them to reject routes
3228          */
3229         addr_type = ipv6_addr_type(&cfg->fc_dst);
3230         if (fib6_is_reject(cfg->fc_flags, rt->fib6_nh.fib_nh_dev, addr_type))
3231                 rt->fib6_flags = RTF_REJECT | RTF_NONEXTHOP;
3232
3233         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
3234                 struct net_device *dev = fib6_info_nh_dev(rt);
3235
3236                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
3237                         NL_SET_ERR_MSG(extack, "Invalid source address");
3238                         err = -EINVAL;
3239                         goto out;
3240                 }
3241                 rt->fib6_prefsrc.addr = cfg->fc_prefsrc;
3242                 rt->fib6_prefsrc.plen = 128;
3243         } else
3244                 rt->fib6_prefsrc.plen = 0;
3245
3246         return rt;
3247 out:
3248         fib6_info_release(rt);
3249         return ERR_PTR(err);
3250 }
3251
3252 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
3253                   struct netlink_ext_ack *extack)
3254 {
3255         struct fib6_info *rt;
3256         int err;
3257
3258         rt = ip6_route_info_create(cfg, gfp_flags, extack);
3259         if (IS_ERR(rt))
3260                 return PTR_ERR(rt);
3261
3262         err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack);
3263         fib6_info_release(rt);
3264
3265         return err;
3266 }
3267
3268 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info)
3269 {
3270         struct net *net = info->nl_net;
3271         struct fib6_table *table;
3272         int err;
3273
3274         if (rt == net->ipv6.fib6_null_entry) {
3275                 err = -ENOENT;
3276                 goto out;
3277         }
3278
3279         table = rt->fib6_table;
3280         spin_lock_bh(&table->tb6_lock);
3281         err = fib6_del(rt, info);
3282         spin_unlock_bh(&table->tb6_lock);
3283
3284 out:
3285         fib6_info_release(rt);
3286         return err;
3287 }
3288
3289 int ip6_del_rt(struct net *net, struct fib6_info *rt)
3290 {
3291         struct nl_info info = { .nl_net = net };
3292
3293         return __ip6_del_rt(rt, &info);
3294 }
3295
3296 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
3297 {
3298         struct nl_info *info = &cfg->fc_nlinfo;
3299         struct net *net = info->nl_net;
3300         struct sk_buff *skb = NULL;
3301         struct fib6_table *table;
3302         int err = -ENOENT;
3303
3304         if (rt == net->ipv6.fib6_null_entry)
3305                 goto out_put;
3306         table = rt->fib6_table;
3307         spin_lock_bh(&table->tb6_lock);
3308
3309         if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) {
3310                 struct fib6_info *sibling, *next_sibling;
3311
3312                 /* prefer to send a single notification with all hops */
3313                 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3314                 if (skb) {
3315                         u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3316
3317                         if (rt6_fill_node(net, skb, rt, NULL,
3318                                           NULL, NULL, 0, RTM_DELROUTE,
3319                                           info->portid, seq, 0) < 0) {
3320                                 kfree_skb(skb);
3321                                 skb = NULL;
3322                         } else
3323                                 info->skip_notify = 1;
3324                 }
3325
3326                 list_for_each_entry_safe(sibling, next_sibling,
3327                                          &rt->fib6_siblings,
3328                                          fib6_siblings) {
3329                         err = fib6_del(sibling, info);
3330                         if (err)
3331                                 goto out_unlock;
3332                 }
3333         }
3334
3335         err = fib6_del(rt, info);
3336 out_unlock:
3337         spin_unlock_bh(&table->tb6_lock);
3338 out_put:
3339         fib6_info_release(rt);
3340
3341         if (skb) {
3342                 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3343                             info->nlh, gfp_any());
3344         }
3345         return err;
3346 }
3347
3348 static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
3349 {
3350         int rc = -ESRCH;
3351
3352         if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex)
3353                 goto out;
3354
3355         if (cfg->fc_flags & RTF_GATEWAY &&
3356             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
3357                 goto out;
3358
3359         rc = rt6_remove_exception_rt(rt);
3360 out:
3361         return rc;
3362 }
3363
3364 static int ip6_route_del(struct fib6_config *cfg,
3365                          struct netlink_ext_ack *extack)
3366 {
3367         struct rt6_info *rt_cache;
3368         struct fib6_table *table;
3369         struct fib6_info *rt;
3370         struct fib6_node *fn;
3371         int err = -ESRCH;
3372
3373         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
3374         if (!table) {
3375                 NL_SET_ERR_MSG(extack, "FIB table does not exist");
3376                 return err;
3377         }
3378
3379         rcu_read_lock();
3380
3381         fn = fib6_locate(&table->tb6_root,
3382                          &cfg->fc_dst, cfg->fc_dst_len,
3383                          &cfg->fc_src, cfg->fc_src_len,
3384                          !(cfg->fc_flags & RTF_CACHE));
3385
3386         if (fn) {
3387                 for_each_fib6_node_rt_rcu(fn) {
3388                         struct fib6_nh *nh;
3389
3390                         if (cfg->fc_flags & RTF_CACHE) {
3391                                 struct fib6_result res = {
3392                                         .f6i = rt,
3393                                 };
3394                                 int rc;
3395
3396                                 rt_cache = rt6_find_cached_rt(&res,
3397                                                               &cfg->fc_dst,
3398                                                               &cfg->fc_src);
3399                                 if (rt_cache) {
3400                                         rc = ip6_del_cached_rt(rt_cache, cfg);
3401                                         if (rc != -ESRCH) {
3402                                                 rcu_read_unlock();
3403                                                 return rc;
3404                                         }
3405                                 }
3406                                 continue;
3407                         }
3408
3409                         nh = &rt->fib6_nh;
3410                         if (cfg->fc_ifindex &&
3411                             (!nh->fib_nh_dev ||
3412                              nh->fib_nh_dev->ifindex != cfg->fc_ifindex))
3413                                 continue;
3414                         if (cfg->fc_flags & RTF_GATEWAY &&
3415                             !ipv6_addr_equal(&cfg->fc_gateway, &nh->fib_nh_gw6))
3416                                 continue;
3417                         if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
3418                                 continue;
3419                         if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol)
3420                                 continue;
3421                         if (!fib6_info_hold_safe(rt))
3422                                 continue;
3423                         rcu_read_unlock();
3424
3425                         /* if gateway was specified only delete the one hop */
3426                         if (cfg->fc_flags & RTF_GATEWAY)
3427                                 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
3428
3429                         return __ip6_del_rt_siblings(rt, cfg);
3430                 }
3431         }
3432         rcu_read_unlock();
3433
3434         return err;
3435 }
3436
3437 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
3438 {
3439         struct netevent_redirect netevent;
3440         struct rt6_info *rt, *nrt = NULL;
3441         struct fib6_result res = {};
3442         struct ndisc_options ndopts;
3443         struct inet6_dev *in6_dev;
3444         struct neighbour *neigh;
3445         struct rd_msg *msg;
3446         int optlen, on_link;
3447         u8 *lladdr;
3448
3449         optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
3450         optlen -= sizeof(*msg);
3451
3452         if (optlen < 0) {
3453                 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
3454                 return;
3455         }
3456
3457         msg = (struct rd_msg *)icmp6_hdr(skb);
3458
3459         if (ipv6_addr_is_multicast(&msg->dest)) {
3460                 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
3461                 return;
3462         }
3463
3464         on_link = 0;
3465         if (ipv6_addr_equal(&msg->dest, &msg->target)) {
3466                 on_link = 1;
3467         } else if (ipv6_addr_type(&msg->target) !=
3468                    (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
3469                 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
3470                 return;
3471         }
3472
3473         in6_dev = __in6_dev_get(skb->dev);
3474         if (!in6_dev)
3475                 return;
3476         if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
3477                 return;
3478
3479         /* RFC2461 8.1:
3480          *      The IP source address of the Redirect MUST be the same as the current
3481          *      first-hop router for the specified ICMP Destination Address.
3482          */
3483
3484         if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
3485                 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
3486                 return;
3487         }
3488
3489         lladdr = NULL;
3490         if (ndopts.nd_opts_tgt_lladdr) {
3491                 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
3492                                              skb->dev);
3493                 if (!lladdr) {
3494                         net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
3495                         return;
3496                 }
3497         }
3498
3499         rt = (struct rt6_info *) dst;
3500         if (rt->rt6i_flags & RTF_REJECT) {
3501                 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3502                 return;
3503         }
3504
3505         /* Redirect received -> path was valid.
3506          * Look, redirects are sent only in response to data packets,
3507          * so that this nexthop apparently is reachable. --ANK
3508          */
3509         dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3510
3511         neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3512         if (!neigh)
3513                 return;
3514
3515         /*
3516          *      We have finally decided to accept it.
3517          */
3518
3519         ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3520                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
3521                      NEIGH_UPDATE_F_OVERRIDE|
3522                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3523                                      NEIGH_UPDATE_F_ISROUTER)),
3524                      NDISC_REDIRECT, &ndopts);
3525
3526         rcu_read_lock();
3527         res.f6i = rcu_dereference(rt->from);
3528         if (!res.f6i)
3529                 goto out;
3530
3531         res.nh = &res.f6i->fib6_nh;
3532         res.fib6_flags = res.f6i->fib6_flags;
3533         res.fib6_type = res.f6i->fib6_type;
3534         nrt = ip6_rt_cache_alloc(&res, &msg->dest, NULL);
3535         if (!nrt)
3536                 goto out;
3537
3538         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3539         if (on_link)
3540                 nrt->rt6i_flags &= ~RTF_GATEWAY;
3541
3542         nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3543
3544         /* rt6_insert_exception() will take care of duplicated exceptions */
3545         if (rt6_insert_exception(nrt, &res)) {
3546                 dst_release_immediate(&nrt->dst);
3547                 goto out;
3548         }
3549
3550         netevent.old = &rt->dst;
3551         netevent.new = &nrt->dst;
3552         netevent.daddr = &msg->dest;
3553         netevent.neigh = neigh;
3554         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3555
3556 out:
3557         rcu_read_unlock();
3558         neigh_release(neigh);
3559 }
3560
3561 #ifdef CONFIG_IPV6_ROUTE_INFO
3562 static struct fib6_info *rt6_get_route_info(struct net *net,
3563                                            const struct in6_addr *prefix, int prefixlen,
3564                                            const struct in6_addr *gwaddr,
3565                                            struct net_device *dev)
3566 {
3567         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3568         int ifindex = dev->ifindex;
3569         struct fib6_node *fn;
3570         struct fib6_info *rt = NULL;
3571         struct fib6_table *table;
3572
3573         table = fib6_get_table(net, tb_id);
3574         if (!table)
3575                 return NULL;
3576
3577         rcu_read_lock();
3578         fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3579         if (!fn)
3580                 goto out;
3581
3582         for_each_fib6_node_rt_rcu(fn) {
3583                 if (rt->fib6_nh.fib_nh_dev->ifindex != ifindex)
3584                         continue;
3585                 if (!(rt->fib6_flags & RTF_ROUTEINFO) ||
3586                     !rt->fib6_nh.fib_nh_gw_family)
3587                         continue;
3588                 if (!ipv6_addr_equal(&rt->fib6_nh.fib_nh_gw6, gwaddr))
3589                         continue;
3590                 if (!fib6_info_hold_safe(rt))
3591                         continue;
3592                 break;
3593         }
3594 out:
3595         rcu_read_unlock();
3596         return rt;
3597 }
3598
3599 static struct fib6_info *rt6_add_route_info(struct net *net,
3600                                            const struct in6_addr *prefix, int prefixlen,
3601                                            const struct in6_addr *gwaddr,
3602                                            struct net_device *dev,
3603                                            unsigned int pref)
3604 {
3605         struct fib6_config cfg = {
3606                 .fc_metric      = IP6_RT_PRIO_USER,
3607                 .fc_ifindex     = dev->ifindex,
3608                 .fc_dst_len     = prefixlen,
3609                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3610                                   RTF_UP | RTF_PREF(pref),
3611                 .fc_protocol = RTPROT_RA,
3612                 .fc_type = RTN_UNICAST,
3613                 .fc_nlinfo.portid = 0,
3614                 .fc_nlinfo.nlh = NULL,
3615                 .fc_nlinfo.nl_net = net,
3616         };
3617
3618         cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3619         cfg.fc_dst = *prefix;
3620         cfg.fc_gateway = *gwaddr;
3621
3622         /* We should treat it as a default route if prefix length is 0. */
3623         if (!prefixlen)
3624                 cfg.fc_flags |= RTF_DEFAULT;
3625
3626         ip6_route_add(&cfg, GFP_ATOMIC, NULL);
3627
3628         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3629 }
3630 #endif
3631
3632 struct fib6_info *rt6_get_dflt_router(struct net *net,
3633                                      const struct in6_addr *addr,
3634                                      struct net_device *dev)
3635 {
3636         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3637         struct fib6_info *rt;
3638         struct fib6_table *table;
3639
3640         table = fib6_get_table(net, tb_id);
3641         if (!table)
3642                 return NULL;
3643
3644         rcu_read_lock();
3645         for_each_fib6_node_rt_rcu(&table->tb6_root) {
3646                 struct fib6_nh *nh = &rt->fib6_nh;
3647
3648                 if (dev == nh->fib_nh_dev &&
3649                     ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3650                     ipv6_addr_equal(&nh->fib_nh_gw6, addr))
3651                         break;
3652         }
3653         if (rt && !fib6_info_hold_safe(rt))
3654                 rt = NULL;
3655         rcu_read_unlock();
3656         return rt;
3657 }
3658
3659 struct fib6_info *rt6_add_dflt_router(struct net *net,
3660                                      const struct in6_addr *gwaddr,
3661                                      struct net_device *dev,
3662                                      unsigned int pref)
3663 {
3664         struct fib6_config cfg = {
3665                 .fc_table       = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3666                 .fc_metric      = IP6_RT_PRIO_USER,
3667                 .fc_ifindex     = dev->ifindex,
3668                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3669                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3670                 .fc_protocol = RTPROT_RA,
3671                 .fc_type = RTN_UNICAST,
3672                 .fc_nlinfo.portid = 0,
3673                 .fc_nlinfo.nlh = NULL,
3674                 .fc_nlinfo.nl_net = net,
3675         };
3676
3677         cfg.fc_gateway = *gwaddr;
3678
3679         if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) {
3680                 struct fib6_table *table;
3681
3682                 table = fib6_get_table(dev_net(dev), cfg.fc_table);
3683                 if (table)
3684                         table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3685         }
3686
3687         return rt6_get_dflt_router(net, gwaddr, dev);
3688 }
3689
3690 static void __rt6_purge_dflt_routers(struct net *net,
3691                                      struct fib6_table *table)
3692 {
3693         struct fib6_info *rt;
3694
3695 restart:
3696         rcu_read_lock();
3697         for_each_fib6_node_rt_rcu(&table->tb6_root) {
3698                 struct net_device *dev = fib6_info_nh_dev(rt);
3699                 struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL;
3700
3701                 if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3702                     (!idev || idev->cnf.accept_ra != 2) &&
3703                     fib6_info_hold_safe(rt)) {
3704                         rcu_read_unlock();
3705                         ip6_del_rt(net, rt);
3706                         goto restart;
3707                 }
3708         }
3709         rcu_read_unlock();
3710
3711         table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3712 }
3713
3714 void rt6_purge_dflt_routers(struct net *net)
3715 {
3716         struct fib6_table *table;
3717         struct hlist_head *head;
3718         unsigned int h;
3719
3720         rcu_read_lock();
3721
3722         for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3723                 head = &net->ipv6.fib_table_hash[h];
3724                 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3725                         if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3726                                 __rt6_purge_dflt_routers(net, table);
3727                 }
3728         }
3729
3730         rcu_read_unlock();
3731 }
3732
3733 static void rtmsg_to_fib6_config(struct net *net,
3734                                  struct in6_rtmsg *rtmsg,
3735                                  struct fib6_config *cfg)
3736 {
3737         *cfg = (struct fib6_config){
3738                 .fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3739                          : RT6_TABLE_MAIN,
3740                 .fc_ifindex = rtmsg->rtmsg_ifindex,
3741                 .fc_metric = rtmsg->rtmsg_metric ? : IP6_RT_PRIO_USER,
3742                 .fc_expires = rtmsg->rtmsg_info,
3743                 .fc_dst_len = rtmsg->rtmsg_dst_len,
3744                 .fc_src_len = rtmsg->rtmsg_src_len,
3745                 .fc_flags = rtmsg->rtmsg_flags,
3746                 .fc_type = rtmsg->rtmsg_type,
3747
3748                 .fc_nlinfo.nl_net = net,
3749
3750                 .fc_dst = rtmsg->rtmsg_dst,
3751                 .fc_src = rtmsg->rtmsg_src,
3752                 .fc_gateway = rtmsg->rtmsg_gateway,
3753         };
3754 }
3755
3756 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3757 {
3758         struct fib6_config cfg;
3759         struct in6_rtmsg rtmsg;
3760         int err;
3761
3762         switch (cmd) {
3763         case SIOCADDRT:         /* Add a route */
3764         case SIOCDELRT:         /* Delete a route */
3765                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3766                         return -EPERM;
3767                 err = copy_from_user(&rtmsg, arg,
3768                                      sizeof(struct in6_rtmsg));
3769                 if (err)
3770                         return -EFAULT;
3771
3772                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3773
3774                 rtnl_lock();
3775                 switch (cmd) {
3776                 case SIOCADDRT:
3777                         err = ip6_route_add(&cfg, GFP_KERNEL, NULL);
3778                         break;
3779                 case SIOCDELRT:
3780                         err = ip6_route_del(&cfg, NULL);
3781                         break;
3782                 default:
3783                         err = -EINVAL;
3784                 }
3785                 rtnl_unlock();
3786
3787                 return err;
3788         }
3789
3790         return -EINVAL;
3791 }
3792
3793 /*
3794  *      Drop the packet on the floor
3795  */
3796
3797 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3798 {
3799         struct dst_entry *dst = skb_dst(skb);
3800         struct net *net = dev_net(dst->dev);
3801         struct inet6_dev *idev;
3802         int type;
3803
3804         if (netif_is_l3_master(skb->dev) &&
3805             dst->dev == net->loopback_dev)
3806                 idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif));
3807         else
3808                 idev = ip6_dst_idev(dst);
3809
3810         switch (ipstats_mib_noroutes) {
3811         case IPSTATS_MIB_INNOROUTES:
3812                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3813                 if (type == IPV6_ADDR_ANY) {
3814                         IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
3815                         break;
3816                 }
3817                 /* FALLTHROUGH */
3818         case IPSTATS_MIB_OUTNOROUTES:
3819                 IP6_INC_STATS(net, idev, ipstats_mib_noroutes);
3820                 break;
3821         }
3822
3823         /* Start over by dropping the dst for l3mdev case */
3824         if (netif_is_l3_master(skb->dev))
3825                 skb_dst_drop(skb);
3826
3827         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3828         kfree_skb(skb);
3829         return 0;
3830 }
3831
3832 static int ip6_pkt_discard(struct sk_buff *skb)
3833 {
3834         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3835 }
3836
3837 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3838 {
3839         skb->dev = skb_dst(skb)->dev;
3840         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3841 }
3842
3843 static int ip6_pkt_prohibit(struct sk_buff *skb)
3844 {
3845         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3846 }
3847
3848 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3849 {
3850         skb->dev = skb_dst(skb)->dev;
3851         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3852 }
3853
3854 /*
3855  *      Allocate a dst for local (unicast / anycast) address.
3856  */
3857
3858 struct fib6_info *addrconf_f6i_alloc(struct net *net,
3859                                      struct inet6_dev *idev,
3860                                      const struct in6_addr *addr,
3861                                      bool anycast, gfp_t gfp_flags)
3862 {
3863         struct fib6_config cfg = {
3864                 .fc_table = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL,
3865                 .fc_ifindex = idev->dev->ifindex,
3866                 .fc_flags = RTF_UP | RTF_ADDRCONF | RTF_NONEXTHOP,
3867                 .fc_dst = *addr,
3868                 .fc_dst_len = 128,
3869                 .fc_protocol = RTPROT_KERNEL,
3870                 .fc_nlinfo.nl_net = net,
3871                 .fc_ignore_dev_down = true,
3872         };
3873
3874         if (anycast) {
3875                 cfg.fc_type = RTN_ANYCAST;
3876                 cfg.fc_flags |= RTF_ANYCAST;
3877         } else {
3878                 cfg.fc_type = RTN_LOCAL;
3879                 cfg.fc_flags |= RTF_LOCAL;
3880         }
3881
3882         return ip6_route_info_create(&cfg, gfp_flags, NULL);
3883 }
3884
3885 /* remove deleted ip from prefsrc entries */
3886 struct arg_dev_net_ip {
3887         struct net_device *dev;
3888         struct net *net;
3889         struct in6_addr *addr;
3890 };
3891
3892 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
3893 {
3894         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3895         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3896         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3897
3898         if (((void *)rt->fib6_nh.fib_nh_dev == dev || !dev) &&
3899             rt != net->ipv6.fib6_null_entry &&
3900             ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) {
3901                 spin_lock_bh(&rt6_exception_lock);
3902                 /* remove prefsrc entry */
3903                 rt->fib6_prefsrc.plen = 0;
3904                 spin_unlock_bh(&rt6_exception_lock);
3905         }
3906         return 0;
3907 }
3908
3909 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3910 {
3911         struct net *net = dev_net(ifp->idev->dev);
3912         struct arg_dev_net_ip adni = {
3913                 .dev = ifp->idev->dev,
3914                 .net = net,
3915                 .addr = &ifp->addr,
3916         };
3917         fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3918 }
3919
3920 #define RTF_RA_ROUTER           (RTF_ADDRCONF | RTF_DEFAULT)
3921
3922 /* Remove routers and update dst entries when gateway turn into host. */
3923 static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
3924 {
3925         struct in6_addr *gateway = (struct in6_addr *)arg;
3926
3927         if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3928             rt->fib6_nh.fib_nh_gw_family &&
3929             ipv6_addr_equal(gateway, &rt->fib6_nh.fib_nh_gw6)) {
3930                 return -1;
3931         }
3932
3933         /* Further clean up cached routes in exception table.
3934          * This is needed because cached route may have a different
3935          * gateway than its 'parent' in the case of an ip redirect.
3936          */
3937         rt6_exceptions_clean_tohost(rt, gateway);
3938
3939         return 0;
3940 }
3941
3942 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3943 {
3944         fib6_clean_all(net, fib6_clean_tohost, gateway);
3945 }
3946
3947 struct arg_netdev_event {
3948         const struct net_device *dev;
3949         union {
3950                 unsigned char nh_flags;
3951                 unsigned long event;
3952         };
3953 };
3954
3955 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
3956 {
3957         struct fib6_info *iter;
3958         struct fib6_node *fn;
3959
3960         fn = rcu_dereference_protected(rt->fib6_node,
3961                         lockdep_is_held(&rt->fib6_table->tb6_lock));
3962         iter = rcu_dereference_protected(fn->leaf,
3963                         lockdep_is_held(&rt->fib6_table->tb6_lock));
3964         while (iter) {
3965                 if (iter->fib6_metric == rt->fib6_metric &&
3966                     rt6_qualify_for_ecmp(iter))
3967                         return iter;
3968                 iter = rcu_dereference_protected(iter->fib6_next,
3969                                 lockdep_is_held(&rt->fib6_table->tb6_lock));
3970         }
3971
3972         return NULL;
3973 }
3974
3975 static bool rt6_is_dead(const struct fib6_info *rt)
3976 {
3977         if (rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD ||
3978             (rt->fib6_nh.fib_nh_flags & RTNH_F_LINKDOWN &&
3979              ip6_ignore_linkdown(rt->fib6_nh.fib_nh_dev)))
3980                 return true;
3981
3982         return false;
3983 }
3984
3985 static int rt6_multipath_total_weight(const struct fib6_info *rt)
3986 {
3987         struct fib6_info *iter;
3988         int total = 0;
3989
3990         if (!rt6_is_dead(rt))
3991                 total += rt->fib6_nh.fib_nh_weight;
3992
3993         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) {
3994                 if (!rt6_is_dead(iter))
3995                         total += iter->fib6_nh.fib_nh_weight;
3996         }
3997
3998         return total;
3999 }
4000
4001 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total)
4002 {
4003         int upper_bound = -1;
4004
4005         if (!rt6_is_dead(rt)) {
4006                 *weight += rt->fib6_nh.fib_nh_weight;
4007                 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
4008                                                     total) - 1;
4009         }
4010         atomic_set(&rt->fib6_nh.fib_nh_upper_bound, upper_bound);
4011 }
4012
4013 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total)
4014 {
4015         struct fib6_info *iter;
4016         int weight = 0;
4017
4018         rt6_upper_bound_set(rt, &weight, total);
4019
4020         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4021                 rt6_upper_bound_set(iter, &weight, total);
4022 }
4023
4024 void rt6_multipath_rebalance(struct fib6_info *rt)
4025 {
4026         struct fib6_info *first;
4027         int total;
4028
4029         /* In case the entire multipath route was marked for flushing,
4030          * then there is no need to rebalance upon the removal of every
4031          * sibling route.
4032          */
4033         if (!rt->fib6_nsiblings || rt->should_flush)
4034                 return;
4035
4036         /* During lookup routes are evaluated in order, so we need to
4037          * make sure upper bounds are assigned from the first sibling
4038          * onwards.
4039          */
4040         first = rt6_multipath_first_sibling(rt);
4041         if (WARN_ON_ONCE(!first))
4042                 return;
4043
4044         total = rt6_multipath_total_weight(first);
4045         rt6_multipath_upper_bound_set(first, total);
4046 }
4047
4048 static int fib6_ifup(struct fib6_info *rt, void *p_arg)
4049 {
4050         const struct arg_netdev_event *arg = p_arg;
4051         struct net *net = dev_net(arg->dev);
4052
4053         if (rt != net->ipv6.fib6_null_entry &&
4054             rt->fib6_nh.fib_nh_dev == arg->dev) {
4055                 rt->fib6_nh.fib_nh_flags &= ~arg->nh_flags;
4056                 fib6_update_sernum_upto_root(net, rt);
4057                 rt6_multipath_rebalance(rt);
4058         }
4059
4060         return 0;
4061 }
4062
4063 void rt6_sync_up(struct net_device *dev, unsigned char nh_flags)
4064 {
4065         struct arg_netdev_event arg = {
4066                 .dev = dev,
4067                 {
4068                         .nh_flags = nh_flags,
4069                 },
4070         };
4071
4072         if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
4073                 arg.nh_flags |= RTNH_F_LINKDOWN;
4074
4075         fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
4076 }
4077
4078 static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
4079                                    const struct net_device *dev)
4080 {
4081         struct fib6_info *iter;
4082
4083         if (rt->fib6_nh.fib_nh_dev == dev)
4084                 return true;
4085         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4086                 if (iter->fib6_nh.fib_nh_dev == dev)
4087                         return true;
4088
4089         return false;
4090 }
4091
4092 static void rt6_multipath_flush(struct fib6_info *rt)
4093 {
4094         struct fib6_info *iter;
4095
4096         rt->should_flush = 1;
4097         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4098                 iter->should_flush = 1;
4099 }
4100
4101 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt,
4102                                              const struct net_device *down_dev)
4103 {
4104         struct fib6_info *iter;
4105         unsigned int dead = 0;
4106
4107         if (rt->fib6_nh.fib_nh_dev == down_dev ||
4108             rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD)
4109                 dead++;
4110         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4111                 if (iter->fib6_nh.fib_nh_dev == down_dev ||
4112                     iter->fib6_nh.fib_nh_flags & RTNH_F_DEAD)
4113                         dead++;
4114
4115         return dead;
4116 }
4117
4118 static void rt6_multipath_nh_flags_set(struct fib6_info *rt,
4119                                        const struct net_device *dev,
4120                                        unsigned char nh_flags)
4121 {
4122         struct fib6_info *iter;
4123
4124         if (rt->fib6_nh.fib_nh_dev == dev)
4125                 rt->fib6_nh.fib_nh_flags |= nh_flags;
4126         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4127                 if (iter->fib6_nh.fib_nh_dev == dev)
4128                         iter->fib6_nh.fib_nh_flags |= nh_flags;
4129 }
4130
4131 /* called with write lock held for table with rt */
4132 static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
4133 {
4134         const struct arg_netdev_event *arg = p_arg;
4135         const struct net_device *dev = arg->dev;
4136         struct net *net = dev_net(dev);
4137
4138         if (rt == net->ipv6.fib6_null_entry)
4139                 return 0;
4140
4141         switch (arg->event) {
4142         case NETDEV_UNREGISTER:
4143                 return rt->fib6_nh.fib_nh_dev == dev ? -1 : 0;
4144         case NETDEV_DOWN:
4145                 if (rt->should_flush)
4146                         return -1;
4147                 if (!rt->fib6_nsiblings)
4148                         return rt->fib6_nh.fib_nh_dev == dev ? -1 : 0;
4149                 if (rt6_multipath_uses_dev(rt, dev)) {
4150                         unsigned int count;
4151
4152                         count = rt6_multipath_dead_count(rt, dev);
4153                         if (rt->fib6_nsiblings + 1 == count) {
4154                                 rt6_multipath_flush(rt);
4155                                 return -1;
4156                         }
4157                         rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
4158                                                    RTNH_F_LINKDOWN);
4159                         fib6_update_sernum(net, rt);
4160                         rt6_multipath_rebalance(rt);
4161                 }
4162                 return -2;
4163         case NETDEV_CHANGE:
4164                 if (rt->fib6_nh.fib_nh_dev != dev ||
4165                     rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
4166                         break;
4167                 rt->fib6_nh.fib_nh_flags |= RTNH_F_LINKDOWN;
4168                 rt6_multipath_rebalance(rt);
4169                 break;
4170         }
4171
4172         return 0;
4173 }
4174
4175 void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
4176 {
4177         struct arg_netdev_event arg = {
4178                 .dev = dev,
4179                 {
4180                         .event = event,
4181                 },
4182         };
4183         struct net *net = dev_net(dev);
4184
4185         if (net->ipv6.sysctl.skip_notify_on_dev_down)
4186                 fib6_clean_all_skip_notify(net, fib6_ifdown, &arg);
4187         else
4188                 fib6_clean_all(net, fib6_ifdown, &arg);
4189 }
4190
4191 void rt6_disable_ip(struct net_device *dev, unsigned long event)
4192 {
4193         rt6_sync_down_dev(dev, event);
4194         rt6_uncached_list_flush_dev(dev_net(dev), dev);
4195         neigh_ifdown(&nd_tbl, dev);
4196 }
4197
4198 struct rt6_mtu_change_arg {
4199         struct net_device *dev;
4200         unsigned int mtu;
4201 };
4202
4203 static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg)
4204 {
4205         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
4206         struct inet6_dev *idev;
4207
4208         /* In IPv6 pmtu discovery is not optional,
4209            so that RTAX_MTU lock cannot disable it.
4210            We still use this lock to block changes
4211            caused by addrconf/ndisc.
4212         */
4213
4214         idev = __in6_dev_get(arg->dev);
4215         if (!idev)
4216                 return 0;
4217
4218         /* For administrative MTU increase, there is no way to discover
4219            IPv6 PMTU increase, so PMTU increase should be updated here.
4220            Since RFC 1981 doesn't include administrative MTU increase
4221            update PMTU increase is a MUST. (i.e. jumbo frame)
4222          */
4223         if (rt->fib6_nh.fib_nh_dev == arg->dev &&
4224             !fib6_metric_locked(rt, RTAX_MTU)) {
4225                 u32 mtu = rt->fib6_pmtu;
4226
4227                 if (mtu >= arg->mtu ||
4228                     (mtu < arg->mtu && mtu == idev->cnf.mtu6))
4229                         fib6_metric_set(rt, RTAX_MTU, arg->mtu);
4230
4231                 spin_lock_bh(&rt6_exception_lock);
4232                 rt6_exceptions_update_pmtu(idev, rt, arg->mtu);
4233                 spin_unlock_bh(&rt6_exception_lock);
4234         }
4235         return 0;
4236 }
4237
4238 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
4239 {
4240         struct rt6_mtu_change_arg arg = {
4241                 .dev = dev,
4242                 .mtu = mtu,
4243         };
4244
4245         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
4246 }
4247
4248 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
4249         [RTA_UNSPEC]            = { .strict_start_type = RTA_DPORT + 1 },
4250         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
4251         [RTA_PREFSRC]           = { .len = sizeof(struct in6_addr) },
4252         [RTA_OIF]               = { .type = NLA_U32 },
4253         [RTA_IIF]               = { .type = NLA_U32 },
4254         [RTA_PRIORITY]          = { .type = NLA_U32 },
4255         [RTA_METRICS]           = { .type = NLA_NESTED },
4256         [RTA_MULTIPATH]         = { .len = sizeof(struct rtnexthop) },
4257         [RTA_PREF]              = { .type = NLA_U8 },
4258         [RTA_ENCAP_TYPE]        = { .type = NLA_U16 },
4259         [RTA_ENCAP]             = { .type = NLA_NESTED },
4260         [RTA_EXPIRES]           = { .type = NLA_U32 },
4261         [RTA_UID]               = { .type = NLA_U32 },
4262         [RTA_MARK]              = { .type = NLA_U32 },
4263         [RTA_TABLE]             = { .type = NLA_U32 },
4264         [RTA_IP_PROTO]          = { .type = NLA_U8 },
4265         [RTA_SPORT]             = { .type = NLA_U16 },
4266         [RTA_DPORT]             = { .type = NLA_U16 },
4267 };
4268
4269 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
4270                               struct fib6_config *cfg,
4271                               struct netlink_ext_ack *extack)
4272 {
4273         struct rtmsg *rtm;
4274         struct nlattr *tb[RTA_MAX+1];
4275         unsigned int pref;
4276         int err;
4277
4278         err = nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
4279                                      rtm_ipv6_policy, extack);
4280         if (err < 0)
4281                 goto errout;
4282
4283         err = -EINVAL;
4284         rtm = nlmsg_data(nlh);
4285
4286         *cfg = (struct fib6_config){
4287                 .fc_table = rtm->rtm_table,
4288                 .fc_dst_len = rtm->rtm_dst_len,
4289                 .fc_src_len = rtm->rtm_src_len,
4290                 .fc_flags = RTF_UP,
4291                 .fc_protocol = rtm->rtm_protocol,
4292                 .fc_type = rtm->rtm_type,
4293
4294                 .fc_nlinfo.portid = NETLINK_CB(skb).portid,
4295                 .fc_nlinfo.nlh = nlh,
4296                 .fc_nlinfo.nl_net = sock_net(skb->sk),
4297         };
4298
4299         if (rtm->rtm_type == RTN_UNREACHABLE ||
4300             rtm->rtm_type == RTN_BLACKHOLE ||
4301             rtm->rtm_type == RTN_PROHIBIT ||
4302             rtm->rtm_type == RTN_THROW)
4303                 cfg->fc_flags |= RTF_REJECT;
4304
4305         if (rtm->rtm_type == RTN_LOCAL)
4306                 cfg->fc_flags |= RTF_LOCAL;
4307
4308         if (rtm->rtm_flags & RTM_F_CLONED)
4309                 cfg->fc_flags |= RTF_CACHE;
4310
4311         cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
4312
4313         if (tb[RTA_GATEWAY]) {
4314                 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
4315                 cfg->fc_flags |= RTF_GATEWAY;
4316         }
4317         if (tb[RTA_VIA]) {
4318                 NL_SET_ERR_MSG(extack, "IPv6 does not support RTA_VIA attribute");
4319                 goto errout;
4320         }
4321
4322         if (tb[RTA_DST]) {
4323                 int plen = (rtm->rtm_dst_len + 7) >> 3;
4324
4325                 if (nla_len(tb[RTA_DST]) < plen)
4326                         goto errout;
4327
4328                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
4329         }
4330
4331         if (tb[RTA_SRC]) {
4332                 int plen = (rtm->rtm_src_len + 7) >> 3;
4333
4334                 if (nla_len(tb[RTA_SRC]) < plen)
4335                         goto errout;
4336
4337                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
4338         }
4339
4340         if (tb[RTA_PREFSRC])
4341                 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
4342
4343         if (tb[RTA_OIF])
4344                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
4345
4346         if (tb[RTA_PRIORITY])
4347                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
4348
4349         if (tb[RTA_METRICS]) {
4350                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
4351                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
4352         }
4353
4354         if (tb[RTA_TABLE])
4355                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
4356
4357         if (tb[RTA_MULTIPATH]) {
4358                 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
4359                 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
4360
4361                 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
4362                                                      cfg->fc_mp_len, extack);
4363                 if (err < 0)
4364                         goto errout;
4365         }
4366
4367         if (tb[RTA_PREF]) {
4368                 pref = nla_get_u8(tb[RTA_PREF]);
4369                 if (pref != ICMPV6_ROUTER_PREF_LOW &&
4370                     pref != ICMPV6_ROUTER_PREF_HIGH)
4371                         pref = ICMPV6_ROUTER_PREF_MEDIUM;
4372                 cfg->fc_flags |= RTF_PREF(pref);
4373         }
4374
4375         if (tb[RTA_ENCAP])
4376                 cfg->fc_encap = tb[RTA_ENCAP];
4377
4378         if (tb[RTA_ENCAP_TYPE]) {
4379                 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
4380
4381                 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
4382                 if (err < 0)
4383                         goto errout;
4384         }
4385
4386         if (tb[RTA_EXPIRES]) {
4387                 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
4388
4389                 if (addrconf_finite_timeout(timeout)) {
4390                         cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
4391                         cfg->fc_flags |= RTF_EXPIRES;
4392                 }
4393         }
4394
4395         err = 0;
4396 errout:
4397         return err;
4398 }
4399
4400 struct rt6_nh {
4401         struct fib6_info *fib6_info;
4402         struct fib6_config r_cfg;
4403         struct list_head next;
4404 };
4405
4406 static int ip6_route_info_append(struct net *net,
4407                                  struct list_head *rt6_nh_list,
4408                                  struct fib6_info *rt,
4409                                  struct fib6_config *r_cfg)
4410 {
4411         struct rt6_nh *nh;
4412         int err = -EEXIST;
4413
4414         list_for_each_entry(nh, rt6_nh_list, next) {
4415                 /* check if fib6_info already exists */
4416                 if (rt6_duplicate_nexthop(nh->fib6_info, rt))
4417                         return err;
4418         }
4419
4420         nh = kzalloc(sizeof(*nh), GFP_KERNEL);
4421         if (!nh)
4422                 return -ENOMEM;
4423         nh->fib6_info = rt;
4424         memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
4425         list_add_tail(&nh->next, rt6_nh_list);
4426
4427         return 0;
4428 }
4429
4430 static void ip6_route_mpath_notify(struct fib6_info *rt,
4431                                    struct fib6_info *rt_last,
4432                                    struct nl_info *info,
4433                                    __u16 nlflags)
4434 {
4435         /* if this is an APPEND route, then rt points to the first route
4436          * inserted and rt_last points to last route inserted. Userspace
4437          * wants a consistent dump of the route which starts at the first
4438          * nexthop. Since sibling routes are always added at the end of
4439          * the list, find the first sibling of the last route appended
4440          */
4441         if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) {
4442                 rt = list_first_entry(&rt_last->fib6_siblings,
4443                                       struct fib6_info,
4444                                       fib6_siblings);
4445         }
4446
4447         if (rt)
4448                 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
4449 }
4450
4451 static int ip6_route_multipath_add(struct fib6_config *cfg,
4452                                    struct netlink_ext_ack *extack)
4453 {
4454         struct fib6_info *rt_notif = NULL, *rt_last = NULL;
4455         struct nl_info *info = &cfg->fc_nlinfo;
4456         struct fib6_config r_cfg;
4457         struct rtnexthop *rtnh;
4458         struct fib6_info *rt;
4459         struct rt6_nh *err_nh;
4460         struct rt6_nh *nh, *nh_safe;
4461         __u16 nlflags;
4462         int remaining;
4463         int attrlen;
4464         int err = 1;
4465         int nhn = 0;
4466         int replace = (cfg->fc_nlinfo.nlh &&
4467                        (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
4468         LIST_HEAD(rt6_nh_list);
4469
4470         nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
4471         if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
4472                 nlflags |= NLM_F_APPEND;
4473
4474         remaining = cfg->fc_mp_len;
4475         rtnh = (struct rtnexthop *)cfg->fc_mp;
4476
4477         /* Parse a Multipath Entry and build a list (rt6_nh_list) of
4478          * fib6_info structs per nexthop
4479          */
4480         while (rtnh_ok(rtnh, remaining)) {
4481                 memcpy(&r_cfg, cfg, sizeof(*cfg));
4482                 if (rtnh->rtnh_ifindex)
4483                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4484
4485                 attrlen = rtnh_attrlen(rtnh);
4486                 if (attrlen > 0) {
4487                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4488
4489                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4490                         if (nla) {
4491                                 r_cfg.fc_gateway = nla_get_in6_addr(nla);
4492                                 r_cfg.fc_flags |= RTF_GATEWAY;
4493                         }
4494                         r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
4495                         nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
4496                         if (nla)
4497                                 r_cfg.fc_encap_type = nla_get_u16(nla);
4498                 }
4499
4500                 r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
4501                 rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack);
4502                 if (IS_ERR(rt)) {
4503                         err = PTR_ERR(rt);
4504                         rt = NULL;
4505                         goto cleanup;
4506                 }
4507                 if (!rt6_qualify_for_ecmp(rt)) {
4508                         err = -EINVAL;
4509                         NL_SET_ERR_MSG(extack,
4510                                        "Device only routes can not be added for IPv6 using the multipath API.");
4511                         fib6_info_release(rt);
4512                         goto cleanup;
4513                 }
4514
4515                 rt->fib6_nh.fib_nh_weight = rtnh->rtnh_hops + 1;
4516
4517                 err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
4518                                             rt, &r_cfg);
4519                 if (err) {
4520                         fib6_info_release(rt);
4521                         goto cleanup;
4522                 }
4523
4524                 rtnh = rtnh_next(rtnh, &remaining);
4525         }
4526
4527         /* for add and replace send one notification with all nexthops.
4528          * Skip the notification in fib6_add_rt2node and send one with
4529          * the full route when done
4530          */
4531         info->skip_notify = 1;
4532
4533         err_nh = NULL;
4534         list_for_each_entry(nh, &rt6_nh_list, next) {
4535                 err = __ip6_ins_rt(nh->fib6_info, info, extack);
4536                 fib6_info_release(nh->fib6_info);
4537
4538                 if (!err) {
4539                         /* save reference to last route successfully inserted */
4540                         rt_last = nh->fib6_info;
4541
4542                         /* save reference to first route for notification */
4543                         if (!rt_notif)
4544                                 rt_notif = nh->fib6_info;
4545                 }
4546
4547                 /* nh->fib6_info is used or freed at this point, reset to NULL*/
4548                 nh->fib6_info = NULL;
4549                 if (err) {
4550                         if (replace && nhn)
4551                                 NL_SET_ERR_MSG_MOD(extack,
4552                                                    "multipath route replace failed (check consistency of installed routes)");
4553                         err_nh = nh;
4554                         goto add_errout;
4555                 }
4556
4557                 /* Because each route is added like a single route we remove
4558                  * these flags after the first nexthop: if there is a collision,
4559                  * we have already failed to add the first nexthop:
4560                  * fib6_add_rt2node() has rejected it; when replacing, old
4561                  * nexthops have been replaced by first new, the rest should
4562                  * be added to it.
4563                  */
4564                 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
4565                                                      NLM_F_REPLACE);
4566                 nhn++;
4567         }
4568
4569         /* success ... tell user about new route */
4570         ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4571         goto cleanup;
4572
4573 add_errout:
4574         /* send notification for routes that were added so that
4575          * the delete notifications sent by ip6_route_del are
4576          * coherent
4577          */
4578         if (rt_notif)
4579                 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4580
4581         /* Delete routes that were already added */
4582         list_for_each_entry(nh, &rt6_nh_list, next) {
4583                 if (err_nh == nh)
4584                         break;
4585                 ip6_route_del(&nh->r_cfg, extack);
4586         }
4587
4588 cleanup:
4589         list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
4590                 if (nh->fib6_info)
4591                         fib6_info_release(nh->fib6_info);
4592                 list_del(&nh->next);
4593                 kfree(nh);
4594         }
4595
4596         return err;
4597 }
4598
4599 static int ip6_route_multipath_del(struct fib6_config *cfg,
4600                                    struct netlink_ext_ack *extack)
4601 {
4602         struct fib6_config r_cfg;
4603         struct rtnexthop *rtnh;
4604         int remaining;
4605         int attrlen;
4606         int err = 1, last_err = 0;
4607
4608         remaining = cfg->fc_mp_len;
4609         rtnh = (struct rtnexthop *)cfg->fc_mp;
4610
4611         /* Parse a Multipath Entry */
4612         while (rtnh_ok(rtnh, remaining)) {
4613                 memcpy(&r_cfg, cfg, sizeof(*cfg));
4614                 if (rtnh->rtnh_ifindex)
4615                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4616
4617                 attrlen = rtnh_attrlen(rtnh);
4618                 if (attrlen > 0) {
4619                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4620
4621                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4622                         if (nla) {
4623                                 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
4624                                 r_cfg.fc_flags |= RTF_GATEWAY;
4625                         }
4626                 }
4627                 err = ip6_route_del(&r_cfg, extack);
4628                 if (err)
4629                         last_err = err;
4630
4631                 rtnh = rtnh_next(rtnh, &remaining);
4632         }
4633
4634         return last_err;
4635 }
4636
4637 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4638                               struct netlink_ext_ack *extack)
4639 {
4640         struct fib6_config cfg;
4641         int err;
4642
4643         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4644         if (err < 0)
4645                 return err;
4646
4647         if (cfg.fc_mp)
4648                 return ip6_route_multipath_del(&cfg, extack);
4649         else {
4650                 cfg.fc_delete_all_nh = 1;
4651                 return ip6_route_del(&cfg, extack);
4652         }
4653 }
4654
4655 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4656                               struct netlink_ext_ack *extack)
4657 {
4658         struct fib6_config cfg;
4659         int err;
4660
4661         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4662         if (err < 0)
4663                 return err;
4664
4665         if (cfg.fc_metric == 0)
4666                 cfg.fc_metric = IP6_RT_PRIO_USER;
4667
4668         if (cfg.fc_mp)
4669                 return ip6_route_multipath_add(&cfg, extack);
4670         else
4671                 return ip6_route_add(&cfg, GFP_KERNEL, extack);
4672 }
4673
4674 static size_t rt6_nlmsg_size(struct fib6_info *rt)
4675 {
4676         int nexthop_len = 0;
4677
4678         if (rt->fib6_nsiblings) {
4679                 nexthop_len = nla_total_size(0)  /* RTA_MULTIPATH */
4680                             + NLA_ALIGN(sizeof(struct rtnexthop))
4681                             + nla_total_size(16) /* RTA_GATEWAY */
4682                             + lwtunnel_get_encap_size(rt->fib6_nh.fib_nh_lws);
4683
4684                 nexthop_len *= rt->fib6_nsiblings;
4685         }
4686
4687         return NLMSG_ALIGN(sizeof(struct rtmsg))
4688                + nla_total_size(16) /* RTA_SRC */
4689                + nla_total_size(16) /* RTA_DST */
4690                + nla_total_size(16) /* RTA_GATEWAY */
4691                + nla_total_size(16) /* RTA_PREFSRC */
4692                + nla_total_size(4) /* RTA_TABLE */
4693                + nla_total_size(4) /* RTA_IIF */
4694                + nla_total_size(4) /* RTA_OIF */
4695                + nla_total_size(4) /* RTA_PRIORITY */
4696                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
4697                + nla_total_size(sizeof(struct rta_cacheinfo))
4698                + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
4699                + nla_total_size(1) /* RTA_PREF */
4700                + lwtunnel_get_encap_size(rt->fib6_nh.fib_nh_lws)
4701                + nexthop_len;
4702 }
4703
4704 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
4705                          struct fib6_info *rt, struct dst_entry *dst,
4706                          struct in6_addr *dest, struct in6_addr *src,
4707                          int iif, int type, u32 portid, u32 seq,
4708                          unsigned int flags)
4709 {
4710         struct rt6_info *rt6 = (struct rt6_info *)dst;
4711         struct rt6key *rt6_dst, *rt6_src;
4712         u32 *pmetrics, table, rt6_flags;
4713         struct nlmsghdr *nlh;
4714         struct rtmsg *rtm;
4715         long expires = 0;
4716
4717         nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4718         if (!nlh)
4719                 return -EMSGSIZE;
4720
4721         if (rt6) {
4722                 rt6_dst = &rt6->rt6i_dst;
4723                 rt6_src = &rt6->rt6i_src;
4724                 rt6_flags = rt6->rt6i_flags;
4725         } else {
4726                 rt6_dst = &rt->fib6_dst;
4727                 rt6_src = &rt->fib6_src;
4728                 rt6_flags = rt->fib6_flags;
4729         }
4730
4731         rtm = nlmsg_data(nlh);
4732         rtm->rtm_family = AF_INET6;
4733         rtm->rtm_dst_len = rt6_dst->plen;
4734         rtm->rtm_src_len = rt6_src->plen;
4735         rtm->rtm_tos = 0;
4736         if (rt->fib6_table)
4737                 table = rt->fib6_table->tb6_id;
4738         else
4739                 table = RT6_TABLE_UNSPEC;
4740         rtm->rtm_table = table < 256 ? table : RT_TABLE_COMPAT;
4741         if (nla_put_u32(skb, RTA_TABLE, table))
4742                 goto nla_put_failure;
4743
4744         rtm->rtm_type = rt->fib6_type;
4745         rtm->rtm_flags = 0;
4746         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4747         rtm->rtm_protocol = rt->fib6_protocol;
4748
4749         if (rt6_flags & RTF_CACHE)
4750                 rtm->rtm_flags |= RTM_F_CLONED;
4751
4752         if (dest) {
4753                 if (nla_put_in6_addr(skb, RTA_DST, dest))
4754                         goto nla_put_failure;
4755                 rtm->rtm_dst_len = 128;
4756         } else if (rtm->rtm_dst_len)
4757                 if (nla_put_in6_addr(skb, RTA_DST, &rt6_dst->addr))
4758                         goto nla_put_failure;
4759 #ifdef CONFIG_IPV6_SUBTREES
4760         if (src) {
4761                 if (nla_put_in6_addr(skb, RTA_SRC, src))
4762                         goto nla_put_failure;
4763                 rtm->rtm_src_len = 128;
4764         } else if (rtm->rtm_src_len &&
4765                    nla_put_in6_addr(skb, RTA_SRC, &rt6_src->addr))
4766                 goto nla_put_failure;
4767 #endif
4768         if (iif) {
4769 #ifdef CONFIG_IPV6_MROUTE
4770                 if (ipv6_addr_is_multicast(&rt6_dst->addr)) {
4771                         int err = ip6mr_get_route(net, skb, rtm, portid);
4772
4773                         if (err == 0)
4774                                 return 0;
4775                         if (err < 0)
4776                                 goto nla_put_failure;
4777                 } else
4778 #endif
4779                         if (nla_put_u32(skb, RTA_IIF, iif))
4780                                 goto nla_put_failure;
4781         } else if (dest) {
4782                 struct in6_addr saddr_buf;
4783                 if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 &&
4784                     nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4785                         goto nla_put_failure;
4786         }
4787
4788         if (rt->fib6_prefsrc.plen) {
4789                 struct in6_addr saddr_buf;
4790                 saddr_buf = rt->fib6_prefsrc.addr;
4791                 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4792                         goto nla_put_failure;
4793         }
4794
4795         pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics;
4796         if (rtnetlink_put_metrics(skb, pmetrics) < 0)
4797                 goto nla_put_failure;
4798
4799         if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric))
4800                 goto nla_put_failure;
4801
4802         /* For multipath routes, walk the siblings list and add
4803          * each as a nexthop within RTA_MULTIPATH.
4804          */
4805         if (rt6) {
4806                 if (rt6_flags & RTF_GATEWAY &&
4807                     nla_put_in6_addr(skb, RTA_GATEWAY, &rt6->rt6i_gateway))
4808                         goto nla_put_failure;
4809
4810                 if (dst->dev && nla_put_u32(skb, RTA_OIF, dst->dev->ifindex))
4811                         goto nla_put_failure;
4812         } else if (rt->fib6_nsiblings) {
4813                 struct fib6_info *sibling, *next_sibling;
4814                 struct nlattr *mp;
4815
4816                 mp = nla_nest_start_noflag(skb, RTA_MULTIPATH);
4817                 if (!mp)
4818                         goto nla_put_failure;
4819
4820                 if (fib_add_nexthop(skb, &rt->fib6_nh.nh_common,
4821                                     rt->fib6_nh.fib_nh_weight) < 0)
4822                         goto nla_put_failure;
4823
4824                 list_for_each_entry_safe(sibling, next_sibling,
4825                                          &rt->fib6_siblings, fib6_siblings) {
4826                         if (fib_add_nexthop(skb, &sibling->fib6_nh.nh_common,
4827                                             sibling->fib6_nh.fib_nh_weight) < 0)
4828                                 goto nla_put_failure;
4829                 }
4830
4831                 nla_nest_end(skb, mp);
4832         } else {
4833                 unsigned char nh_flags = 0;
4834
4835                 if (fib_nexthop_info(skb, &rt->fib6_nh.nh_common,
4836                                      &nh_flags, false) < 0)
4837                         goto nla_put_failure;
4838
4839                 rtm->rtm_flags |= nh_flags;
4840         }
4841
4842         if (rt6_flags & RTF_EXPIRES) {
4843                 expires = dst ? dst->expires : rt->expires;
4844                 expires -= jiffies;
4845         }
4846
4847         if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
4848                 goto nla_put_failure;
4849
4850         if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt6_flags)))
4851                 goto nla_put_failure;
4852
4853
4854         nlmsg_end(skb, nlh);
4855         return 0;
4856
4857 nla_put_failure:
4858         nlmsg_cancel(skb, nlh);
4859         return -EMSGSIZE;
4860 }
4861
4862 static bool fib6_info_uses_dev(const struct fib6_info *f6i,
4863                                const struct net_device *dev)
4864 {
4865         if (f6i->fib6_nh.fib_nh_dev == dev)
4866                 return true;
4867
4868         if (f6i->fib6_nsiblings) {
4869                 struct fib6_info *sibling, *next_sibling;
4870
4871                 list_for_each_entry_safe(sibling, next_sibling,
4872                                          &f6i->fib6_siblings, fib6_siblings) {
4873                         if (sibling->fib6_nh.fib_nh_dev == dev)
4874                                 return true;
4875                 }
4876         }
4877
4878         return false;
4879 }
4880
4881 int rt6_dump_route(struct fib6_info *rt, void *p_arg)
4882 {
4883         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4884         struct fib_dump_filter *filter = &arg->filter;
4885         unsigned int flags = NLM_F_MULTI;
4886         struct net *net = arg->net;
4887
4888         if (rt == net->ipv6.fib6_null_entry)
4889                 return 0;
4890
4891         if ((filter->flags & RTM_F_PREFIX) &&
4892             !(rt->fib6_flags & RTF_PREFIX_RT)) {
4893                 /* success since this is not a prefix route */
4894                 return 1;
4895         }
4896         if (filter->filter_set) {
4897                 if ((filter->rt_type && rt->fib6_type != filter->rt_type) ||
4898                     (filter->dev && !fib6_info_uses_dev(rt, filter->dev)) ||
4899                     (filter->protocol && rt->fib6_protocol != filter->protocol)) {
4900                         return 1;
4901                 }
4902                 flags |= NLM_F_DUMP_FILTERED;
4903         }
4904
4905         return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0,
4906                              RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid,
4907                              arg->cb->nlh->nlmsg_seq, flags);
4908 }
4909
4910 static int inet6_rtm_valid_getroute_req(struct sk_buff *skb,
4911                                         const struct nlmsghdr *nlh,
4912                                         struct nlattr **tb,
4913                                         struct netlink_ext_ack *extack)
4914 {
4915         struct rtmsg *rtm;
4916         int i, err;
4917
4918         if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
4919                 NL_SET_ERR_MSG_MOD(extack,
4920                                    "Invalid header for get route request");
4921                 return -EINVAL;
4922         }
4923
4924         if (!netlink_strict_get_check(skb))
4925                 return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
4926                                               rtm_ipv6_policy, extack);
4927
4928         rtm = nlmsg_data(nlh);
4929         if ((rtm->rtm_src_len && rtm->rtm_src_len != 128) ||
4930             (rtm->rtm_dst_len && rtm->rtm_dst_len != 128) ||
4931             rtm->rtm_table || rtm->rtm_protocol || rtm->rtm_scope ||
4932             rtm->rtm_type) {
4933                 NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for get route request");
4934                 return -EINVAL;
4935         }
4936         if (rtm->rtm_flags & ~RTM_F_FIB_MATCH) {
4937                 NL_SET_ERR_MSG_MOD(extack,
4938                                    "Invalid flags for get route request");
4939                 return -EINVAL;
4940         }
4941
4942         err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
4943                                             rtm_ipv6_policy, extack);
4944         if (err)
4945                 return err;
4946
4947         if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
4948             (tb[RTA_DST] && !rtm->rtm_dst_len)) {
4949                 NL_SET_ERR_MSG_MOD(extack, "rtm_src_len and rtm_dst_len must be 128 for IPv6");
4950                 return -EINVAL;
4951         }
4952
4953         for (i = 0; i <= RTA_MAX; i++) {
4954                 if (!tb[i])
4955                         continue;
4956
4957                 switch (i) {
4958                 case RTA_SRC:
4959                 case RTA_DST:
4960                 case RTA_IIF:
4961                 case RTA_OIF:
4962                 case RTA_MARK:
4963                 case RTA_UID:
4964                 case RTA_SPORT:
4965                 case RTA_DPORT:
4966                 case RTA_IP_PROTO:
4967                         break;
4968                 default:
4969                         NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in get route request");
4970                         return -EINVAL;
4971                 }
4972         }
4973
4974         return 0;
4975 }
4976
4977 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4978                               struct netlink_ext_ack *extack)
4979 {
4980         struct net *net = sock_net(in_skb->sk);
4981         struct nlattr *tb[RTA_MAX+1];
4982         int err, iif = 0, oif = 0;
4983         struct fib6_info *from;
4984         struct dst_entry *dst;
4985         struct rt6_info *rt;
4986         struct sk_buff *skb;
4987         struct rtmsg *rtm;
4988         struct flowi6 fl6 = {};
4989         bool fibmatch;
4990
4991         err = inet6_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
4992         if (err < 0)
4993                 goto errout;
4994
4995         err = -EINVAL;
4996         rtm = nlmsg_data(nlh);
4997         fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4998         fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4999
5000         if (tb[RTA_SRC]) {
5001                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
5002                         goto errout;
5003
5004                 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
5005         }
5006
5007         if (tb[RTA_DST]) {
5008                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
5009                         goto errout;
5010
5011                 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
5012         }
5013
5014         if (tb[RTA_IIF])
5015                 iif = nla_get_u32(tb[RTA_IIF]);
5016
5017         if (tb[RTA_OIF])
5018                 oif = nla_get_u32(tb[RTA_OIF]);
5019
5020         if (tb[RTA_MARK])
5021                 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
5022
5023         if (tb[RTA_UID])
5024                 fl6.flowi6_uid = make_kuid(current_user_ns(),
5025                                            nla_get_u32(tb[RTA_UID]));
5026         else
5027                 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
5028
5029         if (tb[RTA_SPORT])
5030                 fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]);
5031
5032         if (tb[RTA_DPORT])
5033                 fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]);
5034
5035         if (tb[RTA_IP_PROTO]) {
5036                 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
5037                                                   &fl6.flowi6_proto, AF_INET6,
5038                                                   extack);
5039                 if (err)
5040                         goto errout;
5041         }
5042
5043         if (iif) {
5044                 struct net_device *dev;
5045                 int flags = 0;
5046
5047                 rcu_read_lock();
5048
5049                 dev = dev_get_by_index_rcu(net, iif);
5050                 if (!dev) {
5051                         rcu_read_unlock();
5052                         err = -ENODEV;
5053                         goto errout;
5054                 }
5055
5056                 fl6.flowi6_iif = iif;
5057
5058                 if (!ipv6_addr_any(&fl6.saddr))
5059                         flags |= RT6_LOOKUP_F_HAS_SADDR;
5060
5061                 dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags);
5062
5063                 rcu_read_unlock();
5064         } else {
5065                 fl6.flowi6_oif = oif;
5066
5067                 dst = ip6_route_output(net, NULL, &fl6);
5068         }
5069
5070
5071         rt = container_of(dst, struct rt6_info, dst);
5072         if (rt->dst.error) {
5073                 err = rt->dst.error;
5074                 ip6_rt_put(rt);
5075                 goto errout;
5076         }
5077
5078         if (rt == net->ipv6.ip6_null_entry) {
5079                 err = rt->dst.error;
5080                 ip6_rt_put(rt);
5081                 goto errout;
5082         }
5083
5084         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
5085         if (!skb) {
5086                 ip6_rt_put(rt);
5087                 err = -ENOBUFS;
5088                 goto errout;
5089         }
5090
5091         skb_dst_set(skb, &rt->dst);
5092
5093         rcu_read_lock();
5094         from = rcu_dereference(rt->from);
5095         if (from) {
5096                 if (fibmatch)
5097                         err = rt6_fill_node(net, skb, from, NULL, NULL, NULL,
5098                                             iif, RTM_NEWROUTE,
5099                                             NETLINK_CB(in_skb).portid,
5100                                             nlh->nlmsg_seq, 0);
5101                 else
5102                         err = rt6_fill_node(net, skb, from, dst, &fl6.daddr,
5103                                             &fl6.saddr, iif, RTM_NEWROUTE,
5104                                             NETLINK_CB(in_skb).portid,
5105                                             nlh->nlmsg_seq, 0);
5106         } else {
5107                 err = -ENETUNREACH;
5108         }
5109         rcu_read_unlock();
5110
5111         if (err < 0) {
5112                 kfree_skb(skb);
5113                 goto errout;
5114         }
5115
5116         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
5117 errout:
5118         return err;
5119 }
5120
5121 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
5122                      unsigned int nlm_flags)
5123 {
5124         struct sk_buff *skb;
5125         struct net *net = info->nl_net;
5126         u32 seq;
5127         int err;
5128
5129         err = -ENOBUFS;
5130         seq = info->nlh ? info->nlh->nlmsg_seq : 0;
5131
5132         skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
5133         if (!skb)
5134                 goto errout;
5135
5136         err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
5137                             event, info->portid, seq, nlm_flags);
5138         if (err < 0) {
5139                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
5140                 WARN_ON(err == -EMSGSIZE);
5141                 kfree_skb(skb);
5142                 goto errout;
5143         }
5144         rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
5145                     info->nlh, gfp_any());
5146         return;
5147 errout:
5148         if (err < 0)
5149                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
5150 }
5151
5152 void fib6_rt_update(struct net *net, struct fib6_info *rt,
5153                     struct nl_info *info)
5154 {
5155         u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
5156         struct sk_buff *skb;
5157         int err = -ENOBUFS;
5158
5159         /* call_fib6_entry_notifiers will be removed when in-kernel notifier
5160          * is implemented and supported for nexthop objects
5161          */
5162         call_fib6_entry_notifiers(net, FIB_EVENT_ENTRY_REPLACE, rt, NULL);
5163
5164         skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
5165         if (!skb)
5166                 goto errout;
5167
5168         err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
5169                             RTM_NEWROUTE, info->portid, seq, NLM_F_REPLACE);
5170         if (err < 0) {
5171                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
5172                 WARN_ON(err == -EMSGSIZE);
5173                 kfree_skb(skb);
5174                 goto errout;
5175         }
5176         rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
5177                     info->nlh, gfp_any());
5178         return;
5179 errout:
5180         if (err < 0)
5181                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
5182 }
5183
5184 static int ip6_route_dev_notify(struct notifier_block *this,
5185                                 unsigned long event, void *ptr)
5186 {
5187         struct net_device *dev = netdev_notifier_info_to_dev(ptr);
5188         struct net *net = dev_net(dev);
5189
5190         if (!(dev->flags & IFF_LOOPBACK))
5191                 return NOTIFY_OK;
5192
5193         if (event == NETDEV_REGISTER) {
5194                 net->ipv6.fib6_null_entry->fib6_nh.fib_nh_dev = dev;
5195                 net->ipv6.ip6_null_entry->dst.dev = dev;
5196                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
5197 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5198                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
5199                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
5200                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
5201                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
5202 #endif
5203          } else if (event == NETDEV_UNREGISTER &&
5204                     dev->reg_state != NETREG_UNREGISTERED) {
5205                 /* NETDEV_UNREGISTER could be fired for multiple times by
5206                  * netdev_wait_allrefs(). Make sure we only call this once.
5207                  */
5208                 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
5209 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5210                 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
5211                 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
5212 #endif
5213         }
5214
5215         return NOTIFY_OK;
5216 }
5217
5218 /*
5219  *      /proc
5220  */
5221
5222 #ifdef CONFIG_PROC_FS
5223 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
5224 {
5225         struct net *net = (struct net *)seq->private;
5226         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
5227                    net->ipv6.rt6_stats->fib_nodes,
5228                    net->ipv6.rt6_stats->fib_route_nodes,
5229                    atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
5230                    net->ipv6.rt6_stats->fib_rt_entries,
5231                    net->ipv6.rt6_stats->fib_rt_cache,
5232                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
5233                    net->ipv6.rt6_stats->fib_discarded_routes);
5234
5235         return 0;
5236 }
5237 #endif  /* CONFIG_PROC_FS */
5238
5239 #ifdef CONFIG_SYSCTL
5240
5241 static
5242 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
5243                               void __user *buffer, size_t *lenp, loff_t *ppos)
5244 {
5245         struct net *net;
5246         int delay;
5247         int ret;
5248         if (!write)
5249                 return -EINVAL;
5250
5251         net = (struct net *)ctl->extra1;
5252         delay = net->ipv6.sysctl.flush_delay;
5253         ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
5254         if (ret)
5255                 return ret;
5256
5257         fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
5258         return 0;
5259 }
5260
5261 static int zero;
5262 static int one = 1;
5263
5264 static struct ctl_table ipv6_route_table_template[] = {
5265         {
5266                 .procname       =       "flush",
5267                 .data           =       &init_net.ipv6.sysctl.flush_delay,
5268                 .maxlen         =       sizeof(int),
5269                 .mode           =       0200,
5270                 .proc_handler   =       ipv6_sysctl_rtcache_flush
5271         },
5272         {
5273                 .procname       =       "gc_thresh",
5274                 .data           =       &ip6_dst_ops_template.gc_thresh,
5275                 .maxlen         =       sizeof(int),
5276                 .mode           =       0644,
5277                 .proc_handler   =       proc_dointvec,
5278         },
5279         {
5280                 .procname       =       "max_size",
5281                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
5282                 .maxlen         =       sizeof(int),
5283                 .mode           =       0644,
5284                 .proc_handler   =       proc_dointvec,
5285         },
5286         {
5287                 .procname       =       "gc_min_interval",
5288                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5289                 .maxlen         =       sizeof(int),
5290                 .mode           =       0644,
5291                 .proc_handler   =       proc_dointvec_jiffies,
5292         },
5293         {
5294                 .procname       =       "gc_timeout",
5295                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
5296                 .maxlen         =       sizeof(int),
5297                 .mode           =       0644,
5298                 .proc_handler   =       proc_dointvec_jiffies,
5299         },
5300         {
5301                 .procname       =       "gc_interval",
5302                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
5303                 .maxlen         =       sizeof(int),
5304                 .mode           =       0644,
5305                 .proc_handler   =       proc_dointvec_jiffies,
5306         },
5307         {
5308                 .procname       =       "gc_elasticity",
5309                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
5310                 .maxlen         =       sizeof(int),
5311                 .mode           =       0644,
5312                 .proc_handler   =       proc_dointvec,
5313         },
5314         {
5315                 .procname       =       "mtu_expires",
5316                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
5317                 .maxlen         =       sizeof(int),
5318                 .mode           =       0644,
5319                 .proc_handler   =       proc_dointvec_jiffies,
5320         },
5321         {
5322                 .procname       =       "min_adv_mss",
5323                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
5324                 .maxlen         =       sizeof(int),
5325                 .mode           =       0644,
5326                 .proc_handler   =       proc_dointvec,
5327         },
5328         {
5329                 .procname       =       "gc_min_interval_ms",
5330                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5331                 .maxlen         =       sizeof(int),
5332                 .mode           =       0644,
5333                 .proc_handler   =       proc_dointvec_ms_jiffies,
5334         },
5335         {
5336                 .procname       =       "skip_notify_on_dev_down",
5337                 .data           =       &init_net.ipv6.sysctl.skip_notify_on_dev_down,
5338                 .maxlen         =       sizeof(int),
5339                 .mode           =       0644,
5340                 .proc_handler   =       proc_dointvec,
5341                 .extra1         =       &zero,
5342                 .extra2         =       &one,
5343         },
5344         { }
5345 };
5346
5347 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
5348 {
5349         struct ctl_table *table;
5350
5351         table = kmemdup(ipv6_route_table_template,
5352                         sizeof(ipv6_route_table_template),
5353                         GFP_KERNEL);
5354
5355         if (table) {
5356                 table[0].data = &net->ipv6.sysctl.flush_delay;
5357                 table[0].extra1 = net;
5358                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
5359                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
5360                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5361                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
5362                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
5363                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
5364                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
5365                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
5366                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5367                 table[10].data = &net->ipv6.sysctl.skip_notify_on_dev_down;
5368
5369                 /* Don't export sysctls to unprivileged users */
5370                 if (net->user_ns != &init_user_ns)
5371                         table[0].procname = NULL;
5372         }
5373
5374         return table;
5375 }
5376 #endif
5377
5378 static int __net_init ip6_route_net_init(struct net *net)
5379 {
5380         int ret = -ENOMEM;
5381
5382         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
5383                sizeof(net->ipv6.ip6_dst_ops));
5384
5385         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
5386                 goto out_ip6_dst_ops;
5387
5388         net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template,
5389                                             sizeof(*net->ipv6.fib6_null_entry),
5390                                             GFP_KERNEL);
5391         if (!net->ipv6.fib6_null_entry)
5392                 goto out_ip6_dst_entries;
5393
5394         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
5395                                            sizeof(*net->ipv6.ip6_null_entry),
5396                                            GFP_KERNEL);
5397         if (!net->ipv6.ip6_null_entry)
5398                 goto out_fib6_null_entry;
5399         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5400         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
5401                          ip6_template_metrics, true);
5402
5403 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5404         net->ipv6.fib6_has_custom_rules = false;
5405         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
5406                                                sizeof(*net->ipv6.ip6_prohibit_entry),
5407                                                GFP_KERNEL);
5408         if (!net->ipv6.ip6_prohibit_entry)
5409                 goto out_ip6_null_entry;
5410         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5411         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
5412                          ip6_template_metrics, true);
5413
5414         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
5415                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
5416                                                GFP_KERNEL);
5417         if (!net->ipv6.ip6_blk_hole_entry)
5418                 goto out_ip6_prohibit_entry;
5419         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5420         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
5421                          ip6_template_metrics, true);
5422 #endif
5423
5424         net->ipv6.sysctl.flush_delay = 0;
5425         net->ipv6.sysctl.ip6_rt_max_size = 4096;
5426         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
5427         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
5428         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
5429         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
5430         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
5431         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
5432         net->ipv6.sysctl.skip_notify_on_dev_down = 0;
5433
5434         net->ipv6.ip6_rt_gc_expire = 30*HZ;
5435
5436         ret = 0;
5437 out:
5438         return ret;
5439
5440 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5441 out_ip6_prohibit_entry:
5442         kfree(net->ipv6.ip6_prohibit_entry);
5443 out_ip6_null_entry:
5444         kfree(net->ipv6.ip6_null_entry);
5445 #endif
5446 out_fib6_null_entry:
5447         kfree(net->ipv6.fib6_null_entry);
5448 out_ip6_dst_entries:
5449         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5450 out_ip6_dst_ops:
5451         goto out;
5452 }
5453
5454 static void __net_exit ip6_route_net_exit(struct net *net)
5455 {
5456         kfree(net->ipv6.fib6_null_entry);
5457         kfree(net->ipv6.ip6_null_entry);
5458 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5459         kfree(net->ipv6.ip6_prohibit_entry);
5460         kfree(net->ipv6.ip6_blk_hole_entry);
5461 #endif
5462         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5463 }
5464
5465 static int __net_init ip6_route_net_init_late(struct net *net)
5466 {
5467 #ifdef CONFIG_PROC_FS
5468         proc_create_net("ipv6_route", 0, net->proc_net, &ipv6_route_seq_ops,
5469                         sizeof(struct ipv6_route_iter));
5470         proc_create_net_single("rt6_stats", 0444, net->proc_net,
5471                         rt6_stats_seq_show, NULL);
5472 #endif
5473         return 0;
5474 }
5475
5476 static void __net_exit ip6_route_net_exit_late(struct net *net)
5477 {
5478 #ifdef CONFIG_PROC_FS
5479         remove_proc_entry("ipv6_route", net->proc_net);
5480         remove_proc_entry("rt6_stats", net->proc_net);
5481 #endif
5482 }
5483
5484 static struct pernet_operations ip6_route_net_ops = {
5485         .init = ip6_route_net_init,
5486         .exit = ip6_route_net_exit,
5487 };
5488
5489 static int __net_init ipv6_inetpeer_init(struct net *net)
5490 {
5491         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
5492
5493         if (!bp)
5494                 return -ENOMEM;
5495         inet_peer_base_init(bp);
5496         net->ipv6.peers = bp;
5497         return 0;
5498 }
5499
5500 static void __net_exit ipv6_inetpeer_exit(struct net *net)
5501 {
5502         struct inet_peer_base *bp = net->ipv6.peers;
5503
5504         net->ipv6.peers = NULL;
5505         inetpeer_invalidate_tree(bp);
5506         kfree(bp);
5507 }
5508
5509 static struct pernet_operations ipv6_inetpeer_ops = {
5510         .init   =       ipv6_inetpeer_init,
5511         .exit   =       ipv6_inetpeer_exit,
5512 };
5513
5514 static struct pernet_operations ip6_route_net_late_ops = {
5515         .init = ip6_route_net_init_late,
5516         .exit = ip6_route_net_exit_late,
5517 };
5518
5519 static struct notifier_block ip6_route_dev_notifier = {
5520         .notifier_call = ip6_route_dev_notify,
5521         .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
5522 };
5523
5524 void __init ip6_route_init_special_entries(void)
5525 {
5526         /* Registering of the loopback is done before this portion of code,
5527          * the loopback reference in rt6_info will not be taken, do it
5528          * manually for init_net */
5529         init_net.ipv6.fib6_null_entry->fib6_nh.fib_nh_dev = init_net.loopback_dev;
5530         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
5531         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5532   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5533         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
5534         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5535         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
5536         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5537   #endif
5538 }
5539
5540 int __init ip6_route_init(void)
5541 {
5542         int ret;
5543         int cpu;
5544
5545         ret = -ENOMEM;
5546         ip6_dst_ops_template.kmem_cachep =
5547                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
5548                                   SLAB_HWCACHE_ALIGN, NULL);
5549         if (!ip6_dst_ops_template.kmem_cachep)
5550                 goto out;
5551
5552         ret = dst_entries_init(&ip6_dst_blackhole_ops);
5553         if (ret)
5554                 goto out_kmem_cache;
5555
5556         ret = register_pernet_subsys(&ipv6_inetpeer_ops);
5557         if (ret)
5558                 goto out_dst_entries;
5559
5560         ret = register_pernet_subsys(&ip6_route_net_ops);
5561         if (ret)
5562                 goto out_register_inetpeer;
5563
5564         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
5565
5566         ret = fib6_init();
5567         if (ret)
5568                 goto out_register_subsys;
5569
5570         ret = xfrm6_init();
5571         if (ret)
5572                 goto out_fib6_init;
5573
5574         ret = fib6_rules_init();
5575         if (ret)
5576                 goto xfrm6_init;
5577
5578         ret = register_pernet_subsys(&ip6_route_net_late_ops);
5579         if (ret)
5580                 goto fib6_rules_init;
5581
5582         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
5583                                    inet6_rtm_newroute, NULL, 0);
5584         if (ret < 0)
5585                 goto out_register_late_subsys;
5586
5587         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
5588                                    inet6_rtm_delroute, NULL, 0);
5589         if (ret < 0)
5590                 goto out_register_late_subsys;
5591
5592         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
5593                                    inet6_rtm_getroute, NULL,
5594                                    RTNL_FLAG_DOIT_UNLOCKED);
5595         if (ret < 0)
5596                 goto out_register_late_subsys;
5597
5598         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
5599         if (ret)
5600                 goto out_register_late_subsys;
5601
5602         for_each_possible_cpu(cpu) {
5603                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
5604
5605                 INIT_LIST_HEAD(&ul->head);
5606                 spin_lock_init(&ul->lock);
5607         }
5608
5609 out:
5610         return ret;
5611
5612 out_register_late_subsys:
5613         rtnl_unregister_all(PF_INET6);
5614         unregister_pernet_subsys(&ip6_route_net_late_ops);
5615 fib6_rules_init:
5616         fib6_rules_cleanup();
5617 xfrm6_init:
5618         xfrm6_fini();
5619 out_fib6_init:
5620         fib6_gc_cleanup();
5621 out_register_subsys:
5622         unregister_pernet_subsys(&ip6_route_net_ops);
5623 out_register_inetpeer:
5624         unregister_pernet_subsys(&ipv6_inetpeer_ops);
5625 out_dst_entries:
5626         dst_entries_destroy(&ip6_dst_blackhole_ops);
5627 out_kmem_cache:
5628         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5629         goto out;
5630 }
5631
5632 void ip6_route_cleanup(void)
5633 {
5634         unregister_netdevice_notifier(&ip6_route_dev_notifier);
5635         unregister_pernet_subsys(&ip6_route_net_late_ops);
5636         fib6_rules_cleanup();
5637         xfrm6_fini();
5638         fib6_gc_cleanup();
5639         unregister_pernet_subsys(&ipv6_inetpeer_ops);
5640         unregister_pernet_subsys(&ip6_route_net_ops);
5641         dst_entries_destroy(&ip6_dst_blackhole_ops);
5642         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5643 }