clk: renesas: r8a77965: Replace DU2 clock
[linux-2.6-microblaze.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #define pr_fmt(fmt) "IPv6: " fmt
28
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
49 #include <net/snmp.h>
50 #include <net/ipv6.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
55 #include <net/tcp.h>
56 #include <linux/rtnetlink.h>
57 #include <net/dst.h>
58 #include <net/dst_metadata.h>
59 #include <net/xfrm.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/nexthop.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
66 #include <trace/events/fib6.h>
67
68 #include <linux/uaccess.h>
69
70 #ifdef CONFIG_SYSCTL
71 #include <linux/sysctl.h>
72 #endif
73
74 enum rt6_nud_state {
75         RT6_NUD_FAIL_HARD = -3,
76         RT6_NUD_FAIL_PROBE = -2,
77         RT6_NUD_FAIL_DO_RR = -1,
78         RT6_NUD_SUCCEED = 1
79 };
80
81 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort);
82 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
83 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
84 static unsigned int      ip6_mtu(const struct dst_entry *dst);
85 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
86 static void             ip6_dst_destroy(struct dst_entry *);
87 static void             ip6_dst_ifdown(struct dst_entry *,
88                                        struct net_device *dev, int how);
89 static int               ip6_dst_gc(struct dst_ops *ops);
90
91 static int              ip6_pkt_discard(struct sk_buff *skb);
92 static int              ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
93 static int              ip6_pkt_prohibit(struct sk_buff *skb);
94 static int              ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
95 static void             ip6_link_failure(struct sk_buff *skb);
96 static void             ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
97                                            struct sk_buff *skb, u32 mtu);
98 static void             rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
99                                         struct sk_buff *skb);
100 static void             rt6_dst_from_metrics_check(struct rt6_info *rt);
101 static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
102 static size_t rt6_nlmsg_size(struct rt6_info *rt);
103 static int rt6_fill_node(struct net *net,
104                          struct sk_buff *skb, struct rt6_info *rt,
105                          struct in6_addr *dst, struct in6_addr *src,
106                          int iif, int type, u32 portid, u32 seq,
107                          unsigned int flags);
108 static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
109                                            struct in6_addr *daddr,
110                                            struct in6_addr *saddr);
111
112 #ifdef CONFIG_IPV6_ROUTE_INFO
113 static struct rt6_info *rt6_add_route_info(struct net *net,
114                                            const struct in6_addr *prefix, int prefixlen,
115                                            const struct in6_addr *gwaddr,
116                                            struct net_device *dev,
117                                            unsigned int pref);
118 static struct rt6_info *rt6_get_route_info(struct net *net,
119                                            const struct in6_addr *prefix, int prefixlen,
120                                            const struct in6_addr *gwaddr,
121                                            struct net_device *dev);
122 #endif
123
124 struct uncached_list {
125         spinlock_t              lock;
126         struct list_head        head;
127 };
128
129 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
130
131 static void rt6_uncached_list_add(struct rt6_info *rt)
132 {
133         struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
134
135         rt->rt6i_uncached_list = ul;
136
137         spin_lock_bh(&ul->lock);
138         list_add_tail(&rt->rt6i_uncached, &ul->head);
139         spin_unlock_bh(&ul->lock);
140 }
141
142 static void rt6_uncached_list_del(struct rt6_info *rt)
143 {
144         if (!list_empty(&rt->rt6i_uncached)) {
145                 struct uncached_list *ul = rt->rt6i_uncached_list;
146                 struct net *net = dev_net(rt->dst.dev);
147
148                 spin_lock_bh(&ul->lock);
149                 list_del(&rt->rt6i_uncached);
150                 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
151                 spin_unlock_bh(&ul->lock);
152         }
153 }
154
155 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
156 {
157         struct net_device *loopback_dev = net->loopback_dev;
158         int cpu;
159
160         if (dev == loopback_dev)
161                 return;
162
163         for_each_possible_cpu(cpu) {
164                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
165                 struct rt6_info *rt;
166
167                 spin_lock_bh(&ul->lock);
168                 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
169                         struct inet6_dev *rt_idev = rt->rt6i_idev;
170                         struct net_device *rt_dev = rt->dst.dev;
171
172                         if (rt_idev->dev == dev) {
173                                 rt->rt6i_idev = in6_dev_get(loopback_dev);
174                                 in6_dev_put(rt_idev);
175                         }
176
177                         if (rt_dev == dev) {
178                                 rt->dst.dev = loopback_dev;
179                                 dev_hold(rt->dst.dev);
180                                 dev_put(rt_dev);
181                         }
182                 }
183                 spin_unlock_bh(&ul->lock);
184         }
185 }
186
187 static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt)
188 {
189         return dst_metrics_write_ptr(&rt->from->dst);
190 }
191
192 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
193 {
194         struct rt6_info *rt = (struct rt6_info *)dst;
195
196         if (rt->rt6i_flags & RTF_PCPU)
197                 return rt6_pcpu_cow_metrics(rt);
198         else if (rt->rt6i_flags & RTF_CACHE)
199                 return NULL;
200         else
201                 return dst_cow_metrics_generic(dst, old);
202 }
203
204 static inline const void *choose_neigh_daddr(struct rt6_info *rt,
205                                              struct sk_buff *skb,
206                                              const void *daddr)
207 {
208         struct in6_addr *p = &rt->rt6i_gateway;
209
210         if (!ipv6_addr_any(p))
211                 return (const void *) p;
212         else if (skb)
213                 return &ipv6_hdr(skb)->daddr;
214         return daddr;
215 }
216
217 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
218                                           struct sk_buff *skb,
219                                           const void *daddr)
220 {
221         struct rt6_info *rt = (struct rt6_info *) dst;
222         struct neighbour *n;
223
224         daddr = choose_neigh_daddr(rt, skb, daddr);
225         n = __ipv6_neigh_lookup(dst->dev, daddr);
226         if (n)
227                 return n;
228         return neigh_create(&nd_tbl, daddr, dst->dev);
229 }
230
231 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
232 {
233         struct net_device *dev = dst->dev;
234         struct rt6_info *rt = (struct rt6_info *)dst;
235
236         daddr = choose_neigh_daddr(rt, NULL, daddr);
237         if (!daddr)
238                 return;
239         if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
240                 return;
241         if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
242                 return;
243         __ipv6_confirm_neigh(dev, daddr);
244 }
245
246 static struct dst_ops ip6_dst_ops_template = {
247         .family                 =       AF_INET6,
248         .gc                     =       ip6_dst_gc,
249         .gc_thresh              =       1024,
250         .check                  =       ip6_dst_check,
251         .default_advmss         =       ip6_default_advmss,
252         .mtu                    =       ip6_mtu,
253         .cow_metrics            =       ipv6_cow_metrics,
254         .destroy                =       ip6_dst_destroy,
255         .ifdown                 =       ip6_dst_ifdown,
256         .negative_advice        =       ip6_negative_advice,
257         .link_failure           =       ip6_link_failure,
258         .update_pmtu            =       ip6_rt_update_pmtu,
259         .redirect               =       rt6_do_redirect,
260         .local_out              =       __ip6_local_out,
261         .neigh_lookup           =       ip6_neigh_lookup,
262         .confirm_neigh          =       ip6_confirm_neigh,
263 };
264
265 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
266 {
267         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
268
269         return mtu ? : dst->dev->mtu;
270 }
271
272 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
273                                          struct sk_buff *skb, u32 mtu)
274 {
275 }
276
277 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
278                                       struct sk_buff *skb)
279 {
280 }
281
282 static struct dst_ops ip6_dst_blackhole_ops = {
283         .family                 =       AF_INET6,
284         .destroy                =       ip6_dst_destroy,
285         .check                  =       ip6_dst_check,
286         .mtu                    =       ip6_blackhole_mtu,
287         .default_advmss         =       ip6_default_advmss,
288         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
289         .redirect               =       ip6_rt_blackhole_redirect,
290         .cow_metrics            =       dst_cow_metrics_generic,
291         .neigh_lookup           =       ip6_neigh_lookup,
292 };
293
294 static const u32 ip6_template_metrics[RTAX_MAX] = {
295         [RTAX_HOPLIMIT - 1] = 0,
296 };
297
298 static const struct rt6_info ip6_null_entry_template = {
299         .dst = {
300                 .__refcnt       = ATOMIC_INIT(1),
301                 .__use          = 1,
302                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
303                 .error          = -ENETUNREACH,
304                 .input          = ip6_pkt_discard,
305                 .output         = ip6_pkt_discard_out,
306         },
307         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
308         .rt6i_protocol  = RTPROT_KERNEL,
309         .rt6i_metric    = ~(u32) 0,
310         .rt6i_ref       = ATOMIC_INIT(1),
311 };
312
313 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
314
315 static const struct rt6_info ip6_prohibit_entry_template = {
316         .dst = {
317                 .__refcnt       = ATOMIC_INIT(1),
318                 .__use          = 1,
319                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
320                 .error          = -EACCES,
321                 .input          = ip6_pkt_prohibit,
322                 .output         = ip6_pkt_prohibit_out,
323         },
324         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
325         .rt6i_protocol  = RTPROT_KERNEL,
326         .rt6i_metric    = ~(u32) 0,
327         .rt6i_ref       = ATOMIC_INIT(1),
328 };
329
330 static const struct rt6_info ip6_blk_hole_entry_template = {
331         .dst = {
332                 .__refcnt       = ATOMIC_INIT(1),
333                 .__use          = 1,
334                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
335                 .error          = -EINVAL,
336                 .input          = dst_discard,
337                 .output         = dst_discard_out,
338         },
339         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
340         .rt6i_protocol  = RTPROT_KERNEL,
341         .rt6i_metric    = ~(u32) 0,
342         .rt6i_ref       = ATOMIC_INIT(1),
343 };
344
345 #endif
346
347 static void rt6_info_init(struct rt6_info *rt)
348 {
349         struct dst_entry *dst = &rt->dst;
350
351         memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
352         INIT_LIST_HEAD(&rt->rt6i_siblings);
353         INIT_LIST_HEAD(&rt->rt6i_uncached);
354 }
355
356 /* allocate dst with ip6_dst_ops */
357 static struct rt6_info *__ip6_dst_alloc(struct net *net,
358                                         struct net_device *dev,
359                                         int flags)
360 {
361         struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
362                                         1, DST_OBSOLETE_FORCE_CHK, flags);
363
364         if (rt) {
365                 rt6_info_init(rt);
366                 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
367         }
368
369         return rt;
370 }
371
372 struct rt6_info *ip6_dst_alloc(struct net *net,
373                                struct net_device *dev,
374                                int flags)
375 {
376         struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags);
377
378         if (rt) {
379                 rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
380                 if (!rt->rt6i_pcpu) {
381                         dst_release_immediate(&rt->dst);
382                         return NULL;
383                 }
384         }
385
386         return rt;
387 }
388 EXPORT_SYMBOL(ip6_dst_alloc);
389
390 static void ip6_dst_destroy(struct dst_entry *dst)
391 {
392         struct rt6_info *rt = (struct rt6_info *)dst;
393         struct rt6_exception_bucket *bucket;
394         struct rt6_info *from = rt->from;
395         struct inet6_dev *idev;
396
397         dst_destroy_metrics_generic(dst);
398         free_percpu(rt->rt6i_pcpu);
399         rt6_uncached_list_del(rt);
400
401         idev = rt->rt6i_idev;
402         if (idev) {
403                 rt->rt6i_idev = NULL;
404                 in6_dev_put(idev);
405         }
406         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1);
407         if (bucket) {
408                 rt->rt6i_exception_bucket = NULL;
409                 kfree(bucket);
410         }
411
412         rt->from = NULL;
413         dst_release(&from->dst);
414 }
415
416 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
417                            int how)
418 {
419         struct rt6_info *rt = (struct rt6_info *)dst;
420         struct inet6_dev *idev = rt->rt6i_idev;
421         struct net_device *loopback_dev =
422                 dev_net(dev)->loopback_dev;
423
424         if (idev && idev->dev != loopback_dev) {
425                 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
426                 if (loopback_idev) {
427                         rt->rt6i_idev = loopback_idev;
428                         in6_dev_put(idev);
429                 }
430         }
431 }
432
433 static bool __rt6_check_expired(const struct rt6_info *rt)
434 {
435         if (rt->rt6i_flags & RTF_EXPIRES)
436                 return time_after(jiffies, rt->dst.expires);
437         else
438                 return false;
439 }
440
441 static bool rt6_check_expired(const struct rt6_info *rt)
442 {
443         if (rt->rt6i_flags & RTF_EXPIRES) {
444                 if (time_after(jiffies, rt->dst.expires))
445                         return true;
446         } else if (rt->from) {
447                 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
448                         rt6_check_expired(rt->from);
449         }
450         return false;
451 }
452
453 static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
454                                              struct flowi6 *fl6, int oif,
455                                              int strict)
456 {
457         struct rt6_info *sibling, *next_sibling;
458
459         /* We might have already computed the hash for ICMPv6 errors. In such
460          * case it will always be non-zero. Otherwise now is the time to do it.
461          */
462         if (!fl6->mp_hash)
463                 fl6->mp_hash = rt6_multipath_hash(fl6, NULL);
464
465         if (fl6->mp_hash <= atomic_read(&match->rt6i_nh_upper_bound))
466                 return match;
467
468         list_for_each_entry_safe(sibling, next_sibling, &match->rt6i_siblings,
469                                  rt6i_siblings) {
470                 if (fl6->mp_hash > atomic_read(&sibling->rt6i_nh_upper_bound))
471                         continue;
472                 if (rt6_score_route(sibling, oif, strict) < 0)
473                         break;
474                 match = sibling;
475                 break;
476         }
477
478         return match;
479 }
480
481 /*
482  *      Route lookup. rcu_read_lock() should be held.
483  */
484
485 static inline struct rt6_info *rt6_device_match(struct net *net,
486                                                     struct rt6_info *rt,
487                                                     const struct in6_addr *saddr,
488                                                     int oif,
489                                                     int flags)
490 {
491         struct rt6_info *local = NULL;
492         struct rt6_info *sprt;
493
494         if (!oif && ipv6_addr_any(saddr) && !(rt->rt6i_nh_flags & RTNH_F_DEAD))
495                 return rt;
496
497         for (sprt = rt; sprt; sprt = rcu_dereference(sprt->rt6_next)) {
498                 struct net_device *dev = sprt->dst.dev;
499
500                 if (sprt->rt6i_nh_flags & RTNH_F_DEAD)
501                         continue;
502
503                 if (oif) {
504                         if (dev->ifindex == oif)
505                                 return sprt;
506                         if (dev->flags & IFF_LOOPBACK) {
507                                 if (!sprt->rt6i_idev ||
508                                     sprt->rt6i_idev->dev->ifindex != oif) {
509                                         if (flags & RT6_LOOKUP_F_IFACE)
510                                                 continue;
511                                         if (local &&
512                                             local->rt6i_idev->dev->ifindex == oif)
513                                                 continue;
514                                 }
515                                 local = sprt;
516                         }
517                 } else {
518                         if (ipv6_chk_addr(net, saddr, dev,
519                                           flags & RT6_LOOKUP_F_IFACE))
520                                 return sprt;
521                 }
522         }
523
524         if (oif) {
525                 if (local)
526                         return local;
527
528                 if (flags & RT6_LOOKUP_F_IFACE)
529                         return net->ipv6.ip6_null_entry;
530         }
531
532         return rt->rt6i_nh_flags & RTNH_F_DEAD ? net->ipv6.ip6_null_entry : rt;
533 }
534
535 #ifdef CONFIG_IPV6_ROUTER_PREF
536 struct __rt6_probe_work {
537         struct work_struct work;
538         struct in6_addr target;
539         struct net_device *dev;
540 };
541
542 static void rt6_probe_deferred(struct work_struct *w)
543 {
544         struct in6_addr mcaddr;
545         struct __rt6_probe_work *work =
546                 container_of(w, struct __rt6_probe_work, work);
547
548         addrconf_addr_solict_mult(&work->target, &mcaddr);
549         ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
550         dev_put(work->dev);
551         kfree(work);
552 }
553
554 static void rt6_probe(struct rt6_info *rt)
555 {
556         struct __rt6_probe_work *work;
557         struct neighbour *neigh;
558         /*
559          * Okay, this does not seem to be appropriate
560          * for now, however, we need to check if it
561          * is really so; aka Router Reachability Probing.
562          *
563          * Router Reachability Probe MUST be rate-limited
564          * to no more than one per minute.
565          */
566         if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
567                 return;
568         rcu_read_lock_bh();
569         neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
570         if (neigh) {
571                 if (neigh->nud_state & NUD_VALID)
572                         goto out;
573
574                 work = NULL;
575                 write_lock(&neigh->lock);
576                 if (!(neigh->nud_state & NUD_VALID) &&
577                     time_after(jiffies,
578                                neigh->updated +
579                                rt->rt6i_idev->cnf.rtr_probe_interval)) {
580                         work = kmalloc(sizeof(*work), GFP_ATOMIC);
581                         if (work)
582                                 __neigh_set_probe_once(neigh);
583                 }
584                 write_unlock(&neigh->lock);
585         } else {
586                 work = kmalloc(sizeof(*work), GFP_ATOMIC);
587         }
588
589         if (work) {
590                 INIT_WORK(&work->work, rt6_probe_deferred);
591                 work->target = rt->rt6i_gateway;
592                 dev_hold(rt->dst.dev);
593                 work->dev = rt->dst.dev;
594                 schedule_work(&work->work);
595         }
596
597 out:
598         rcu_read_unlock_bh();
599 }
600 #else
601 static inline void rt6_probe(struct rt6_info *rt)
602 {
603 }
604 #endif
605
606 /*
607  * Default Router Selection (RFC 2461 6.3.6)
608  */
609 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
610 {
611         struct net_device *dev = rt->dst.dev;
612         if (!oif || dev->ifindex == oif)
613                 return 2;
614         if ((dev->flags & IFF_LOOPBACK) &&
615             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
616                 return 1;
617         return 0;
618 }
619
620 static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
621 {
622         struct neighbour *neigh;
623         enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
624
625         if (rt->rt6i_flags & RTF_NONEXTHOP ||
626             !(rt->rt6i_flags & RTF_GATEWAY))
627                 return RT6_NUD_SUCCEED;
628
629         rcu_read_lock_bh();
630         neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
631         if (neigh) {
632                 read_lock(&neigh->lock);
633                 if (neigh->nud_state & NUD_VALID)
634                         ret = RT6_NUD_SUCCEED;
635 #ifdef CONFIG_IPV6_ROUTER_PREF
636                 else if (!(neigh->nud_state & NUD_FAILED))
637                         ret = RT6_NUD_SUCCEED;
638                 else
639                         ret = RT6_NUD_FAIL_PROBE;
640 #endif
641                 read_unlock(&neigh->lock);
642         } else {
643                 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
644                       RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
645         }
646         rcu_read_unlock_bh();
647
648         return ret;
649 }
650
651 static int rt6_score_route(struct rt6_info *rt, int oif,
652                            int strict)
653 {
654         int m;
655
656         m = rt6_check_dev(rt, oif);
657         if (!m && (strict & RT6_LOOKUP_F_IFACE))
658                 return RT6_NUD_FAIL_HARD;
659 #ifdef CONFIG_IPV6_ROUTER_PREF
660         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
661 #endif
662         if (strict & RT6_LOOKUP_F_REACHABLE) {
663                 int n = rt6_check_neigh(rt);
664                 if (n < 0)
665                         return n;
666         }
667         return m;
668 }
669
670 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
671                                    int *mpri, struct rt6_info *match,
672                                    bool *do_rr)
673 {
674         int m;
675         bool match_do_rr = false;
676         struct inet6_dev *idev = rt->rt6i_idev;
677
678         if (rt->rt6i_nh_flags & RTNH_F_DEAD)
679                 goto out;
680
681         if (idev->cnf.ignore_routes_with_linkdown &&
682             rt->rt6i_nh_flags & RTNH_F_LINKDOWN &&
683             !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
684                 goto out;
685
686         if (rt6_check_expired(rt))
687                 goto out;
688
689         m = rt6_score_route(rt, oif, strict);
690         if (m == RT6_NUD_FAIL_DO_RR) {
691                 match_do_rr = true;
692                 m = 0; /* lowest valid score */
693         } else if (m == RT6_NUD_FAIL_HARD) {
694                 goto out;
695         }
696
697         if (strict & RT6_LOOKUP_F_REACHABLE)
698                 rt6_probe(rt);
699
700         /* note that m can be RT6_NUD_FAIL_PROBE at this point */
701         if (m > *mpri) {
702                 *do_rr = match_do_rr;
703                 *mpri = m;
704                 match = rt;
705         }
706 out:
707         return match;
708 }
709
710 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
711                                      struct rt6_info *leaf,
712                                      struct rt6_info *rr_head,
713                                      u32 metric, int oif, int strict,
714                                      bool *do_rr)
715 {
716         struct rt6_info *rt, *match, *cont;
717         int mpri = -1;
718
719         match = NULL;
720         cont = NULL;
721         for (rt = rr_head; rt; rt = rcu_dereference(rt->rt6_next)) {
722                 if (rt->rt6i_metric != metric) {
723                         cont = rt;
724                         break;
725                 }
726
727                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
728         }
729
730         for (rt = leaf; rt && rt != rr_head;
731              rt = rcu_dereference(rt->rt6_next)) {
732                 if (rt->rt6i_metric != metric) {
733                         cont = rt;
734                         break;
735                 }
736
737                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
738         }
739
740         if (match || !cont)
741                 return match;
742
743         for (rt = cont; rt; rt = rcu_dereference(rt->rt6_next))
744                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
745
746         return match;
747 }
748
749 static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn,
750                                    int oif, int strict)
751 {
752         struct rt6_info *leaf = rcu_dereference(fn->leaf);
753         struct rt6_info *match, *rt0;
754         bool do_rr = false;
755         int key_plen;
756
757         if (!leaf || leaf == net->ipv6.ip6_null_entry)
758                 return net->ipv6.ip6_null_entry;
759
760         rt0 = rcu_dereference(fn->rr_ptr);
761         if (!rt0)
762                 rt0 = leaf;
763
764         /* Double check to make sure fn is not an intermediate node
765          * and fn->leaf does not points to its child's leaf
766          * (This might happen if all routes under fn are deleted from
767          * the tree and fib6_repair_tree() is called on the node.)
768          */
769         key_plen = rt0->rt6i_dst.plen;
770 #ifdef CONFIG_IPV6_SUBTREES
771         if (rt0->rt6i_src.plen)
772                 key_plen = rt0->rt6i_src.plen;
773 #endif
774         if (fn->fn_bit != key_plen)
775                 return net->ipv6.ip6_null_entry;
776
777         match = find_rr_leaf(fn, leaf, rt0, rt0->rt6i_metric, oif, strict,
778                              &do_rr);
779
780         if (do_rr) {
781                 struct rt6_info *next = rcu_dereference(rt0->rt6_next);
782
783                 /* no entries matched; do round-robin */
784                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
785                         next = leaf;
786
787                 if (next != rt0) {
788                         spin_lock_bh(&leaf->rt6i_table->tb6_lock);
789                         /* make sure next is not being deleted from the tree */
790                         if (next->rt6i_node)
791                                 rcu_assign_pointer(fn->rr_ptr, next);
792                         spin_unlock_bh(&leaf->rt6i_table->tb6_lock);
793                 }
794         }
795
796         return match ? match : net->ipv6.ip6_null_entry;
797 }
798
799 static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt)
800 {
801         return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
802 }
803
804 #ifdef CONFIG_IPV6_ROUTE_INFO
805 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
806                   const struct in6_addr *gwaddr)
807 {
808         struct net *net = dev_net(dev);
809         struct route_info *rinfo = (struct route_info *) opt;
810         struct in6_addr prefix_buf, *prefix;
811         unsigned int pref;
812         unsigned long lifetime;
813         struct rt6_info *rt;
814
815         if (len < sizeof(struct route_info)) {
816                 return -EINVAL;
817         }
818
819         /* Sanity check for prefix_len and length */
820         if (rinfo->length > 3) {
821                 return -EINVAL;
822         } else if (rinfo->prefix_len > 128) {
823                 return -EINVAL;
824         } else if (rinfo->prefix_len > 64) {
825                 if (rinfo->length < 2) {
826                         return -EINVAL;
827                 }
828         } else if (rinfo->prefix_len > 0) {
829                 if (rinfo->length < 1) {
830                         return -EINVAL;
831                 }
832         }
833
834         pref = rinfo->route_pref;
835         if (pref == ICMPV6_ROUTER_PREF_INVALID)
836                 return -EINVAL;
837
838         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
839
840         if (rinfo->length == 3)
841                 prefix = (struct in6_addr *)rinfo->prefix;
842         else {
843                 /* this function is safe */
844                 ipv6_addr_prefix(&prefix_buf,
845                                  (struct in6_addr *)rinfo->prefix,
846                                  rinfo->prefix_len);
847                 prefix = &prefix_buf;
848         }
849
850         if (rinfo->prefix_len == 0)
851                 rt = rt6_get_dflt_router(gwaddr, dev);
852         else
853                 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
854                                         gwaddr, dev);
855
856         if (rt && !lifetime) {
857                 ip6_del_rt(rt);
858                 rt = NULL;
859         }
860
861         if (!rt && lifetime)
862                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
863                                         dev, pref);
864         else if (rt)
865                 rt->rt6i_flags = RTF_ROUTEINFO |
866                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
867
868         if (rt) {
869                 if (!addrconf_finite_timeout(lifetime))
870                         rt6_clean_expires(rt);
871                 else
872                         rt6_set_expires(rt, jiffies + HZ * lifetime);
873
874                 ip6_rt_put(rt);
875         }
876         return 0;
877 }
878 #endif
879
880 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
881                                         struct in6_addr *saddr)
882 {
883         struct fib6_node *pn, *sn;
884         while (1) {
885                 if (fn->fn_flags & RTN_TL_ROOT)
886                         return NULL;
887                 pn = rcu_dereference(fn->parent);
888                 sn = FIB6_SUBTREE(pn);
889                 if (sn && sn != fn)
890                         fn = fib6_lookup(sn, NULL, saddr);
891                 else
892                         fn = pn;
893                 if (fn->fn_flags & RTN_RTINFO)
894                         return fn;
895         }
896 }
897
898 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
899                           bool null_fallback)
900 {
901         struct rt6_info *rt = *prt;
902
903         if (dst_hold_safe(&rt->dst))
904                 return true;
905         if (null_fallback) {
906                 rt = net->ipv6.ip6_null_entry;
907                 dst_hold(&rt->dst);
908         } else {
909                 rt = NULL;
910         }
911         *prt = rt;
912         return false;
913 }
914
915 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
916                                              struct fib6_table *table,
917                                              struct flowi6 *fl6, int flags)
918 {
919         struct rt6_info *rt, *rt_cache;
920         struct fib6_node *fn;
921
922         rcu_read_lock();
923         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
924 restart:
925         rt = rcu_dereference(fn->leaf);
926         if (!rt) {
927                 rt = net->ipv6.ip6_null_entry;
928         } else {
929                 rt = rt6_device_match(net, rt, &fl6->saddr,
930                                       fl6->flowi6_oif, flags);
931                 if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
932                         rt = rt6_multipath_select(rt, fl6,
933                                                   fl6->flowi6_oif, flags);
934         }
935         if (rt == net->ipv6.ip6_null_entry) {
936                 fn = fib6_backtrack(fn, &fl6->saddr);
937                 if (fn)
938                         goto restart;
939         }
940         /* Search through exception table */
941         rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
942         if (rt_cache)
943                 rt = rt_cache;
944
945         if (ip6_hold_safe(net, &rt, true))
946                 dst_use_noref(&rt->dst, jiffies);
947
948         rcu_read_unlock();
949
950         trace_fib6_table_lookup(net, rt, table, fl6);
951
952         return rt;
953
954 }
955
956 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
957                                     int flags)
958 {
959         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
960 }
961 EXPORT_SYMBOL_GPL(ip6_route_lookup);
962
963 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
964                             const struct in6_addr *saddr, int oif, int strict)
965 {
966         struct flowi6 fl6 = {
967                 .flowi6_oif = oif,
968                 .daddr = *daddr,
969         };
970         struct dst_entry *dst;
971         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
972
973         if (saddr) {
974                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
975                 flags |= RT6_LOOKUP_F_HAS_SADDR;
976         }
977
978         dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
979         if (dst->error == 0)
980                 return (struct rt6_info *) dst;
981
982         dst_release(dst);
983
984         return NULL;
985 }
986 EXPORT_SYMBOL(rt6_lookup);
987
988 /* ip6_ins_rt is called with FREE table->tb6_lock.
989  * It takes new route entry, the addition fails by any reason the
990  * route is released.
991  * Caller must hold dst before calling it.
992  */
993
994 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
995                         struct mx6_config *mxc,
996                         struct netlink_ext_ack *extack)
997 {
998         int err;
999         struct fib6_table *table;
1000
1001         table = rt->rt6i_table;
1002         spin_lock_bh(&table->tb6_lock);
1003         err = fib6_add(&table->tb6_root, rt, info, mxc, extack);
1004         spin_unlock_bh(&table->tb6_lock);
1005
1006         return err;
1007 }
1008
1009 int ip6_ins_rt(struct rt6_info *rt)
1010 {
1011         struct nl_info info = { .nl_net = dev_net(rt->dst.dev), };
1012         struct mx6_config mxc = { .mx = NULL, };
1013
1014         /* Hold dst to account for the reference from the fib6 tree */
1015         dst_hold(&rt->dst);
1016         return __ip6_ins_rt(rt, &info, &mxc, NULL);
1017 }
1018
1019 /* called with rcu_lock held */
1020 static struct net_device *ip6_rt_get_dev_rcu(struct rt6_info *rt)
1021 {
1022         struct net_device *dev = rt->dst.dev;
1023
1024         if (rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) {
1025                 /* for copies of local routes, dst->dev needs to be the
1026                  * device if it is a master device, the master device if
1027                  * device is enslaved, and the loopback as the default
1028                  */
1029                 if (netif_is_l3_slave(dev) &&
1030                     !rt6_need_strict(&rt->rt6i_dst.addr))
1031                         dev = l3mdev_master_dev_rcu(dev);
1032                 else if (!netif_is_l3_master(dev))
1033                         dev = dev_net(dev)->loopback_dev;
1034                 /* last case is netif_is_l3_master(dev) is true in which
1035                  * case we want dev returned to be dev
1036                  */
1037         }
1038
1039         return dev;
1040 }
1041
1042 static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
1043                                            const struct in6_addr *daddr,
1044                                            const struct in6_addr *saddr)
1045 {
1046         struct net_device *dev;
1047         struct rt6_info *rt;
1048
1049         /*
1050          *      Clone the route.
1051          */
1052
1053         if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
1054                 ort = ort->from;
1055
1056         rcu_read_lock();
1057         dev = ip6_rt_get_dev_rcu(ort);
1058         rt = __ip6_dst_alloc(dev_net(dev), dev, 0);
1059         rcu_read_unlock();
1060         if (!rt)
1061                 return NULL;
1062
1063         ip6_rt_copy_init(rt, ort);
1064         rt->rt6i_flags |= RTF_CACHE;
1065         rt->rt6i_metric = 0;
1066         rt->dst.flags |= DST_HOST;
1067         rt->rt6i_dst.addr = *daddr;
1068         rt->rt6i_dst.plen = 128;
1069
1070         if (!rt6_is_gw_or_nonexthop(ort)) {
1071                 if (ort->rt6i_dst.plen != 128 &&
1072                     ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
1073                         rt->rt6i_flags |= RTF_ANYCAST;
1074 #ifdef CONFIG_IPV6_SUBTREES
1075                 if (rt->rt6i_src.plen && saddr) {
1076                         rt->rt6i_src.addr = *saddr;
1077                         rt->rt6i_src.plen = 128;
1078                 }
1079 #endif
1080         }
1081
1082         return rt;
1083 }
1084
1085 static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
1086 {
1087         struct net_device *dev;
1088         struct rt6_info *pcpu_rt;
1089
1090         rcu_read_lock();
1091         dev = ip6_rt_get_dev_rcu(rt);
1092         pcpu_rt = __ip6_dst_alloc(dev_net(dev), dev, rt->dst.flags);
1093         rcu_read_unlock();
1094         if (!pcpu_rt)
1095                 return NULL;
1096         ip6_rt_copy_init(pcpu_rt, rt);
1097         pcpu_rt->rt6i_protocol = rt->rt6i_protocol;
1098         pcpu_rt->rt6i_flags |= RTF_PCPU;
1099         return pcpu_rt;
1100 }
1101
1102 /* It should be called with rcu_read_lock() acquired */
1103 static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
1104 {
1105         struct rt6_info *pcpu_rt, **p;
1106
1107         p = this_cpu_ptr(rt->rt6i_pcpu);
1108         pcpu_rt = *p;
1109
1110         if (pcpu_rt && ip6_hold_safe(NULL, &pcpu_rt, false))
1111                 rt6_dst_from_metrics_check(pcpu_rt);
1112
1113         return pcpu_rt;
1114 }
1115
1116 static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt)
1117 {
1118         struct rt6_info *pcpu_rt, *prev, **p;
1119
1120         pcpu_rt = ip6_rt_pcpu_alloc(rt);
1121         if (!pcpu_rt) {
1122                 struct net *net = dev_net(rt->dst.dev);
1123
1124                 dst_hold(&net->ipv6.ip6_null_entry->dst);
1125                 return net->ipv6.ip6_null_entry;
1126         }
1127
1128         dst_hold(&pcpu_rt->dst);
1129         p = this_cpu_ptr(rt->rt6i_pcpu);
1130         prev = cmpxchg(p, NULL, pcpu_rt);
1131         BUG_ON(prev);
1132
1133         rt6_dst_from_metrics_check(pcpu_rt);
1134         return pcpu_rt;
1135 }
1136
1137 /* exception hash table implementation
1138  */
1139 static DEFINE_SPINLOCK(rt6_exception_lock);
1140
1141 /* Remove rt6_ex from hash table and free the memory
1142  * Caller must hold rt6_exception_lock
1143  */
1144 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1145                                  struct rt6_exception *rt6_ex)
1146 {
1147         struct net *net;
1148
1149         if (!bucket || !rt6_ex)
1150                 return;
1151
1152         net = dev_net(rt6_ex->rt6i->dst.dev);
1153         rt6_ex->rt6i->rt6i_node = NULL;
1154         hlist_del_rcu(&rt6_ex->hlist);
1155         rt6_release(rt6_ex->rt6i);
1156         kfree_rcu(rt6_ex, rcu);
1157         WARN_ON_ONCE(!bucket->depth);
1158         bucket->depth--;
1159         net->ipv6.rt6_stats->fib_rt_cache--;
1160 }
1161
1162 /* Remove oldest rt6_ex in bucket and free the memory
1163  * Caller must hold rt6_exception_lock
1164  */
1165 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1166 {
1167         struct rt6_exception *rt6_ex, *oldest = NULL;
1168
1169         if (!bucket)
1170                 return;
1171
1172         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1173                 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1174                         oldest = rt6_ex;
1175         }
1176         rt6_remove_exception(bucket, oldest);
1177 }
1178
1179 static u32 rt6_exception_hash(const struct in6_addr *dst,
1180                               const struct in6_addr *src)
1181 {
1182         static u32 seed __read_mostly;
1183         u32 val;
1184
1185         net_get_random_once(&seed, sizeof(seed));
1186         val = jhash(dst, sizeof(*dst), seed);
1187
1188 #ifdef CONFIG_IPV6_SUBTREES
1189         if (src)
1190                 val = jhash(src, sizeof(*src), val);
1191 #endif
1192         return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1193 }
1194
1195 /* Helper function to find the cached rt in the hash table
1196  * and update bucket pointer to point to the bucket for this
1197  * (daddr, saddr) pair
1198  * Caller must hold rt6_exception_lock
1199  */
1200 static struct rt6_exception *
1201 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1202                               const struct in6_addr *daddr,
1203                               const struct in6_addr *saddr)
1204 {
1205         struct rt6_exception *rt6_ex;
1206         u32 hval;
1207
1208         if (!(*bucket) || !daddr)
1209                 return NULL;
1210
1211         hval = rt6_exception_hash(daddr, saddr);
1212         *bucket += hval;
1213
1214         hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1215                 struct rt6_info *rt6 = rt6_ex->rt6i;
1216                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1217
1218 #ifdef CONFIG_IPV6_SUBTREES
1219                 if (matched && saddr)
1220                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1221 #endif
1222                 if (matched)
1223                         return rt6_ex;
1224         }
1225         return NULL;
1226 }
1227
1228 /* Helper function to find the cached rt in the hash table
1229  * and update bucket pointer to point to the bucket for this
1230  * (daddr, saddr) pair
1231  * Caller must hold rcu_read_lock()
1232  */
1233 static struct rt6_exception *
1234 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1235                          const struct in6_addr *daddr,
1236                          const struct in6_addr *saddr)
1237 {
1238         struct rt6_exception *rt6_ex;
1239         u32 hval;
1240
1241         WARN_ON_ONCE(!rcu_read_lock_held());
1242
1243         if (!(*bucket) || !daddr)
1244                 return NULL;
1245
1246         hval = rt6_exception_hash(daddr, saddr);
1247         *bucket += hval;
1248
1249         hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1250                 struct rt6_info *rt6 = rt6_ex->rt6i;
1251                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1252
1253 #ifdef CONFIG_IPV6_SUBTREES
1254                 if (matched && saddr)
1255                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1256 #endif
1257                 if (matched)
1258                         return rt6_ex;
1259         }
1260         return NULL;
1261 }
1262
1263 static int rt6_insert_exception(struct rt6_info *nrt,
1264                                 struct rt6_info *ort)
1265 {
1266         struct net *net = dev_net(ort->dst.dev);
1267         struct rt6_exception_bucket *bucket;
1268         struct in6_addr *src_key = NULL;
1269         struct rt6_exception *rt6_ex;
1270         int err = 0;
1271
1272         /* ort can't be a cache or pcpu route */
1273         if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
1274                 ort = ort->from;
1275         WARN_ON_ONCE(ort->rt6i_flags & (RTF_CACHE | RTF_PCPU));
1276
1277         spin_lock_bh(&rt6_exception_lock);
1278
1279         if (ort->exception_bucket_flushed) {
1280                 err = -EINVAL;
1281                 goto out;
1282         }
1283
1284         bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1285                                         lockdep_is_held(&rt6_exception_lock));
1286         if (!bucket) {
1287                 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1288                                  GFP_ATOMIC);
1289                 if (!bucket) {
1290                         err = -ENOMEM;
1291                         goto out;
1292                 }
1293                 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1294         }
1295
1296 #ifdef CONFIG_IPV6_SUBTREES
1297         /* rt6i_src.plen != 0 indicates ort is in subtree
1298          * and exception table is indexed by a hash of
1299          * both rt6i_dst and rt6i_src.
1300          * Otherwise, the exception table is indexed by
1301          * a hash of only rt6i_dst.
1302          */
1303         if (ort->rt6i_src.plen)
1304                 src_key = &nrt->rt6i_src.addr;
1305 #endif
1306
1307         /* Update rt6i_prefsrc as it could be changed
1308          * in rt6_remove_prefsrc()
1309          */
1310         nrt->rt6i_prefsrc = ort->rt6i_prefsrc;
1311         /* rt6_mtu_change() might lower mtu on ort.
1312          * Only insert this exception route if its mtu
1313          * is less than ort's mtu value.
1314          */
1315         if (nrt->rt6i_pmtu >= dst_mtu(&ort->dst)) {
1316                 err = -EINVAL;
1317                 goto out;
1318         }
1319
1320         rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1321                                                src_key);
1322         if (rt6_ex)
1323                 rt6_remove_exception(bucket, rt6_ex);
1324
1325         rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1326         if (!rt6_ex) {
1327                 err = -ENOMEM;
1328                 goto out;
1329         }
1330         rt6_ex->rt6i = nrt;
1331         rt6_ex->stamp = jiffies;
1332         atomic_inc(&nrt->rt6i_ref);
1333         nrt->rt6i_node = ort->rt6i_node;
1334         hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1335         bucket->depth++;
1336         net->ipv6.rt6_stats->fib_rt_cache++;
1337
1338         if (bucket->depth > FIB6_MAX_DEPTH)
1339                 rt6_exception_remove_oldest(bucket);
1340
1341 out:
1342         spin_unlock_bh(&rt6_exception_lock);
1343
1344         /* Update fn->fn_sernum to invalidate all cached dst */
1345         if (!err) {
1346                 spin_lock_bh(&ort->rt6i_table->tb6_lock);
1347                 fib6_update_sernum(ort);
1348                 spin_unlock_bh(&ort->rt6i_table->tb6_lock);
1349                 fib6_force_start_gc(net);
1350         }
1351
1352         return err;
1353 }
1354
1355 void rt6_flush_exceptions(struct rt6_info *rt)
1356 {
1357         struct rt6_exception_bucket *bucket;
1358         struct rt6_exception *rt6_ex;
1359         struct hlist_node *tmp;
1360         int i;
1361
1362         spin_lock_bh(&rt6_exception_lock);
1363         /* Prevent rt6_insert_exception() to recreate the bucket list */
1364         rt->exception_bucket_flushed = 1;
1365
1366         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1367                                     lockdep_is_held(&rt6_exception_lock));
1368         if (!bucket)
1369                 goto out;
1370
1371         for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1372                 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1373                         rt6_remove_exception(bucket, rt6_ex);
1374                 WARN_ON_ONCE(bucket->depth);
1375                 bucket++;
1376         }
1377
1378 out:
1379         spin_unlock_bh(&rt6_exception_lock);
1380 }
1381
1382 /* Find cached rt in the hash table inside passed in rt
1383  * Caller has to hold rcu_read_lock()
1384  */
1385 static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
1386                                            struct in6_addr *daddr,
1387                                            struct in6_addr *saddr)
1388 {
1389         struct rt6_exception_bucket *bucket;
1390         struct in6_addr *src_key = NULL;
1391         struct rt6_exception *rt6_ex;
1392         struct rt6_info *res = NULL;
1393
1394         bucket = rcu_dereference(rt->rt6i_exception_bucket);
1395
1396 #ifdef CONFIG_IPV6_SUBTREES
1397         /* rt6i_src.plen != 0 indicates rt is in subtree
1398          * and exception table is indexed by a hash of
1399          * both rt6i_dst and rt6i_src.
1400          * Otherwise, the exception table is indexed by
1401          * a hash of only rt6i_dst.
1402          */
1403         if (rt->rt6i_src.plen)
1404                 src_key = saddr;
1405 #endif
1406         rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1407
1408         if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1409                 res = rt6_ex->rt6i;
1410
1411         return res;
1412 }
1413
1414 /* Remove the passed in cached rt from the hash table that contains it */
1415 int rt6_remove_exception_rt(struct rt6_info *rt)
1416 {
1417         struct rt6_exception_bucket *bucket;
1418         struct rt6_info *from = rt->from;
1419         struct in6_addr *src_key = NULL;
1420         struct rt6_exception *rt6_ex;
1421         int err;
1422
1423         if (!from ||
1424             !(rt->rt6i_flags & RTF_CACHE))
1425                 return -EINVAL;
1426
1427         if (!rcu_access_pointer(from->rt6i_exception_bucket))
1428                 return -ENOENT;
1429
1430         spin_lock_bh(&rt6_exception_lock);
1431         bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1432                                     lockdep_is_held(&rt6_exception_lock));
1433 #ifdef CONFIG_IPV6_SUBTREES
1434         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1435          * and exception table is indexed by a hash of
1436          * both rt6i_dst and rt6i_src.
1437          * Otherwise, the exception table is indexed by
1438          * a hash of only rt6i_dst.
1439          */
1440         if (from->rt6i_src.plen)
1441                 src_key = &rt->rt6i_src.addr;
1442 #endif
1443         rt6_ex = __rt6_find_exception_spinlock(&bucket,
1444                                                &rt->rt6i_dst.addr,
1445                                                src_key);
1446         if (rt6_ex) {
1447                 rt6_remove_exception(bucket, rt6_ex);
1448                 err = 0;
1449         } else {
1450                 err = -ENOENT;
1451         }
1452
1453         spin_unlock_bh(&rt6_exception_lock);
1454         return err;
1455 }
1456
1457 /* Find rt6_ex which contains the passed in rt cache and
1458  * refresh its stamp
1459  */
1460 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1461 {
1462         struct rt6_exception_bucket *bucket;
1463         struct rt6_info *from = rt->from;
1464         struct in6_addr *src_key = NULL;
1465         struct rt6_exception *rt6_ex;
1466
1467         if (!from ||
1468             !(rt->rt6i_flags & RTF_CACHE))
1469                 return;
1470
1471         rcu_read_lock();
1472         bucket = rcu_dereference(from->rt6i_exception_bucket);
1473
1474 #ifdef CONFIG_IPV6_SUBTREES
1475         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1476          * and exception table is indexed by a hash of
1477          * both rt6i_dst and rt6i_src.
1478          * Otherwise, the exception table is indexed by
1479          * a hash of only rt6i_dst.
1480          */
1481         if (from->rt6i_src.plen)
1482                 src_key = &rt->rt6i_src.addr;
1483 #endif
1484         rt6_ex = __rt6_find_exception_rcu(&bucket,
1485                                           &rt->rt6i_dst.addr,
1486                                           src_key);
1487         if (rt6_ex)
1488                 rt6_ex->stamp = jiffies;
1489
1490         rcu_read_unlock();
1491 }
1492
1493 static void rt6_exceptions_remove_prefsrc(struct rt6_info *rt)
1494 {
1495         struct rt6_exception_bucket *bucket;
1496         struct rt6_exception *rt6_ex;
1497         int i;
1498
1499         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1500                                         lockdep_is_held(&rt6_exception_lock));
1501
1502         if (bucket) {
1503                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1504                         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1505                                 rt6_ex->rt6i->rt6i_prefsrc.plen = 0;
1506                         }
1507                         bucket++;
1508                 }
1509         }
1510 }
1511
1512 static void rt6_exceptions_update_pmtu(struct rt6_info *rt, int mtu)
1513 {
1514         struct rt6_exception_bucket *bucket;
1515         struct rt6_exception *rt6_ex;
1516         int i;
1517
1518         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1519                                         lockdep_is_held(&rt6_exception_lock));
1520
1521         if (bucket) {
1522                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1523                         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1524                                 struct rt6_info *entry = rt6_ex->rt6i;
1525                                 /* For RTF_CACHE with rt6i_pmtu == 0
1526                                  * (i.e. a redirected route),
1527                                  * the metrics of its rt->dst.from has already
1528                                  * been updated.
1529                                  */
1530                                 if (entry->rt6i_pmtu && entry->rt6i_pmtu > mtu)
1531                                         entry->rt6i_pmtu = mtu;
1532                         }
1533                         bucket++;
1534                 }
1535         }
1536 }
1537
1538 #define RTF_CACHE_GATEWAY       (RTF_GATEWAY | RTF_CACHE)
1539
1540 static void rt6_exceptions_clean_tohost(struct rt6_info *rt,
1541                                         struct in6_addr *gateway)
1542 {
1543         struct rt6_exception_bucket *bucket;
1544         struct rt6_exception *rt6_ex;
1545         struct hlist_node *tmp;
1546         int i;
1547
1548         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1549                 return;
1550
1551         spin_lock_bh(&rt6_exception_lock);
1552         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1553                                      lockdep_is_held(&rt6_exception_lock));
1554
1555         if (bucket) {
1556                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1557                         hlist_for_each_entry_safe(rt6_ex, tmp,
1558                                                   &bucket->chain, hlist) {
1559                                 struct rt6_info *entry = rt6_ex->rt6i;
1560
1561                                 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1562                                     RTF_CACHE_GATEWAY &&
1563                                     ipv6_addr_equal(gateway,
1564                                                     &entry->rt6i_gateway)) {
1565                                         rt6_remove_exception(bucket, rt6_ex);
1566                                 }
1567                         }
1568                         bucket++;
1569                 }
1570         }
1571
1572         spin_unlock_bh(&rt6_exception_lock);
1573 }
1574
1575 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1576                                       struct rt6_exception *rt6_ex,
1577                                       struct fib6_gc_args *gc_args,
1578                                       unsigned long now)
1579 {
1580         struct rt6_info *rt = rt6_ex->rt6i;
1581
1582         /* we are pruning and obsoleting aged-out and non gateway exceptions
1583          * even if others have still references to them, so that on next
1584          * dst_check() such references can be dropped.
1585          * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1586          * expired, independently from their aging, as per RFC 8201 section 4
1587          */
1588         if (!(rt->rt6i_flags & RTF_EXPIRES)) {
1589                 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1590                         RT6_TRACE("aging clone %p\n", rt);
1591                         rt6_remove_exception(bucket, rt6_ex);
1592                         return;
1593                 }
1594         } else if (time_after(jiffies, rt->dst.expires)) {
1595                 RT6_TRACE("purging expired route %p\n", rt);
1596                 rt6_remove_exception(bucket, rt6_ex);
1597                 return;
1598         }
1599
1600         if (rt->rt6i_flags & RTF_GATEWAY) {
1601                 struct neighbour *neigh;
1602                 __u8 neigh_flags = 0;
1603
1604                 neigh = dst_neigh_lookup(&rt->dst, &rt->rt6i_gateway);
1605                 if (neigh) {
1606                         neigh_flags = neigh->flags;
1607                         neigh_release(neigh);
1608                 }
1609                 if (!(neigh_flags & NTF_ROUTER)) {
1610                         RT6_TRACE("purging route %p via non-router but gateway\n",
1611                                   rt);
1612                         rt6_remove_exception(bucket, rt6_ex);
1613                         return;
1614                 }
1615         }
1616
1617         gc_args->more++;
1618 }
1619
1620 void rt6_age_exceptions(struct rt6_info *rt,
1621                         struct fib6_gc_args *gc_args,
1622                         unsigned long now)
1623 {
1624         struct rt6_exception_bucket *bucket;
1625         struct rt6_exception *rt6_ex;
1626         struct hlist_node *tmp;
1627         int i;
1628
1629         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1630                 return;
1631
1632         spin_lock_bh(&rt6_exception_lock);
1633         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1634                                     lockdep_is_held(&rt6_exception_lock));
1635
1636         if (bucket) {
1637                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1638                         hlist_for_each_entry_safe(rt6_ex, tmp,
1639                                                   &bucket->chain, hlist) {
1640                                 rt6_age_examine_exception(bucket, rt6_ex,
1641                                                           gc_args, now);
1642                         }
1643                         bucket++;
1644                 }
1645         }
1646         spin_unlock_bh(&rt6_exception_lock);
1647 }
1648
1649 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1650                                int oif, struct flowi6 *fl6, int flags)
1651 {
1652         struct fib6_node *fn, *saved_fn;
1653         struct rt6_info *rt, *rt_cache;
1654         int strict = 0;
1655
1656         strict |= flags & RT6_LOOKUP_F_IFACE;
1657         strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1658         if (net->ipv6.devconf_all->forwarding == 0)
1659                 strict |= RT6_LOOKUP_F_REACHABLE;
1660
1661         rcu_read_lock();
1662
1663         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1664         saved_fn = fn;
1665
1666         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1667                 oif = 0;
1668
1669 redo_rt6_select:
1670         rt = rt6_select(net, fn, oif, strict);
1671         if (rt->rt6i_nsiblings)
1672                 rt = rt6_multipath_select(rt, fl6, oif, strict);
1673         if (rt == net->ipv6.ip6_null_entry) {
1674                 fn = fib6_backtrack(fn, &fl6->saddr);
1675                 if (fn)
1676                         goto redo_rt6_select;
1677                 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1678                         /* also consider unreachable route */
1679                         strict &= ~RT6_LOOKUP_F_REACHABLE;
1680                         fn = saved_fn;
1681                         goto redo_rt6_select;
1682                 }
1683         }
1684
1685         /*Search through exception table */
1686         rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
1687         if (rt_cache)
1688                 rt = rt_cache;
1689
1690         if (rt == net->ipv6.ip6_null_entry) {
1691                 rcu_read_unlock();
1692                 dst_hold(&rt->dst);
1693                 trace_fib6_table_lookup(net, rt, table, fl6);
1694                 return rt;
1695         } else if (rt->rt6i_flags & RTF_CACHE) {
1696                 if (ip6_hold_safe(net, &rt, true)) {
1697                         dst_use_noref(&rt->dst, jiffies);
1698                         rt6_dst_from_metrics_check(rt);
1699                 }
1700                 rcu_read_unlock();
1701                 trace_fib6_table_lookup(net, rt, table, fl6);
1702                 return rt;
1703         } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1704                             !(rt->rt6i_flags & RTF_GATEWAY))) {
1705                 /* Create a RTF_CACHE clone which will not be
1706                  * owned by the fib6 tree.  It is for the special case where
1707                  * the daddr in the skb during the neighbor look-up is different
1708                  * from the fl6->daddr used to look-up route here.
1709                  */
1710
1711                 struct rt6_info *uncached_rt;
1712
1713                 if (ip6_hold_safe(net, &rt, true)) {
1714                         dst_use_noref(&rt->dst, jiffies);
1715                 } else {
1716                         rcu_read_unlock();
1717                         uncached_rt = rt;
1718                         goto uncached_rt_out;
1719                 }
1720                 rcu_read_unlock();
1721
1722                 uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
1723                 dst_release(&rt->dst);
1724
1725                 if (uncached_rt) {
1726                         /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1727                          * No need for another dst_hold()
1728                          */
1729                         rt6_uncached_list_add(uncached_rt);
1730                         atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1731                 } else {
1732                         uncached_rt = net->ipv6.ip6_null_entry;
1733                         dst_hold(&uncached_rt->dst);
1734                 }
1735
1736 uncached_rt_out:
1737                 trace_fib6_table_lookup(net, uncached_rt, table, fl6);
1738                 return uncached_rt;
1739
1740         } else {
1741                 /* Get a percpu copy */
1742
1743                 struct rt6_info *pcpu_rt;
1744
1745                 dst_use_noref(&rt->dst, jiffies);
1746                 local_bh_disable();
1747                 pcpu_rt = rt6_get_pcpu_route(rt);
1748
1749                 if (!pcpu_rt) {
1750                         /* atomic_inc_not_zero() is needed when using rcu */
1751                         if (atomic_inc_not_zero(&rt->rt6i_ref)) {
1752                                 /* No dst_hold() on rt is needed because grabbing
1753                                  * rt->rt6i_ref makes sure rt can't be released.
1754                                  */
1755                                 pcpu_rt = rt6_make_pcpu_route(rt);
1756                                 rt6_release(rt);
1757                         } else {
1758                                 /* rt is already removed from tree */
1759                                 pcpu_rt = net->ipv6.ip6_null_entry;
1760                                 dst_hold(&pcpu_rt->dst);
1761                         }
1762                 }
1763                 local_bh_enable();
1764                 rcu_read_unlock();
1765                 trace_fib6_table_lookup(net, pcpu_rt, table, fl6);
1766                 return pcpu_rt;
1767         }
1768 }
1769 EXPORT_SYMBOL_GPL(ip6_pol_route);
1770
1771 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
1772                                             struct flowi6 *fl6, int flags)
1773 {
1774         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
1775 }
1776
1777 struct dst_entry *ip6_route_input_lookup(struct net *net,
1778                                          struct net_device *dev,
1779                                          struct flowi6 *fl6, int flags)
1780 {
1781         if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1782                 flags |= RT6_LOOKUP_F_IFACE;
1783
1784         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
1785 }
1786 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1787
1788 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1789                                   struct flow_keys *keys)
1790 {
1791         const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1792         const struct ipv6hdr *key_iph = outer_iph;
1793         const struct ipv6hdr *inner_iph;
1794         const struct icmp6hdr *icmph;
1795         struct ipv6hdr _inner_iph;
1796
1797         if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1798                 goto out;
1799
1800         icmph = icmp6_hdr(skb);
1801         if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1802             icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1803             icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1804             icmph->icmp6_type != ICMPV6_PARAMPROB)
1805                 goto out;
1806
1807         inner_iph = skb_header_pointer(skb,
1808                                        skb_transport_offset(skb) + sizeof(*icmph),
1809                                        sizeof(_inner_iph), &_inner_iph);
1810         if (!inner_iph)
1811                 goto out;
1812
1813         key_iph = inner_iph;
1814 out:
1815         memset(keys, 0, sizeof(*keys));
1816         keys->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1817         keys->addrs.v6addrs.src = key_iph->saddr;
1818         keys->addrs.v6addrs.dst = key_iph->daddr;
1819         keys->tags.flow_label = ip6_flowinfo(key_iph);
1820         keys->basic.ip_proto = key_iph->nexthdr;
1821 }
1822
1823 /* if skb is set it will be used and fl6 can be NULL */
1824 u32 rt6_multipath_hash(const struct flowi6 *fl6, const struct sk_buff *skb)
1825 {
1826         struct flow_keys hash_keys;
1827
1828         if (skb) {
1829                 ip6_multipath_l3_keys(skb, &hash_keys);
1830                 return flow_hash_from_keys(&hash_keys) >> 1;
1831         }
1832
1833         return get_hash_from_flowi6(fl6) >> 1;
1834 }
1835
1836 void ip6_route_input(struct sk_buff *skb)
1837 {
1838         const struct ipv6hdr *iph = ipv6_hdr(skb);
1839         struct net *net = dev_net(skb->dev);
1840         int flags = RT6_LOOKUP_F_HAS_SADDR;
1841         struct ip_tunnel_info *tun_info;
1842         struct flowi6 fl6 = {
1843                 .flowi6_iif = skb->dev->ifindex,
1844                 .daddr = iph->daddr,
1845                 .saddr = iph->saddr,
1846                 .flowlabel = ip6_flowinfo(iph),
1847                 .flowi6_mark = skb->mark,
1848                 .flowi6_proto = iph->nexthdr,
1849         };
1850
1851         tun_info = skb_tunnel_info(skb);
1852         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1853                 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
1854         if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
1855                 fl6.mp_hash = rt6_multipath_hash(&fl6, skb);
1856         skb_dst_drop(skb);
1857         skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
1858 }
1859
1860 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
1861                                              struct flowi6 *fl6, int flags)
1862 {
1863         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
1864 }
1865
1866 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
1867                                          struct flowi6 *fl6, int flags)
1868 {
1869         bool any_src;
1870
1871         if (rt6_need_strict(&fl6->daddr)) {
1872                 struct dst_entry *dst;
1873
1874                 dst = l3mdev_link_scope_lookup(net, fl6);
1875                 if (dst)
1876                         return dst;
1877         }
1878
1879         fl6->flowi6_iif = LOOPBACK_IFINDEX;
1880
1881         any_src = ipv6_addr_any(&fl6->saddr);
1882         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
1883             (fl6->flowi6_oif && any_src))
1884                 flags |= RT6_LOOKUP_F_IFACE;
1885
1886         if (!any_src)
1887                 flags |= RT6_LOOKUP_F_HAS_SADDR;
1888         else if (sk)
1889                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
1890
1891         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
1892 }
1893 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
1894
1895 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
1896 {
1897         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
1898         struct net_device *loopback_dev = net->loopback_dev;
1899         struct dst_entry *new = NULL;
1900
1901         rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
1902                        DST_OBSOLETE_DEAD, 0);
1903         if (rt) {
1904                 rt6_info_init(rt);
1905                 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
1906
1907                 new = &rt->dst;
1908                 new->__use = 1;
1909                 new->input = dst_discard;
1910                 new->output = dst_discard_out;
1911
1912                 dst_copy_metrics(new, &ort->dst);
1913
1914                 rt->rt6i_idev = in6_dev_get(loopback_dev);
1915                 rt->rt6i_gateway = ort->rt6i_gateway;
1916                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
1917                 rt->rt6i_metric = 0;
1918
1919                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1920 #ifdef CONFIG_IPV6_SUBTREES
1921                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1922 #endif
1923         }
1924
1925         dst_release(dst_orig);
1926         return new ? new : ERR_PTR(-ENOMEM);
1927 }
1928
1929 /*
1930  *      Destination cache support functions
1931  */
1932
1933 static void rt6_dst_from_metrics_check(struct rt6_info *rt)
1934 {
1935         if (rt->from &&
1936             dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(&rt->from->dst))
1937                 dst_init_metrics(&rt->dst, dst_metrics_ptr(&rt->from->dst), true);
1938 }
1939
1940 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
1941 {
1942         u32 rt_cookie = 0;
1943
1944         if (!rt6_get_cookie_safe(rt, &rt_cookie) || rt_cookie != cookie)
1945                 return NULL;
1946
1947         if (rt6_check_expired(rt))
1948                 return NULL;
1949
1950         return &rt->dst;
1951 }
1952
1953 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
1954 {
1955         if (!__rt6_check_expired(rt) &&
1956             rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1957             rt6_check(rt->from, cookie))
1958                 return &rt->dst;
1959         else
1960                 return NULL;
1961 }
1962
1963 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
1964 {
1965         struct rt6_info *rt;
1966
1967         rt = (struct rt6_info *) dst;
1968
1969         /* All IPV6 dsts are created with ->obsolete set to the value
1970          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1971          * into this function always.
1972          */
1973
1974         rt6_dst_from_metrics_check(rt);
1975
1976         if (rt->rt6i_flags & RTF_PCPU ||
1977             (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->from))
1978                 return rt6_dst_from_check(rt, cookie);
1979         else
1980                 return rt6_check(rt, cookie);
1981 }
1982
1983 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
1984 {
1985         struct rt6_info *rt = (struct rt6_info *) dst;
1986
1987         if (rt) {
1988                 if (rt->rt6i_flags & RTF_CACHE) {
1989                         if (rt6_check_expired(rt)) {
1990                                 ip6_del_rt(rt);
1991                                 dst = NULL;
1992                         }
1993                 } else {
1994                         dst_release(dst);
1995                         dst = NULL;
1996                 }
1997         }
1998         return dst;
1999 }
2000
2001 static void ip6_link_failure(struct sk_buff *skb)
2002 {
2003         struct rt6_info *rt;
2004
2005         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2006
2007         rt = (struct rt6_info *) skb_dst(skb);
2008         if (rt) {
2009                 if (rt->rt6i_flags & RTF_CACHE) {
2010                         if (dst_hold_safe(&rt->dst))
2011                                 ip6_del_rt(rt);
2012                 } else {
2013                         struct fib6_node *fn;
2014
2015                         rcu_read_lock();
2016                         fn = rcu_dereference(rt->rt6i_node);
2017                         if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2018                                 fn->fn_sernum = -1;
2019                         rcu_read_unlock();
2020                 }
2021         }
2022 }
2023
2024 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2025 {
2026         struct net *net = dev_net(rt->dst.dev);
2027
2028         rt->rt6i_flags |= RTF_MODIFIED;
2029         rt->rt6i_pmtu = mtu;
2030         rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2031 }
2032
2033 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2034 {
2035         return !(rt->rt6i_flags & RTF_CACHE) &&
2036                 (rt->rt6i_flags & RTF_PCPU ||
2037                  rcu_access_pointer(rt->rt6i_node));
2038 }
2039
2040 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2041                                  const struct ipv6hdr *iph, u32 mtu)
2042 {
2043         const struct in6_addr *daddr, *saddr;
2044         struct rt6_info *rt6 = (struct rt6_info *)dst;
2045
2046         if (rt6->rt6i_flags & RTF_LOCAL)
2047                 return;
2048
2049         if (dst_metric_locked(dst, RTAX_MTU))
2050                 return;
2051
2052         if (iph) {
2053                 daddr = &iph->daddr;
2054                 saddr = &iph->saddr;
2055         } else if (sk) {
2056                 daddr = &sk->sk_v6_daddr;
2057                 saddr = &inet6_sk(sk)->saddr;
2058         } else {
2059                 daddr = NULL;
2060                 saddr = NULL;
2061         }
2062         dst_confirm_neigh(dst, daddr);
2063         mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2064         if (mtu >= dst_mtu(dst))
2065                 return;
2066
2067         if (!rt6_cache_allowed_for_pmtu(rt6)) {
2068                 rt6_do_update_pmtu(rt6, mtu);
2069                 /* update rt6_ex->stamp for cache */
2070                 if (rt6->rt6i_flags & RTF_CACHE)
2071                         rt6_update_exception_stamp_rt(rt6);
2072         } else if (daddr) {
2073                 struct rt6_info *nrt6;
2074
2075                 nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
2076                 if (nrt6) {
2077                         rt6_do_update_pmtu(nrt6, mtu);
2078                         if (rt6_insert_exception(nrt6, rt6))
2079                                 dst_release_immediate(&nrt6->dst);
2080                 }
2081         }
2082 }
2083
2084 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2085                                struct sk_buff *skb, u32 mtu)
2086 {
2087         __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2088 }
2089
2090 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2091                      int oif, u32 mark, kuid_t uid)
2092 {
2093         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2094         struct dst_entry *dst;
2095         struct flowi6 fl6;
2096
2097         memset(&fl6, 0, sizeof(fl6));
2098         fl6.flowi6_oif = oif;
2099         fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
2100         fl6.daddr = iph->daddr;
2101         fl6.saddr = iph->saddr;
2102         fl6.flowlabel = ip6_flowinfo(iph);
2103         fl6.flowi6_uid = uid;
2104
2105         dst = ip6_route_output(net, NULL, &fl6);
2106         if (!dst->error)
2107                 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2108         dst_release(dst);
2109 }
2110 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2111
2112 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2113 {
2114         struct dst_entry *dst;
2115
2116         ip6_update_pmtu(skb, sock_net(sk), mtu,
2117                         sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid);
2118
2119         dst = __sk_dst_get(sk);
2120         if (!dst || !dst->obsolete ||
2121             dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2122                 return;
2123
2124         bh_lock_sock(sk);
2125         if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2126                 ip6_datagram_dst_update(sk, false);
2127         bh_unlock_sock(sk);
2128 }
2129 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2130
2131 /* Handle redirects */
2132 struct ip6rd_flowi {
2133         struct flowi6 fl6;
2134         struct in6_addr gateway;
2135 };
2136
2137 static struct rt6_info *__ip6_route_redirect(struct net *net,
2138                                              struct fib6_table *table,
2139                                              struct flowi6 *fl6,
2140                                              int flags)
2141 {
2142         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2143         struct rt6_info *rt, *rt_cache;
2144         struct fib6_node *fn;
2145
2146         /* Get the "current" route for this destination and
2147          * check if the redirect has come from appropriate router.
2148          *
2149          * RFC 4861 specifies that redirects should only be
2150          * accepted if they come from the nexthop to the target.
2151          * Due to the way the routes are chosen, this notion
2152          * is a bit fuzzy and one might need to check all possible
2153          * routes.
2154          */
2155
2156         rcu_read_lock();
2157         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2158 restart:
2159         for_each_fib6_node_rt_rcu(fn) {
2160                 if (rt->rt6i_nh_flags & RTNH_F_DEAD)
2161                         continue;
2162                 if (rt6_check_expired(rt))
2163                         continue;
2164                 if (rt->dst.error)
2165                         break;
2166                 if (!(rt->rt6i_flags & RTF_GATEWAY))
2167                         continue;
2168                 if (fl6->flowi6_oif != rt->dst.dev->ifindex)
2169                         continue;
2170                 /* rt_cache's gateway might be different from its 'parent'
2171                  * in the case of an ip redirect.
2172                  * So we keep searching in the exception table if the gateway
2173                  * is different.
2174                  */
2175                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway)) {
2176                         rt_cache = rt6_find_cached_rt(rt,
2177                                                       &fl6->daddr,
2178                                                       &fl6->saddr);
2179                         if (rt_cache &&
2180                             ipv6_addr_equal(&rdfl->gateway,
2181                                             &rt_cache->rt6i_gateway)) {
2182                                 rt = rt_cache;
2183                                 break;
2184                         }
2185                         continue;
2186                 }
2187                 break;
2188         }
2189
2190         if (!rt)
2191                 rt = net->ipv6.ip6_null_entry;
2192         else if (rt->dst.error) {
2193                 rt = net->ipv6.ip6_null_entry;
2194                 goto out;
2195         }
2196
2197         if (rt == net->ipv6.ip6_null_entry) {
2198                 fn = fib6_backtrack(fn, &fl6->saddr);
2199                 if (fn)
2200                         goto restart;
2201         }
2202
2203 out:
2204         ip6_hold_safe(net, &rt, true);
2205
2206         rcu_read_unlock();
2207
2208         trace_fib6_table_lookup(net, rt, table, fl6);
2209         return rt;
2210 };
2211
2212 static struct dst_entry *ip6_route_redirect(struct net *net,
2213                                         const struct flowi6 *fl6,
2214                                         const struct in6_addr *gateway)
2215 {
2216         int flags = RT6_LOOKUP_F_HAS_SADDR;
2217         struct ip6rd_flowi rdfl;
2218
2219         rdfl.fl6 = *fl6;
2220         rdfl.gateway = *gateway;
2221
2222         return fib6_rule_lookup(net, &rdfl.fl6,
2223                                 flags, __ip6_route_redirect);
2224 }
2225
2226 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2227                   kuid_t uid)
2228 {
2229         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2230         struct dst_entry *dst;
2231         struct flowi6 fl6;
2232
2233         memset(&fl6, 0, sizeof(fl6));
2234         fl6.flowi6_iif = LOOPBACK_IFINDEX;
2235         fl6.flowi6_oif = oif;
2236         fl6.flowi6_mark = mark;
2237         fl6.daddr = iph->daddr;
2238         fl6.saddr = iph->saddr;
2239         fl6.flowlabel = ip6_flowinfo(iph);
2240         fl6.flowi6_uid = uid;
2241
2242         dst = ip6_route_redirect(net, &fl6, &ipv6_hdr(skb)->saddr);
2243         rt6_do_redirect(dst, NULL, skb);
2244         dst_release(dst);
2245 }
2246 EXPORT_SYMBOL_GPL(ip6_redirect);
2247
2248 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
2249                             u32 mark)
2250 {
2251         const struct ipv6hdr *iph = ipv6_hdr(skb);
2252         const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2253         struct dst_entry *dst;
2254         struct flowi6 fl6;
2255
2256         memset(&fl6, 0, sizeof(fl6));
2257         fl6.flowi6_iif = LOOPBACK_IFINDEX;
2258         fl6.flowi6_oif = oif;
2259         fl6.flowi6_mark = mark;
2260         fl6.daddr = msg->dest;
2261         fl6.saddr = iph->daddr;
2262         fl6.flowi6_uid = sock_net_uid(net, NULL);
2263
2264         dst = ip6_route_redirect(net, &fl6, &iph->saddr);
2265         rt6_do_redirect(dst, NULL, skb);
2266         dst_release(dst);
2267 }
2268
2269 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2270 {
2271         ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2272                      sk->sk_uid);
2273 }
2274 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2275
2276 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2277 {
2278         struct net_device *dev = dst->dev;
2279         unsigned int mtu = dst_mtu(dst);
2280         struct net *net = dev_net(dev);
2281
2282         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2283
2284         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2285                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2286
2287         /*
2288          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2289          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2290          * IPV6_MAXPLEN is also valid and means: "any MSS,
2291          * rely only on pmtu discovery"
2292          */
2293         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2294                 mtu = IPV6_MAXPLEN;
2295         return mtu;
2296 }
2297
2298 static unsigned int ip6_mtu(const struct dst_entry *dst)
2299 {
2300         const struct rt6_info *rt = (const struct rt6_info *)dst;
2301         unsigned int mtu = rt->rt6i_pmtu;
2302         struct inet6_dev *idev;
2303
2304         if (mtu)
2305                 goto out;
2306
2307         mtu = dst_metric_raw(dst, RTAX_MTU);
2308         if (mtu)
2309                 goto out;
2310
2311         mtu = IPV6_MIN_MTU;
2312
2313         rcu_read_lock();
2314         idev = __in6_dev_get(dst->dev);
2315         if (idev)
2316                 mtu = idev->cnf.mtu6;
2317         rcu_read_unlock();
2318
2319 out:
2320         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2321
2322         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2323 }
2324
2325 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2326                                   struct flowi6 *fl6)
2327 {
2328         struct dst_entry *dst;
2329         struct rt6_info *rt;
2330         struct inet6_dev *idev = in6_dev_get(dev);
2331         struct net *net = dev_net(dev);
2332
2333         if (unlikely(!idev))
2334                 return ERR_PTR(-ENODEV);
2335
2336         rt = ip6_dst_alloc(net, dev, 0);
2337         if (unlikely(!rt)) {
2338                 in6_dev_put(idev);
2339                 dst = ERR_PTR(-ENOMEM);
2340                 goto out;
2341         }
2342
2343         rt->dst.flags |= DST_HOST;
2344         rt->dst.input = ip6_input;
2345         rt->dst.output  = ip6_output;
2346         rt->rt6i_gateway  = fl6->daddr;
2347         rt->rt6i_dst.addr = fl6->daddr;
2348         rt->rt6i_dst.plen = 128;
2349         rt->rt6i_idev     = idev;
2350         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2351
2352         /* Add this dst into uncached_list so that rt6_disable_ip() can
2353          * do proper release of the net_device
2354          */
2355         rt6_uncached_list_add(rt);
2356         atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2357
2358         dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2359
2360 out:
2361         return dst;
2362 }
2363
2364 static int ip6_dst_gc(struct dst_ops *ops)
2365 {
2366         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2367         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2368         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2369         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2370         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2371         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2372         int entries;
2373
2374         entries = dst_entries_get_fast(ops);
2375         if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2376             entries <= rt_max_size)
2377                 goto out;
2378
2379         net->ipv6.ip6_rt_gc_expire++;
2380         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2381         entries = dst_entries_get_slow(ops);
2382         if (entries < ops->gc_thresh)
2383                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2384 out:
2385         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2386         return entries > rt_max_size;
2387 }
2388
2389 static int ip6_convert_metrics(struct mx6_config *mxc,
2390                                const struct fib6_config *cfg)
2391 {
2392         struct net *net = cfg->fc_nlinfo.nl_net;
2393         bool ecn_ca = false;
2394         struct nlattr *nla;
2395         int remaining;
2396         u32 *mp;
2397
2398         if (!cfg->fc_mx)
2399                 return 0;
2400
2401         mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
2402         if (unlikely(!mp))
2403                 return -ENOMEM;
2404
2405         nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
2406                 int type = nla_type(nla);
2407                 u32 val;
2408
2409                 if (!type)
2410                         continue;
2411                 if (unlikely(type > RTAX_MAX))
2412                         goto err;
2413
2414                 if (type == RTAX_CC_ALGO) {
2415                         char tmp[TCP_CA_NAME_MAX];
2416
2417                         nla_strlcpy(tmp, nla, sizeof(tmp));
2418                         val = tcp_ca_get_key_by_name(net, tmp, &ecn_ca);
2419                         if (val == TCP_CA_UNSPEC)
2420                                 goto err;
2421                 } else {
2422                         val = nla_get_u32(nla);
2423                 }
2424                 if (type == RTAX_HOPLIMIT && val > 255)
2425                         val = 255;
2426                 if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
2427                         goto err;
2428
2429                 mp[type - 1] = val;
2430                 __set_bit(type - 1, mxc->mx_valid);
2431         }
2432
2433         if (ecn_ca) {
2434                 __set_bit(RTAX_FEATURES - 1, mxc->mx_valid);
2435                 mp[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
2436         }
2437
2438         mxc->mx = mp;
2439         return 0;
2440  err:
2441         kfree(mp);
2442         return -EINVAL;
2443 }
2444
2445 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2446                                             struct fib6_config *cfg,
2447                                             const struct in6_addr *gw_addr,
2448                                             u32 tbid, int flags)
2449 {
2450         struct flowi6 fl6 = {
2451                 .flowi6_oif = cfg->fc_ifindex,
2452                 .daddr = *gw_addr,
2453                 .saddr = cfg->fc_prefsrc,
2454         };
2455         struct fib6_table *table;
2456         struct rt6_info *rt;
2457
2458         table = fib6_get_table(net, tbid);
2459         if (!table)
2460                 return NULL;
2461
2462         if (!ipv6_addr_any(&cfg->fc_prefsrc))
2463                 flags |= RT6_LOOKUP_F_HAS_SADDR;
2464
2465         flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
2466         rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, flags);
2467
2468         /* if table lookup failed, fall back to full lookup */
2469         if (rt == net->ipv6.ip6_null_entry) {
2470                 ip6_rt_put(rt);
2471                 rt = NULL;
2472         }
2473
2474         return rt;
2475 }
2476
2477 static int ip6_route_check_nh_onlink(struct net *net,
2478                                      struct fib6_config *cfg,
2479                                      struct net_device *dev,
2480                                      struct netlink_ext_ack *extack)
2481 {
2482         u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
2483         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2484         u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
2485         struct rt6_info *grt;
2486         int err;
2487
2488         err = 0;
2489         grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
2490         if (grt) {
2491                 if (!grt->dst.error &&
2492                     (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
2493                         NL_SET_ERR_MSG(extack,
2494                                        "Nexthop has invalid gateway or device mismatch");
2495                         err = -EINVAL;
2496                 }
2497
2498                 ip6_rt_put(grt);
2499         }
2500
2501         return err;
2502 }
2503
2504 static int ip6_route_check_nh(struct net *net,
2505                               struct fib6_config *cfg,
2506                               struct net_device **_dev,
2507                               struct inet6_dev **idev)
2508 {
2509         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2510         struct net_device *dev = _dev ? *_dev : NULL;
2511         struct rt6_info *grt = NULL;
2512         int err = -EHOSTUNREACH;
2513
2514         if (cfg->fc_table) {
2515                 int flags = RT6_LOOKUP_F_IFACE;
2516
2517                 grt = ip6_nh_lookup_table(net, cfg, gw_addr,
2518                                           cfg->fc_table, flags);
2519                 if (grt) {
2520                         if (grt->rt6i_flags & RTF_GATEWAY ||
2521                             (dev && dev != grt->dst.dev)) {
2522                                 ip6_rt_put(grt);
2523                                 grt = NULL;
2524                         }
2525                 }
2526         }
2527
2528         if (!grt)
2529                 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
2530
2531         if (!grt)
2532                 goto out;
2533
2534         if (dev) {
2535                 if (dev != grt->dst.dev) {
2536                         ip6_rt_put(grt);
2537                         goto out;
2538                 }
2539         } else {
2540                 *_dev = dev = grt->dst.dev;
2541                 *idev = grt->rt6i_idev;
2542                 dev_hold(dev);
2543                 in6_dev_hold(grt->rt6i_idev);
2544         }
2545
2546         if (!(grt->rt6i_flags & RTF_GATEWAY))
2547                 err = 0;
2548
2549         ip6_rt_put(grt);
2550
2551 out:
2552         return err;
2553 }
2554
2555 static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
2556                                               struct netlink_ext_ack *extack)
2557 {
2558         struct net *net = cfg->fc_nlinfo.nl_net;
2559         struct rt6_info *rt = NULL;
2560         struct net_device *dev = NULL;
2561         struct inet6_dev *idev = NULL;
2562         struct fib6_table *table;
2563         int addr_type;
2564         int err = -EINVAL;
2565
2566         /* RTF_PCPU is an internal flag; can not be set by userspace */
2567         if (cfg->fc_flags & RTF_PCPU) {
2568                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
2569                 goto out;
2570         }
2571
2572         /* RTF_CACHE is an internal flag; can not be set by userspace */
2573         if (cfg->fc_flags & RTF_CACHE) {
2574                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
2575                 goto out;
2576         }
2577
2578         if (cfg->fc_dst_len > 128) {
2579                 NL_SET_ERR_MSG(extack, "Invalid prefix length");
2580                 goto out;
2581         }
2582         if (cfg->fc_src_len > 128) {
2583                 NL_SET_ERR_MSG(extack, "Invalid source address length");
2584                 goto out;
2585         }
2586 #ifndef CONFIG_IPV6_SUBTREES
2587         if (cfg->fc_src_len) {
2588                 NL_SET_ERR_MSG(extack,
2589                                "Specifying source address requires IPV6_SUBTREES to be enabled");
2590                 goto out;
2591         }
2592 #endif
2593         if (cfg->fc_ifindex) {
2594                 err = -ENODEV;
2595                 dev = dev_get_by_index(net, cfg->fc_ifindex);
2596                 if (!dev)
2597                         goto out;
2598                 idev = in6_dev_get(dev);
2599                 if (!idev)
2600                         goto out;
2601         }
2602
2603         if (cfg->fc_metric == 0)
2604                 cfg->fc_metric = IP6_RT_PRIO_USER;
2605
2606         if (cfg->fc_flags & RTNH_F_ONLINK) {
2607                 if (!dev) {
2608                         NL_SET_ERR_MSG(extack,
2609                                        "Nexthop device required for onlink");
2610                         err = -ENODEV;
2611                         goto out;
2612                 }
2613
2614                 if (!(dev->flags & IFF_UP)) {
2615                         NL_SET_ERR_MSG(extack, "Nexthop device is not up");
2616                         err = -ENETDOWN;
2617                         goto out;
2618                 }
2619         }
2620
2621         err = -ENOBUFS;
2622         if (cfg->fc_nlinfo.nlh &&
2623             !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
2624                 table = fib6_get_table(net, cfg->fc_table);
2625                 if (!table) {
2626                         pr_warn("NLM_F_CREATE should be specified when creating new route\n");
2627                         table = fib6_new_table(net, cfg->fc_table);
2628                 }
2629         } else {
2630                 table = fib6_new_table(net, cfg->fc_table);
2631         }
2632
2633         if (!table)
2634                 goto out;
2635
2636         rt = ip6_dst_alloc(net, NULL,
2637                            (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT);
2638
2639         if (!rt) {
2640                 err = -ENOMEM;
2641                 goto out;
2642         }
2643
2644         if (cfg->fc_flags & RTF_EXPIRES)
2645                 rt6_set_expires(rt, jiffies +
2646                                 clock_t_to_jiffies(cfg->fc_expires));
2647         else
2648                 rt6_clean_expires(rt);
2649
2650         if (cfg->fc_protocol == RTPROT_UNSPEC)
2651                 cfg->fc_protocol = RTPROT_BOOT;
2652         rt->rt6i_protocol = cfg->fc_protocol;
2653
2654         addr_type = ipv6_addr_type(&cfg->fc_dst);
2655
2656         if (addr_type & IPV6_ADDR_MULTICAST)
2657                 rt->dst.input = ip6_mc_input;
2658         else if (cfg->fc_flags & RTF_LOCAL)
2659                 rt->dst.input = ip6_input;
2660         else
2661                 rt->dst.input = ip6_forward;
2662
2663         rt->dst.output = ip6_output;
2664
2665         if (cfg->fc_encap) {
2666                 struct lwtunnel_state *lwtstate;
2667
2668                 err = lwtunnel_build_state(cfg->fc_encap_type,
2669                                            cfg->fc_encap, AF_INET6, cfg,
2670                                            &lwtstate, extack);
2671                 if (err)
2672                         goto out;
2673                 rt->dst.lwtstate = lwtstate_get(lwtstate);
2674                 if (lwtunnel_output_redirect(rt->dst.lwtstate)) {
2675                         rt->dst.lwtstate->orig_output = rt->dst.output;
2676                         rt->dst.output = lwtunnel_output;
2677                 }
2678                 if (lwtunnel_input_redirect(rt->dst.lwtstate)) {
2679                         rt->dst.lwtstate->orig_input = rt->dst.input;
2680                         rt->dst.input = lwtunnel_input;
2681                 }
2682         }
2683
2684         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
2685         rt->rt6i_dst.plen = cfg->fc_dst_len;
2686         if (rt->rt6i_dst.plen == 128)
2687                 rt->dst.flags |= DST_HOST;
2688
2689 #ifdef CONFIG_IPV6_SUBTREES
2690         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
2691         rt->rt6i_src.plen = cfg->fc_src_len;
2692 #endif
2693
2694         rt->rt6i_metric = cfg->fc_metric;
2695         rt->rt6i_nh_weight = 1;
2696
2697         /* We cannot add true routes via loopback here,
2698            they would result in kernel looping; promote them to reject routes
2699          */
2700         if ((cfg->fc_flags & RTF_REJECT) ||
2701             (dev && (dev->flags & IFF_LOOPBACK) &&
2702              !(addr_type & IPV6_ADDR_LOOPBACK) &&
2703              !(cfg->fc_flags & RTF_LOCAL))) {
2704                 /* hold loopback dev/idev if we haven't done so. */
2705                 if (dev != net->loopback_dev) {
2706                         if (dev) {
2707                                 dev_put(dev);
2708                                 in6_dev_put(idev);
2709                         }
2710                         dev = net->loopback_dev;
2711                         dev_hold(dev);
2712                         idev = in6_dev_get(dev);
2713                         if (!idev) {
2714                                 err = -ENODEV;
2715                                 goto out;
2716                         }
2717                 }
2718                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
2719                 switch (cfg->fc_type) {
2720                 case RTN_BLACKHOLE:
2721                         rt->dst.error = -EINVAL;
2722                         rt->dst.output = dst_discard_out;
2723                         rt->dst.input = dst_discard;
2724                         break;
2725                 case RTN_PROHIBIT:
2726                         rt->dst.error = -EACCES;
2727                         rt->dst.output = ip6_pkt_prohibit_out;
2728                         rt->dst.input = ip6_pkt_prohibit;
2729                         break;
2730                 case RTN_THROW:
2731                 case RTN_UNREACHABLE:
2732                 default:
2733                         rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN
2734                                         : (cfg->fc_type == RTN_UNREACHABLE)
2735                                         ? -EHOSTUNREACH : -ENETUNREACH;
2736                         rt->dst.output = ip6_pkt_discard_out;
2737                         rt->dst.input = ip6_pkt_discard;
2738                         break;
2739                 }
2740                 goto install_route;
2741         }
2742
2743         if (cfg->fc_flags & RTF_GATEWAY) {
2744                 const struct in6_addr *gw_addr;
2745                 int gwa_type;
2746
2747                 gw_addr = &cfg->fc_gateway;
2748                 gwa_type = ipv6_addr_type(gw_addr);
2749
2750                 /* if gw_addr is local we will fail to detect this in case
2751                  * address is still TENTATIVE (DAD in progress). rt6_lookup()
2752                  * will return already-added prefix route via interface that
2753                  * prefix route was assigned to, which might be non-loopback.
2754                  */
2755                 err = -EINVAL;
2756                 if (ipv6_chk_addr_and_flags(net, gw_addr,
2757                                             gwa_type & IPV6_ADDR_LINKLOCAL ?
2758                                             dev : NULL, 0, 0)) {
2759                         NL_SET_ERR_MSG(extack, "Invalid gateway address");
2760                         goto out;
2761                 }
2762                 rt->rt6i_gateway = *gw_addr;
2763
2764                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
2765                         /* IPv6 strictly inhibits using not link-local
2766                            addresses as nexthop address.
2767                            Otherwise, router will not able to send redirects.
2768                            It is very good, but in some (rare!) circumstances
2769                            (SIT, PtP, NBMA NOARP links) it is handy to allow
2770                            some exceptions. --ANK
2771                            We allow IPv4-mapped nexthops to support RFC4798-type
2772                            addressing
2773                          */
2774                         if (!(gwa_type & (IPV6_ADDR_UNICAST |
2775                                           IPV6_ADDR_MAPPED))) {
2776                                 NL_SET_ERR_MSG(extack,
2777                                                "Invalid gateway address");
2778                                 goto out;
2779                         }
2780
2781                         if (cfg->fc_flags & RTNH_F_ONLINK) {
2782                                 err = ip6_route_check_nh_onlink(net, cfg, dev,
2783                                                                 extack);
2784                         } else {
2785                                 err = ip6_route_check_nh(net, cfg, &dev, &idev);
2786                         }
2787                         if (err)
2788                                 goto out;
2789                 }
2790                 err = -EINVAL;
2791                 if (!dev) {
2792                         NL_SET_ERR_MSG(extack, "Egress device not specified");
2793                         goto out;
2794                 } else if (dev->flags & IFF_LOOPBACK) {
2795                         NL_SET_ERR_MSG(extack,
2796                                        "Egress device can not be loopback device for this route");
2797                         goto out;
2798                 }
2799         }
2800
2801         err = -ENODEV;
2802         if (!dev)
2803                 goto out;
2804
2805         if (!(dev->flags & IFF_UP)) {
2806                 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
2807                 err = -ENETDOWN;
2808                 goto out;
2809         }
2810
2811         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
2812                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
2813                         NL_SET_ERR_MSG(extack, "Invalid source address");
2814                         err = -EINVAL;
2815                         goto out;
2816                 }
2817                 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
2818                 rt->rt6i_prefsrc.plen = 128;
2819         } else
2820                 rt->rt6i_prefsrc.plen = 0;
2821
2822         rt->rt6i_flags = cfg->fc_flags;
2823
2824 install_route:
2825         if (!(rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
2826             !netif_carrier_ok(dev))
2827                 rt->rt6i_nh_flags |= RTNH_F_LINKDOWN;
2828         rt->rt6i_nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK);
2829         rt->dst.dev = dev;
2830         rt->rt6i_idev = idev;
2831         rt->rt6i_table = table;
2832
2833         cfg->fc_nlinfo.nl_net = dev_net(dev);
2834
2835         return rt;
2836 out:
2837         if (dev)
2838                 dev_put(dev);
2839         if (idev)
2840                 in6_dev_put(idev);
2841         if (rt)
2842                 dst_release_immediate(&rt->dst);
2843
2844         return ERR_PTR(err);
2845 }
2846
2847 int ip6_route_add(struct fib6_config *cfg,
2848                   struct netlink_ext_ack *extack)
2849 {
2850         struct mx6_config mxc = { .mx = NULL, };
2851         struct rt6_info *rt;
2852         int err;
2853
2854         rt = ip6_route_info_create(cfg, extack);
2855         if (IS_ERR(rt)) {
2856                 err = PTR_ERR(rt);
2857                 rt = NULL;
2858                 goto out;
2859         }
2860
2861         err = ip6_convert_metrics(&mxc, cfg);
2862         if (err)
2863                 goto out;
2864
2865         err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc, extack);
2866
2867         kfree(mxc.mx);
2868
2869         return err;
2870 out:
2871         if (rt)
2872                 dst_release_immediate(&rt->dst);
2873
2874         return err;
2875 }
2876
2877 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
2878 {
2879         int err;
2880         struct fib6_table *table;
2881         struct net *net = dev_net(rt->dst.dev);
2882
2883         if (rt == net->ipv6.ip6_null_entry) {
2884                 err = -ENOENT;
2885                 goto out;
2886         }
2887
2888         table = rt->rt6i_table;
2889         spin_lock_bh(&table->tb6_lock);
2890         err = fib6_del(rt, info);
2891         spin_unlock_bh(&table->tb6_lock);
2892
2893 out:
2894         ip6_rt_put(rt);
2895         return err;
2896 }
2897
2898 int ip6_del_rt(struct rt6_info *rt)
2899 {
2900         struct nl_info info = {
2901                 .nl_net = dev_net(rt->dst.dev),
2902         };
2903         return __ip6_del_rt(rt, &info);
2904 }
2905
2906 static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg)
2907 {
2908         struct nl_info *info = &cfg->fc_nlinfo;
2909         struct net *net = info->nl_net;
2910         struct sk_buff *skb = NULL;
2911         struct fib6_table *table;
2912         int err = -ENOENT;
2913
2914         if (rt == net->ipv6.ip6_null_entry)
2915                 goto out_put;
2916         table = rt->rt6i_table;
2917         spin_lock_bh(&table->tb6_lock);
2918
2919         if (rt->rt6i_nsiblings && cfg->fc_delete_all_nh) {
2920                 struct rt6_info *sibling, *next_sibling;
2921
2922                 /* prefer to send a single notification with all hops */
2923                 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
2924                 if (skb) {
2925                         u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
2926
2927                         if (rt6_fill_node(net, skb, rt,
2928                                           NULL, NULL, 0, RTM_DELROUTE,
2929                                           info->portid, seq, 0) < 0) {
2930                                 kfree_skb(skb);
2931                                 skb = NULL;
2932                         } else
2933                                 info->skip_notify = 1;
2934                 }
2935
2936                 list_for_each_entry_safe(sibling, next_sibling,
2937                                          &rt->rt6i_siblings,
2938                                          rt6i_siblings) {
2939                         err = fib6_del(sibling, info);
2940                         if (err)
2941                                 goto out_unlock;
2942                 }
2943         }
2944
2945         err = fib6_del(rt, info);
2946 out_unlock:
2947         spin_unlock_bh(&table->tb6_lock);
2948 out_put:
2949         ip6_rt_put(rt);
2950
2951         if (skb) {
2952                 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
2953                             info->nlh, gfp_any());
2954         }
2955         return err;
2956 }
2957
2958 static int ip6_route_del(struct fib6_config *cfg,
2959                          struct netlink_ext_ack *extack)
2960 {
2961         struct rt6_info *rt, *rt_cache;
2962         struct fib6_table *table;
2963         struct fib6_node *fn;
2964         int err = -ESRCH;
2965
2966         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
2967         if (!table) {
2968                 NL_SET_ERR_MSG(extack, "FIB table does not exist");
2969                 return err;
2970         }
2971
2972         rcu_read_lock();
2973
2974         fn = fib6_locate(&table->tb6_root,
2975                          &cfg->fc_dst, cfg->fc_dst_len,
2976                          &cfg->fc_src, cfg->fc_src_len,
2977                          !(cfg->fc_flags & RTF_CACHE));
2978
2979         if (fn) {
2980                 for_each_fib6_node_rt_rcu(fn) {
2981                         if (cfg->fc_flags & RTF_CACHE) {
2982                                 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
2983                                                               &cfg->fc_src);
2984                                 if (!rt_cache)
2985                                         continue;
2986                                 rt = rt_cache;
2987                         }
2988                         if (cfg->fc_ifindex &&
2989                             (!rt->dst.dev ||
2990                              rt->dst.dev->ifindex != cfg->fc_ifindex))
2991                                 continue;
2992                         if (cfg->fc_flags & RTF_GATEWAY &&
2993                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
2994                                 continue;
2995                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
2996                                 continue;
2997                         if (cfg->fc_protocol && cfg->fc_protocol != rt->rt6i_protocol)
2998                                 continue;
2999                         if (!dst_hold_safe(&rt->dst))
3000                                 break;
3001                         rcu_read_unlock();
3002
3003                         /* if gateway was specified only delete the one hop */
3004                         if (cfg->fc_flags & RTF_GATEWAY)
3005                                 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
3006
3007                         return __ip6_del_rt_siblings(rt, cfg);
3008                 }
3009         }
3010         rcu_read_unlock();
3011
3012         return err;
3013 }
3014
3015 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
3016 {
3017         struct netevent_redirect netevent;
3018         struct rt6_info *rt, *nrt = NULL;
3019         struct ndisc_options ndopts;
3020         struct inet6_dev *in6_dev;
3021         struct neighbour *neigh;
3022         struct rd_msg *msg;
3023         int optlen, on_link;
3024         u8 *lladdr;
3025
3026         optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
3027         optlen -= sizeof(*msg);
3028
3029         if (optlen < 0) {
3030                 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
3031                 return;
3032         }
3033
3034         msg = (struct rd_msg *)icmp6_hdr(skb);
3035
3036         if (ipv6_addr_is_multicast(&msg->dest)) {
3037                 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
3038                 return;
3039         }
3040
3041         on_link = 0;
3042         if (ipv6_addr_equal(&msg->dest, &msg->target)) {
3043                 on_link = 1;
3044         } else if (ipv6_addr_type(&msg->target) !=
3045                    (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
3046                 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
3047                 return;
3048         }
3049
3050         in6_dev = __in6_dev_get(skb->dev);
3051         if (!in6_dev)
3052                 return;
3053         if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
3054                 return;
3055
3056         /* RFC2461 8.1:
3057          *      The IP source address of the Redirect MUST be the same as the current
3058          *      first-hop router for the specified ICMP Destination Address.
3059          */
3060
3061         if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
3062                 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
3063                 return;
3064         }
3065
3066         lladdr = NULL;
3067         if (ndopts.nd_opts_tgt_lladdr) {
3068                 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
3069                                              skb->dev);
3070                 if (!lladdr) {
3071                         net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
3072                         return;
3073                 }
3074         }
3075
3076         rt = (struct rt6_info *) dst;
3077         if (rt->rt6i_flags & RTF_REJECT) {
3078                 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3079                 return;
3080         }
3081
3082         /* Redirect received -> path was valid.
3083          * Look, redirects are sent only in response to data packets,
3084          * so that this nexthop apparently is reachable. --ANK
3085          */
3086         dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3087
3088         neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3089         if (!neigh)
3090                 return;
3091
3092         /*
3093          *      We have finally decided to accept it.
3094          */
3095
3096         ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3097                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
3098                      NEIGH_UPDATE_F_OVERRIDE|
3099                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3100                                      NEIGH_UPDATE_F_ISROUTER)),
3101                      NDISC_REDIRECT, &ndopts);
3102
3103         nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL);
3104         if (!nrt)
3105                 goto out;
3106
3107         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3108         if (on_link)
3109                 nrt->rt6i_flags &= ~RTF_GATEWAY;
3110
3111         nrt->rt6i_protocol = RTPROT_REDIRECT;
3112         nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3113
3114         /* No need to remove rt from the exception table if rt is
3115          * a cached route because rt6_insert_exception() will
3116          * takes care of it
3117          */
3118         if (rt6_insert_exception(nrt, rt)) {
3119                 dst_release_immediate(&nrt->dst);
3120                 goto out;
3121         }
3122
3123         netevent.old = &rt->dst;
3124         netevent.new = &nrt->dst;
3125         netevent.daddr = &msg->dest;
3126         netevent.neigh = neigh;
3127         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3128
3129 out:
3130         neigh_release(neigh);
3131 }
3132
3133 /*
3134  *      Misc support functions
3135  */
3136
3137 static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
3138 {
3139         BUG_ON(from->from);
3140
3141         rt->rt6i_flags &= ~RTF_EXPIRES;
3142         dst_hold(&from->dst);
3143         rt->from = from;
3144         dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true);
3145 }
3146
3147 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
3148 {
3149         rt->dst.input = ort->dst.input;
3150         rt->dst.output = ort->dst.output;
3151         rt->rt6i_dst = ort->rt6i_dst;
3152         rt->dst.error = ort->dst.error;
3153         rt->rt6i_idev = ort->rt6i_idev;
3154         if (rt->rt6i_idev)
3155                 in6_dev_hold(rt->rt6i_idev);
3156         rt->dst.lastuse = jiffies;
3157         rt->rt6i_gateway = ort->rt6i_gateway;
3158         rt->rt6i_flags = ort->rt6i_flags;
3159         rt6_set_from(rt, ort);
3160         rt->rt6i_metric = ort->rt6i_metric;
3161 #ifdef CONFIG_IPV6_SUBTREES
3162         rt->rt6i_src = ort->rt6i_src;
3163 #endif
3164         rt->rt6i_prefsrc = ort->rt6i_prefsrc;
3165         rt->rt6i_table = ort->rt6i_table;
3166         rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate);
3167 }
3168
3169 #ifdef CONFIG_IPV6_ROUTE_INFO
3170 static struct rt6_info *rt6_get_route_info(struct net *net,
3171                                            const struct in6_addr *prefix, int prefixlen,
3172                                            const struct in6_addr *gwaddr,
3173                                            struct net_device *dev)
3174 {
3175         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3176         int ifindex = dev->ifindex;
3177         struct fib6_node *fn;
3178         struct rt6_info *rt = NULL;
3179         struct fib6_table *table;
3180
3181         table = fib6_get_table(net, tb_id);
3182         if (!table)
3183                 return NULL;
3184
3185         rcu_read_lock();
3186         fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3187         if (!fn)
3188                 goto out;
3189
3190         for_each_fib6_node_rt_rcu(fn) {
3191                 if (rt->dst.dev->ifindex != ifindex)
3192                         continue;
3193                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
3194                         continue;
3195                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
3196                         continue;
3197                 ip6_hold_safe(NULL, &rt, false);
3198                 break;
3199         }
3200 out:
3201         rcu_read_unlock();
3202         return rt;
3203 }
3204
3205 static struct rt6_info *rt6_add_route_info(struct net *net,
3206                                            const struct in6_addr *prefix, int prefixlen,
3207                                            const struct in6_addr *gwaddr,
3208                                            struct net_device *dev,
3209                                            unsigned int pref)
3210 {
3211         struct fib6_config cfg = {
3212                 .fc_metric      = IP6_RT_PRIO_USER,
3213                 .fc_ifindex     = dev->ifindex,
3214                 .fc_dst_len     = prefixlen,
3215                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3216                                   RTF_UP | RTF_PREF(pref),
3217                 .fc_protocol = RTPROT_RA,
3218                 .fc_nlinfo.portid = 0,
3219                 .fc_nlinfo.nlh = NULL,
3220                 .fc_nlinfo.nl_net = net,
3221         };
3222
3223         cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3224         cfg.fc_dst = *prefix;
3225         cfg.fc_gateway = *gwaddr;
3226
3227         /* We should treat it as a default route if prefix length is 0. */
3228         if (!prefixlen)
3229                 cfg.fc_flags |= RTF_DEFAULT;
3230
3231         ip6_route_add(&cfg, NULL);
3232
3233         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3234 }
3235 #endif
3236
3237 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
3238 {
3239         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3240         struct rt6_info *rt;
3241         struct fib6_table *table;
3242
3243         table = fib6_get_table(dev_net(dev), tb_id);
3244         if (!table)
3245                 return NULL;
3246
3247         rcu_read_lock();
3248         for_each_fib6_node_rt_rcu(&table->tb6_root) {
3249                 if (dev == rt->dst.dev &&
3250                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3251                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
3252                         break;
3253         }
3254         if (rt)
3255                 ip6_hold_safe(NULL, &rt, false);
3256         rcu_read_unlock();
3257         return rt;
3258 }
3259
3260 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
3261                                      struct net_device *dev,
3262                                      unsigned int pref)
3263 {
3264         struct fib6_config cfg = {
3265                 .fc_table       = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3266                 .fc_metric      = IP6_RT_PRIO_USER,
3267                 .fc_ifindex     = dev->ifindex,
3268                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3269                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3270                 .fc_protocol = RTPROT_RA,
3271                 .fc_nlinfo.portid = 0,
3272                 .fc_nlinfo.nlh = NULL,
3273                 .fc_nlinfo.nl_net = dev_net(dev),
3274         };
3275
3276         cfg.fc_gateway = *gwaddr;
3277
3278         if (!ip6_route_add(&cfg, NULL)) {
3279                 struct fib6_table *table;
3280
3281                 table = fib6_get_table(dev_net(dev), cfg.fc_table);
3282                 if (table)
3283                         table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3284         }
3285
3286         return rt6_get_dflt_router(gwaddr, dev);
3287 }
3288
3289 static void __rt6_purge_dflt_routers(struct fib6_table *table)
3290 {
3291         struct rt6_info *rt;
3292
3293 restart:
3294         rcu_read_lock();
3295         for_each_fib6_node_rt_rcu(&table->tb6_root) {
3296                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3297                     (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
3298                         if (dst_hold_safe(&rt->dst)) {
3299                                 rcu_read_unlock();
3300                                 ip6_del_rt(rt);
3301                         } else {
3302                                 rcu_read_unlock();
3303                         }
3304                         goto restart;
3305                 }
3306         }
3307         rcu_read_unlock();
3308
3309         table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3310 }
3311
3312 void rt6_purge_dflt_routers(struct net *net)
3313 {
3314         struct fib6_table *table;
3315         struct hlist_head *head;
3316         unsigned int h;
3317
3318         rcu_read_lock();
3319
3320         for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3321                 head = &net->ipv6.fib_table_hash[h];
3322                 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3323                         if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3324                                 __rt6_purge_dflt_routers(table);
3325                 }
3326         }
3327
3328         rcu_read_unlock();
3329 }
3330
3331 static void rtmsg_to_fib6_config(struct net *net,
3332                                  struct in6_rtmsg *rtmsg,
3333                                  struct fib6_config *cfg)
3334 {
3335         memset(cfg, 0, sizeof(*cfg));
3336
3337         cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3338                          : RT6_TABLE_MAIN;
3339         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
3340         cfg->fc_metric = rtmsg->rtmsg_metric;
3341         cfg->fc_expires = rtmsg->rtmsg_info;
3342         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
3343         cfg->fc_src_len = rtmsg->rtmsg_src_len;
3344         cfg->fc_flags = rtmsg->rtmsg_flags;
3345
3346         cfg->fc_nlinfo.nl_net = net;
3347
3348         cfg->fc_dst = rtmsg->rtmsg_dst;
3349         cfg->fc_src = rtmsg->rtmsg_src;
3350         cfg->fc_gateway = rtmsg->rtmsg_gateway;
3351 }
3352
3353 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3354 {
3355         struct fib6_config cfg;
3356         struct in6_rtmsg rtmsg;
3357         int err;
3358
3359         switch (cmd) {
3360         case SIOCADDRT:         /* Add a route */
3361         case SIOCDELRT:         /* Delete a route */
3362                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3363                         return -EPERM;
3364                 err = copy_from_user(&rtmsg, arg,
3365                                      sizeof(struct in6_rtmsg));
3366                 if (err)
3367                         return -EFAULT;
3368
3369                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3370
3371                 rtnl_lock();
3372                 switch (cmd) {
3373                 case SIOCADDRT:
3374                         err = ip6_route_add(&cfg, NULL);
3375                         break;
3376                 case SIOCDELRT:
3377                         err = ip6_route_del(&cfg, NULL);
3378                         break;
3379                 default:
3380                         err = -EINVAL;
3381                 }
3382                 rtnl_unlock();
3383
3384                 return err;
3385         }
3386
3387         return -EINVAL;
3388 }
3389
3390 /*
3391  *      Drop the packet on the floor
3392  */
3393
3394 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3395 {
3396         int type;
3397         struct dst_entry *dst = skb_dst(skb);
3398         switch (ipstats_mib_noroutes) {
3399         case IPSTATS_MIB_INNOROUTES:
3400                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3401                 if (type == IPV6_ADDR_ANY) {
3402                         IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3403                                       IPSTATS_MIB_INADDRERRORS);
3404                         break;
3405                 }
3406                 /* FALLTHROUGH */
3407         case IPSTATS_MIB_OUTNOROUTES:
3408                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3409                               ipstats_mib_noroutes);
3410                 break;
3411         }
3412         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3413         kfree_skb(skb);
3414         return 0;
3415 }
3416
3417 static int ip6_pkt_discard(struct sk_buff *skb)
3418 {
3419         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3420 }
3421
3422 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3423 {
3424         skb->dev = skb_dst(skb)->dev;
3425         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3426 }
3427
3428 static int ip6_pkt_prohibit(struct sk_buff *skb)
3429 {
3430         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3431 }
3432
3433 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3434 {
3435         skb->dev = skb_dst(skb)->dev;
3436         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3437 }
3438
3439 /*
3440  *      Allocate a dst for local (unicast / anycast) address.
3441  */
3442
3443 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
3444                                     const struct in6_addr *addr,
3445                                     bool anycast)
3446 {
3447         u32 tb_id;
3448         struct net *net = dev_net(idev->dev);
3449         struct net_device *dev = idev->dev;
3450         struct rt6_info *rt;
3451
3452         rt = ip6_dst_alloc(net, dev, DST_NOCOUNT);
3453         if (!rt)
3454                 return ERR_PTR(-ENOMEM);
3455
3456         in6_dev_hold(idev);
3457
3458         rt->dst.flags |= DST_HOST;
3459         rt->dst.input = ip6_input;
3460         rt->dst.output = ip6_output;
3461         rt->rt6i_idev = idev;
3462
3463         rt->rt6i_protocol = RTPROT_KERNEL;
3464         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
3465         if (anycast)
3466                 rt->rt6i_flags |= RTF_ANYCAST;
3467         else
3468                 rt->rt6i_flags |= RTF_LOCAL;
3469
3470         rt->rt6i_gateway  = *addr;
3471         rt->rt6i_dst.addr = *addr;
3472         rt->rt6i_dst.plen = 128;
3473         tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
3474         rt->rt6i_table = fib6_get_table(net, tb_id);
3475
3476         return rt;
3477 }
3478
3479 /* remove deleted ip from prefsrc entries */
3480 struct arg_dev_net_ip {
3481         struct net_device *dev;
3482         struct net *net;
3483         struct in6_addr *addr;
3484 };
3485
3486 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
3487 {
3488         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3489         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3490         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3491
3492         if (((void *)rt->dst.dev == dev || !dev) &&
3493             rt != net->ipv6.ip6_null_entry &&
3494             ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
3495                 spin_lock_bh(&rt6_exception_lock);
3496                 /* remove prefsrc entry */
3497                 rt->rt6i_prefsrc.plen = 0;
3498                 /* need to update cache as well */
3499                 rt6_exceptions_remove_prefsrc(rt);
3500                 spin_unlock_bh(&rt6_exception_lock);
3501         }
3502         return 0;
3503 }
3504
3505 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3506 {
3507         struct net *net = dev_net(ifp->idev->dev);
3508         struct arg_dev_net_ip adni = {
3509                 .dev = ifp->idev->dev,
3510                 .net = net,
3511                 .addr = &ifp->addr,
3512         };
3513         fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3514 }
3515
3516 #define RTF_RA_ROUTER           (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
3517
3518 /* Remove routers and update dst entries when gateway turn into host. */
3519 static int fib6_clean_tohost(struct rt6_info *rt, void *arg)
3520 {
3521         struct in6_addr *gateway = (struct in6_addr *)arg;
3522
3523         if (((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3524             ipv6_addr_equal(gateway, &rt->rt6i_gateway)) {
3525                 return -1;
3526         }
3527
3528         /* Further clean up cached routes in exception table.
3529          * This is needed because cached route may have a different
3530          * gateway than its 'parent' in the case of an ip redirect.
3531          */
3532         rt6_exceptions_clean_tohost(rt, gateway);
3533
3534         return 0;
3535 }
3536
3537 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3538 {
3539         fib6_clean_all(net, fib6_clean_tohost, gateway);
3540 }
3541
3542 struct arg_netdev_event {
3543         const struct net_device *dev;
3544         union {
3545                 unsigned int nh_flags;
3546                 unsigned long event;
3547         };
3548 };
3549
3550 static struct rt6_info *rt6_multipath_first_sibling(const struct rt6_info *rt)
3551 {
3552         struct rt6_info *iter;
3553         struct fib6_node *fn;
3554
3555         fn = rcu_dereference_protected(rt->rt6i_node,
3556                         lockdep_is_held(&rt->rt6i_table->tb6_lock));
3557         iter = rcu_dereference_protected(fn->leaf,
3558                         lockdep_is_held(&rt->rt6i_table->tb6_lock));
3559         while (iter) {
3560                 if (iter->rt6i_metric == rt->rt6i_metric &&
3561                     rt6_qualify_for_ecmp(iter))
3562                         return iter;
3563                 iter = rcu_dereference_protected(iter->rt6_next,
3564                                 lockdep_is_held(&rt->rt6i_table->tb6_lock));
3565         }
3566
3567         return NULL;
3568 }
3569
3570 static bool rt6_is_dead(const struct rt6_info *rt)
3571 {
3572         if (rt->rt6i_nh_flags & RTNH_F_DEAD ||
3573             (rt->rt6i_nh_flags & RTNH_F_LINKDOWN &&
3574              rt->rt6i_idev->cnf.ignore_routes_with_linkdown))
3575                 return true;
3576
3577         return false;
3578 }
3579
3580 static int rt6_multipath_total_weight(const struct rt6_info *rt)
3581 {
3582         struct rt6_info *iter;
3583         int total = 0;
3584
3585         if (!rt6_is_dead(rt))
3586                 total += rt->rt6i_nh_weight;
3587
3588         list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings) {
3589                 if (!rt6_is_dead(iter))
3590                         total += iter->rt6i_nh_weight;
3591         }
3592
3593         return total;
3594 }
3595
3596 static void rt6_upper_bound_set(struct rt6_info *rt, int *weight, int total)
3597 {
3598         int upper_bound = -1;
3599
3600         if (!rt6_is_dead(rt)) {
3601                 *weight += rt->rt6i_nh_weight;
3602                 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
3603                                                     total) - 1;
3604         }
3605         atomic_set(&rt->rt6i_nh_upper_bound, upper_bound);
3606 }
3607
3608 static void rt6_multipath_upper_bound_set(struct rt6_info *rt, int total)
3609 {
3610         struct rt6_info *iter;
3611         int weight = 0;
3612
3613         rt6_upper_bound_set(rt, &weight, total);
3614
3615         list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3616                 rt6_upper_bound_set(iter, &weight, total);
3617 }
3618
3619 void rt6_multipath_rebalance(struct rt6_info *rt)
3620 {
3621         struct rt6_info *first;
3622         int total;
3623
3624         /* In case the entire multipath route was marked for flushing,
3625          * then there is no need to rebalance upon the removal of every
3626          * sibling route.
3627          */
3628         if (!rt->rt6i_nsiblings || rt->should_flush)
3629                 return;
3630
3631         /* During lookup routes are evaluated in order, so we need to
3632          * make sure upper bounds are assigned from the first sibling
3633          * onwards.
3634          */
3635         first = rt6_multipath_first_sibling(rt);
3636         if (WARN_ON_ONCE(!first))
3637                 return;
3638
3639         total = rt6_multipath_total_weight(first);
3640         rt6_multipath_upper_bound_set(first, total);
3641 }
3642
3643 static int fib6_ifup(struct rt6_info *rt, void *p_arg)
3644 {
3645         const struct arg_netdev_event *arg = p_arg;
3646         const struct net *net = dev_net(arg->dev);
3647
3648         if (rt != net->ipv6.ip6_null_entry && rt->dst.dev == arg->dev) {
3649                 rt->rt6i_nh_flags &= ~arg->nh_flags;
3650                 fib6_update_sernum_upto_root(dev_net(rt->dst.dev), rt);
3651                 rt6_multipath_rebalance(rt);
3652         }
3653
3654         return 0;
3655 }
3656
3657 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags)
3658 {
3659         struct arg_netdev_event arg = {
3660                 .dev = dev,
3661                 {
3662                         .nh_flags = nh_flags,
3663                 },
3664         };
3665
3666         if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
3667                 arg.nh_flags |= RTNH_F_LINKDOWN;
3668
3669         fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
3670 }
3671
3672 static bool rt6_multipath_uses_dev(const struct rt6_info *rt,
3673                                    const struct net_device *dev)
3674 {
3675         struct rt6_info *iter;
3676
3677         if (rt->dst.dev == dev)
3678                 return true;
3679         list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3680                 if (iter->dst.dev == dev)
3681                         return true;
3682
3683         return false;
3684 }
3685
3686 static void rt6_multipath_flush(struct rt6_info *rt)
3687 {
3688         struct rt6_info *iter;
3689
3690         rt->should_flush = 1;
3691         list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3692                 iter->should_flush = 1;
3693 }
3694
3695 static unsigned int rt6_multipath_dead_count(const struct rt6_info *rt,
3696                                              const struct net_device *down_dev)
3697 {
3698         struct rt6_info *iter;
3699         unsigned int dead = 0;
3700
3701         if (rt->dst.dev == down_dev || rt->rt6i_nh_flags & RTNH_F_DEAD)
3702                 dead++;
3703         list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3704                 if (iter->dst.dev == down_dev ||
3705                     iter->rt6i_nh_flags & RTNH_F_DEAD)
3706                         dead++;
3707
3708         return dead;
3709 }
3710
3711 static void rt6_multipath_nh_flags_set(struct rt6_info *rt,
3712                                        const struct net_device *dev,
3713                                        unsigned int nh_flags)
3714 {
3715         struct rt6_info *iter;
3716
3717         if (rt->dst.dev == dev)
3718                 rt->rt6i_nh_flags |= nh_flags;
3719         list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3720                 if (iter->dst.dev == dev)
3721                         iter->rt6i_nh_flags |= nh_flags;
3722 }
3723
3724 /* called with write lock held for table with rt */
3725 static int fib6_ifdown(struct rt6_info *rt, void *p_arg)
3726 {
3727         const struct arg_netdev_event *arg = p_arg;
3728         const struct net_device *dev = arg->dev;
3729         const struct net *net = dev_net(dev);
3730
3731         if (rt == net->ipv6.ip6_null_entry)
3732                 return 0;
3733
3734         switch (arg->event) {
3735         case NETDEV_UNREGISTER:
3736                 return rt->dst.dev == dev ? -1 : 0;
3737         case NETDEV_DOWN:
3738                 if (rt->should_flush)
3739                         return -1;
3740                 if (!rt->rt6i_nsiblings)
3741                         return rt->dst.dev == dev ? -1 : 0;
3742                 if (rt6_multipath_uses_dev(rt, dev)) {
3743                         unsigned int count;
3744
3745                         count = rt6_multipath_dead_count(rt, dev);
3746                         if (rt->rt6i_nsiblings + 1 == count) {
3747                                 rt6_multipath_flush(rt);
3748                                 return -1;
3749                         }
3750                         rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
3751                                                    RTNH_F_LINKDOWN);
3752                         fib6_update_sernum(rt);
3753                         rt6_multipath_rebalance(rt);
3754                 }
3755                 return -2;
3756         case NETDEV_CHANGE:
3757                 if (rt->dst.dev != dev ||
3758                     rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST))
3759                         break;
3760                 rt->rt6i_nh_flags |= RTNH_F_LINKDOWN;
3761                 rt6_multipath_rebalance(rt);
3762                 break;
3763         }
3764
3765         return 0;
3766 }
3767
3768 void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
3769 {
3770         struct arg_netdev_event arg = {
3771                 .dev = dev,
3772                 {
3773                         .event = event,
3774                 },
3775         };
3776
3777         fib6_clean_all(dev_net(dev), fib6_ifdown, &arg);
3778 }
3779
3780 void rt6_disable_ip(struct net_device *dev, unsigned long event)
3781 {
3782         rt6_sync_down_dev(dev, event);
3783         rt6_uncached_list_flush_dev(dev_net(dev), dev);
3784         neigh_ifdown(&nd_tbl, dev);
3785 }
3786
3787 struct rt6_mtu_change_arg {
3788         struct net_device *dev;
3789         unsigned int mtu;
3790 };
3791
3792 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
3793 {
3794         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
3795         struct inet6_dev *idev;
3796
3797         /* In IPv6 pmtu discovery is not optional,
3798            so that RTAX_MTU lock cannot disable it.
3799            We still use this lock to block changes
3800            caused by addrconf/ndisc.
3801         */
3802
3803         idev = __in6_dev_get(arg->dev);
3804         if (!idev)
3805                 return 0;
3806
3807         /* For administrative MTU increase, there is no way to discover
3808            IPv6 PMTU increase, so PMTU increase should be updated here.
3809            Since RFC 1981 doesn't include administrative MTU increase
3810            update PMTU increase is a MUST. (i.e. jumbo frame)
3811          */
3812         /*
3813            If new MTU is less than route PMTU, this new MTU will be the
3814            lowest MTU in the path, update the route PMTU to reflect PMTU
3815            decreases; if new MTU is greater than route PMTU, and the
3816            old MTU is the lowest MTU in the path, update the route PMTU
3817            to reflect the increase. In this case if the other nodes' MTU
3818            also have the lowest MTU, TOO BIG MESSAGE will be lead to
3819            PMTU discovery.
3820          */
3821         if (rt->dst.dev == arg->dev &&
3822             dst_metric_raw(&rt->dst, RTAX_MTU) &&
3823             !dst_metric_locked(&rt->dst, RTAX_MTU)) {
3824                 spin_lock_bh(&rt6_exception_lock);
3825                 if (dst_mtu(&rt->dst) >= arg->mtu ||
3826                     (dst_mtu(&rt->dst) < arg->mtu &&
3827                      dst_mtu(&rt->dst) == idev->cnf.mtu6)) {
3828                         dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
3829                 }
3830                 rt6_exceptions_update_pmtu(rt, arg->mtu);
3831                 spin_unlock_bh(&rt6_exception_lock);
3832         }
3833         return 0;
3834 }
3835
3836 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
3837 {
3838         struct rt6_mtu_change_arg arg = {
3839                 .dev = dev,
3840                 .mtu = mtu,
3841         };
3842
3843         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
3844 }
3845
3846 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
3847         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
3848         [RTA_OIF]               = { .type = NLA_U32 },
3849         [RTA_IIF]               = { .type = NLA_U32 },
3850         [RTA_PRIORITY]          = { .type = NLA_U32 },
3851         [RTA_METRICS]           = { .type = NLA_NESTED },
3852         [RTA_MULTIPATH]         = { .len = sizeof(struct rtnexthop) },
3853         [RTA_PREF]              = { .type = NLA_U8 },
3854         [RTA_ENCAP_TYPE]        = { .type = NLA_U16 },
3855         [RTA_ENCAP]             = { .type = NLA_NESTED },
3856         [RTA_EXPIRES]           = { .type = NLA_U32 },
3857         [RTA_UID]               = { .type = NLA_U32 },
3858         [RTA_MARK]              = { .type = NLA_U32 },
3859 };
3860
3861 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
3862                               struct fib6_config *cfg,
3863                               struct netlink_ext_ack *extack)
3864 {
3865         struct rtmsg *rtm;
3866         struct nlattr *tb[RTA_MAX+1];
3867         unsigned int pref;
3868         int err;
3869
3870         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
3871                           NULL);
3872         if (err < 0)
3873                 goto errout;
3874
3875         err = -EINVAL;
3876         rtm = nlmsg_data(nlh);
3877         memset(cfg, 0, sizeof(*cfg));
3878
3879         cfg->fc_table = rtm->rtm_table;
3880         cfg->fc_dst_len = rtm->rtm_dst_len;
3881         cfg->fc_src_len = rtm->rtm_src_len;
3882         cfg->fc_flags = RTF_UP;
3883         cfg->fc_protocol = rtm->rtm_protocol;
3884         cfg->fc_type = rtm->rtm_type;
3885
3886         if (rtm->rtm_type == RTN_UNREACHABLE ||
3887             rtm->rtm_type == RTN_BLACKHOLE ||
3888             rtm->rtm_type == RTN_PROHIBIT ||
3889             rtm->rtm_type == RTN_THROW)
3890                 cfg->fc_flags |= RTF_REJECT;
3891
3892         if (rtm->rtm_type == RTN_LOCAL)
3893                 cfg->fc_flags |= RTF_LOCAL;
3894
3895         if (rtm->rtm_flags & RTM_F_CLONED)
3896                 cfg->fc_flags |= RTF_CACHE;
3897
3898         cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
3899
3900         cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
3901         cfg->fc_nlinfo.nlh = nlh;
3902         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
3903
3904         if (tb[RTA_GATEWAY]) {
3905                 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
3906                 cfg->fc_flags |= RTF_GATEWAY;
3907         }
3908
3909         if (tb[RTA_DST]) {
3910                 int plen = (rtm->rtm_dst_len + 7) >> 3;
3911
3912                 if (nla_len(tb[RTA_DST]) < plen)
3913                         goto errout;
3914
3915                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
3916         }
3917
3918         if (tb[RTA_SRC]) {
3919                 int plen = (rtm->rtm_src_len + 7) >> 3;
3920
3921                 if (nla_len(tb[RTA_SRC]) < plen)
3922                         goto errout;
3923
3924                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
3925         }
3926
3927         if (tb[RTA_PREFSRC])
3928                 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
3929
3930         if (tb[RTA_OIF])
3931                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
3932
3933         if (tb[RTA_PRIORITY])
3934                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
3935
3936         if (tb[RTA_METRICS]) {
3937                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
3938                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
3939         }
3940
3941         if (tb[RTA_TABLE])
3942                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
3943
3944         if (tb[RTA_MULTIPATH]) {
3945                 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
3946                 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
3947
3948                 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
3949                                                      cfg->fc_mp_len, extack);
3950                 if (err < 0)
3951                         goto errout;
3952         }
3953
3954         if (tb[RTA_PREF]) {
3955                 pref = nla_get_u8(tb[RTA_PREF]);
3956                 if (pref != ICMPV6_ROUTER_PREF_LOW &&
3957                     pref != ICMPV6_ROUTER_PREF_HIGH)
3958                         pref = ICMPV6_ROUTER_PREF_MEDIUM;
3959                 cfg->fc_flags |= RTF_PREF(pref);
3960         }
3961
3962         if (tb[RTA_ENCAP])
3963                 cfg->fc_encap = tb[RTA_ENCAP];
3964
3965         if (tb[RTA_ENCAP_TYPE]) {
3966                 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
3967
3968                 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
3969                 if (err < 0)
3970                         goto errout;
3971         }
3972
3973         if (tb[RTA_EXPIRES]) {
3974                 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
3975
3976                 if (addrconf_finite_timeout(timeout)) {
3977                         cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
3978                         cfg->fc_flags |= RTF_EXPIRES;
3979                 }
3980         }
3981
3982         err = 0;
3983 errout:
3984         return err;
3985 }
3986
3987 struct rt6_nh {
3988         struct rt6_info *rt6_info;
3989         struct fib6_config r_cfg;
3990         struct mx6_config mxc;
3991         struct list_head next;
3992 };
3993
3994 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
3995 {
3996         struct rt6_nh *nh;
3997
3998         list_for_each_entry(nh, rt6_nh_list, next) {
3999                 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
4000                         &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
4001                         nh->r_cfg.fc_ifindex);
4002         }
4003 }
4004
4005 static int ip6_route_info_append(struct list_head *rt6_nh_list,
4006                                  struct rt6_info *rt, struct fib6_config *r_cfg)
4007 {
4008         struct rt6_nh *nh;
4009         int err = -EEXIST;
4010
4011         list_for_each_entry(nh, rt6_nh_list, next) {
4012                 /* check if rt6_info already exists */
4013                 if (rt6_duplicate_nexthop(nh->rt6_info, rt))
4014                         return err;
4015         }
4016
4017         nh = kzalloc(sizeof(*nh), GFP_KERNEL);
4018         if (!nh)
4019                 return -ENOMEM;
4020         nh->rt6_info = rt;
4021         err = ip6_convert_metrics(&nh->mxc, r_cfg);
4022         if (err) {
4023                 kfree(nh);
4024                 return err;
4025         }
4026         memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
4027         list_add_tail(&nh->next, rt6_nh_list);
4028
4029         return 0;
4030 }
4031
4032 static void ip6_route_mpath_notify(struct rt6_info *rt,
4033                                    struct rt6_info *rt_last,
4034                                    struct nl_info *info,
4035                                    __u16 nlflags)
4036 {
4037         /* if this is an APPEND route, then rt points to the first route
4038          * inserted and rt_last points to last route inserted. Userspace
4039          * wants a consistent dump of the route which starts at the first
4040          * nexthop. Since sibling routes are always added at the end of
4041          * the list, find the first sibling of the last route appended
4042          */
4043         if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->rt6i_nsiblings) {
4044                 rt = list_first_entry(&rt_last->rt6i_siblings,
4045                                       struct rt6_info,
4046                                       rt6i_siblings);
4047         }
4048
4049         if (rt)
4050                 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
4051 }
4052
4053 static int ip6_route_multipath_add(struct fib6_config *cfg,
4054                                    struct netlink_ext_ack *extack)
4055 {
4056         struct rt6_info *rt_notif = NULL, *rt_last = NULL;
4057         struct nl_info *info = &cfg->fc_nlinfo;
4058         struct fib6_config r_cfg;
4059         struct rtnexthop *rtnh;
4060         struct rt6_info *rt;
4061         struct rt6_nh *err_nh;
4062         struct rt6_nh *nh, *nh_safe;
4063         __u16 nlflags;
4064         int remaining;
4065         int attrlen;
4066         int err = 1;
4067         int nhn = 0;
4068         int replace = (cfg->fc_nlinfo.nlh &&
4069                        (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
4070         LIST_HEAD(rt6_nh_list);
4071
4072         nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
4073         if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
4074                 nlflags |= NLM_F_APPEND;
4075
4076         remaining = cfg->fc_mp_len;
4077         rtnh = (struct rtnexthop *)cfg->fc_mp;
4078
4079         /* Parse a Multipath Entry and build a list (rt6_nh_list) of
4080          * rt6_info structs per nexthop
4081          */
4082         while (rtnh_ok(rtnh, remaining)) {
4083                 memcpy(&r_cfg, cfg, sizeof(*cfg));
4084                 if (rtnh->rtnh_ifindex)
4085                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4086
4087                 attrlen = rtnh_attrlen(rtnh);
4088                 if (attrlen > 0) {
4089                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4090
4091                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4092                         if (nla) {
4093                                 r_cfg.fc_gateway = nla_get_in6_addr(nla);
4094                                 r_cfg.fc_flags |= RTF_GATEWAY;
4095                         }
4096                         r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
4097                         nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
4098                         if (nla)
4099                                 r_cfg.fc_encap_type = nla_get_u16(nla);
4100                 }
4101
4102                 rt = ip6_route_info_create(&r_cfg, extack);
4103                 if (IS_ERR(rt)) {
4104                         err = PTR_ERR(rt);
4105                         rt = NULL;
4106                         goto cleanup;
4107                 }
4108
4109                 rt->rt6i_nh_weight = rtnh->rtnh_hops + 1;
4110
4111                 err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg);
4112                 if (err) {
4113                         dst_release_immediate(&rt->dst);
4114                         goto cleanup;
4115                 }
4116
4117                 rtnh = rtnh_next(rtnh, &remaining);
4118         }
4119
4120         /* for add and replace send one notification with all nexthops.
4121          * Skip the notification in fib6_add_rt2node and send one with
4122          * the full route when done
4123          */
4124         info->skip_notify = 1;
4125
4126         err_nh = NULL;
4127         list_for_each_entry(nh, &rt6_nh_list, next) {
4128                 rt_last = nh->rt6_info;
4129                 err = __ip6_ins_rt(nh->rt6_info, info, &nh->mxc, extack);
4130                 /* save reference to first route for notification */
4131                 if (!rt_notif && !err)
4132                         rt_notif = nh->rt6_info;
4133
4134                 /* nh->rt6_info is used or freed at this point, reset to NULL*/
4135                 nh->rt6_info = NULL;
4136                 if (err) {
4137                         if (replace && nhn)
4138                                 ip6_print_replace_route_err(&rt6_nh_list);
4139                         err_nh = nh;
4140                         goto add_errout;
4141                 }
4142
4143                 /* Because each route is added like a single route we remove
4144                  * these flags after the first nexthop: if there is a collision,
4145                  * we have already failed to add the first nexthop:
4146                  * fib6_add_rt2node() has rejected it; when replacing, old
4147                  * nexthops have been replaced by first new, the rest should
4148                  * be added to it.
4149                  */
4150                 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
4151                                                      NLM_F_REPLACE);
4152                 nhn++;
4153         }
4154
4155         /* success ... tell user about new route */
4156         ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4157         goto cleanup;
4158
4159 add_errout:
4160         /* send notification for routes that were added so that
4161          * the delete notifications sent by ip6_route_del are
4162          * coherent
4163          */
4164         if (rt_notif)
4165                 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4166
4167         /* Delete routes that were already added */
4168         list_for_each_entry(nh, &rt6_nh_list, next) {
4169                 if (err_nh == nh)
4170                         break;
4171                 ip6_route_del(&nh->r_cfg, extack);
4172         }
4173
4174 cleanup:
4175         list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
4176                 if (nh->rt6_info)
4177                         dst_release_immediate(&nh->rt6_info->dst);
4178                 kfree(nh->mxc.mx);
4179                 list_del(&nh->next);
4180                 kfree(nh);
4181         }
4182
4183         return err;
4184 }
4185
4186 static int ip6_route_multipath_del(struct fib6_config *cfg,
4187                                    struct netlink_ext_ack *extack)
4188 {
4189         struct fib6_config r_cfg;
4190         struct rtnexthop *rtnh;
4191         int remaining;
4192         int attrlen;
4193         int err = 1, last_err = 0;
4194
4195         remaining = cfg->fc_mp_len;
4196         rtnh = (struct rtnexthop *)cfg->fc_mp;
4197
4198         /* Parse a Multipath Entry */
4199         while (rtnh_ok(rtnh, remaining)) {
4200                 memcpy(&r_cfg, cfg, sizeof(*cfg));
4201                 if (rtnh->rtnh_ifindex)
4202                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4203
4204                 attrlen = rtnh_attrlen(rtnh);
4205                 if (attrlen > 0) {
4206                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4207
4208                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4209                         if (nla) {
4210                                 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
4211                                 r_cfg.fc_flags |= RTF_GATEWAY;
4212                         }
4213                 }
4214                 err = ip6_route_del(&r_cfg, extack);
4215                 if (err)
4216                         last_err = err;
4217
4218                 rtnh = rtnh_next(rtnh, &remaining);
4219         }
4220
4221         return last_err;
4222 }
4223
4224 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4225                               struct netlink_ext_ack *extack)
4226 {
4227         struct fib6_config cfg;
4228         int err;
4229
4230         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4231         if (err < 0)
4232                 return err;
4233
4234         if (cfg.fc_mp)
4235                 return ip6_route_multipath_del(&cfg, extack);
4236         else {
4237                 cfg.fc_delete_all_nh = 1;
4238                 return ip6_route_del(&cfg, extack);
4239         }
4240 }
4241
4242 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4243                               struct netlink_ext_ack *extack)
4244 {
4245         struct fib6_config cfg;
4246         int err;
4247
4248         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4249         if (err < 0)
4250                 return err;
4251
4252         if (cfg.fc_mp)
4253                 return ip6_route_multipath_add(&cfg, extack);
4254         else
4255                 return ip6_route_add(&cfg, extack);
4256 }
4257
4258 static size_t rt6_nlmsg_size(struct rt6_info *rt)
4259 {
4260         int nexthop_len = 0;
4261
4262         if (rt->rt6i_nsiblings) {
4263                 nexthop_len = nla_total_size(0)  /* RTA_MULTIPATH */
4264                             + NLA_ALIGN(sizeof(struct rtnexthop))
4265                             + nla_total_size(16) /* RTA_GATEWAY */
4266                             + lwtunnel_get_encap_size(rt->dst.lwtstate);
4267
4268                 nexthop_len *= rt->rt6i_nsiblings;
4269         }
4270
4271         return NLMSG_ALIGN(sizeof(struct rtmsg))
4272                + nla_total_size(16) /* RTA_SRC */
4273                + nla_total_size(16) /* RTA_DST */
4274                + nla_total_size(16) /* RTA_GATEWAY */
4275                + nla_total_size(16) /* RTA_PREFSRC */
4276                + nla_total_size(4) /* RTA_TABLE */
4277                + nla_total_size(4) /* RTA_IIF */
4278                + nla_total_size(4) /* RTA_OIF */
4279                + nla_total_size(4) /* RTA_PRIORITY */
4280                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
4281                + nla_total_size(sizeof(struct rta_cacheinfo))
4282                + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
4283                + nla_total_size(1) /* RTA_PREF */
4284                + lwtunnel_get_encap_size(rt->dst.lwtstate)
4285                + nexthop_len;
4286 }
4287
4288 static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt,
4289                             unsigned int *flags, bool skip_oif)
4290 {
4291         if (rt->rt6i_nh_flags & RTNH_F_DEAD)
4292                 *flags |= RTNH_F_DEAD;
4293
4294         if (rt->rt6i_nh_flags & RTNH_F_LINKDOWN) {
4295                 *flags |= RTNH_F_LINKDOWN;
4296                 if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
4297                         *flags |= RTNH_F_DEAD;
4298         }
4299
4300         if (rt->rt6i_flags & RTF_GATEWAY) {
4301                 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0)
4302                         goto nla_put_failure;
4303         }
4304
4305         *flags |= (rt->rt6i_nh_flags & RTNH_F_ONLINK);
4306         if (rt->rt6i_nh_flags & RTNH_F_OFFLOAD)
4307                 *flags |= RTNH_F_OFFLOAD;
4308
4309         /* not needed for multipath encoding b/c it has a rtnexthop struct */
4310         if (!skip_oif && rt->dst.dev &&
4311             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
4312                 goto nla_put_failure;
4313
4314         if (rt->dst.lwtstate &&
4315             lwtunnel_fill_encap(skb, rt->dst.lwtstate) < 0)
4316                 goto nla_put_failure;
4317
4318         return 0;
4319
4320 nla_put_failure:
4321         return -EMSGSIZE;
4322 }
4323
4324 /* add multipath next hop */
4325 static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt)
4326 {
4327         struct rtnexthop *rtnh;
4328         unsigned int flags = 0;
4329
4330         rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
4331         if (!rtnh)
4332                 goto nla_put_failure;
4333
4334         rtnh->rtnh_hops = rt->rt6i_nh_weight - 1;
4335         rtnh->rtnh_ifindex = rt->dst.dev ? rt->dst.dev->ifindex : 0;
4336
4337         if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
4338                 goto nla_put_failure;
4339
4340         rtnh->rtnh_flags = flags;
4341
4342         /* length of rtnetlink header + attributes */
4343         rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
4344
4345         return 0;
4346
4347 nla_put_failure:
4348         return -EMSGSIZE;
4349 }
4350
4351 static int rt6_fill_node(struct net *net,
4352                          struct sk_buff *skb, struct rt6_info *rt,
4353                          struct in6_addr *dst, struct in6_addr *src,
4354                          int iif, int type, u32 portid, u32 seq,
4355                          unsigned int flags)
4356 {
4357         u32 metrics[RTAX_MAX];
4358         struct rtmsg *rtm;
4359         struct nlmsghdr *nlh;
4360         long expires;
4361         u32 table;
4362
4363         nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4364         if (!nlh)
4365                 return -EMSGSIZE;
4366
4367         rtm = nlmsg_data(nlh);
4368         rtm->rtm_family = AF_INET6;
4369         rtm->rtm_dst_len = rt->rt6i_dst.plen;
4370         rtm->rtm_src_len = rt->rt6i_src.plen;
4371         rtm->rtm_tos = 0;
4372         if (rt->rt6i_table)
4373                 table = rt->rt6i_table->tb6_id;
4374         else
4375                 table = RT6_TABLE_UNSPEC;
4376         rtm->rtm_table = table;
4377         if (nla_put_u32(skb, RTA_TABLE, table))
4378                 goto nla_put_failure;
4379         if (rt->rt6i_flags & RTF_REJECT) {
4380                 switch (rt->dst.error) {
4381                 case -EINVAL:
4382                         rtm->rtm_type = RTN_BLACKHOLE;
4383                         break;
4384                 case -EACCES:
4385                         rtm->rtm_type = RTN_PROHIBIT;
4386                         break;
4387                 case -EAGAIN:
4388                         rtm->rtm_type = RTN_THROW;
4389                         break;
4390                 default:
4391                         rtm->rtm_type = RTN_UNREACHABLE;
4392                         break;
4393                 }
4394         }
4395         else if (rt->rt6i_flags & RTF_LOCAL)
4396                 rtm->rtm_type = RTN_LOCAL;
4397         else if (rt->rt6i_flags & RTF_ANYCAST)
4398                 rtm->rtm_type = RTN_ANYCAST;
4399         else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
4400                 rtm->rtm_type = RTN_LOCAL;
4401         else
4402                 rtm->rtm_type = RTN_UNICAST;
4403         rtm->rtm_flags = 0;
4404         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4405         rtm->rtm_protocol = rt->rt6i_protocol;
4406
4407         if (rt->rt6i_flags & RTF_CACHE)
4408                 rtm->rtm_flags |= RTM_F_CLONED;
4409
4410         if (dst) {
4411                 if (nla_put_in6_addr(skb, RTA_DST, dst))
4412                         goto nla_put_failure;
4413                 rtm->rtm_dst_len = 128;
4414         } else if (rtm->rtm_dst_len)
4415                 if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr))
4416                         goto nla_put_failure;
4417 #ifdef CONFIG_IPV6_SUBTREES
4418         if (src) {
4419                 if (nla_put_in6_addr(skb, RTA_SRC, src))
4420                         goto nla_put_failure;
4421                 rtm->rtm_src_len = 128;
4422         } else if (rtm->rtm_src_len &&
4423                    nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr))
4424                 goto nla_put_failure;
4425 #endif
4426         if (iif) {
4427 #ifdef CONFIG_IPV6_MROUTE
4428                 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
4429                         int err = ip6mr_get_route(net, skb, rtm, portid);
4430
4431                         if (err == 0)
4432                                 return 0;
4433                         if (err < 0)
4434                                 goto nla_put_failure;
4435                 } else
4436 #endif
4437                         if (nla_put_u32(skb, RTA_IIF, iif))
4438                                 goto nla_put_failure;
4439         } else if (dst) {
4440                 struct in6_addr saddr_buf;
4441                 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
4442                     nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4443                         goto nla_put_failure;
4444         }
4445
4446         if (rt->rt6i_prefsrc.plen) {
4447                 struct in6_addr saddr_buf;
4448                 saddr_buf = rt->rt6i_prefsrc.addr;
4449                 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4450                         goto nla_put_failure;
4451         }
4452
4453         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
4454         if (rt->rt6i_pmtu)
4455                 metrics[RTAX_MTU - 1] = rt->rt6i_pmtu;
4456         if (rtnetlink_put_metrics(skb, metrics) < 0)
4457                 goto nla_put_failure;
4458
4459         if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
4460                 goto nla_put_failure;
4461
4462         /* For multipath routes, walk the siblings list and add
4463          * each as a nexthop within RTA_MULTIPATH.
4464          */
4465         if (rt->rt6i_nsiblings) {
4466                 struct rt6_info *sibling, *next_sibling;
4467                 struct nlattr *mp;
4468
4469                 mp = nla_nest_start(skb, RTA_MULTIPATH);
4470                 if (!mp)
4471                         goto nla_put_failure;
4472
4473                 if (rt6_add_nexthop(skb, rt) < 0)
4474                         goto nla_put_failure;
4475
4476                 list_for_each_entry_safe(sibling, next_sibling,
4477                                          &rt->rt6i_siblings, rt6i_siblings) {
4478                         if (rt6_add_nexthop(skb, sibling) < 0)
4479                                 goto nla_put_failure;
4480                 }
4481
4482                 nla_nest_end(skb, mp);
4483         } else {
4484                 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
4485                         goto nla_put_failure;
4486         }
4487
4488         expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0;
4489
4490         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
4491                 goto nla_put_failure;
4492
4493         if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
4494                 goto nla_put_failure;
4495
4496
4497         nlmsg_end(skb, nlh);
4498         return 0;
4499
4500 nla_put_failure:
4501         nlmsg_cancel(skb, nlh);
4502         return -EMSGSIZE;
4503 }
4504
4505 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
4506 {
4507         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4508         struct net *net = arg->net;
4509
4510         if (rt == net->ipv6.ip6_null_entry)
4511                 return 0;
4512
4513         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
4514                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
4515
4516                 /* user wants prefix routes only */
4517                 if (rtm->rtm_flags & RTM_F_PREFIX &&
4518                     !(rt->rt6i_flags & RTF_PREFIX_RT)) {
4519                         /* success since this is not a prefix route */
4520                         return 1;
4521                 }
4522         }
4523
4524         return rt6_fill_node(net,
4525                      arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
4526                      NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq,
4527                      NLM_F_MULTI);
4528 }
4529
4530 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4531                               struct netlink_ext_ack *extack)
4532 {
4533         struct net *net = sock_net(in_skb->sk);
4534         struct nlattr *tb[RTA_MAX+1];
4535         int err, iif = 0, oif = 0;
4536         struct dst_entry *dst;
4537         struct rt6_info *rt;
4538         struct sk_buff *skb;
4539         struct rtmsg *rtm;
4540         struct flowi6 fl6;
4541         bool fibmatch;
4542
4543         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4544                           extack);
4545         if (err < 0)
4546                 goto errout;
4547
4548         err = -EINVAL;
4549         memset(&fl6, 0, sizeof(fl6));
4550         rtm = nlmsg_data(nlh);
4551         fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4552         fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4553
4554         if (tb[RTA_SRC]) {
4555                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4556                         goto errout;
4557
4558                 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4559         }
4560
4561         if (tb[RTA_DST]) {
4562                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4563                         goto errout;
4564
4565                 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4566         }
4567
4568         if (tb[RTA_IIF])
4569                 iif = nla_get_u32(tb[RTA_IIF]);
4570
4571         if (tb[RTA_OIF])
4572                 oif = nla_get_u32(tb[RTA_OIF]);
4573
4574         if (tb[RTA_MARK])
4575                 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4576
4577         if (tb[RTA_UID])
4578                 fl6.flowi6_uid = make_kuid(current_user_ns(),
4579                                            nla_get_u32(tb[RTA_UID]));
4580         else
4581                 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4582
4583         if (iif) {
4584                 struct net_device *dev;
4585                 int flags = 0;
4586
4587                 rcu_read_lock();
4588
4589                 dev = dev_get_by_index_rcu(net, iif);
4590                 if (!dev) {
4591                         rcu_read_unlock();
4592                         err = -ENODEV;
4593                         goto errout;
4594                 }
4595
4596                 fl6.flowi6_iif = iif;
4597
4598                 if (!ipv6_addr_any(&fl6.saddr))
4599                         flags |= RT6_LOOKUP_F_HAS_SADDR;
4600
4601                 dst = ip6_route_input_lookup(net, dev, &fl6, flags);
4602
4603                 rcu_read_unlock();
4604         } else {
4605                 fl6.flowi6_oif = oif;
4606
4607                 dst = ip6_route_output(net, NULL, &fl6);
4608         }
4609
4610
4611         rt = container_of(dst, struct rt6_info, dst);
4612         if (rt->dst.error) {
4613                 err = rt->dst.error;
4614                 ip6_rt_put(rt);
4615                 goto errout;
4616         }
4617
4618         if (rt == net->ipv6.ip6_null_entry) {
4619                 err = rt->dst.error;
4620                 ip6_rt_put(rt);
4621                 goto errout;
4622         }
4623
4624         if (fibmatch && rt->from) {
4625                 struct rt6_info *ort = rt->from;
4626
4627                 dst_hold(&ort->dst);
4628                 ip6_rt_put(rt);
4629                 rt = ort;
4630         }
4631
4632         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
4633         if (!skb) {
4634                 ip6_rt_put(rt);
4635                 err = -ENOBUFS;
4636                 goto errout;
4637         }
4638
4639         skb_dst_set(skb, &rt->dst);
4640         if (fibmatch)
4641                 err = rt6_fill_node(net, skb, rt, NULL, NULL, iif,
4642                                     RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4643                                     nlh->nlmsg_seq, 0);
4644         else
4645                 err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
4646                                     RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4647                                     nlh->nlmsg_seq, 0);
4648         if (err < 0) {
4649                 kfree_skb(skb);
4650                 goto errout;
4651         }
4652
4653         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
4654 errout:
4655         return err;
4656 }
4657
4658 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info,
4659                      unsigned int nlm_flags)
4660 {
4661         struct sk_buff *skb;
4662         struct net *net = info->nl_net;
4663         u32 seq;
4664         int err;
4665
4666         err = -ENOBUFS;
4667         seq = info->nlh ? info->nlh->nlmsg_seq : 0;
4668
4669         skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
4670         if (!skb)
4671                 goto errout;
4672
4673         err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
4674                                 event, info->portid, seq, nlm_flags);
4675         if (err < 0) {
4676                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
4677                 WARN_ON(err == -EMSGSIZE);
4678                 kfree_skb(skb);
4679                 goto errout;
4680         }
4681         rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
4682                     info->nlh, gfp_any());
4683         return;
4684 errout:
4685         if (err < 0)
4686                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
4687 }
4688
4689 static int ip6_route_dev_notify(struct notifier_block *this,
4690                                 unsigned long event, void *ptr)
4691 {
4692         struct net_device *dev = netdev_notifier_info_to_dev(ptr);
4693         struct net *net = dev_net(dev);
4694
4695         if (!(dev->flags & IFF_LOOPBACK))
4696                 return NOTIFY_OK;
4697
4698         if (event == NETDEV_REGISTER) {
4699                 net->ipv6.ip6_null_entry->dst.dev = dev;
4700                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
4701 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4702                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
4703                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
4704                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
4705                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
4706 #endif
4707          } else if (event == NETDEV_UNREGISTER &&
4708                     dev->reg_state != NETREG_UNREGISTERED) {
4709                 /* NETDEV_UNREGISTER could be fired for multiple times by
4710                  * netdev_wait_allrefs(). Make sure we only call this once.
4711                  */
4712                 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
4713 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4714                 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
4715                 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
4716 #endif
4717         }
4718
4719         return NOTIFY_OK;
4720 }
4721
4722 /*
4723  *      /proc
4724  */
4725
4726 #ifdef CONFIG_PROC_FS
4727
4728 static const struct file_operations ipv6_route_proc_fops = {
4729         .open           = ipv6_route_open,
4730         .read           = seq_read,
4731         .llseek         = seq_lseek,
4732         .release        = seq_release_net,
4733 };
4734
4735 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
4736 {
4737         struct net *net = (struct net *)seq->private;
4738         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
4739                    net->ipv6.rt6_stats->fib_nodes,
4740                    net->ipv6.rt6_stats->fib_route_nodes,
4741                    atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
4742                    net->ipv6.rt6_stats->fib_rt_entries,
4743                    net->ipv6.rt6_stats->fib_rt_cache,
4744                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
4745                    net->ipv6.rt6_stats->fib_discarded_routes);
4746
4747         return 0;
4748 }
4749
4750 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
4751 {
4752         return single_open_net(inode, file, rt6_stats_seq_show);
4753 }
4754
4755 static const struct file_operations rt6_stats_seq_fops = {
4756         .open    = rt6_stats_seq_open,
4757         .read    = seq_read,
4758         .llseek  = seq_lseek,
4759         .release = single_release_net,
4760 };
4761 #endif  /* CONFIG_PROC_FS */
4762
4763 #ifdef CONFIG_SYSCTL
4764
4765 static
4766 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
4767                               void __user *buffer, size_t *lenp, loff_t *ppos)
4768 {
4769         struct net *net;
4770         int delay;
4771         if (!write)
4772                 return -EINVAL;
4773
4774         net = (struct net *)ctl->extra1;
4775         delay = net->ipv6.sysctl.flush_delay;
4776         proc_dointvec(ctl, write, buffer, lenp, ppos);
4777         fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
4778         return 0;
4779 }
4780
4781 struct ctl_table ipv6_route_table_template[] = {
4782         {
4783                 .procname       =       "flush",
4784                 .data           =       &init_net.ipv6.sysctl.flush_delay,
4785                 .maxlen         =       sizeof(int),
4786                 .mode           =       0200,
4787                 .proc_handler   =       ipv6_sysctl_rtcache_flush
4788         },
4789         {
4790                 .procname       =       "gc_thresh",
4791                 .data           =       &ip6_dst_ops_template.gc_thresh,
4792                 .maxlen         =       sizeof(int),
4793                 .mode           =       0644,
4794                 .proc_handler   =       proc_dointvec,
4795         },
4796         {
4797                 .procname       =       "max_size",
4798                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
4799                 .maxlen         =       sizeof(int),
4800                 .mode           =       0644,
4801                 .proc_handler   =       proc_dointvec,
4802         },
4803         {
4804                 .procname       =       "gc_min_interval",
4805                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
4806                 .maxlen         =       sizeof(int),
4807                 .mode           =       0644,
4808                 .proc_handler   =       proc_dointvec_jiffies,
4809         },
4810         {
4811                 .procname       =       "gc_timeout",
4812                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
4813                 .maxlen         =       sizeof(int),
4814                 .mode           =       0644,
4815                 .proc_handler   =       proc_dointvec_jiffies,
4816         },
4817         {
4818                 .procname       =       "gc_interval",
4819                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
4820                 .maxlen         =       sizeof(int),
4821                 .mode           =       0644,
4822                 .proc_handler   =       proc_dointvec_jiffies,
4823         },
4824         {
4825                 .procname       =       "gc_elasticity",
4826                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
4827                 .maxlen         =       sizeof(int),
4828                 .mode           =       0644,
4829                 .proc_handler   =       proc_dointvec,
4830         },
4831         {
4832                 .procname       =       "mtu_expires",
4833                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
4834                 .maxlen         =       sizeof(int),
4835                 .mode           =       0644,
4836                 .proc_handler   =       proc_dointvec_jiffies,
4837         },
4838         {
4839                 .procname       =       "min_adv_mss",
4840                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
4841                 .maxlen         =       sizeof(int),
4842                 .mode           =       0644,
4843                 .proc_handler   =       proc_dointvec,
4844         },
4845         {
4846                 .procname       =       "gc_min_interval_ms",
4847                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
4848                 .maxlen         =       sizeof(int),
4849                 .mode           =       0644,
4850                 .proc_handler   =       proc_dointvec_ms_jiffies,
4851         },
4852         { }
4853 };
4854
4855 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
4856 {
4857         struct ctl_table *table;
4858
4859         table = kmemdup(ipv6_route_table_template,
4860                         sizeof(ipv6_route_table_template),
4861                         GFP_KERNEL);
4862
4863         if (table) {
4864                 table[0].data = &net->ipv6.sysctl.flush_delay;
4865                 table[0].extra1 = net;
4866                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
4867                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
4868                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
4869                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
4870                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
4871                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
4872                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
4873                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
4874                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
4875
4876                 /* Don't export sysctls to unprivileged users */
4877                 if (net->user_ns != &init_user_ns)
4878                         table[0].procname = NULL;
4879         }
4880
4881         return table;
4882 }
4883 #endif
4884
4885 static int __net_init ip6_route_net_init(struct net *net)
4886 {
4887         int ret = -ENOMEM;
4888
4889         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
4890                sizeof(net->ipv6.ip6_dst_ops));
4891
4892         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
4893                 goto out_ip6_dst_ops;
4894
4895         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
4896                                            sizeof(*net->ipv6.ip6_null_entry),
4897                                            GFP_KERNEL);
4898         if (!net->ipv6.ip6_null_entry)
4899                 goto out_ip6_dst_entries;
4900         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4901         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
4902                          ip6_template_metrics, true);
4903
4904 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4905         net->ipv6.fib6_has_custom_rules = false;
4906         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
4907                                                sizeof(*net->ipv6.ip6_prohibit_entry),
4908                                                GFP_KERNEL);
4909         if (!net->ipv6.ip6_prohibit_entry)
4910                 goto out_ip6_null_entry;
4911         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4912         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
4913                          ip6_template_metrics, true);
4914
4915         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
4916                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
4917                                                GFP_KERNEL);
4918         if (!net->ipv6.ip6_blk_hole_entry)
4919                 goto out_ip6_prohibit_entry;
4920         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4921         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
4922                          ip6_template_metrics, true);
4923 #endif
4924
4925         net->ipv6.sysctl.flush_delay = 0;
4926         net->ipv6.sysctl.ip6_rt_max_size = 4096;
4927         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
4928         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
4929         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
4930         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
4931         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
4932         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
4933
4934         net->ipv6.ip6_rt_gc_expire = 30*HZ;
4935
4936         ret = 0;
4937 out:
4938         return ret;
4939
4940 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4941 out_ip6_prohibit_entry:
4942         kfree(net->ipv6.ip6_prohibit_entry);
4943 out_ip6_null_entry:
4944         kfree(net->ipv6.ip6_null_entry);
4945 #endif
4946 out_ip6_dst_entries:
4947         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
4948 out_ip6_dst_ops:
4949         goto out;
4950 }
4951
4952 static void __net_exit ip6_route_net_exit(struct net *net)
4953 {
4954         kfree(net->ipv6.ip6_null_entry);
4955 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4956         kfree(net->ipv6.ip6_prohibit_entry);
4957         kfree(net->ipv6.ip6_blk_hole_entry);
4958 #endif
4959         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
4960 }
4961
4962 static int __net_init ip6_route_net_init_late(struct net *net)
4963 {
4964 #ifdef CONFIG_PROC_FS
4965         proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
4966         proc_create("rt6_stats", S_IRUGO, net->proc_net, &rt6_stats_seq_fops);
4967 #endif
4968         return 0;
4969 }
4970
4971 static void __net_exit ip6_route_net_exit_late(struct net *net)
4972 {
4973 #ifdef CONFIG_PROC_FS
4974         remove_proc_entry("ipv6_route", net->proc_net);
4975         remove_proc_entry("rt6_stats", net->proc_net);
4976 #endif
4977 }
4978
4979 static struct pernet_operations ip6_route_net_ops = {
4980         .init = ip6_route_net_init,
4981         .exit = ip6_route_net_exit,
4982 };
4983
4984 static int __net_init ipv6_inetpeer_init(struct net *net)
4985 {
4986         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
4987
4988         if (!bp)
4989                 return -ENOMEM;
4990         inet_peer_base_init(bp);
4991         net->ipv6.peers = bp;
4992         return 0;
4993 }
4994
4995 static void __net_exit ipv6_inetpeer_exit(struct net *net)
4996 {
4997         struct inet_peer_base *bp = net->ipv6.peers;
4998
4999         net->ipv6.peers = NULL;
5000         inetpeer_invalidate_tree(bp);
5001         kfree(bp);
5002 }
5003
5004 static struct pernet_operations ipv6_inetpeer_ops = {
5005         .init   =       ipv6_inetpeer_init,
5006         .exit   =       ipv6_inetpeer_exit,
5007 };
5008
5009 static struct pernet_operations ip6_route_net_late_ops = {
5010         .init = ip6_route_net_init_late,
5011         .exit = ip6_route_net_exit_late,
5012 };
5013
5014 static struct notifier_block ip6_route_dev_notifier = {
5015         .notifier_call = ip6_route_dev_notify,
5016         .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
5017 };
5018
5019 void __init ip6_route_init_special_entries(void)
5020 {
5021         /* Registering of the loopback is done before this portion of code,
5022          * the loopback reference in rt6_info will not be taken, do it
5023          * manually for init_net */
5024         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
5025         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5026   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5027         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
5028         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5029         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
5030         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5031   #endif
5032 }
5033
5034 int __init ip6_route_init(void)
5035 {
5036         int ret;
5037         int cpu;
5038
5039         ret = -ENOMEM;
5040         ip6_dst_ops_template.kmem_cachep =
5041                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
5042                                   SLAB_HWCACHE_ALIGN, NULL);
5043         if (!ip6_dst_ops_template.kmem_cachep)
5044                 goto out;
5045
5046         ret = dst_entries_init(&ip6_dst_blackhole_ops);
5047         if (ret)
5048                 goto out_kmem_cache;
5049
5050         ret = register_pernet_subsys(&ipv6_inetpeer_ops);
5051         if (ret)
5052                 goto out_dst_entries;
5053
5054         ret = register_pernet_subsys(&ip6_route_net_ops);
5055         if (ret)
5056                 goto out_register_inetpeer;
5057
5058         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
5059
5060         ret = fib6_init();
5061         if (ret)
5062                 goto out_register_subsys;
5063
5064         ret = xfrm6_init();
5065         if (ret)
5066                 goto out_fib6_init;
5067
5068         ret = fib6_rules_init();
5069         if (ret)
5070                 goto xfrm6_init;
5071
5072         ret = register_pernet_subsys(&ip6_route_net_late_ops);
5073         if (ret)
5074                 goto fib6_rules_init;
5075
5076         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
5077                                    inet6_rtm_newroute, NULL, 0);
5078         if (ret < 0)
5079                 goto out_register_late_subsys;
5080
5081         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
5082                                    inet6_rtm_delroute, NULL, 0);
5083         if (ret < 0)
5084                 goto out_register_late_subsys;
5085
5086         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
5087                                    inet6_rtm_getroute, NULL,
5088                                    RTNL_FLAG_DOIT_UNLOCKED);
5089         if (ret < 0)
5090                 goto out_register_late_subsys;
5091
5092         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
5093         if (ret)
5094                 goto out_register_late_subsys;
5095
5096         for_each_possible_cpu(cpu) {
5097                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
5098
5099                 INIT_LIST_HEAD(&ul->head);
5100                 spin_lock_init(&ul->lock);
5101         }
5102
5103 out:
5104         return ret;
5105
5106 out_register_late_subsys:
5107         rtnl_unregister_all(PF_INET6);
5108         unregister_pernet_subsys(&ip6_route_net_late_ops);
5109 fib6_rules_init:
5110         fib6_rules_cleanup();
5111 xfrm6_init:
5112         xfrm6_fini();
5113 out_fib6_init:
5114         fib6_gc_cleanup();
5115 out_register_subsys:
5116         unregister_pernet_subsys(&ip6_route_net_ops);
5117 out_register_inetpeer:
5118         unregister_pernet_subsys(&ipv6_inetpeer_ops);
5119 out_dst_entries:
5120         dst_entries_destroy(&ip6_dst_blackhole_ops);
5121 out_kmem_cache:
5122         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5123         goto out;
5124 }
5125
5126 void ip6_route_cleanup(void)
5127 {
5128         unregister_netdevice_notifier(&ip6_route_dev_notifier);
5129         unregister_pernet_subsys(&ip6_route_net_late_ops);
5130         fib6_rules_cleanup();
5131         xfrm6_fini();
5132         fib6_gc_cleanup();
5133         unregister_pernet_subsys(&ipv6_inetpeer_ops);
5134         unregister_pernet_subsys(&ip6_route_net_ops);
5135         dst_entries_destroy(&ip6_dst_blackhole_ops);
5136         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5137 }