ipv6: prevent user from adding cached routes
[linux-2.6-microblaze.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #define pr_fmt(fmt) "IPv6: " fmt
28
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
49 #include <net/snmp.h>
50 #include <net/ipv6.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
55 #include <net/tcp.h>
56 #include <linux/rtnetlink.h>
57 #include <net/dst.h>
58 #include <net/dst_metadata.h>
59 #include <net/xfrm.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/nexthop.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
66 #include <trace/events/fib6.h>
67
68 #include <linux/uaccess.h>
69
70 #ifdef CONFIG_SYSCTL
71 #include <linux/sysctl.h>
72 #endif
73
74 enum rt6_nud_state {
75         RT6_NUD_FAIL_HARD = -3,
76         RT6_NUD_FAIL_PROBE = -2,
77         RT6_NUD_FAIL_DO_RR = -1,
78         RT6_NUD_SUCCEED = 1
79 };
80
81 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort);
82 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
83 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
84 static unsigned int      ip6_mtu(const struct dst_entry *dst);
85 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
86 static void             ip6_dst_destroy(struct dst_entry *);
87 static void             ip6_dst_ifdown(struct dst_entry *,
88                                        struct net_device *dev, int how);
89 static int               ip6_dst_gc(struct dst_ops *ops);
90
91 static int              ip6_pkt_discard(struct sk_buff *skb);
92 static int              ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
93 static int              ip6_pkt_prohibit(struct sk_buff *skb);
94 static int              ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
95 static void             ip6_link_failure(struct sk_buff *skb);
96 static void             ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
97                                            struct sk_buff *skb, u32 mtu);
98 static void             rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
99                                         struct sk_buff *skb);
100 static void             rt6_dst_from_metrics_check(struct rt6_info *rt);
101 static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
102 static size_t rt6_nlmsg_size(struct rt6_info *rt);
103 static int rt6_fill_node(struct net *net,
104                          struct sk_buff *skb, struct rt6_info *rt,
105                          struct in6_addr *dst, struct in6_addr *src,
106                          int iif, int type, u32 portid, u32 seq,
107                          unsigned int flags);
108 static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
109                                            struct in6_addr *daddr,
110                                            struct in6_addr *saddr);
111
112 #ifdef CONFIG_IPV6_ROUTE_INFO
113 static struct rt6_info *rt6_add_route_info(struct net *net,
114                                            const struct in6_addr *prefix, int prefixlen,
115                                            const struct in6_addr *gwaddr,
116                                            struct net_device *dev,
117                                            unsigned int pref);
118 static struct rt6_info *rt6_get_route_info(struct net *net,
119                                            const struct in6_addr *prefix, int prefixlen,
120                                            const struct in6_addr *gwaddr,
121                                            struct net_device *dev);
122 #endif
123
124 struct uncached_list {
125         spinlock_t              lock;
126         struct list_head        head;
127 };
128
129 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
130
131 static void rt6_uncached_list_add(struct rt6_info *rt)
132 {
133         struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
134
135         rt->rt6i_uncached_list = ul;
136
137         spin_lock_bh(&ul->lock);
138         list_add_tail(&rt->rt6i_uncached, &ul->head);
139         spin_unlock_bh(&ul->lock);
140 }
141
142 static void rt6_uncached_list_del(struct rt6_info *rt)
143 {
144         if (!list_empty(&rt->rt6i_uncached)) {
145                 struct uncached_list *ul = rt->rt6i_uncached_list;
146                 struct net *net = dev_net(rt->dst.dev);
147
148                 spin_lock_bh(&ul->lock);
149                 list_del(&rt->rt6i_uncached);
150                 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
151                 spin_unlock_bh(&ul->lock);
152         }
153 }
154
155 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
156 {
157         struct net_device *loopback_dev = net->loopback_dev;
158         int cpu;
159
160         if (dev == loopback_dev)
161                 return;
162
163         for_each_possible_cpu(cpu) {
164                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
165                 struct rt6_info *rt;
166
167                 spin_lock_bh(&ul->lock);
168                 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
169                         struct inet6_dev *rt_idev = rt->rt6i_idev;
170                         struct net_device *rt_dev = rt->dst.dev;
171
172                         if (rt_idev->dev == dev) {
173                                 rt->rt6i_idev = in6_dev_get(loopback_dev);
174                                 in6_dev_put(rt_idev);
175                         }
176
177                         if (rt_dev == dev) {
178                                 rt->dst.dev = loopback_dev;
179                                 dev_hold(rt->dst.dev);
180                                 dev_put(rt_dev);
181                         }
182                 }
183                 spin_unlock_bh(&ul->lock);
184         }
185 }
186
187 static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt)
188 {
189         return dst_metrics_write_ptr(rt->dst.from);
190 }
191
192 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
193 {
194         struct rt6_info *rt = (struct rt6_info *)dst;
195
196         if (rt->rt6i_flags & RTF_PCPU)
197                 return rt6_pcpu_cow_metrics(rt);
198         else if (rt->rt6i_flags & RTF_CACHE)
199                 return NULL;
200         else
201                 return dst_cow_metrics_generic(dst, old);
202 }
203
204 static inline const void *choose_neigh_daddr(struct rt6_info *rt,
205                                              struct sk_buff *skb,
206                                              const void *daddr)
207 {
208         struct in6_addr *p = &rt->rt6i_gateway;
209
210         if (!ipv6_addr_any(p))
211                 return (const void *) p;
212         else if (skb)
213                 return &ipv6_hdr(skb)->daddr;
214         return daddr;
215 }
216
217 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
218                                           struct sk_buff *skb,
219                                           const void *daddr)
220 {
221         struct rt6_info *rt = (struct rt6_info *) dst;
222         struct neighbour *n;
223
224         daddr = choose_neigh_daddr(rt, skb, daddr);
225         n = __ipv6_neigh_lookup(dst->dev, daddr);
226         if (n)
227                 return n;
228         return neigh_create(&nd_tbl, daddr, dst->dev);
229 }
230
231 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
232 {
233         struct net_device *dev = dst->dev;
234         struct rt6_info *rt = (struct rt6_info *)dst;
235
236         daddr = choose_neigh_daddr(rt, NULL, daddr);
237         if (!daddr)
238                 return;
239         if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
240                 return;
241         if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
242                 return;
243         __ipv6_confirm_neigh(dev, daddr);
244 }
245
246 static struct dst_ops ip6_dst_ops_template = {
247         .family                 =       AF_INET6,
248         .gc                     =       ip6_dst_gc,
249         .gc_thresh              =       1024,
250         .check                  =       ip6_dst_check,
251         .default_advmss         =       ip6_default_advmss,
252         .mtu                    =       ip6_mtu,
253         .cow_metrics            =       ipv6_cow_metrics,
254         .destroy                =       ip6_dst_destroy,
255         .ifdown                 =       ip6_dst_ifdown,
256         .negative_advice        =       ip6_negative_advice,
257         .link_failure           =       ip6_link_failure,
258         .update_pmtu            =       ip6_rt_update_pmtu,
259         .redirect               =       rt6_do_redirect,
260         .local_out              =       __ip6_local_out,
261         .neigh_lookup           =       ip6_neigh_lookup,
262         .confirm_neigh          =       ip6_confirm_neigh,
263 };
264
265 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
266 {
267         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
268
269         return mtu ? : dst->dev->mtu;
270 }
271
272 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
273                                          struct sk_buff *skb, u32 mtu)
274 {
275 }
276
277 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
278                                       struct sk_buff *skb)
279 {
280 }
281
282 static struct dst_ops ip6_dst_blackhole_ops = {
283         .family                 =       AF_INET6,
284         .destroy                =       ip6_dst_destroy,
285         .check                  =       ip6_dst_check,
286         .mtu                    =       ip6_blackhole_mtu,
287         .default_advmss         =       ip6_default_advmss,
288         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
289         .redirect               =       ip6_rt_blackhole_redirect,
290         .cow_metrics            =       dst_cow_metrics_generic,
291         .neigh_lookup           =       ip6_neigh_lookup,
292 };
293
294 static const u32 ip6_template_metrics[RTAX_MAX] = {
295         [RTAX_HOPLIMIT - 1] = 0,
296 };
297
298 static const struct rt6_info ip6_null_entry_template = {
299         .dst = {
300                 .__refcnt       = ATOMIC_INIT(1),
301                 .__use          = 1,
302                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
303                 .error          = -ENETUNREACH,
304                 .input          = ip6_pkt_discard,
305                 .output         = ip6_pkt_discard_out,
306         },
307         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
308         .rt6i_protocol  = RTPROT_KERNEL,
309         .rt6i_metric    = ~(u32) 0,
310         .rt6i_ref       = ATOMIC_INIT(1),
311 };
312
313 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
314
315 static const struct rt6_info ip6_prohibit_entry_template = {
316         .dst = {
317                 .__refcnt       = ATOMIC_INIT(1),
318                 .__use          = 1,
319                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
320                 .error          = -EACCES,
321                 .input          = ip6_pkt_prohibit,
322                 .output         = ip6_pkt_prohibit_out,
323         },
324         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
325         .rt6i_protocol  = RTPROT_KERNEL,
326         .rt6i_metric    = ~(u32) 0,
327         .rt6i_ref       = ATOMIC_INIT(1),
328 };
329
330 static const struct rt6_info ip6_blk_hole_entry_template = {
331         .dst = {
332                 .__refcnt       = ATOMIC_INIT(1),
333                 .__use          = 1,
334                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
335                 .error          = -EINVAL,
336                 .input          = dst_discard,
337                 .output         = dst_discard_out,
338         },
339         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
340         .rt6i_protocol  = RTPROT_KERNEL,
341         .rt6i_metric    = ~(u32) 0,
342         .rt6i_ref       = ATOMIC_INIT(1),
343 };
344
345 #endif
346
347 static void rt6_info_init(struct rt6_info *rt)
348 {
349         struct dst_entry *dst = &rt->dst;
350
351         memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
352         INIT_LIST_HEAD(&rt->rt6i_siblings);
353         INIT_LIST_HEAD(&rt->rt6i_uncached);
354 }
355
356 /* allocate dst with ip6_dst_ops */
357 static struct rt6_info *__ip6_dst_alloc(struct net *net,
358                                         struct net_device *dev,
359                                         int flags)
360 {
361         struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
362                                         1, DST_OBSOLETE_FORCE_CHK, flags);
363
364         if (rt) {
365                 rt6_info_init(rt);
366                 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
367         }
368
369         return rt;
370 }
371
372 struct rt6_info *ip6_dst_alloc(struct net *net,
373                                struct net_device *dev,
374                                int flags)
375 {
376         struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags);
377
378         if (rt) {
379                 rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
380                 if (!rt->rt6i_pcpu) {
381                         dst_release_immediate(&rt->dst);
382                         return NULL;
383                 }
384         }
385
386         return rt;
387 }
388 EXPORT_SYMBOL(ip6_dst_alloc);
389
390 static void ip6_dst_destroy(struct dst_entry *dst)
391 {
392         struct rt6_info *rt = (struct rt6_info *)dst;
393         struct rt6_exception_bucket *bucket;
394         struct dst_entry *from = dst->from;
395         struct inet6_dev *idev;
396
397         dst_destroy_metrics_generic(dst);
398         free_percpu(rt->rt6i_pcpu);
399         rt6_uncached_list_del(rt);
400
401         idev = rt->rt6i_idev;
402         if (idev) {
403                 rt->rt6i_idev = NULL;
404                 in6_dev_put(idev);
405         }
406         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1);
407         if (bucket) {
408                 rt->rt6i_exception_bucket = NULL;
409                 kfree(bucket);
410         }
411
412         dst->from = NULL;
413         dst_release(from);
414 }
415
416 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
417                            int how)
418 {
419         struct rt6_info *rt = (struct rt6_info *)dst;
420         struct inet6_dev *idev = rt->rt6i_idev;
421         struct net_device *loopback_dev =
422                 dev_net(dev)->loopback_dev;
423
424         if (idev && idev->dev != loopback_dev) {
425                 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
426                 if (loopback_idev) {
427                         rt->rt6i_idev = loopback_idev;
428                         in6_dev_put(idev);
429                 }
430         }
431 }
432
433 static bool __rt6_check_expired(const struct rt6_info *rt)
434 {
435         if (rt->rt6i_flags & RTF_EXPIRES)
436                 return time_after(jiffies, rt->dst.expires);
437         else
438                 return false;
439 }
440
441 static bool rt6_check_expired(const struct rt6_info *rt)
442 {
443         if (rt->rt6i_flags & RTF_EXPIRES) {
444                 if (time_after(jiffies, rt->dst.expires))
445                         return true;
446         } else if (rt->dst.from) {
447                 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
448                        rt6_check_expired((struct rt6_info *)rt->dst.from);
449         }
450         return false;
451 }
452
453 static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
454                                              struct flowi6 *fl6, int oif,
455                                              int strict)
456 {
457         struct rt6_info *sibling, *next_sibling;
458         int route_choosen;
459
460         /* We might have already computed the hash for ICMPv6 errors. In such
461          * case it will always be non-zero. Otherwise now is the time to do it.
462          */
463         if (!fl6->mp_hash)
464                 fl6->mp_hash = rt6_multipath_hash(fl6, NULL);
465
466         route_choosen = fl6->mp_hash % (match->rt6i_nsiblings + 1);
467         /* Don't change the route, if route_choosen == 0
468          * (siblings does not include ourself)
469          */
470         if (route_choosen)
471                 list_for_each_entry_safe(sibling, next_sibling,
472                                 &match->rt6i_siblings, rt6i_siblings) {
473                         route_choosen--;
474                         if (route_choosen == 0) {
475                                 if (rt6_score_route(sibling, oif, strict) < 0)
476                                         break;
477                                 match = sibling;
478                                 break;
479                         }
480                 }
481         return match;
482 }
483
484 /*
485  *      Route lookup. rcu_read_lock() should be held.
486  */
487
488 static inline struct rt6_info *rt6_device_match(struct net *net,
489                                                     struct rt6_info *rt,
490                                                     const struct in6_addr *saddr,
491                                                     int oif,
492                                                     int flags)
493 {
494         struct rt6_info *local = NULL;
495         struct rt6_info *sprt;
496
497         if (!oif && ipv6_addr_any(saddr))
498                 goto out;
499
500         for (sprt = rt; sprt; sprt = rcu_dereference(sprt->dst.rt6_next)) {
501                 struct net_device *dev = sprt->dst.dev;
502
503                 if (oif) {
504                         if (dev->ifindex == oif)
505                                 return sprt;
506                         if (dev->flags & IFF_LOOPBACK) {
507                                 if (!sprt->rt6i_idev ||
508                                     sprt->rt6i_idev->dev->ifindex != oif) {
509                                         if (flags & RT6_LOOKUP_F_IFACE)
510                                                 continue;
511                                         if (local &&
512                                             local->rt6i_idev->dev->ifindex == oif)
513                                                 continue;
514                                 }
515                                 local = sprt;
516                         }
517                 } else {
518                         if (ipv6_chk_addr(net, saddr, dev,
519                                           flags & RT6_LOOKUP_F_IFACE))
520                                 return sprt;
521                 }
522         }
523
524         if (oif) {
525                 if (local)
526                         return local;
527
528                 if (flags & RT6_LOOKUP_F_IFACE)
529                         return net->ipv6.ip6_null_entry;
530         }
531 out:
532         return rt;
533 }
534
535 #ifdef CONFIG_IPV6_ROUTER_PREF
536 struct __rt6_probe_work {
537         struct work_struct work;
538         struct in6_addr target;
539         struct net_device *dev;
540 };
541
542 static void rt6_probe_deferred(struct work_struct *w)
543 {
544         struct in6_addr mcaddr;
545         struct __rt6_probe_work *work =
546                 container_of(w, struct __rt6_probe_work, work);
547
548         addrconf_addr_solict_mult(&work->target, &mcaddr);
549         ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
550         dev_put(work->dev);
551         kfree(work);
552 }
553
554 static void rt6_probe(struct rt6_info *rt)
555 {
556         struct __rt6_probe_work *work;
557         struct neighbour *neigh;
558         /*
559          * Okay, this does not seem to be appropriate
560          * for now, however, we need to check if it
561          * is really so; aka Router Reachability Probing.
562          *
563          * Router Reachability Probe MUST be rate-limited
564          * to no more than one per minute.
565          */
566         if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
567                 return;
568         rcu_read_lock_bh();
569         neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
570         if (neigh) {
571                 if (neigh->nud_state & NUD_VALID)
572                         goto out;
573
574                 work = NULL;
575                 write_lock(&neigh->lock);
576                 if (!(neigh->nud_state & NUD_VALID) &&
577                     time_after(jiffies,
578                                neigh->updated +
579                                rt->rt6i_idev->cnf.rtr_probe_interval)) {
580                         work = kmalloc(sizeof(*work), GFP_ATOMIC);
581                         if (work)
582                                 __neigh_set_probe_once(neigh);
583                 }
584                 write_unlock(&neigh->lock);
585         } else {
586                 work = kmalloc(sizeof(*work), GFP_ATOMIC);
587         }
588
589         if (work) {
590                 INIT_WORK(&work->work, rt6_probe_deferred);
591                 work->target = rt->rt6i_gateway;
592                 dev_hold(rt->dst.dev);
593                 work->dev = rt->dst.dev;
594                 schedule_work(&work->work);
595         }
596
597 out:
598         rcu_read_unlock_bh();
599 }
600 #else
601 static inline void rt6_probe(struct rt6_info *rt)
602 {
603 }
604 #endif
605
606 /*
607  * Default Router Selection (RFC 2461 6.3.6)
608  */
609 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
610 {
611         struct net_device *dev = rt->dst.dev;
612         if (!oif || dev->ifindex == oif)
613                 return 2;
614         if ((dev->flags & IFF_LOOPBACK) &&
615             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
616                 return 1;
617         return 0;
618 }
619
620 static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
621 {
622         struct neighbour *neigh;
623         enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
624
625         if (rt->rt6i_flags & RTF_NONEXTHOP ||
626             !(rt->rt6i_flags & RTF_GATEWAY))
627                 return RT6_NUD_SUCCEED;
628
629         rcu_read_lock_bh();
630         neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
631         if (neigh) {
632                 read_lock(&neigh->lock);
633                 if (neigh->nud_state & NUD_VALID)
634                         ret = RT6_NUD_SUCCEED;
635 #ifdef CONFIG_IPV6_ROUTER_PREF
636                 else if (!(neigh->nud_state & NUD_FAILED))
637                         ret = RT6_NUD_SUCCEED;
638                 else
639                         ret = RT6_NUD_FAIL_PROBE;
640 #endif
641                 read_unlock(&neigh->lock);
642         } else {
643                 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
644                       RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
645         }
646         rcu_read_unlock_bh();
647
648         return ret;
649 }
650
651 static int rt6_score_route(struct rt6_info *rt, int oif,
652                            int strict)
653 {
654         int m;
655
656         m = rt6_check_dev(rt, oif);
657         if (!m && (strict & RT6_LOOKUP_F_IFACE))
658                 return RT6_NUD_FAIL_HARD;
659 #ifdef CONFIG_IPV6_ROUTER_PREF
660         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
661 #endif
662         if (strict & RT6_LOOKUP_F_REACHABLE) {
663                 int n = rt6_check_neigh(rt);
664                 if (n < 0)
665                         return n;
666         }
667         return m;
668 }
669
670 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
671                                    int *mpri, struct rt6_info *match,
672                                    bool *do_rr)
673 {
674         int m;
675         bool match_do_rr = false;
676         struct inet6_dev *idev = rt->rt6i_idev;
677         struct net_device *dev = rt->dst.dev;
678
679         if (dev && !netif_carrier_ok(dev) &&
680             idev->cnf.ignore_routes_with_linkdown &&
681             !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
682                 goto out;
683
684         if (rt6_check_expired(rt))
685                 goto out;
686
687         m = rt6_score_route(rt, oif, strict);
688         if (m == RT6_NUD_FAIL_DO_RR) {
689                 match_do_rr = true;
690                 m = 0; /* lowest valid score */
691         } else if (m == RT6_NUD_FAIL_HARD) {
692                 goto out;
693         }
694
695         if (strict & RT6_LOOKUP_F_REACHABLE)
696                 rt6_probe(rt);
697
698         /* note that m can be RT6_NUD_FAIL_PROBE at this point */
699         if (m > *mpri) {
700                 *do_rr = match_do_rr;
701                 *mpri = m;
702                 match = rt;
703         }
704 out:
705         return match;
706 }
707
708 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
709                                      struct rt6_info *leaf,
710                                      struct rt6_info *rr_head,
711                                      u32 metric, int oif, int strict,
712                                      bool *do_rr)
713 {
714         struct rt6_info *rt, *match, *cont;
715         int mpri = -1;
716
717         match = NULL;
718         cont = NULL;
719         for (rt = rr_head; rt; rt = rcu_dereference(rt->dst.rt6_next)) {
720                 if (rt->rt6i_metric != metric) {
721                         cont = rt;
722                         break;
723                 }
724
725                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
726         }
727
728         for (rt = leaf; rt && rt != rr_head;
729              rt = rcu_dereference(rt->dst.rt6_next)) {
730                 if (rt->rt6i_metric != metric) {
731                         cont = rt;
732                         break;
733                 }
734
735                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
736         }
737
738         if (match || !cont)
739                 return match;
740
741         for (rt = cont; rt; rt = rcu_dereference(rt->dst.rt6_next))
742                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
743
744         return match;
745 }
746
747 static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn,
748                                    int oif, int strict)
749 {
750         struct rt6_info *leaf = rcu_dereference(fn->leaf);
751         struct rt6_info *match, *rt0;
752         bool do_rr = false;
753         int key_plen;
754
755         if (!leaf || leaf == net->ipv6.ip6_null_entry)
756                 return net->ipv6.ip6_null_entry;
757
758         rt0 = rcu_dereference(fn->rr_ptr);
759         if (!rt0)
760                 rt0 = leaf;
761
762         /* Double check to make sure fn is not an intermediate node
763          * and fn->leaf does not points to its child's leaf
764          * (This might happen if all routes under fn are deleted from
765          * the tree and fib6_repair_tree() is called on the node.)
766          */
767         key_plen = rt0->rt6i_dst.plen;
768 #ifdef CONFIG_IPV6_SUBTREES
769         if (rt0->rt6i_src.plen)
770                 key_plen = rt0->rt6i_src.plen;
771 #endif
772         if (fn->fn_bit != key_plen)
773                 return net->ipv6.ip6_null_entry;
774
775         match = find_rr_leaf(fn, leaf, rt0, rt0->rt6i_metric, oif, strict,
776                              &do_rr);
777
778         if (do_rr) {
779                 struct rt6_info *next = rcu_dereference(rt0->dst.rt6_next);
780
781                 /* no entries matched; do round-robin */
782                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
783                         next = leaf;
784
785                 if (next != rt0) {
786                         spin_lock_bh(&leaf->rt6i_table->tb6_lock);
787                         /* make sure next is not being deleted from the tree */
788                         if (next->rt6i_node)
789                                 rcu_assign_pointer(fn->rr_ptr, next);
790                         spin_unlock_bh(&leaf->rt6i_table->tb6_lock);
791                 }
792         }
793
794         return match ? match : net->ipv6.ip6_null_entry;
795 }
796
797 static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt)
798 {
799         return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
800 }
801
802 #ifdef CONFIG_IPV6_ROUTE_INFO
803 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
804                   const struct in6_addr *gwaddr)
805 {
806         struct net *net = dev_net(dev);
807         struct route_info *rinfo = (struct route_info *) opt;
808         struct in6_addr prefix_buf, *prefix;
809         unsigned int pref;
810         unsigned long lifetime;
811         struct rt6_info *rt;
812
813         if (len < sizeof(struct route_info)) {
814                 return -EINVAL;
815         }
816
817         /* Sanity check for prefix_len and length */
818         if (rinfo->length > 3) {
819                 return -EINVAL;
820         } else if (rinfo->prefix_len > 128) {
821                 return -EINVAL;
822         } else if (rinfo->prefix_len > 64) {
823                 if (rinfo->length < 2) {
824                         return -EINVAL;
825                 }
826         } else if (rinfo->prefix_len > 0) {
827                 if (rinfo->length < 1) {
828                         return -EINVAL;
829                 }
830         }
831
832         pref = rinfo->route_pref;
833         if (pref == ICMPV6_ROUTER_PREF_INVALID)
834                 return -EINVAL;
835
836         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
837
838         if (rinfo->length == 3)
839                 prefix = (struct in6_addr *)rinfo->prefix;
840         else {
841                 /* this function is safe */
842                 ipv6_addr_prefix(&prefix_buf,
843                                  (struct in6_addr *)rinfo->prefix,
844                                  rinfo->prefix_len);
845                 prefix = &prefix_buf;
846         }
847
848         if (rinfo->prefix_len == 0)
849                 rt = rt6_get_dflt_router(gwaddr, dev);
850         else
851                 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
852                                         gwaddr, dev);
853
854         if (rt && !lifetime) {
855                 ip6_del_rt(rt);
856                 rt = NULL;
857         }
858
859         if (!rt && lifetime)
860                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
861                                         dev, pref);
862         else if (rt)
863                 rt->rt6i_flags = RTF_ROUTEINFO |
864                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
865
866         if (rt) {
867                 if (!addrconf_finite_timeout(lifetime))
868                         rt6_clean_expires(rt);
869                 else
870                         rt6_set_expires(rt, jiffies + HZ * lifetime);
871
872                 ip6_rt_put(rt);
873         }
874         return 0;
875 }
876 #endif
877
878 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
879                                         struct in6_addr *saddr)
880 {
881         struct fib6_node *pn, *sn;
882         while (1) {
883                 if (fn->fn_flags & RTN_TL_ROOT)
884                         return NULL;
885                 pn = rcu_dereference(fn->parent);
886                 sn = FIB6_SUBTREE(pn);
887                 if (sn && sn != fn)
888                         fn = fib6_lookup(sn, NULL, saddr);
889                 else
890                         fn = pn;
891                 if (fn->fn_flags & RTN_RTINFO)
892                         return fn;
893         }
894 }
895
896 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
897                           bool null_fallback)
898 {
899         struct rt6_info *rt = *prt;
900
901         if (dst_hold_safe(&rt->dst))
902                 return true;
903         if (null_fallback) {
904                 rt = net->ipv6.ip6_null_entry;
905                 dst_hold(&rt->dst);
906         } else {
907                 rt = NULL;
908         }
909         *prt = rt;
910         return false;
911 }
912
913 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
914                                              struct fib6_table *table,
915                                              struct flowi6 *fl6, int flags)
916 {
917         struct rt6_info *rt, *rt_cache;
918         struct fib6_node *fn;
919
920         rcu_read_lock();
921         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
922 restart:
923         rt = rcu_dereference(fn->leaf);
924         if (!rt) {
925                 rt = net->ipv6.ip6_null_entry;
926         } else {
927                 rt = rt6_device_match(net, rt, &fl6->saddr,
928                                       fl6->flowi6_oif, flags);
929                 if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
930                         rt = rt6_multipath_select(rt, fl6,
931                                                   fl6->flowi6_oif, flags);
932         }
933         if (rt == net->ipv6.ip6_null_entry) {
934                 fn = fib6_backtrack(fn, &fl6->saddr);
935                 if (fn)
936                         goto restart;
937         }
938         /* Search through exception table */
939         rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
940         if (rt_cache)
941                 rt = rt_cache;
942
943         if (ip6_hold_safe(net, &rt, true))
944                 dst_use_noref(&rt->dst, jiffies);
945
946         rcu_read_unlock();
947
948         trace_fib6_table_lookup(net, rt, table, fl6);
949
950         return rt;
951
952 }
953
954 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
955                                     int flags)
956 {
957         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
958 }
959 EXPORT_SYMBOL_GPL(ip6_route_lookup);
960
961 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
962                             const struct in6_addr *saddr, int oif, int strict)
963 {
964         struct flowi6 fl6 = {
965                 .flowi6_oif = oif,
966                 .daddr = *daddr,
967         };
968         struct dst_entry *dst;
969         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
970
971         if (saddr) {
972                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
973                 flags |= RT6_LOOKUP_F_HAS_SADDR;
974         }
975
976         dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
977         if (dst->error == 0)
978                 return (struct rt6_info *) dst;
979
980         dst_release(dst);
981
982         return NULL;
983 }
984 EXPORT_SYMBOL(rt6_lookup);
985
986 /* ip6_ins_rt is called with FREE table->tb6_lock.
987  * It takes new route entry, the addition fails by any reason the
988  * route is released.
989  * Caller must hold dst before calling it.
990  */
991
992 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
993                         struct mx6_config *mxc,
994                         struct netlink_ext_ack *extack)
995 {
996         int err;
997         struct fib6_table *table;
998
999         table = rt->rt6i_table;
1000         spin_lock_bh(&table->tb6_lock);
1001         err = fib6_add(&table->tb6_root, rt, info, mxc, extack);
1002         spin_unlock_bh(&table->tb6_lock);
1003
1004         return err;
1005 }
1006
1007 int ip6_ins_rt(struct rt6_info *rt)
1008 {
1009         struct nl_info info = { .nl_net = dev_net(rt->dst.dev), };
1010         struct mx6_config mxc = { .mx = NULL, };
1011
1012         /* Hold dst to account for the reference from the fib6 tree */
1013         dst_hold(&rt->dst);
1014         return __ip6_ins_rt(rt, &info, &mxc, NULL);
1015 }
1016
1017 /* called with rcu_lock held */
1018 static struct net_device *ip6_rt_get_dev_rcu(struct rt6_info *rt)
1019 {
1020         struct net_device *dev = rt->dst.dev;
1021
1022         if (rt->rt6i_flags & RTF_LOCAL) {
1023                 /* for copies of local routes, dst->dev needs to be the
1024                  * device if it is a master device, the master device if
1025                  * device is enslaved, and the loopback as the default
1026                  */
1027                 if (netif_is_l3_slave(dev) &&
1028                     !rt6_need_strict(&rt->rt6i_dst.addr))
1029                         dev = l3mdev_master_dev_rcu(dev);
1030                 else if (!netif_is_l3_master(dev))
1031                         dev = dev_net(dev)->loopback_dev;
1032                 /* last case is netif_is_l3_master(dev) is true in which
1033                  * case we want dev returned to be dev
1034                  */
1035         }
1036
1037         return dev;
1038 }
1039
1040 static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
1041                                            const struct in6_addr *daddr,
1042                                            const struct in6_addr *saddr)
1043 {
1044         struct net_device *dev;
1045         struct rt6_info *rt;
1046
1047         /*
1048          *      Clone the route.
1049          */
1050
1051         if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
1052                 ort = (struct rt6_info *)ort->dst.from;
1053
1054         rcu_read_lock();
1055         dev = ip6_rt_get_dev_rcu(ort);
1056         rt = __ip6_dst_alloc(dev_net(dev), dev, 0);
1057         rcu_read_unlock();
1058         if (!rt)
1059                 return NULL;
1060
1061         ip6_rt_copy_init(rt, ort);
1062         rt->rt6i_flags |= RTF_CACHE;
1063         rt->rt6i_metric = 0;
1064         rt->dst.flags |= DST_HOST;
1065         rt->rt6i_dst.addr = *daddr;
1066         rt->rt6i_dst.plen = 128;
1067
1068         if (!rt6_is_gw_or_nonexthop(ort)) {
1069                 if (ort->rt6i_dst.plen != 128 &&
1070                     ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
1071                         rt->rt6i_flags |= RTF_ANYCAST;
1072 #ifdef CONFIG_IPV6_SUBTREES
1073                 if (rt->rt6i_src.plen && saddr) {
1074                         rt->rt6i_src.addr = *saddr;
1075                         rt->rt6i_src.plen = 128;
1076                 }
1077 #endif
1078         }
1079
1080         return rt;
1081 }
1082
1083 static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
1084 {
1085         struct net_device *dev;
1086         struct rt6_info *pcpu_rt;
1087
1088         rcu_read_lock();
1089         dev = ip6_rt_get_dev_rcu(rt);
1090         pcpu_rt = __ip6_dst_alloc(dev_net(dev), dev, rt->dst.flags);
1091         rcu_read_unlock();
1092         if (!pcpu_rt)
1093                 return NULL;
1094         ip6_rt_copy_init(pcpu_rt, rt);
1095         pcpu_rt->rt6i_protocol = rt->rt6i_protocol;
1096         pcpu_rt->rt6i_flags |= RTF_PCPU;
1097         return pcpu_rt;
1098 }
1099
1100 /* It should be called with rcu_read_lock() acquired */
1101 static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
1102 {
1103         struct rt6_info *pcpu_rt, **p;
1104
1105         p = this_cpu_ptr(rt->rt6i_pcpu);
1106         pcpu_rt = *p;
1107
1108         if (pcpu_rt && ip6_hold_safe(NULL, &pcpu_rt, false))
1109                 rt6_dst_from_metrics_check(pcpu_rt);
1110
1111         return pcpu_rt;
1112 }
1113
1114 static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt)
1115 {
1116         struct rt6_info *pcpu_rt, *prev, **p;
1117
1118         pcpu_rt = ip6_rt_pcpu_alloc(rt);
1119         if (!pcpu_rt) {
1120                 struct net *net = dev_net(rt->dst.dev);
1121
1122                 dst_hold(&net->ipv6.ip6_null_entry->dst);
1123                 return net->ipv6.ip6_null_entry;
1124         }
1125
1126         dst_hold(&pcpu_rt->dst);
1127         p = this_cpu_ptr(rt->rt6i_pcpu);
1128         prev = cmpxchg(p, NULL, pcpu_rt);
1129         BUG_ON(prev);
1130
1131         rt6_dst_from_metrics_check(pcpu_rt);
1132         return pcpu_rt;
1133 }
1134
1135 /* exception hash table implementation
1136  */
1137 static DEFINE_SPINLOCK(rt6_exception_lock);
1138
1139 /* Remove rt6_ex from hash table and free the memory
1140  * Caller must hold rt6_exception_lock
1141  */
1142 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1143                                  struct rt6_exception *rt6_ex)
1144 {
1145         struct net *net;
1146
1147         if (!bucket || !rt6_ex)
1148                 return;
1149
1150         net = dev_net(rt6_ex->rt6i->dst.dev);
1151         rt6_ex->rt6i->rt6i_node = NULL;
1152         hlist_del_rcu(&rt6_ex->hlist);
1153         rt6_release(rt6_ex->rt6i);
1154         kfree_rcu(rt6_ex, rcu);
1155         WARN_ON_ONCE(!bucket->depth);
1156         bucket->depth--;
1157         net->ipv6.rt6_stats->fib_rt_cache--;
1158 }
1159
1160 /* Remove oldest rt6_ex in bucket and free the memory
1161  * Caller must hold rt6_exception_lock
1162  */
1163 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1164 {
1165         struct rt6_exception *rt6_ex, *oldest = NULL;
1166
1167         if (!bucket)
1168                 return;
1169
1170         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1171                 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1172                         oldest = rt6_ex;
1173         }
1174         rt6_remove_exception(bucket, oldest);
1175 }
1176
1177 static u32 rt6_exception_hash(const struct in6_addr *dst,
1178                               const struct in6_addr *src)
1179 {
1180         static u32 seed __read_mostly;
1181         u32 val;
1182
1183         net_get_random_once(&seed, sizeof(seed));
1184         val = jhash(dst, sizeof(*dst), seed);
1185
1186 #ifdef CONFIG_IPV6_SUBTREES
1187         if (src)
1188                 val = jhash(src, sizeof(*src), val);
1189 #endif
1190         return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1191 }
1192
1193 /* Helper function to find the cached rt in the hash table
1194  * and update bucket pointer to point to the bucket for this
1195  * (daddr, saddr) pair
1196  * Caller must hold rt6_exception_lock
1197  */
1198 static struct rt6_exception *
1199 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1200                               const struct in6_addr *daddr,
1201                               const struct in6_addr *saddr)
1202 {
1203         struct rt6_exception *rt6_ex;
1204         u32 hval;
1205
1206         if (!(*bucket) || !daddr)
1207                 return NULL;
1208
1209         hval = rt6_exception_hash(daddr, saddr);
1210         *bucket += hval;
1211
1212         hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1213                 struct rt6_info *rt6 = rt6_ex->rt6i;
1214                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1215
1216 #ifdef CONFIG_IPV6_SUBTREES
1217                 if (matched && saddr)
1218                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1219 #endif
1220                 if (matched)
1221                         return rt6_ex;
1222         }
1223         return NULL;
1224 }
1225
1226 /* Helper function to find the cached rt in the hash table
1227  * and update bucket pointer to point to the bucket for this
1228  * (daddr, saddr) pair
1229  * Caller must hold rcu_read_lock()
1230  */
1231 static struct rt6_exception *
1232 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1233                          const struct in6_addr *daddr,
1234                          const struct in6_addr *saddr)
1235 {
1236         struct rt6_exception *rt6_ex;
1237         u32 hval;
1238
1239         WARN_ON_ONCE(!rcu_read_lock_held());
1240
1241         if (!(*bucket) || !daddr)
1242                 return NULL;
1243
1244         hval = rt6_exception_hash(daddr, saddr);
1245         *bucket += hval;
1246
1247         hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1248                 struct rt6_info *rt6 = rt6_ex->rt6i;
1249                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1250
1251 #ifdef CONFIG_IPV6_SUBTREES
1252                 if (matched && saddr)
1253                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1254 #endif
1255                 if (matched)
1256                         return rt6_ex;
1257         }
1258         return NULL;
1259 }
1260
1261 static int rt6_insert_exception(struct rt6_info *nrt,
1262                                 struct rt6_info *ort)
1263 {
1264         struct net *net = dev_net(ort->dst.dev);
1265         struct rt6_exception_bucket *bucket;
1266         struct in6_addr *src_key = NULL;
1267         struct rt6_exception *rt6_ex;
1268         int err = 0;
1269
1270         /* ort can't be a cache or pcpu route */
1271         if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
1272                 ort = (struct rt6_info *)ort->dst.from;
1273         WARN_ON_ONCE(ort->rt6i_flags & (RTF_CACHE | RTF_PCPU));
1274
1275         spin_lock_bh(&rt6_exception_lock);
1276
1277         if (ort->exception_bucket_flushed) {
1278                 err = -EINVAL;
1279                 goto out;
1280         }
1281
1282         bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1283                                         lockdep_is_held(&rt6_exception_lock));
1284         if (!bucket) {
1285                 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1286                                  GFP_ATOMIC);
1287                 if (!bucket) {
1288                         err = -ENOMEM;
1289                         goto out;
1290                 }
1291                 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1292         }
1293
1294 #ifdef CONFIG_IPV6_SUBTREES
1295         /* rt6i_src.plen != 0 indicates ort is in subtree
1296          * and exception table is indexed by a hash of
1297          * both rt6i_dst and rt6i_src.
1298          * Otherwise, the exception table is indexed by
1299          * a hash of only rt6i_dst.
1300          */
1301         if (ort->rt6i_src.plen)
1302                 src_key = &nrt->rt6i_src.addr;
1303 #endif
1304
1305         /* Update rt6i_prefsrc as it could be changed
1306          * in rt6_remove_prefsrc()
1307          */
1308         nrt->rt6i_prefsrc = ort->rt6i_prefsrc;
1309         /* rt6_mtu_change() might lower mtu on ort.
1310          * Only insert this exception route if its mtu
1311          * is less than ort's mtu value.
1312          */
1313         if (nrt->rt6i_pmtu >= dst_mtu(&ort->dst)) {
1314                 err = -EINVAL;
1315                 goto out;
1316         }
1317
1318         rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1319                                                src_key);
1320         if (rt6_ex)
1321                 rt6_remove_exception(bucket, rt6_ex);
1322
1323         rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1324         if (!rt6_ex) {
1325                 err = -ENOMEM;
1326                 goto out;
1327         }
1328         rt6_ex->rt6i = nrt;
1329         rt6_ex->stamp = jiffies;
1330         atomic_inc(&nrt->rt6i_ref);
1331         nrt->rt6i_node = ort->rt6i_node;
1332         hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1333         bucket->depth++;
1334         net->ipv6.rt6_stats->fib_rt_cache++;
1335
1336         if (bucket->depth > FIB6_MAX_DEPTH)
1337                 rt6_exception_remove_oldest(bucket);
1338
1339 out:
1340         spin_unlock_bh(&rt6_exception_lock);
1341
1342         /* Update fn->fn_sernum to invalidate all cached dst */
1343         if (!err) {
1344                 fib6_update_sernum(ort);
1345                 fib6_force_start_gc(net);
1346         }
1347
1348         return err;
1349 }
1350
1351 void rt6_flush_exceptions(struct rt6_info *rt)
1352 {
1353         struct rt6_exception_bucket *bucket;
1354         struct rt6_exception *rt6_ex;
1355         struct hlist_node *tmp;
1356         int i;
1357
1358         spin_lock_bh(&rt6_exception_lock);
1359         /* Prevent rt6_insert_exception() to recreate the bucket list */
1360         rt->exception_bucket_flushed = 1;
1361
1362         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1363                                     lockdep_is_held(&rt6_exception_lock));
1364         if (!bucket)
1365                 goto out;
1366
1367         for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1368                 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1369                         rt6_remove_exception(bucket, rt6_ex);
1370                 WARN_ON_ONCE(bucket->depth);
1371                 bucket++;
1372         }
1373
1374 out:
1375         spin_unlock_bh(&rt6_exception_lock);
1376 }
1377
1378 /* Find cached rt in the hash table inside passed in rt
1379  * Caller has to hold rcu_read_lock()
1380  */
1381 static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
1382                                            struct in6_addr *daddr,
1383                                            struct in6_addr *saddr)
1384 {
1385         struct rt6_exception_bucket *bucket;
1386         struct in6_addr *src_key = NULL;
1387         struct rt6_exception *rt6_ex;
1388         struct rt6_info *res = NULL;
1389
1390         bucket = rcu_dereference(rt->rt6i_exception_bucket);
1391
1392 #ifdef CONFIG_IPV6_SUBTREES
1393         /* rt6i_src.plen != 0 indicates rt is in subtree
1394          * and exception table is indexed by a hash of
1395          * both rt6i_dst and rt6i_src.
1396          * Otherwise, the exception table is indexed by
1397          * a hash of only rt6i_dst.
1398          */
1399         if (rt->rt6i_src.plen)
1400                 src_key = saddr;
1401 #endif
1402         rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1403
1404         if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1405                 res = rt6_ex->rt6i;
1406
1407         return res;
1408 }
1409
1410 /* Remove the passed in cached rt from the hash table that contains it */
1411 int rt6_remove_exception_rt(struct rt6_info *rt)
1412 {
1413         struct rt6_info *from = (struct rt6_info *)rt->dst.from;
1414         struct rt6_exception_bucket *bucket;
1415         struct in6_addr *src_key = NULL;
1416         struct rt6_exception *rt6_ex;
1417         int err;
1418
1419         if (!from ||
1420             !(rt->rt6i_flags & RTF_CACHE))
1421                 return -EINVAL;
1422
1423         if (!rcu_access_pointer(from->rt6i_exception_bucket))
1424                 return -ENOENT;
1425
1426         spin_lock_bh(&rt6_exception_lock);
1427         bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1428                                     lockdep_is_held(&rt6_exception_lock));
1429 #ifdef CONFIG_IPV6_SUBTREES
1430         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1431          * and exception table is indexed by a hash of
1432          * both rt6i_dst and rt6i_src.
1433          * Otherwise, the exception table is indexed by
1434          * a hash of only rt6i_dst.
1435          */
1436         if (from->rt6i_src.plen)
1437                 src_key = &rt->rt6i_src.addr;
1438 #endif
1439         rt6_ex = __rt6_find_exception_spinlock(&bucket,
1440                                                &rt->rt6i_dst.addr,
1441                                                src_key);
1442         if (rt6_ex) {
1443                 rt6_remove_exception(bucket, rt6_ex);
1444                 err = 0;
1445         } else {
1446                 err = -ENOENT;
1447         }
1448
1449         spin_unlock_bh(&rt6_exception_lock);
1450         return err;
1451 }
1452
1453 /* Find rt6_ex which contains the passed in rt cache and
1454  * refresh its stamp
1455  */
1456 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1457 {
1458         struct rt6_info *from = (struct rt6_info *)rt->dst.from;
1459         struct rt6_exception_bucket *bucket;
1460         struct in6_addr *src_key = NULL;
1461         struct rt6_exception *rt6_ex;
1462
1463         if (!from ||
1464             !(rt->rt6i_flags & RTF_CACHE))
1465                 return;
1466
1467         rcu_read_lock();
1468         bucket = rcu_dereference(from->rt6i_exception_bucket);
1469
1470 #ifdef CONFIG_IPV6_SUBTREES
1471         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1472          * and exception table is indexed by a hash of
1473          * both rt6i_dst and rt6i_src.
1474          * Otherwise, the exception table is indexed by
1475          * a hash of only rt6i_dst.
1476          */
1477         if (from->rt6i_src.plen)
1478                 src_key = &rt->rt6i_src.addr;
1479 #endif
1480         rt6_ex = __rt6_find_exception_rcu(&bucket,
1481                                           &rt->rt6i_dst.addr,
1482                                           src_key);
1483         if (rt6_ex)
1484                 rt6_ex->stamp = jiffies;
1485
1486         rcu_read_unlock();
1487 }
1488
1489 static void rt6_exceptions_remove_prefsrc(struct rt6_info *rt)
1490 {
1491         struct rt6_exception_bucket *bucket;
1492         struct rt6_exception *rt6_ex;
1493         int i;
1494
1495         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1496                                         lockdep_is_held(&rt6_exception_lock));
1497
1498         if (bucket) {
1499                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1500                         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1501                                 rt6_ex->rt6i->rt6i_prefsrc.plen = 0;
1502                         }
1503                         bucket++;
1504                 }
1505         }
1506 }
1507
1508 static void rt6_exceptions_update_pmtu(struct rt6_info *rt, int mtu)
1509 {
1510         struct rt6_exception_bucket *bucket;
1511         struct rt6_exception *rt6_ex;
1512         int i;
1513
1514         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1515                                         lockdep_is_held(&rt6_exception_lock));
1516
1517         if (bucket) {
1518                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1519                         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1520                                 struct rt6_info *entry = rt6_ex->rt6i;
1521                                 /* For RTF_CACHE with rt6i_pmtu == 0
1522                                  * (i.e. a redirected route),
1523                                  * the metrics of its rt->dst.from has already
1524                                  * been updated.
1525                                  */
1526                                 if (entry->rt6i_pmtu && entry->rt6i_pmtu > mtu)
1527                                         entry->rt6i_pmtu = mtu;
1528                         }
1529                         bucket++;
1530                 }
1531         }
1532 }
1533
1534 #define RTF_CACHE_GATEWAY       (RTF_GATEWAY | RTF_CACHE)
1535
1536 static void rt6_exceptions_clean_tohost(struct rt6_info *rt,
1537                                         struct in6_addr *gateway)
1538 {
1539         struct rt6_exception_bucket *bucket;
1540         struct rt6_exception *rt6_ex;
1541         struct hlist_node *tmp;
1542         int i;
1543
1544         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1545                 return;
1546
1547         spin_lock_bh(&rt6_exception_lock);
1548         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1549                                      lockdep_is_held(&rt6_exception_lock));
1550
1551         if (bucket) {
1552                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1553                         hlist_for_each_entry_safe(rt6_ex, tmp,
1554                                                   &bucket->chain, hlist) {
1555                                 struct rt6_info *entry = rt6_ex->rt6i;
1556
1557                                 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1558                                     RTF_CACHE_GATEWAY &&
1559                                     ipv6_addr_equal(gateway,
1560                                                     &entry->rt6i_gateway)) {
1561                                         rt6_remove_exception(bucket, rt6_ex);
1562                                 }
1563                         }
1564                         bucket++;
1565                 }
1566         }
1567
1568         spin_unlock_bh(&rt6_exception_lock);
1569 }
1570
1571 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1572                                       struct rt6_exception *rt6_ex,
1573                                       struct fib6_gc_args *gc_args,
1574                                       unsigned long now)
1575 {
1576         struct rt6_info *rt = rt6_ex->rt6i;
1577
1578         /* we are pruning and obsoleting aged-out and non gateway exceptions
1579          * even if others have still references to them, so that on next
1580          * dst_check() such references can be dropped.
1581          * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1582          * expired, independently from their aging, as per RFC 8201 section 4
1583          */
1584         if (!(rt->rt6i_flags & RTF_EXPIRES) &&
1585             time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1586                 RT6_TRACE("aging clone %p\n", rt);
1587                 rt6_remove_exception(bucket, rt6_ex);
1588                 return;
1589         } else if (rt->rt6i_flags & RTF_GATEWAY) {
1590                 struct neighbour *neigh;
1591                 __u8 neigh_flags = 0;
1592
1593                 neigh = dst_neigh_lookup(&rt->dst, &rt->rt6i_gateway);
1594                 if (neigh) {
1595                         neigh_flags = neigh->flags;
1596                         neigh_release(neigh);
1597                 }
1598                 if (!(neigh_flags & NTF_ROUTER)) {
1599                         RT6_TRACE("purging route %p via non-router but gateway\n",
1600                                   rt);
1601                         rt6_remove_exception(bucket, rt6_ex);
1602                         return;
1603                 }
1604         } else if (__rt6_check_expired(rt)) {
1605                 RT6_TRACE("purging expired route %p\n", rt);
1606                 rt6_remove_exception(bucket, rt6_ex);
1607                 return;
1608         }
1609         gc_args->more++;
1610 }
1611
1612 void rt6_age_exceptions(struct rt6_info *rt,
1613                         struct fib6_gc_args *gc_args,
1614                         unsigned long now)
1615 {
1616         struct rt6_exception_bucket *bucket;
1617         struct rt6_exception *rt6_ex;
1618         struct hlist_node *tmp;
1619         int i;
1620
1621         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1622                 return;
1623
1624         spin_lock_bh(&rt6_exception_lock);
1625         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1626                                     lockdep_is_held(&rt6_exception_lock));
1627
1628         if (bucket) {
1629                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1630                         hlist_for_each_entry_safe(rt6_ex, tmp,
1631                                                   &bucket->chain, hlist) {
1632                                 rt6_age_examine_exception(bucket, rt6_ex,
1633                                                           gc_args, now);
1634                         }
1635                         bucket++;
1636                 }
1637         }
1638         spin_unlock_bh(&rt6_exception_lock);
1639 }
1640
1641 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1642                                int oif, struct flowi6 *fl6, int flags)
1643 {
1644         struct fib6_node *fn, *saved_fn;
1645         struct rt6_info *rt, *rt_cache;
1646         int strict = 0;
1647
1648         strict |= flags & RT6_LOOKUP_F_IFACE;
1649         strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1650         if (net->ipv6.devconf_all->forwarding == 0)
1651                 strict |= RT6_LOOKUP_F_REACHABLE;
1652
1653         rcu_read_lock();
1654
1655         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1656         saved_fn = fn;
1657
1658         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1659                 oif = 0;
1660
1661 redo_rt6_select:
1662         rt = rt6_select(net, fn, oif, strict);
1663         if (rt->rt6i_nsiblings)
1664                 rt = rt6_multipath_select(rt, fl6, oif, strict);
1665         if (rt == net->ipv6.ip6_null_entry) {
1666                 fn = fib6_backtrack(fn, &fl6->saddr);
1667                 if (fn)
1668                         goto redo_rt6_select;
1669                 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1670                         /* also consider unreachable route */
1671                         strict &= ~RT6_LOOKUP_F_REACHABLE;
1672                         fn = saved_fn;
1673                         goto redo_rt6_select;
1674                 }
1675         }
1676
1677         /*Search through exception table */
1678         rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
1679         if (rt_cache)
1680                 rt = rt_cache;
1681
1682         if (rt == net->ipv6.ip6_null_entry) {
1683                 rcu_read_unlock();
1684                 dst_hold(&rt->dst);
1685                 trace_fib6_table_lookup(net, rt, table, fl6);
1686                 return rt;
1687         } else if (rt->rt6i_flags & RTF_CACHE) {
1688                 if (ip6_hold_safe(net, &rt, true)) {
1689                         dst_use_noref(&rt->dst, jiffies);
1690                         rt6_dst_from_metrics_check(rt);
1691                 }
1692                 rcu_read_unlock();
1693                 trace_fib6_table_lookup(net, rt, table, fl6);
1694                 return rt;
1695         } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1696                             !(rt->rt6i_flags & RTF_GATEWAY))) {
1697                 /* Create a RTF_CACHE clone which will not be
1698                  * owned by the fib6 tree.  It is for the special case where
1699                  * the daddr in the skb during the neighbor look-up is different
1700                  * from the fl6->daddr used to look-up route here.
1701                  */
1702
1703                 struct rt6_info *uncached_rt;
1704
1705                 if (ip6_hold_safe(net, &rt, true)) {
1706                         dst_use_noref(&rt->dst, jiffies);
1707                 } else {
1708                         rcu_read_unlock();
1709                         uncached_rt = rt;
1710                         goto uncached_rt_out;
1711                 }
1712                 rcu_read_unlock();
1713
1714                 uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
1715                 dst_release(&rt->dst);
1716
1717                 if (uncached_rt) {
1718                         /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1719                          * No need for another dst_hold()
1720                          */
1721                         rt6_uncached_list_add(uncached_rt);
1722                         atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1723                 } else {
1724                         uncached_rt = net->ipv6.ip6_null_entry;
1725                         dst_hold(&uncached_rt->dst);
1726                 }
1727
1728 uncached_rt_out:
1729                 trace_fib6_table_lookup(net, uncached_rt, table, fl6);
1730                 return uncached_rt;
1731
1732         } else {
1733                 /* Get a percpu copy */
1734
1735                 struct rt6_info *pcpu_rt;
1736
1737                 dst_use_noref(&rt->dst, jiffies);
1738                 local_bh_disable();
1739                 pcpu_rt = rt6_get_pcpu_route(rt);
1740
1741                 if (!pcpu_rt) {
1742                         /* atomic_inc_not_zero() is needed when using rcu */
1743                         if (atomic_inc_not_zero(&rt->rt6i_ref)) {
1744                                 /* No dst_hold() on rt is needed because grabbing
1745                                  * rt->rt6i_ref makes sure rt can't be released.
1746                                  */
1747                                 pcpu_rt = rt6_make_pcpu_route(rt);
1748                                 rt6_release(rt);
1749                         } else {
1750                                 /* rt is already removed from tree */
1751                                 pcpu_rt = net->ipv6.ip6_null_entry;
1752                                 dst_hold(&pcpu_rt->dst);
1753                         }
1754                 }
1755                 local_bh_enable();
1756                 rcu_read_unlock();
1757                 trace_fib6_table_lookup(net, pcpu_rt, table, fl6);
1758                 return pcpu_rt;
1759         }
1760 }
1761 EXPORT_SYMBOL_GPL(ip6_pol_route);
1762
1763 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
1764                                             struct flowi6 *fl6, int flags)
1765 {
1766         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
1767 }
1768
1769 struct dst_entry *ip6_route_input_lookup(struct net *net,
1770                                          struct net_device *dev,
1771                                          struct flowi6 *fl6, int flags)
1772 {
1773         if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1774                 flags |= RT6_LOOKUP_F_IFACE;
1775
1776         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
1777 }
1778 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1779
1780 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1781                                   struct flow_keys *keys)
1782 {
1783         const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1784         const struct ipv6hdr *key_iph = outer_iph;
1785         const struct ipv6hdr *inner_iph;
1786         const struct icmp6hdr *icmph;
1787         struct ipv6hdr _inner_iph;
1788
1789         if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1790                 goto out;
1791
1792         icmph = icmp6_hdr(skb);
1793         if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1794             icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1795             icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1796             icmph->icmp6_type != ICMPV6_PARAMPROB)
1797                 goto out;
1798
1799         inner_iph = skb_header_pointer(skb,
1800                                        skb_transport_offset(skb) + sizeof(*icmph),
1801                                        sizeof(_inner_iph), &_inner_iph);
1802         if (!inner_iph)
1803                 goto out;
1804
1805         key_iph = inner_iph;
1806 out:
1807         memset(keys, 0, sizeof(*keys));
1808         keys->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1809         keys->addrs.v6addrs.src = key_iph->saddr;
1810         keys->addrs.v6addrs.dst = key_iph->daddr;
1811         keys->tags.flow_label = ip6_flowinfo(key_iph);
1812         keys->basic.ip_proto = key_iph->nexthdr;
1813 }
1814
1815 /* if skb is set it will be used and fl6 can be NULL */
1816 u32 rt6_multipath_hash(const struct flowi6 *fl6, const struct sk_buff *skb)
1817 {
1818         struct flow_keys hash_keys;
1819
1820         if (skb) {
1821                 ip6_multipath_l3_keys(skb, &hash_keys);
1822                 return flow_hash_from_keys(&hash_keys);
1823         }
1824
1825         return get_hash_from_flowi6(fl6);
1826 }
1827
1828 void ip6_route_input(struct sk_buff *skb)
1829 {
1830         const struct ipv6hdr *iph = ipv6_hdr(skb);
1831         struct net *net = dev_net(skb->dev);
1832         int flags = RT6_LOOKUP_F_HAS_SADDR;
1833         struct ip_tunnel_info *tun_info;
1834         struct flowi6 fl6 = {
1835                 .flowi6_iif = skb->dev->ifindex,
1836                 .daddr = iph->daddr,
1837                 .saddr = iph->saddr,
1838                 .flowlabel = ip6_flowinfo(iph),
1839                 .flowi6_mark = skb->mark,
1840                 .flowi6_proto = iph->nexthdr,
1841         };
1842
1843         tun_info = skb_tunnel_info(skb);
1844         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1845                 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
1846         if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
1847                 fl6.mp_hash = rt6_multipath_hash(&fl6, skb);
1848         skb_dst_drop(skb);
1849         skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
1850 }
1851
1852 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
1853                                              struct flowi6 *fl6, int flags)
1854 {
1855         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
1856 }
1857
1858 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
1859                                          struct flowi6 *fl6, int flags)
1860 {
1861         bool any_src;
1862
1863         if (rt6_need_strict(&fl6->daddr)) {
1864                 struct dst_entry *dst;
1865
1866                 dst = l3mdev_link_scope_lookup(net, fl6);
1867                 if (dst)
1868                         return dst;
1869         }
1870
1871         fl6->flowi6_iif = LOOPBACK_IFINDEX;
1872
1873         any_src = ipv6_addr_any(&fl6->saddr);
1874         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
1875             (fl6->flowi6_oif && any_src))
1876                 flags |= RT6_LOOKUP_F_IFACE;
1877
1878         if (!any_src)
1879                 flags |= RT6_LOOKUP_F_HAS_SADDR;
1880         else if (sk)
1881                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
1882
1883         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
1884 }
1885 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
1886
1887 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
1888 {
1889         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
1890         struct net_device *loopback_dev = net->loopback_dev;
1891         struct dst_entry *new = NULL;
1892
1893         rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
1894                        DST_OBSOLETE_DEAD, 0);
1895         if (rt) {
1896                 rt6_info_init(rt);
1897                 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
1898
1899                 new = &rt->dst;
1900                 new->__use = 1;
1901                 new->input = dst_discard;
1902                 new->output = dst_discard_out;
1903
1904                 dst_copy_metrics(new, &ort->dst);
1905
1906                 rt->rt6i_idev = in6_dev_get(loopback_dev);
1907                 rt->rt6i_gateway = ort->rt6i_gateway;
1908                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
1909                 rt->rt6i_metric = 0;
1910
1911                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1912 #ifdef CONFIG_IPV6_SUBTREES
1913                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1914 #endif
1915         }
1916
1917         dst_release(dst_orig);
1918         return new ? new : ERR_PTR(-ENOMEM);
1919 }
1920
1921 /*
1922  *      Destination cache support functions
1923  */
1924
1925 static void rt6_dst_from_metrics_check(struct rt6_info *rt)
1926 {
1927         if (rt->dst.from &&
1928             dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(rt->dst.from))
1929                 dst_init_metrics(&rt->dst, dst_metrics_ptr(rt->dst.from), true);
1930 }
1931
1932 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
1933 {
1934         u32 rt_cookie = 0;
1935
1936         if (!rt6_get_cookie_safe(rt, &rt_cookie) || rt_cookie != cookie)
1937                 return NULL;
1938
1939         if (rt6_check_expired(rt))
1940                 return NULL;
1941
1942         return &rt->dst;
1943 }
1944
1945 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
1946 {
1947         if (!__rt6_check_expired(rt) &&
1948             rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1949             rt6_check((struct rt6_info *)(rt->dst.from), cookie))
1950                 return &rt->dst;
1951         else
1952                 return NULL;
1953 }
1954
1955 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
1956 {
1957         struct rt6_info *rt;
1958
1959         rt = (struct rt6_info *) dst;
1960
1961         /* All IPV6 dsts are created with ->obsolete set to the value
1962          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1963          * into this function always.
1964          */
1965
1966         rt6_dst_from_metrics_check(rt);
1967
1968         if (rt->rt6i_flags & RTF_PCPU ||
1969             (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->dst.from))
1970                 return rt6_dst_from_check(rt, cookie);
1971         else
1972                 return rt6_check(rt, cookie);
1973 }
1974
1975 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
1976 {
1977         struct rt6_info *rt = (struct rt6_info *) dst;
1978
1979         if (rt) {
1980                 if (rt->rt6i_flags & RTF_CACHE) {
1981                         if (rt6_check_expired(rt)) {
1982                                 ip6_del_rt(rt);
1983                                 dst = NULL;
1984                         }
1985                 } else {
1986                         dst_release(dst);
1987                         dst = NULL;
1988                 }
1989         }
1990         return dst;
1991 }
1992
1993 static void ip6_link_failure(struct sk_buff *skb)
1994 {
1995         struct rt6_info *rt;
1996
1997         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
1998
1999         rt = (struct rt6_info *) skb_dst(skb);
2000         if (rt) {
2001                 if (rt->rt6i_flags & RTF_CACHE) {
2002                         if (dst_hold_safe(&rt->dst))
2003                                 ip6_del_rt(rt);
2004                 } else {
2005                         struct fib6_node *fn;
2006
2007                         rcu_read_lock();
2008                         fn = rcu_dereference(rt->rt6i_node);
2009                         if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2010                                 fn->fn_sernum = -1;
2011                         rcu_read_unlock();
2012                 }
2013         }
2014 }
2015
2016 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2017 {
2018         struct net *net = dev_net(rt->dst.dev);
2019
2020         rt->rt6i_flags |= RTF_MODIFIED;
2021         rt->rt6i_pmtu = mtu;
2022         rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2023 }
2024
2025 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2026 {
2027         return !(rt->rt6i_flags & RTF_CACHE) &&
2028                 (rt->rt6i_flags & RTF_PCPU ||
2029                  rcu_access_pointer(rt->rt6i_node));
2030 }
2031
2032 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2033                                  const struct ipv6hdr *iph, u32 mtu)
2034 {
2035         const struct in6_addr *daddr, *saddr;
2036         struct rt6_info *rt6 = (struct rt6_info *)dst;
2037
2038         if (rt6->rt6i_flags & RTF_LOCAL)
2039                 return;
2040
2041         if (dst_metric_locked(dst, RTAX_MTU))
2042                 return;
2043
2044         if (iph) {
2045                 daddr = &iph->daddr;
2046                 saddr = &iph->saddr;
2047         } else if (sk) {
2048                 daddr = &sk->sk_v6_daddr;
2049                 saddr = &inet6_sk(sk)->saddr;
2050         } else {
2051                 daddr = NULL;
2052                 saddr = NULL;
2053         }
2054         dst_confirm_neigh(dst, daddr);
2055         mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2056         if (mtu >= dst_mtu(dst))
2057                 return;
2058
2059         if (!rt6_cache_allowed_for_pmtu(rt6)) {
2060                 rt6_do_update_pmtu(rt6, mtu);
2061                 /* update rt6_ex->stamp for cache */
2062                 if (rt6->rt6i_flags & RTF_CACHE)
2063                         rt6_update_exception_stamp_rt(rt6);
2064         } else if (daddr) {
2065                 struct rt6_info *nrt6;
2066
2067                 nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
2068                 if (nrt6) {
2069                         rt6_do_update_pmtu(nrt6, mtu);
2070                         if (rt6_insert_exception(nrt6, rt6))
2071                                 dst_release_immediate(&nrt6->dst);
2072                 }
2073         }
2074 }
2075
2076 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2077                                struct sk_buff *skb, u32 mtu)
2078 {
2079         __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2080 }
2081
2082 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2083                      int oif, u32 mark, kuid_t uid)
2084 {
2085         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2086         struct dst_entry *dst;
2087         struct flowi6 fl6;
2088
2089         memset(&fl6, 0, sizeof(fl6));
2090         fl6.flowi6_oif = oif;
2091         fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
2092         fl6.daddr = iph->daddr;
2093         fl6.saddr = iph->saddr;
2094         fl6.flowlabel = ip6_flowinfo(iph);
2095         fl6.flowi6_uid = uid;
2096
2097         dst = ip6_route_output(net, NULL, &fl6);
2098         if (!dst->error)
2099                 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2100         dst_release(dst);
2101 }
2102 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2103
2104 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2105 {
2106         struct dst_entry *dst;
2107
2108         ip6_update_pmtu(skb, sock_net(sk), mtu,
2109                         sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid);
2110
2111         dst = __sk_dst_get(sk);
2112         if (!dst || !dst->obsolete ||
2113             dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2114                 return;
2115
2116         bh_lock_sock(sk);
2117         if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2118                 ip6_datagram_dst_update(sk, false);
2119         bh_unlock_sock(sk);
2120 }
2121 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2122
2123 /* Handle redirects */
2124 struct ip6rd_flowi {
2125         struct flowi6 fl6;
2126         struct in6_addr gateway;
2127 };
2128
2129 static struct rt6_info *__ip6_route_redirect(struct net *net,
2130                                              struct fib6_table *table,
2131                                              struct flowi6 *fl6,
2132                                              int flags)
2133 {
2134         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2135         struct rt6_info *rt, *rt_cache;
2136         struct fib6_node *fn;
2137
2138         /* Get the "current" route for this destination and
2139          * check if the redirect has come from appropriate router.
2140          *
2141          * RFC 4861 specifies that redirects should only be
2142          * accepted if they come from the nexthop to the target.
2143          * Due to the way the routes are chosen, this notion
2144          * is a bit fuzzy and one might need to check all possible
2145          * routes.
2146          */
2147
2148         rcu_read_lock();
2149         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2150 restart:
2151         for_each_fib6_node_rt_rcu(fn) {
2152                 if (rt6_check_expired(rt))
2153                         continue;
2154                 if (rt->dst.error)
2155                         break;
2156                 if (!(rt->rt6i_flags & RTF_GATEWAY))
2157                         continue;
2158                 if (fl6->flowi6_oif != rt->dst.dev->ifindex)
2159                         continue;
2160                 /* rt_cache's gateway might be different from its 'parent'
2161                  * in the case of an ip redirect.
2162                  * So we keep searching in the exception table if the gateway
2163                  * is different.
2164                  */
2165                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway)) {
2166                         rt_cache = rt6_find_cached_rt(rt,
2167                                                       &fl6->daddr,
2168                                                       &fl6->saddr);
2169                         if (rt_cache &&
2170                             ipv6_addr_equal(&rdfl->gateway,
2171                                             &rt_cache->rt6i_gateway)) {
2172                                 rt = rt_cache;
2173                                 break;
2174                         }
2175                         continue;
2176                 }
2177                 break;
2178         }
2179
2180         if (!rt)
2181                 rt = net->ipv6.ip6_null_entry;
2182         else if (rt->dst.error) {
2183                 rt = net->ipv6.ip6_null_entry;
2184                 goto out;
2185         }
2186
2187         if (rt == net->ipv6.ip6_null_entry) {
2188                 fn = fib6_backtrack(fn, &fl6->saddr);
2189                 if (fn)
2190                         goto restart;
2191         }
2192
2193 out:
2194         ip6_hold_safe(net, &rt, true);
2195
2196         rcu_read_unlock();
2197
2198         trace_fib6_table_lookup(net, rt, table, fl6);
2199         return rt;
2200 };
2201
2202 static struct dst_entry *ip6_route_redirect(struct net *net,
2203                                         const struct flowi6 *fl6,
2204                                         const struct in6_addr *gateway)
2205 {
2206         int flags = RT6_LOOKUP_F_HAS_SADDR;
2207         struct ip6rd_flowi rdfl;
2208
2209         rdfl.fl6 = *fl6;
2210         rdfl.gateway = *gateway;
2211
2212         return fib6_rule_lookup(net, &rdfl.fl6,
2213                                 flags, __ip6_route_redirect);
2214 }
2215
2216 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2217                   kuid_t uid)
2218 {
2219         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2220         struct dst_entry *dst;
2221         struct flowi6 fl6;
2222
2223         memset(&fl6, 0, sizeof(fl6));
2224         fl6.flowi6_iif = LOOPBACK_IFINDEX;
2225         fl6.flowi6_oif = oif;
2226         fl6.flowi6_mark = mark;
2227         fl6.daddr = iph->daddr;
2228         fl6.saddr = iph->saddr;
2229         fl6.flowlabel = ip6_flowinfo(iph);
2230         fl6.flowi6_uid = uid;
2231
2232         dst = ip6_route_redirect(net, &fl6, &ipv6_hdr(skb)->saddr);
2233         rt6_do_redirect(dst, NULL, skb);
2234         dst_release(dst);
2235 }
2236 EXPORT_SYMBOL_GPL(ip6_redirect);
2237
2238 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
2239                             u32 mark)
2240 {
2241         const struct ipv6hdr *iph = ipv6_hdr(skb);
2242         const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2243         struct dst_entry *dst;
2244         struct flowi6 fl6;
2245
2246         memset(&fl6, 0, sizeof(fl6));
2247         fl6.flowi6_iif = LOOPBACK_IFINDEX;
2248         fl6.flowi6_oif = oif;
2249         fl6.flowi6_mark = mark;
2250         fl6.daddr = msg->dest;
2251         fl6.saddr = iph->daddr;
2252         fl6.flowi6_uid = sock_net_uid(net, NULL);
2253
2254         dst = ip6_route_redirect(net, &fl6, &iph->saddr);
2255         rt6_do_redirect(dst, NULL, skb);
2256         dst_release(dst);
2257 }
2258
2259 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2260 {
2261         ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2262                      sk->sk_uid);
2263 }
2264 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2265
2266 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2267 {
2268         struct net_device *dev = dst->dev;
2269         unsigned int mtu = dst_mtu(dst);
2270         struct net *net = dev_net(dev);
2271
2272         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2273
2274         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2275                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2276
2277         /*
2278          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2279          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2280          * IPV6_MAXPLEN is also valid and means: "any MSS,
2281          * rely only on pmtu discovery"
2282          */
2283         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2284                 mtu = IPV6_MAXPLEN;
2285         return mtu;
2286 }
2287
2288 static unsigned int ip6_mtu(const struct dst_entry *dst)
2289 {
2290         const struct rt6_info *rt = (const struct rt6_info *)dst;
2291         unsigned int mtu = rt->rt6i_pmtu;
2292         struct inet6_dev *idev;
2293
2294         if (mtu)
2295                 goto out;
2296
2297         mtu = dst_metric_raw(dst, RTAX_MTU);
2298         if (mtu)
2299                 goto out;
2300
2301         mtu = IPV6_MIN_MTU;
2302
2303         rcu_read_lock();
2304         idev = __in6_dev_get(dst->dev);
2305         if (idev)
2306                 mtu = idev->cnf.mtu6;
2307         rcu_read_unlock();
2308
2309 out:
2310         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2311
2312         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2313 }
2314
2315 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2316                                   struct flowi6 *fl6)
2317 {
2318         struct dst_entry *dst;
2319         struct rt6_info *rt;
2320         struct inet6_dev *idev = in6_dev_get(dev);
2321         struct net *net = dev_net(dev);
2322
2323         if (unlikely(!idev))
2324                 return ERR_PTR(-ENODEV);
2325
2326         rt = ip6_dst_alloc(net, dev, 0);
2327         if (unlikely(!rt)) {
2328                 in6_dev_put(idev);
2329                 dst = ERR_PTR(-ENOMEM);
2330                 goto out;
2331         }
2332
2333         rt->dst.flags |= DST_HOST;
2334         rt->dst.output  = ip6_output;
2335         rt->rt6i_gateway  = fl6->daddr;
2336         rt->rt6i_dst.addr = fl6->daddr;
2337         rt->rt6i_dst.plen = 128;
2338         rt->rt6i_idev     = idev;
2339         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2340
2341         /* Add this dst into uncached_list so that rt6_ifdown() can
2342          * do proper release of the net_device
2343          */
2344         rt6_uncached_list_add(rt);
2345         atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2346
2347         dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2348
2349 out:
2350         return dst;
2351 }
2352
2353 static int ip6_dst_gc(struct dst_ops *ops)
2354 {
2355         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2356         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2357         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2358         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2359         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2360         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2361         int entries;
2362
2363         entries = dst_entries_get_fast(ops);
2364         if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2365             entries <= rt_max_size)
2366                 goto out;
2367
2368         net->ipv6.ip6_rt_gc_expire++;
2369         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2370         entries = dst_entries_get_slow(ops);
2371         if (entries < ops->gc_thresh)
2372                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2373 out:
2374         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2375         return entries > rt_max_size;
2376 }
2377
2378 static int ip6_convert_metrics(struct mx6_config *mxc,
2379                                const struct fib6_config *cfg)
2380 {
2381         bool ecn_ca = false;
2382         struct nlattr *nla;
2383         int remaining;
2384         u32 *mp;
2385
2386         if (!cfg->fc_mx)
2387                 return 0;
2388
2389         mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
2390         if (unlikely(!mp))
2391                 return -ENOMEM;
2392
2393         nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
2394                 int type = nla_type(nla);
2395                 u32 val;
2396
2397                 if (!type)
2398                         continue;
2399                 if (unlikely(type > RTAX_MAX))
2400                         goto err;
2401
2402                 if (type == RTAX_CC_ALGO) {
2403                         char tmp[TCP_CA_NAME_MAX];
2404
2405                         nla_strlcpy(tmp, nla, sizeof(tmp));
2406                         val = tcp_ca_get_key_by_name(tmp, &ecn_ca);
2407                         if (val == TCP_CA_UNSPEC)
2408                                 goto err;
2409                 } else {
2410                         val = nla_get_u32(nla);
2411                 }
2412                 if (type == RTAX_HOPLIMIT && val > 255)
2413                         val = 255;
2414                 if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
2415                         goto err;
2416
2417                 mp[type - 1] = val;
2418                 __set_bit(type - 1, mxc->mx_valid);
2419         }
2420
2421         if (ecn_ca) {
2422                 __set_bit(RTAX_FEATURES - 1, mxc->mx_valid);
2423                 mp[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
2424         }
2425
2426         mxc->mx = mp;
2427         return 0;
2428  err:
2429         kfree(mp);
2430         return -EINVAL;
2431 }
2432
2433 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2434                                             struct fib6_config *cfg,
2435                                             const struct in6_addr *gw_addr)
2436 {
2437         struct flowi6 fl6 = {
2438                 .flowi6_oif = cfg->fc_ifindex,
2439                 .daddr = *gw_addr,
2440                 .saddr = cfg->fc_prefsrc,
2441         };
2442         struct fib6_table *table;
2443         struct rt6_info *rt;
2444         int flags = RT6_LOOKUP_F_IFACE | RT6_LOOKUP_F_IGNORE_LINKSTATE;
2445
2446         table = fib6_get_table(net, cfg->fc_table);
2447         if (!table)
2448                 return NULL;
2449
2450         if (!ipv6_addr_any(&cfg->fc_prefsrc))
2451                 flags |= RT6_LOOKUP_F_HAS_SADDR;
2452
2453         rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, flags);
2454
2455         /* if table lookup failed, fall back to full lookup */
2456         if (rt == net->ipv6.ip6_null_entry) {
2457                 ip6_rt_put(rt);
2458                 rt = NULL;
2459         }
2460
2461         return rt;
2462 }
2463
2464 static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
2465                                               struct netlink_ext_ack *extack)
2466 {
2467         struct net *net = cfg->fc_nlinfo.nl_net;
2468         struct rt6_info *rt = NULL;
2469         struct net_device *dev = NULL;
2470         struct inet6_dev *idev = NULL;
2471         struct fib6_table *table;
2472         int addr_type;
2473         int err = -EINVAL;
2474
2475         /* RTF_PCPU is an internal flag; can not be set by userspace */
2476         if (cfg->fc_flags & RTF_PCPU) {
2477                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
2478                 goto out;
2479         }
2480
2481         /* RTF_CACHE is an internal flag; can not be set by userspace */
2482         if (cfg->fc_flags & RTF_CACHE) {
2483                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
2484                 goto out;
2485         }
2486
2487         if (cfg->fc_dst_len > 128) {
2488                 NL_SET_ERR_MSG(extack, "Invalid prefix length");
2489                 goto out;
2490         }
2491         if (cfg->fc_src_len > 128) {
2492                 NL_SET_ERR_MSG(extack, "Invalid source address length");
2493                 goto out;
2494         }
2495 #ifndef CONFIG_IPV6_SUBTREES
2496         if (cfg->fc_src_len) {
2497                 NL_SET_ERR_MSG(extack,
2498                                "Specifying source address requires IPV6_SUBTREES to be enabled");
2499                 goto out;
2500         }
2501 #endif
2502         if (cfg->fc_ifindex) {
2503                 err = -ENODEV;
2504                 dev = dev_get_by_index(net, cfg->fc_ifindex);
2505                 if (!dev)
2506                         goto out;
2507                 idev = in6_dev_get(dev);
2508                 if (!idev)
2509                         goto out;
2510         }
2511
2512         if (cfg->fc_metric == 0)
2513                 cfg->fc_metric = IP6_RT_PRIO_USER;
2514
2515         err = -ENOBUFS;
2516         if (cfg->fc_nlinfo.nlh &&
2517             !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
2518                 table = fib6_get_table(net, cfg->fc_table);
2519                 if (!table) {
2520                         pr_warn("NLM_F_CREATE should be specified when creating new route\n");
2521                         table = fib6_new_table(net, cfg->fc_table);
2522                 }
2523         } else {
2524                 table = fib6_new_table(net, cfg->fc_table);
2525         }
2526
2527         if (!table)
2528                 goto out;
2529
2530         rt = ip6_dst_alloc(net, NULL,
2531                            (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT);
2532
2533         if (!rt) {
2534                 err = -ENOMEM;
2535                 goto out;
2536         }
2537
2538         if (cfg->fc_flags & RTF_EXPIRES)
2539                 rt6_set_expires(rt, jiffies +
2540                                 clock_t_to_jiffies(cfg->fc_expires));
2541         else
2542                 rt6_clean_expires(rt);
2543
2544         if (cfg->fc_protocol == RTPROT_UNSPEC)
2545                 cfg->fc_protocol = RTPROT_BOOT;
2546         rt->rt6i_protocol = cfg->fc_protocol;
2547
2548         addr_type = ipv6_addr_type(&cfg->fc_dst);
2549
2550         if (addr_type & IPV6_ADDR_MULTICAST)
2551                 rt->dst.input = ip6_mc_input;
2552         else if (cfg->fc_flags & RTF_LOCAL)
2553                 rt->dst.input = ip6_input;
2554         else
2555                 rt->dst.input = ip6_forward;
2556
2557         rt->dst.output = ip6_output;
2558
2559         if (cfg->fc_encap) {
2560                 struct lwtunnel_state *lwtstate;
2561
2562                 err = lwtunnel_build_state(cfg->fc_encap_type,
2563                                            cfg->fc_encap, AF_INET6, cfg,
2564                                            &lwtstate, extack);
2565                 if (err)
2566                         goto out;
2567                 rt->dst.lwtstate = lwtstate_get(lwtstate);
2568                 if (lwtunnel_output_redirect(rt->dst.lwtstate)) {
2569                         rt->dst.lwtstate->orig_output = rt->dst.output;
2570                         rt->dst.output = lwtunnel_output;
2571                 }
2572                 if (lwtunnel_input_redirect(rt->dst.lwtstate)) {
2573                         rt->dst.lwtstate->orig_input = rt->dst.input;
2574                         rt->dst.input = lwtunnel_input;
2575                 }
2576         }
2577
2578         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
2579         rt->rt6i_dst.plen = cfg->fc_dst_len;
2580         if (rt->rt6i_dst.plen == 128)
2581                 rt->dst.flags |= DST_HOST;
2582
2583 #ifdef CONFIG_IPV6_SUBTREES
2584         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
2585         rt->rt6i_src.plen = cfg->fc_src_len;
2586 #endif
2587
2588         rt->rt6i_metric = cfg->fc_metric;
2589
2590         /* We cannot add true routes via loopback here,
2591            they would result in kernel looping; promote them to reject routes
2592          */
2593         if ((cfg->fc_flags & RTF_REJECT) ||
2594             (dev && (dev->flags & IFF_LOOPBACK) &&
2595              !(addr_type & IPV6_ADDR_LOOPBACK) &&
2596              !(cfg->fc_flags & RTF_LOCAL))) {
2597                 /* hold loopback dev/idev if we haven't done so. */
2598                 if (dev != net->loopback_dev) {
2599                         if (dev) {
2600                                 dev_put(dev);
2601                                 in6_dev_put(idev);
2602                         }
2603                         dev = net->loopback_dev;
2604                         dev_hold(dev);
2605                         idev = in6_dev_get(dev);
2606                         if (!idev) {
2607                                 err = -ENODEV;
2608                                 goto out;
2609                         }
2610                 }
2611                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
2612                 switch (cfg->fc_type) {
2613                 case RTN_BLACKHOLE:
2614                         rt->dst.error = -EINVAL;
2615                         rt->dst.output = dst_discard_out;
2616                         rt->dst.input = dst_discard;
2617                         break;
2618                 case RTN_PROHIBIT:
2619                         rt->dst.error = -EACCES;
2620                         rt->dst.output = ip6_pkt_prohibit_out;
2621                         rt->dst.input = ip6_pkt_prohibit;
2622                         break;
2623                 case RTN_THROW:
2624                 case RTN_UNREACHABLE:
2625                 default:
2626                         rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN
2627                                         : (cfg->fc_type == RTN_UNREACHABLE)
2628                                         ? -EHOSTUNREACH : -ENETUNREACH;
2629                         rt->dst.output = ip6_pkt_discard_out;
2630                         rt->dst.input = ip6_pkt_discard;
2631                         break;
2632                 }
2633                 goto install_route;
2634         }
2635
2636         if (cfg->fc_flags & RTF_GATEWAY) {
2637                 const struct in6_addr *gw_addr;
2638                 int gwa_type;
2639
2640                 gw_addr = &cfg->fc_gateway;
2641                 gwa_type = ipv6_addr_type(gw_addr);
2642
2643                 /* if gw_addr is local we will fail to detect this in case
2644                  * address is still TENTATIVE (DAD in progress). rt6_lookup()
2645                  * will return already-added prefix route via interface that
2646                  * prefix route was assigned to, which might be non-loopback.
2647                  */
2648                 err = -EINVAL;
2649                 if (ipv6_chk_addr_and_flags(net, gw_addr,
2650                                             gwa_type & IPV6_ADDR_LINKLOCAL ?
2651                                             dev : NULL, 0, 0)) {
2652                         NL_SET_ERR_MSG(extack, "Invalid gateway address");
2653                         goto out;
2654                 }
2655                 rt->rt6i_gateway = *gw_addr;
2656
2657                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
2658                         struct rt6_info *grt = NULL;
2659
2660                         /* IPv6 strictly inhibits using not link-local
2661                            addresses as nexthop address.
2662                            Otherwise, router will not able to send redirects.
2663                            It is very good, but in some (rare!) circumstances
2664                            (SIT, PtP, NBMA NOARP links) it is handy to allow
2665                            some exceptions. --ANK
2666                            We allow IPv4-mapped nexthops to support RFC4798-type
2667                            addressing
2668                          */
2669                         if (!(gwa_type & (IPV6_ADDR_UNICAST |
2670                                           IPV6_ADDR_MAPPED))) {
2671                                 NL_SET_ERR_MSG(extack,
2672                                                "Invalid gateway address");
2673                                 goto out;
2674                         }
2675
2676                         if (cfg->fc_table) {
2677                                 grt = ip6_nh_lookup_table(net, cfg, gw_addr);
2678
2679                                 if (grt) {
2680                                         if (grt->rt6i_flags & RTF_GATEWAY ||
2681                                             (dev && dev != grt->dst.dev)) {
2682                                                 ip6_rt_put(grt);
2683                                                 grt = NULL;
2684                                         }
2685                                 }
2686                         }
2687
2688                         if (!grt)
2689                                 grt = rt6_lookup(net, gw_addr, NULL,
2690                                                  cfg->fc_ifindex, 1);
2691
2692                         err = -EHOSTUNREACH;
2693                         if (!grt)
2694                                 goto out;
2695                         if (dev) {
2696                                 if (dev != grt->dst.dev) {
2697                                         ip6_rt_put(grt);
2698                                         goto out;
2699                                 }
2700                         } else {
2701                                 dev = grt->dst.dev;
2702                                 idev = grt->rt6i_idev;
2703                                 dev_hold(dev);
2704                                 in6_dev_hold(grt->rt6i_idev);
2705                         }
2706                         if (!(grt->rt6i_flags & RTF_GATEWAY))
2707                                 err = 0;
2708                         ip6_rt_put(grt);
2709
2710                         if (err)
2711                                 goto out;
2712                 }
2713                 err = -EINVAL;
2714                 if (!dev) {
2715                         NL_SET_ERR_MSG(extack, "Egress device not specified");
2716                         goto out;
2717                 } else if (dev->flags & IFF_LOOPBACK) {
2718                         NL_SET_ERR_MSG(extack,
2719                                        "Egress device can not be loopback device for this route");
2720                         goto out;
2721                 }
2722         }
2723
2724         err = -ENODEV;
2725         if (!dev)
2726                 goto out;
2727
2728         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
2729                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
2730                         NL_SET_ERR_MSG(extack, "Invalid source address");
2731                         err = -EINVAL;
2732                         goto out;
2733                 }
2734                 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
2735                 rt->rt6i_prefsrc.plen = 128;
2736         } else
2737                 rt->rt6i_prefsrc.plen = 0;
2738
2739         rt->rt6i_flags = cfg->fc_flags;
2740
2741 install_route:
2742         rt->dst.dev = dev;
2743         rt->rt6i_idev = idev;
2744         rt->rt6i_table = table;
2745
2746         cfg->fc_nlinfo.nl_net = dev_net(dev);
2747
2748         return rt;
2749 out:
2750         if (dev)
2751                 dev_put(dev);
2752         if (idev)
2753                 in6_dev_put(idev);
2754         if (rt)
2755                 dst_release_immediate(&rt->dst);
2756
2757         return ERR_PTR(err);
2758 }
2759
2760 int ip6_route_add(struct fib6_config *cfg,
2761                   struct netlink_ext_ack *extack)
2762 {
2763         struct mx6_config mxc = { .mx = NULL, };
2764         struct rt6_info *rt;
2765         int err;
2766
2767         rt = ip6_route_info_create(cfg, extack);
2768         if (IS_ERR(rt)) {
2769                 err = PTR_ERR(rt);
2770                 rt = NULL;
2771                 goto out;
2772         }
2773
2774         err = ip6_convert_metrics(&mxc, cfg);
2775         if (err)
2776                 goto out;
2777
2778         err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc, extack);
2779
2780         kfree(mxc.mx);
2781
2782         return err;
2783 out:
2784         if (rt)
2785                 dst_release_immediate(&rt->dst);
2786
2787         return err;
2788 }
2789
2790 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
2791 {
2792         int err;
2793         struct fib6_table *table;
2794         struct net *net = dev_net(rt->dst.dev);
2795
2796         if (rt == net->ipv6.ip6_null_entry) {
2797                 err = -ENOENT;
2798                 goto out;
2799         }
2800
2801         table = rt->rt6i_table;
2802         spin_lock_bh(&table->tb6_lock);
2803         err = fib6_del(rt, info);
2804         spin_unlock_bh(&table->tb6_lock);
2805
2806 out:
2807         ip6_rt_put(rt);
2808         return err;
2809 }
2810
2811 int ip6_del_rt(struct rt6_info *rt)
2812 {
2813         struct nl_info info = {
2814                 .nl_net = dev_net(rt->dst.dev),
2815         };
2816         return __ip6_del_rt(rt, &info);
2817 }
2818
2819 static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg)
2820 {
2821         struct nl_info *info = &cfg->fc_nlinfo;
2822         struct net *net = info->nl_net;
2823         struct sk_buff *skb = NULL;
2824         struct fib6_table *table;
2825         int err = -ENOENT;
2826
2827         if (rt == net->ipv6.ip6_null_entry)
2828                 goto out_put;
2829         table = rt->rt6i_table;
2830         spin_lock_bh(&table->tb6_lock);
2831
2832         if (rt->rt6i_nsiblings && cfg->fc_delete_all_nh) {
2833                 struct rt6_info *sibling, *next_sibling;
2834
2835                 /* prefer to send a single notification with all hops */
2836                 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
2837                 if (skb) {
2838                         u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
2839
2840                         if (rt6_fill_node(net, skb, rt,
2841                                           NULL, NULL, 0, RTM_DELROUTE,
2842                                           info->portid, seq, 0) < 0) {
2843                                 kfree_skb(skb);
2844                                 skb = NULL;
2845                         } else
2846                                 info->skip_notify = 1;
2847                 }
2848
2849                 list_for_each_entry_safe(sibling, next_sibling,
2850                                          &rt->rt6i_siblings,
2851                                          rt6i_siblings) {
2852                         err = fib6_del(sibling, info);
2853                         if (err)
2854                                 goto out_unlock;
2855                 }
2856         }
2857
2858         err = fib6_del(rt, info);
2859 out_unlock:
2860         spin_unlock_bh(&table->tb6_lock);
2861 out_put:
2862         ip6_rt_put(rt);
2863
2864         if (skb) {
2865                 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
2866                             info->nlh, gfp_any());
2867         }
2868         return err;
2869 }
2870
2871 static int ip6_route_del(struct fib6_config *cfg,
2872                          struct netlink_ext_ack *extack)
2873 {
2874         struct rt6_info *rt, *rt_cache;
2875         struct fib6_table *table;
2876         struct fib6_node *fn;
2877         int err = -ESRCH;
2878
2879         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
2880         if (!table) {
2881                 NL_SET_ERR_MSG(extack, "FIB table does not exist");
2882                 return err;
2883         }
2884
2885         rcu_read_lock();
2886
2887         fn = fib6_locate(&table->tb6_root,
2888                          &cfg->fc_dst, cfg->fc_dst_len,
2889                          &cfg->fc_src, cfg->fc_src_len,
2890                          !(cfg->fc_flags & RTF_CACHE));
2891
2892         if (fn) {
2893                 for_each_fib6_node_rt_rcu(fn) {
2894                         if (cfg->fc_flags & RTF_CACHE) {
2895                                 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
2896                                                               &cfg->fc_src);
2897                                 if (!rt_cache)
2898                                         continue;
2899                                 rt = rt_cache;
2900                         }
2901                         if (cfg->fc_ifindex &&
2902                             (!rt->dst.dev ||
2903                              rt->dst.dev->ifindex != cfg->fc_ifindex))
2904                                 continue;
2905                         if (cfg->fc_flags & RTF_GATEWAY &&
2906                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
2907                                 continue;
2908                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
2909                                 continue;
2910                         if (cfg->fc_protocol && cfg->fc_protocol != rt->rt6i_protocol)
2911                                 continue;
2912                         if (!dst_hold_safe(&rt->dst))
2913                                 break;
2914                         rcu_read_unlock();
2915
2916                         /* if gateway was specified only delete the one hop */
2917                         if (cfg->fc_flags & RTF_GATEWAY)
2918                                 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
2919
2920                         return __ip6_del_rt_siblings(rt, cfg);
2921                 }
2922         }
2923         rcu_read_unlock();
2924
2925         return err;
2926 }
2927
2928 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
2929 {
2930         struct netevent_redirect netevent;
2931         struct rt6_info *rt, *nrt = NULL;
2932         struct ndisc_options ndopts;
2933         struct inet6_dev *in6_dev;
2934         struct neighbour *neigh;
2935         struct rd_msg *msg;
2936         int optlen, on_link;
2937         u8 *lladdr;
2938
2939         optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
2940         optlen -= sizeof(*msg);
2941
2942         if (optlen < 0) {
2943                 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
2944                 return;
2945         }
2946
2947         msg = (struct rd_msg *)icmp6_hdr(skb);
2948
2949         if (ipv6_addr_is_multicast(&msg->dest)) {
2950                 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
2951                 return;
2952         }
2953
2954         on_link = 0;
2955         if (ipv6_addr_equal(&msg->dest, &msg->target)) {
2956                 on_link = 1;
2957         } else if (ipv6_addr_type(&msg->target) !=
2958                    (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
2959                 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
2960                 return;
2961         }
2962
2963         in6_dev = __in6_dev_get(skb->dev);
2964         if (!in6_dev)
2965                 return;
2966         if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
2967                 return;
2968
2969         /* RFC2461 8.1:
2970          *      The IP source address of the Redirect MUST be the same as the current
2971          *      first-hop router for the specified ICMP Destination Address.
2972          */
2973
2974         if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
2975                 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
2976                 return;
2977         }
2978
2979         lladdr = NULL;
2980         if (ndopts.nd_opts_tgt_lladdr) {
2981                 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
2982                                              skb->dev);
2983                 if (!lladdr) {
2984                         net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
2985                         return;
2986                 }
2987         }
2988
2989         rt = (struct rt6_info *) dst;
2990         if (rt->rt6i_flags & RTF_REJECT) {
2991                 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
2992                 return;
2993         }
2994
2995         /* Redirect received -> path was valid.
2996          * Look, redirects are sent only in response to data packets,
2997          * so that this nexthop apparently is reachable. --ANK
2998          */
2999         dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3000
3001         neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3002         if (!neigh)
3003                 return;
3004
3005         /*
3006          *      We have finally decided to accept it.
3007          */
3008
3009         ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3010                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
3011                      NEIGH_UPDATE_F_OVERRIDE|
3012                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3013                                      NEIGH_UPDATE_F_ISROUTER)),
3014                      NDISC_REDIRECT, &ndopts);
3015
3016         nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL);
3017         if (!nrt)
3018                 goto out;
3019
3020         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3021         if (on_link)
3022                 nrt->rt6i_flags &= ~RTF_GATEWAY;
3023
3024         nrt->rt6i_protocol = RTPROT_REDIRECT;
3025         nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3026
3027         /* No need to remove rt from the exception table if rt is
3028          * a cached route because rt6_insert_exception() will
3029          * takes care of it
3030          */
3031         if (rt6_insert_exception(nrt, rt)) {
3032                 dst_release_immediate(&nrt->dst);
3033                 goto out;
3034         }
3035
3036         netevent.old = &rt->dst;
3037         netevent.new = &nrt->dst;
3038         netevent.daddr = &msg->dest;
3039         netevent.neigh = neigh;
3040         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3041
3042 out:
3043         neigh_release(neigh);
3044 }
3045
3046 /*
3047  *      Misc support functions
3048  */
3049
3050 static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
3051 {
3052         BUG_ON(from->dst.from);
3053
3054         rt->rt6i_flags &= ~RTF_EXPIRES;
3055         dst_hold(&from->dst);
3056         rt->dst.from = &from->dst;
3057         dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true);
3058 }
3059
3060 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
3061 {
3062         rt->dst.input = ort->dst.input;
3063         rt->dst.output = ort->dst.output;
3064         rt->rt6i_dst = ort->rt6i_dst;
3065         rt->dst.error = ort->dst.error;
3066         rt->rt6i_idev = ort->rt6i_idev;
3067         if (rt->rt6i_idev)
3068                 in6_dev_hold(rt->rt6i_idev);
3069         rt->dst.lastuse = jiffies;
3070         rt->rt6i_gateway = ort->rt6i_gateway;
3071         rt->rt6i_flags = ort->rt6i_flags;
3072         rt6_set_from(rt, ort);
3073         rt->rt6i_metric = ort->rt6i_metric;
3074 #ifdef CONFIG_IPV6_SUBTREES
3075         rt->rt6i_src = ort->rt6i_src;
3076 #endif
3077         rt->rt6i_prefsrc = ort->rt6i_prefsrc;
3078         rt->rt6i_table = ort->rt6i_table;
3079         rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate);
3080 }
3081
3082 #ifdef CONFIG_IPV6_ROUTE_INFO
3083 static struct rt6_info *rt6_get_route_info(struct net *net,
3084                                            const struct in6_addr *prefix, int prefixlen,
3085                                            const struct in6_addr *gwaddr,
3086                                            struct net_device *dev)
3087 {
3088         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3089         int ifindex = dev->ifindex;
3090         struct fib6_node *fn;
3091         struct rt6_info *rt = NULL;
3092         struct fib6_table *table;
3093
3094         table = fib6_get_table(net, tb_id);
3095         if (!table)
3096                 return NULL;
3097
3098         rcu_read_lock();
3099         fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3100         if (!fn)
3101                 goto out;
3102
3103         for_each_fib6_node_rt_rcu(fn) {
3104                 if (rt->dst.dev->ifindex != ifindex)
3105                         continue;
3106                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
3107                         continue;
3108                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
3109                         continue;
3110                 ip6_hold_safe(NULL, &rt, false);
3111                 break;
3112         }
3113 out:
3114         rcu_read_unlock();
3115         return rt;
3116 }
3117
3118 static struct rt6_info *rt6_add_route_info(struct net *net,
3119                                            const struct in6_addr *prefix, int prefixlen,
3120                                            const struct in6_addr *gwaddr,
3121                                            struct net_device *dev,
3122                                            unsigned int pref)
3123 {
3124         struct fib6_config cfg = {
3125                 .fc_metric      = IP6_RT_PRIO_USER,
3126                 .fc_ifindex     = dev->ifindex,
3127                 .fc_dst_len     = prefixlen,
3128                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3129                                   RTF_UP | RTF_PREF(pref),
3130                 .fc_protocol = RTPROT_RA,
3131                 .fc_nlinfo.portid = 0,
3132                 .fc_nlinfo.nlh = NULL,
3133                 .fc_nlinfo.nl_net = net,
3134         };
3135
3136         cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3137         cfg.fc_dst = *prefix;
3138         cfg.fc_gateway = *gwaddr;
3139
3140         /* We should treat it as a default route if prefix length is 0. */
3141         if (!prefixlen)
3142                 cfg.fc_flags |= RTF_DEFAULT;
3143
3144         ip6_route_add(&cfg, NULL);
3145
3146         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3147 }
3148 #endif
3149
3150 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
3151 {
3152         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3153         struct rt6_info *rt;
3154         struct fib6_table *table;
3155
3156         table = fib6_get_table(dev_net(dev), tb_id);
3157         if (!table)
3158                 return NULL;
3159
3160         rcu_read_lock();
3161         for_each_fib6_node_rt_rcu(&table->tb6_root) {
3162                 if (dev == rt->dst.dev &&
3163                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3164                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
3165                         break;
3166         }
3167         if (rt)
3168                 ip6_hold_safe(NULL, &rt, false);
3169         rcu_read_unlock();
3170         return rt;
3171 }
3172
3173 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
3174                                      struct net_device *dev,
3175                                      unsigned int pref)
3176 {
3177         struct fib6_config cfg = {
3178                 .fc_table       = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3179                 .fc_metric      = IP6_RT_PRIO_USER,
3180                 .fc_ifindex     = dev->ifindex,
3181                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3182                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3183                 .fc_protocol = RTPROT_RA,
3184                 .fc_nlinfo.portid = 0,
3185                 .fc_nlinfo.nlh = NULL,
3186                 .fc_nlinfo.nl_net = dev_net(dev),
3187         };
3188
3189         cfg.fc_gateway = *gwaddr;
3190
3191         if (!ip6_route_add(&cfg, NULL)) {
3192                 struct fib6_table *table;
3193
3194                 table = fib6_get_table(dev_net(dev), cfg.fc_table);
3195                 if (table)
3196                         table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3197         }
3198
3199         return rt6_get_dflt_router(gwaddr, dev);
3200 }
3201
3202 static void __rt6_purge_dflt_routers(struct fib6_table *table)
3203 {
3204         struct rt6_info *rt;
3205
3206 restart:
3207         rcu_read_lock();
3208         for_each_fib6_node_rt_rcu(&table->tb6_root) {
3209                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3210                     (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
3211                         if (dst_hold_safe(&rt->dst)) {
3212                                 rcu_read_unlock();
3213                                 ip6_del_rt(rt);
3214                         } else {
3215                                 rcu_read_unlock();
3216                         }
3217                         goto restart;
3218                 }
3219         }
3220         rcu_read_unlock();
3221
3222         table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3223 }
3224
3225 void rt6_purge_dflt_routers(struct net *net)
3226 {
3227         struct fib6_table *table;
3228         struct hlist_head *head;
3229         unsigned int h;
3230
3231         rcu_read_lock();
3232
3233         for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3234                 head = &net->ipv6.fib_table_hash[h];
3235                 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3236                         if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3237                                 __rt6_purge_dflt_routers(table);
3238                 }
3239         }
3240
3241         rcu_read_unlock();
3242 }
3243
3244 static void rtmsg_to_fib6_config(struct net *net,
3245                                  struct in6_rtmsg *rtmsg,
3246                                  struct fib6_config *cfg)
3247 {
3248         memset(cfg, 0, sizeof(*cfg));
3249
3250         cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3251                          : RT6_TABLE_MAIN;
3252         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
3253         cfg->fc_metric = rtmsg->rtmsg_metric;
3254         cfg->fc_expires = rtmsg->rtmsg_info;
3255         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
3256         cfg->fc_src_len = rtmsg->rtmsg_src_len;
3257         cfg->fc_flags = rtmsg->rtmsg_flags;
3258
3259         cfg->fc_nlinfo.nl_net = net;
3260
3261         cfg->fc_dst = rtmsg->rtmsg_dst;
3262         cfg->fc_src = rtmsg->rtmsg_src;
3263         cfg->fc_gateway = rtmsg->rtmsg_gateway;
3264 }
3265
3266 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3267 {
3268         struct fib6_config cfg;
3269         struct in6_rtmsg rtmsg;
3270         int err;
3271
3272         switch (cmd) {
3273         case SIOCADDRT:         /* Add a route */
3274         case SIOCDELRT:         /* Delete a route */
3275                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3276                         return -EPERM;
3277                 err = copy_from_user(&rtmsg, arg,
3278                                      sizeof(struct in6_rtmsg));
3279                 if (err)
3280                         return -EFAULT;
3281
3282                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3283
3284                 rtnl_lock();
3285                 switch (cmd) {
3286                 case SIOCADDRT:
3287                         err = ip6_route_add(&cfg, NULL);
3288                         break;
3289                 case SIOCDELRT:
3290                         err = ip6_route_del(&cfg, NULL);
3291                         break;
3292                 default:
3293                         err = -EINVAL;
3294                 }
3295                 rtnl_unlock();
3296
3297                 return err;
3298         }
3299
3300         return -EINVAL;
3301 }
3302
3303 /*
3304  *      Drop the packet on the floor
3305  */
3306
3307 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3308 {
3309         int type;
3310         struct dst_entry *dst = skb_dst(skb);
3311         switch (ipstats_mib_noroutes) {
3312         case IPSTATS_MIB_INNOROUTES:
3313                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3314                 if (type == IPV6_ADDR_ANY) {
3315                         IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3316                                       IPSTATS_MIB_INADDRERRORS);
3317                         break;
3318                 }
3319                 /* FALLTHROUGH */
3320         case IPSTATS_MIB_OUTNOROUTES:
3321                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3322                               ipstats_mib_noroutes);
3323                 break;
3324         }
3325         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3326         kfree_skb(skb);
3327         return 0;
3328 }
3329
3330 static int ip6_pkt_discard(struct sk_buff *skb)
3331 {
3332         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3333 }
3334
3335 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3336 {
3337         skb->dev = skb_dst(skb)->dev;
3338         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3339 }
3340
3341 static int ip6_pkt_prohibit(struct sk_buff *skb)
3342 {
3343         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3344 }
3345
3346 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3347 {
3348         skb->dev = skb_dst(skb)->dev;
3349         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3350 }
3351
3352 /*
3353  *      Allocate a dst for local (unicast / anycast) address.
3354  */
3355
3356 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
3357                                     const struct in6_addr *addr,
3358                                     bool anycast)
3359 {
3360         u32 tb_id;
3361         struct net *net = dev_net(idev->dev);
3362         struct net_device *dev = idev->dev;
3363         struct rt6_info *rt;
3364
3365         rt = ip6_dst_alloc(net, dev, DST_NOCOUNT);
3366         if (!rt)
3367                 return ERR_PTR(-ENOMEM);
3368
3369         in6_dev_hold(idev);
3370
3371         rt->dst.flags |= DST_HOST;
3372         rt->dst.input = ip6_input;
3373         rt->dst.output = ip6_output;
3374         rt->rt6i_idev = idev;
3375
3376         rt->rt6i_protocol = RTPROT_KERNEL;
3377         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
3378         if (anycast)
3379                 rt->rt6i_flags |= RTF_ANYCAST;
3380         else
3381                 rt->rt6i_flags |= RTF_LOCAL;
3382
3383         rt->rt6i_gateway  = *addr;
3384         rt->rt6i_dst.addr = *addr;
3385         rt->rt6i_dst.plen = 128;
3386         tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
3387         rt->rt6i_table = fib6_get_table(net, tb_id);
3388
3389         return rt;
3390 }
3391
3392 /* remove deleted ip from prefsrc entries */
3393 struct arg_dev_net_ip {
3394         struct net_device *dev;
3395         struct net *net;
3396         struct in6_addr *addr;
3397 };
3398
3399 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
3400 {
3401         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3402         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3403         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3404
3405         if (((void *)rt->dst.dev == dev || !dev) &&
3406             rt != net->ipv6.ip6_null_entry &&
3407             ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
3408                 spin_lock_bh(&rt6_exception_lock);
3409                 /* remove prefsrc entry */
3410                 rt->rt6i_prefsrc.plen = 0;
3411                 /* need to update cache as well */
3412                 rt6_exceptions_remove_prefsrc(rt);
3413                 spin_unlock_bh(&rt6_exception_lock);
3414         }
3415         return 0;
3416 }
3417
3418 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3419 {
3420         struct net *net = dev_net(ifp->idev->dev);
3421         struct arg_dev_net_ip adni = {
3422                 .dev = ifp->idev->dev,
3423                 .net = net,
3424                 .addr = &ifp->addr,
3425         };
3426         fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3427 }
3428
3429 #define RTF_RA_ROUTER           (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
3430
3431 /* Remove routers and update dst entries when gateway turn into host. */
3432 static int fib6_clean_tohost(struct rt6_info *rt, void *arg)
3433 {
3434         struct in6_addr *gateway = (struct in6_addr *)arg;
3435
3436         if (((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3437             ipv6_addr_equal(gateway, &rt->rt6i_gateway)) {
3438                 return -1;
3439         }
3440
3441         /* Further clean up cached routes in exception table.
3442          * This is needed because cached route may have a different
3443          * gateway than its 'parent' in the case of an ip redirect.
3444          */
3445         rt6_exceptions_clean_tohost(rt, gateway);
3446
3447         return 0;
3448 }
3449
3450 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3451 {
3452         fib6_clean_all(net, fib6_clean_tohost, gateway);
3453 }
3454
3455 struct arg_dev_net {
3456         struct net_device *dev;
3457         struct net *net;
3458 };
3459
3460 /* called with write lock held for table with rt */
3461 static int fib6_ifdown(struct rt6_info *rt, void *arg)
3462 {
3463         const struct arg_dev_net *adn = arg;
3464         const struct net_device *dev = adn->dev;
3465
3466         if ((rt->dst.dev == dev || !dev) &&
3467             rt != adn->net->ipv6.ip6_null_entry &&
3468             (rt->rt6i_nsiblings == 0 ||
3469              (dev && netdev_unregistering(dev)) ||
3470              !rt->rt6i_idev->cnf.ignore_routes_with_linkdown))
3471                 return -1;
3472
3473         return 0;
3474 }
3475
3476 void rt6_ifdown(struct net *net, struct net_device *dev)
3477 {
3478         struct arg_dev_net adn = {
3479                 .dev = dev,
3480                 .net = net,
3481         };
3482
3483         fib6_clean_all(net, fib6_ifdown, &adn);
3484         if (dev)
3485                 rt6_uncached_list_flush_dev(net, dev);
3486 }
3487
3488 struct rt6_mtu_change_arg {
3489         struct net_device *dev;
3490         unsigned int mtu;
3491 };
3492
3493 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
3494 {
3495         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
3496         struct inet6_dev *idev;
3497
3498         /* In IPv6 pmtu discovery is not optional,
3499            so that RTAX_MTU lock cannot disable it.
3500            We still use this lock to block changes
3501            caused by addrconf/ndisc.
3502         */
3503
3504         idev = __in6_dev_get(arg->dev);
3505         if (!idev)
3506                 return 0;
3507
3508         /* For administrative MTU increase, there is no way to discover
3509            IPv6 PMTU increase, so PMTU increase should be updated here.
3510            Since RFC 1981 doesn't include administrative MTU increase
3511            update PMTU increase is a MUST. (i.e. jumbo frame)
3512          */
3513         /*
3514            If new MTU is less than route PMTU, this new MTU will be the
3515            lowest MTU in the path, update the route PMTU to reflect PMTU
3516            decreases; if new MTU is greater than route PMTU, and the
3517            old MTU is the lowest MTU in the path, update the route PMTU
3518            to reflect the increase. In this case if the other nodes' MTU
3519            also have the lowest MTU, TOO BIG MESSAGE will be lead to
3520            PMTU discovery.
3521          */
3522         if (rt->dst.dev == arg->dev &&
3523             dst_metric_raw(&rt->dst, RTAX_MTU) &&
3524             !dst_metric_locked(&rt->dst, RTAX_MTU)) {
3525                 spin_lock_bh(&rt6_exception_lock);
3526                 if (dst_mtu(&rt->dst) >= arg->mtu ||
3527                     (dst_mtu(&rt->dst) < arg->mtu &&
3528                      dst_mtu(&rt->dst) == idev->cnf.mtu6)) {
3529                         dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
3530                 }
3531                 rt6_exceptions_update_pmtu(rt, arg->mtu);
3532                 spin_unlock_bh(&rt6_exception_lock);
3533         }
3534         return 0;
3535 }
3536
3537 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
3538 {
3539         struct rt6_mtu_change_arg arg = {
3540                 .dev = dev,
3541                 .mtu = mtu,
3542         };
3543
3544         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
3545 }
3546
3547 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
3548         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
3549         [RTA_OIF]               = { .type = NLA_U32 },
3550         [RTA_IIF]               = { .type = NLA_U32 },
3551         [RTA_PRIORITY]          = { .type = NLA_U32 },
3552         [RTA_METRICS]           = { .type = NLA_NESTED },
3553         [RTA_MULTIPATH]         = { .len = sizeof(struct rtnexthop) },
3554         [RTA_PREF]              = { .type = NLA_U8 },
3555         [RTA_ENCAP_TYPE]        = { .type = NLA_U16 },
3556         [RTA_ENCAP]             = { .type = NLA_NESTED },
3557         [RTA_EXPIRES]           = { .type = NLA_U32 },
3558         [RTA_UID]               = { .type = NLA_U32 },
3559         [RTA_MARK]              = { .type = NLA_U32 },
3560 };
3561
3562 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
3563                               struct fib6_config *cfg,
3564                               struct netlink_ext_ack *extack)
3565 {
3566         struct rtmsg *rtm;
3567         struct nlattr *tb[RTA_MAX+1];
3568         unsigned int pref;
3569         int err;
3570
3571         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
3572                           NULL);
3573         if (err < 0)
3574                 goto errout;
3575
3576         err = -EINVAL;
3577         rtm = nlmsg_data(nlh);
3578         memset(cfg, 0, sizeof(*cfg));
3579
3580         cfg->fc_table = rtm->rtm_table;
3581         cfg->fc_dst_len = rtm->rtm_dst_len;
3582         cfg->fc_src_len = rtm->rtm_src_len;
3583         cfg->fc_flags = RTF_UP;
3584         cfg->fc_protocol = rtm->rtm_protocol;
3585         cfg->fc_type = rtm->rtm_type;
3586
3587         if (rtm->rtm_type == RTN_UNREACHABLE ||
3588             rtm->rtm_type == RTN_BLACKHOLE ||
3589             rtm->rtm_type == RTN_PROHIBIT ||
3590             rtm->rtm_type == RTN_THROW)
3591                 cfg->fc_flags |= RTF_REJECT;
3592
3593         if (rtm->rtm_type == RTN_LOCAL)
3594                 cfg->fc_flags |= RTF_LOCAL;
3595
3596         if (rtm->rtm_flags & RTM_F_CLONED)
3597                 cfg->fc_flags |= RTF_CACHE;
3598
3599         cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
3600         cfg->fc_nlinfo.nlh = nlh;
3601         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
3602
3603         if (tb[RTA_GATEWAY]) {
3604                 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
3605                 cfg->fc_flags |= RTF_GATEWAY;
3606         }
3607
3608         if (tb[RTA_DST]) {
3609                 int plen = (rtm->rtm_dst_len + 7) >> 3;
3610
3611                 if (nla_len(tb[RTA_DST]) < plen)
3612                         goto errout;
3613
3614                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
3615         }
3616
3617         if (tb[RTA_SRC]) {
3618                 int plen = (rtm->rtm_src_len + 7) >> 3;
3619
3620                 if (nla_len(tb[RTA_SRC]) < plen)
3621                         goto errout;
3622
3623                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
3624         }
3625
3626         if (tb[RTA_PREFSRC])
3627                 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
3628
3629         if (tb[RTA_OIF])
3630                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
3631
3632         if (tb[RTA_PRIORITY])
3633                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
3634
3635         if (tb[RTA_METRICS]) {
3636                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
3637                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
3638         }
3639
3640         if (tb[RTA_TABLE])
3641                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
3642
3643         if (tb[RTA_MULTIPATH]) {
3644                 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
3645                 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
3646
3647                 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
3648                                                      cfg->fc_mp_len, extack);
3649                 if (err < 0)
3650                         goto errout;
3651         }
3652
3653         if (tb[RTA_PREF]) {
3654                 pref = nla_get_u8(tb[RTA_PREF]);
3655                 if (pref != ICMPV6_ROUTER_PREF_LOW &&
3656                     pref != ICMPV6_ROUTER_PREF_HIGH)
3657                         pref = ICMPV6_ROUTER_PREF_MEDIUM;
3658                 cfg->fc_flags |= RTF_PREF(pref);
3659         }
3660
3661         if (tb[RTA_ENCAP])
3662                 cfg->fc_encap = tb[RTA_ENCAP];
3663
3664         if (tb[RTA_ENCAP_TYPE]) {
3665                 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
3666
3667                 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
3668                 if (err < 0)
3669                         goto errout;
3670         }
3671
3672         if (tb[RTA_EXPIRES]) {
3673                 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
3674
3675                 if (addrconf_finite_timeout(timeout)) {
3676                         cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
3677                         cfg->fc_flags |= RTF_EXPIRES;
3678                 }
3679         }
3680
3681         err = 0;
3682 errout:
3683         return err;
3684 }
3685
3686 struct rt6_nh {
3687         struct rt6_info *rt6_info;
3688         struct fib6_config r_cfg;
3689         struct mx6_config mxc;
3690         struct list_head next;
3691 };
3692
3693 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
3694 {
3695         struct rt6_nh *nh;
3696
3697         list_for_each_entry(nh, rt6_nh_list, next) {
3698                 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
3699                         &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
3700                         nh->r_cfg.fc_ifindex);
3701         }
3702 }
3703
3704 static int ip6_route_info_append(struct list_head *rt6_nh_list,
3705                                  struct rt6_info *rt, struct fib6_config *r_cfg)
3706 {
3707         struct rt6_nh *nh;
3708         int err = -EEXIST;
3709
3710         list_for_each_entry(nh, rt6_nh_list, next) {
3711                 /* check if rt6_info already exists */
3712                 if (rt6_duplicate_nexthop(nh->rt6_info, rt))
3713                         return err;
3714         }
3715
3716         nh = kzalloc(sizeof(*nh), GFP_KERNEL);
3717         if (!nh)
3718                 return -ENOMEM;
3719         nh->rt6_info = rt;
3720         err = ip6_convert_metrics(&nh->mxc, r_cfg);
3721         if (err) {
3722                 kfree(nh);
3723                 return err;
3724         }
3725         memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
3726         list_add_tail(&nh->next, rt6_nh_list);
3727
3728         return 0;
3729 }
3730
3731 static void ip6_route_mpath_notify(struct rt6_info *rt,
3732                                    struct rt6_info *rt_last,
3733                                    struct nl_info *info,
3734                                    __u16 nlflags)
3735 {
3736         /* if this is an APPEND route, then rt points to the first route
3737          * inserted and rt_last points to last route inserted. Userspace
3738          * wants a consistent dump of the route which starts at the first
3739          * nexthop. Since sibling routes are always added at the end of
3740          * the list, find the first sibling of the last route appended
3741          */
3742         if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->rt6i_nsiblings) {
3743                 rt = list_first_entry(&rt_last->rt6i_siblings,
3744                                       struct rt6_info,
3745                                       rt6i_siblings);
3746         }
3747
3748         if (rt)
3749                 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
3750 }
3751
3752 static int ip6_route_multipath_add(struct fib6_config *cfg,
3753                                    struct netlink_ext_ack *extack)
3754 {
3755         struct rt6_info *rt_notif = NULL, *rt_last = NULL;
3756         struct nl_info *info = &cfg->fc_nlinfo;
3757         struct fib6_config r_cfg;
3758         struct rtnexthop *rtnh;
3759         struct rt6_info *rt;
3760         struct rt6_nh *err_nh;
3761         struct rt6_nh *nh, *nh_safe;
3762         __u16 nlflags;
3763         int remaining;
3764         int attrlen;
3765         int err = 1;
3766         int nhn = 0;
3767         int replace = (cfg->fc_nlinfo.nlh &&
3768                        (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
3769         LIST_HEAD(rt6_nh_list);
3770
3771         nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
3772         if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
3773                 nlflags |= NLM_F_APPEND;
3774
3775         remaining = cfg->fc_mp_len;
3776         rtnh = (struct rtnexthop *)cfg->fc_mp;
3777
3778         /* Parse a Multipath Entry and build a list (rt6_nh_list) of
3779          * rt6_info structs per nexthop
3780          */
3781         while (rtnh_ok(rtnh, remaining)) {
3782                 memcpy(&r_cfg, cfg, sizeof(*cfg));
3783                 if (rtnh->rtnh_ifindex)
3784                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
3785
3786                 attrlen = rtnh_attrlen(rtnh);
3787                 if (attrlen > 0) {
3788                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
3789
3790                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
3791                         if (nla) {
3792                                 r_cfg.fc_gateway = nla_get_in6_addr(nla);
3793                                 r_cfg.fc_flags |= RTF_GATEWAY;
3794                         }
3795                         r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
3796                         nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
3797                         if (nla)
3798                                 r_cfg.fc_encap_type = nla_get_u16(nla);
3799                 }
3800
3801                 rt = ip6_route_info_create(&r_cfg, extack);
3802                 if (IS_ERR(rt)) {
3803                         err = PTR_ERR(rt);
3804                         rt = NULL;
3805                         goto cleanup;
3806                 }
3807
3808                 err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg);
3809                 if (err) {
3810                         dst_release_immediate(&rt->dst);
3811                         goto cleanup;
3812                 }
3813
3814                 rtnh = rtnh_next(rtnh, &remaining);
3815         }
3816
3817         /* for add and replace send one notification with all nexthops.
3818          * Skip the notification in fib6_add_rt2node and send one with
3819          * the full route when done
3820          */
3821         info->skip_notify = 1;
3822
3823         err_nh = NULL;
3824         list_for_each_entry(nh, &rt6_nh_list, next) {
3825                 rt_last = nh->rt6_info;
3826                 err = __ip6_ins_rt(nh->rt6_info, info, &nh->mxc, extack);
3827                 /* save reference to first route for notification */
3828                 if (!rt_notif && !err)
3829                         rt_notif = nh->rt6_info;
3830
3831                 /* nh->rt6_info is used or freed at this point, reset to NULL*/
3832                 nh->rt6_info = NULL;
3833                 if (err) {
3834                         if (replace && nhn)
3835                                 ip6_print_replace_route_err(&rt6_nh_list);
3836                         err_nh = nh;
3837                         goto add_errout;
3838                 }
3839
3840                 /* Because each route is added like a single route we remove
3841                  * these flags after the first nexthop: if there is a collision,
3842                  * we have already failed to add the first nexthop:
3843                  * fib6_add_rt2node() has rejected it; when replacing, old
3844                  * nexthops have been replaced by first new, the rest should
3845                  * be added to it.
3846                  */
3847                 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
3848                                                      NLM_F_REPLACE);
3849                 nhn++;
3850         }
3851
3852         /* success ... tell user about new route */
3853         ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
3854         goto cleanup;
3855
3856 add_errout:
3857         /* send notification for routes that were added so that
3858          * the delete notifications sent by ip6_route_del are
3859          * coherent
3860          */
3861         if (rt_notif)
3862                 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
3863
3864         /* Delete routes that were already added */
3865         list_for_each_entry(nh, &rt6_nh_list, next) {
3866                 if (err_nh == nh)
3867                         break;
3868                 ip6_route_del(&nh->r_cfg, extack);
3869         }
3870
3871 cleanup:
3872         list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
3873                 if (nh->rt6_info)
3874                         dst_release_immediate(&nh->rt6_info->dst);
3875                 kfree(nh->mxc.mx);
3876                 list_del(&nh->next);
3877                 kfree(nh);
3878         }
3879
3880         return err;
3881 }
3882
3883 static int ip6_route_multipath_del(struct fib6_config *cfg,
3884                                    struct netlink_ext_ack *extack)
3885 {
3886         struct fib6_config r_cfg;
3887         struct rtnexthop *rtnh;
3888         int remaining;
3889         int attrlen;
3890         int err = 1, last_err = 0;
3891
3892         remaining = cfg->fc_mp_len;
3893         rtnh = (struct rtnexthop *)cfg->fc_mp;
3894
3895         /* Parse a Multipath Entry */
3896         while (rtnh_ok(rtnh, remaining)) {
3897                 memcpy(&r_cfg, cfg, sizeof(*cfg));
3898                 if (rtnh->rtnh_ifindex)
3899                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
3900
3901                 attrlen = rtnh_attrlen(rtnh);
3902                 if (attrlen > 0) {
3903                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
3904
3905                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
3906                         if (nla) {
3907                                 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
3908                                 r_cfg.fc_flags |= RTF_GATEWAY;
3909                         }
3910                 }
3911                 err = ip6_route_del(&r_cfg, extack);
3912                 if (err)
3913                         last_err = err;
3914
3915                 rtnh = rtnh_next(rtnh, &remaining);
3916         }
3917
3918         return last_err;
3919 }
3920
3921 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
3922                               struct netlink_ext_ack *extack)
3923 {
3924         struct fib6_config cfg;
3925         int err;
3926
3927         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
3928         if (err < 0)
3929                 return err;
3930
3931         if (cfg.fc_mp)
3932                 return ip6_route_multipath_del(&cfg, extack);
3933         else {
3934                 cfg.fc_delete_all_nh = 1;
3935                 return ip6_route_del(&cfg, extack);
3936         }
3937 }
3938
3939 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
3940                               struct netlink_ext_ack *extack)
3941 {
3942         struct fib6_config cfg;
3943         int err;
3944
3945         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
3946         if (err < 0)
3947                 return err;
3948
3949         if (cfg.fc_mp)
3950                 return ip6_route_multipath_add(&cfg, extack);
3951         else
3952                 return ip6_route_add(&cfg, extack);
3953 }
3954
3955 static size_t rt6_nlmsg_size(struct rt6_info *rt)
3956 {
3957         int nexthop_len = 0;
3958
3959         if (rt->rt6i_nsiblings) {
3960                 nexthop_len = nla_total_size(0)  /* RTA_MULTIPATH */
3961                             + NLA_ALIGN(sizeof(struct rtnexthop))
3962                             + nla_total_size(16) /* RTA_GATEWAY */
3963                             + lwtunnel_get_encap_size(rt->dst.lwtstate);
3964
3965                 nexthop_len *= rt->rt6i_nsiblings;
3966         }
3967
3968         return NLMSG_ALIGN(sizeof(struct rtmsg))
3969                + nla_total_size(16) /* RTA_SRC */
3970                + nla_total_size(16) /* RTA_DST */
3971                + nla_total_size(16) /* RTA_GATEWAY */
3972                + nla_total_size(16) /* RTA_PREFSRC */
3973                + nla_total_size(4) /* RTA_TABLE */
3974                + nla_total_size(4) /* RTA_IIF */
3975                + nla_total_size(4) /* RTA_OIF */
3976                + nla_total_size(4) /* RTA_PRIORITY */
3977                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
3978                + nla_total_size(sizeof(struct rta_cacheinfo))
3979                + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
3980                + nla_total_size(1) /* RTA_PREF */
3981                + lwtunnel_get_encap_size(rt->dst.lwtstate)
3982                + nexthop_len;
3983 }
3984
3985 static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt,
3986                             unsigned int *flags, bool skip_oif)
3987 {
3988         if (!netif_running(rt->dst.dev) || !netif_carrier_ok(rt->dst.dev)) {
3989                 *flags |= RTNH_F_LINKDOWN;
3990                 if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
3991                         *flags |= RTNH_F_DEAD;
3992         }
3993
3994         if (rt->rt6i_flags & RTF_GATEWAY) {
3995                 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0)
3996                         goto nla_put_failure;
3997         }
3998
3999         if (rt->rt6i_nh_flags & RTNH_F_OFFLOAD)
4000                 *flags |= RTNH_F_OFFLOAD;
4001
4002         /* not needed for multipath encoding b/c it has a rtnexthop struct */
4003         if (!skip_oif && rt->dst.dev &&
4004             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
4005                 goto nla_put_failure;
4006
4007         if (rt->dst.lwtstate &&
4008             lwtunnel_fill_encap(skb, rt->dst.lwtstate) < 0)
4009                 goto nla_put_failure;
4010
4011         return 0;
4012
4013 nla_put_failure:
4014         return -EMSGSIZE;
4015 }
4016
4017 /* add multipath next hop */
4018 static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt)
4019 {
4020         struct rtnexthop *rtnh;
4021         unsigned int flags = 0;
4022
4023         rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
4024         if (!rtnh)
4025                 goto nla_put_failure;
4026
4027         rtnh->rtnh_hops = 0;
4028         rtnh->rtnh_ifindex = rt->dst.dev ? rt->dst.dev->ifindex : 0;
4029
4030         if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
4031                 goto nla_put_failure;
4032
4033         rtnh->rtnh_flags = flags;
4034
4035         /* length of rtnetlink header + attributes */
4036         rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
4037
4038         return 0;
4039
4040 nla_put_failure:
4041         return -EMSGSIZE;
4042 }
4043
4044 static int rt6_fill_node(struct net *net,
4045                          struct sk_buff *skb, struct rt6_info *rt,
4046                          struct in6_addr *dst, struct in6_addr *src,
4047                          int iif, int type, u32 portid, u32 seq,
4048                          unsigned int flags)
4049 {
4050         u32 metrics[RTAX_MAX];
4051         struct rtmsg *rtm;
4052         struct nlmsghdr *nlh;
4053         long expires;
4054         u32 table;
4055
4056         nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4057         if (!nlh)
4058                 return -EMSGSIZE;
4059
4060         rtm = nlmsg_data(nlh);
4061         rtm->rtm_family = AF_INET6;
4062         rtm->rtm_dst_len = rt->rt6i_dst.plen;
4063         rtm->rtm_src_len = rt->rt6i_src.plen;
4064         rtm->rtm_tos = 0;
4065         if (rt->rt6i_table)
4066                 table = rt->rt6i_table->tb6_id;
4067         else
4068                 table = RT6_TABLE_UNSPEC;
4069         rtm->rtm_table = table;
4070         if (nla_put_u32(skb, RTA_TABLE, table))
4071                 goto nla_put_failure;
4072         if (rt->rt6i_flags & RTF_REJECT) {
4073                 switch (rt->dst.error) {
4074                 case -EINVAL:
4075                         rtm->rtm_type = RTN_BLACKHOLE;
4076                         break;
4077                 case -EACCES:
4078                         rtm->rtm_type = RTN_PROHIBIT;
4079                         break;
4080                 case -EAGAIN:
4081                         rtm->rtm_type = RTN_THROW;
4082                         break;
4083                 default:
4084                         rtm->rtm_type = RTN_UNREACHABLE;
4085                         break;
4086                 }
4087         }
4088         else if (rt->rt6i_flags & RTF_LOCAL)
4089                 rtm->rtm_type = RTN_LOCAL;
4090         else if (rt->rt6i_flags & RTF_ANYCAST)
4091                 rtm->rtm_type = RTN_ANYCAST;
4092         else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
4093                 rtm->rtm_type = RTN_LOCAL;
4094         else
4095                 rtm->rtm_type = RTN_UNICAST;
4096         rtm->rtm_flags = 0;
4097         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4098         rtm->rtm_protocol = rt->rt6i_protocol;
4099
4100         if (rt->rt6i_flags & RTF_CACHE)
4101                 rtm->rtm_flags |= RTM_F_CLONED;
4102
4103         if (dst) {
4104                 if (nla_put_in6_addr(skb, RTA_DST, dst))
4105                         goto nla_put_failure;
4106                 rtm->rtm_dst_len = 128;
4107         } else if (rtm->rtm_dst_len)
4108                 if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr))
4109                         goto nla_put_failure;
4110 #ifdef CONFIG_IPV6_SUBTREES
4111         if (src) {
4112                 if (nla_put_in6_addr(skb, RTA_SRC, src))
4113                         goto nla_put_failure;
4114                 rtm->rtm_src_len = 128;
4115         } else if (rtm->rtm_src_len &&
4116                    nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr))
4117                 goto nla_put_failure;
4118 #endif
4119         if (iif) {
4120 #ifdef CONFIG_IPV6_MROUTE
4121                 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
4122                         int err = ip6mr_get_route(net, skb, rtm, portid);
4123
4124                         if (err == 0)
4125                                 return 0;
4126                         if (err < 0)
4127                                 goto nla_put_failure;
4128                 } else
4129 #endif
4130                         if (nla_put_u32(skb, RTA_IIF, iif))
4131                                 goto nla_put_failure;
4132         } else if (dst) {
4133                 struct in6_addr saddr_buf;
4134                 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
4135                     nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4136                         goto nla_put_failure;
4137         }
4138
4139         if (rt->rt6i_prefsrc.plen) {
4140                 struct in6_addr saddr_buf;
4141                 saddr_buf = rt->rt6i_prefsrc.addr;
4142                 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4143                         goto nla_put_failure;
4144         }
4145
4146         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
4147         if (rt->rt6i_pmtu)
4148                 metrics[RTAX_MTU - 1] = rt->rt6i_pmtu;
4149         if (rtnetlink_put_metrics(skb, metrics) < 0)
4150                 goto nla_put_failure;
4151
4152         if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
4153                 goto nla_put_failure;
4154
4155         /* For multipath routes, walk the siblings list and add
4156          * each as a nexthop within RTA_MULTIPATH.
4157          */
4158         if (rt->rt6i_nsiblings) {
4159                 struct rt6_info *sibling, *next_sibling;
4160                 struct nlattr *mp;
4161
4162                 mp = nla_nest_start(skb, RTA_MULTIPATH);
4163                 if (!mp)
4164                         goto nla_put_failure;
4165
4166                 if (rt6_add_nexthop(skb, rt) < 0)
4167                         goto nla_put_failure;
4168
4169                 list_for_each_entry_safe(sibling, next_sibling,
4170                                          &rt->rt6i_siblings, rt6i_siblings) {
4171                         if (rt6_add_nexthop(skb, sibling) < 0)
4172                                 goto nla_put_failure;
4173                 }
4174
4175                 nla_nest_end(skb, mp);
4176         } else {
4177                 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
4178                         goto nla_put_failure;
4179         }
4180
4181         expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0;
4182
4183         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
4184                 goto nla_put_failure;
4185
4186         if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
4187                 goto nla_put_failure;
4188
4189
4190         nlmsg_end(skb, nlh);
4191         return 0;
4192
4193 nla_put_failure:
4194         nlmsg_cancel(skb, nlh);
4195         return -EMSGSIZE;
4196 }
4197
4198 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
4199 {
4200         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4201         struct net *net = arg->net;
4202
4203         if (rt == net->ipv6.ip6_null_entry)
4204                 return 0;
4205
4206         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
4207                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
4208
4209                 /* user wants prefix routes only */
4210                 if (rtm->rtm_flags & RTM_F_PREFIX &&
4211                     !(rt->rt6i_flags & RTF_PREFIX_RT)) {
4212                         /* success since this is not a prefix route */
4213                         return 1;
4214                 }
4215         }
4216
4217         return rt6_fill_node(net,
4218                      arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
4219                      NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq,
4220                      NLM_F_MULTI);
4221 }
4222
4223 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4224                               struct netlink_ext_ack *extack)
4225 {
4226         struct net *net = sock_net(in_skb->sk);
4227         struct nlattr *tb[RTA_MAX+1];
4228         int err, iif = 0, oif = 0;
4229         struct dst_entry *dst;
4230         struct rt6_info *rt;
4231         struct sk_buff *skb;
4232         struct rtmsg *rtm;
4233         struct flowi6 fl6;
4234         bool fibmatch;
4235
4236         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4237                           extack);
4238         if (err < 0)
4239                 goto errout;
4240
4241         err = -EINVAL;
4242         memset(&fl6, 0, sizeof(fl6));
4243         rtm = nlmsg_data(nlh);
4244         fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4245         fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4246
4247         if (tb[RTA_SRC]) {
4248                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4249                         goto errout;
4250
4251                 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4252         }
4253
4254         if (tb[RTA_DST]) {
4255                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4256                         goto errout;
4257
4258                 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4259         }
4260
4261         if (tb[RTA_IIF])
4262                 iif = nla_get_u32(tb[RTA_IIF]);
4263
4264         if (tb[RTA_OIF])
4265                 oif = nla_get_u32(tb[RTA_OIF]);
4266
4267         if (tb[RTA_MARK])
4268                 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4269
4270         if (tb[RTA_UID])
4271                 fl6.flowi6_uid = make_kuid(current_user_ns(),
4272                                            nla_get_u32(tb[RTA_UID]));
4273         else
4274                 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4275
4276         if (iif) {
4277                 struct net_device *dev;
4278                 int flags = 0;
4279
4280                 rcu_read_lock();
4281
4282                 dev = dev_get_by_index_rcu(net, iif);
4283                 if (!dev) {
4284                         rcu_read_unlock();
4285                         err = -ENODEV;
4286                         goto errout;
4287                 }
4288
4289                 fl6.flowi6_iif = iif;
4290
4291                 if (!ipv6_addr_any(&fl6.saddr))
4292                         flags |= RT6_LOOKUP_F_HAS_SADDR;
4293
4294                 if (!fibmatch)
4295                         dst = ip6_route_input_lookup(net, dev, &fl6, flags);
4296                 else
4297                         dst = ip6_route_lookup(net, &fl6, 0);
4298
4299                 rcu_read_unlock();
4300         } else {
4301                 fl6.flowi6_oif = oif;
4302
4303                 if (!fibmatch)
4304                         dst = ip6_route_output(net, NULL, &fl6);
4305                 else
4306                         dst = ip6_route_lookup(net, &fl6, 0);
4307         }
4308
4309
4310         rt = container_of(dst, struct rt6_info, dst);
4311         if (rt->dst.error) {
4312                 err = rt->dst.error;
4313                 ip6_rt_put(rt);
4314                 goto errout;
4315         }
4316
4317         if (rt == net->ipv6.ip6_null_entry) {
4318                 err = rt->dst.error;
4319                 ip6_rt_put(rt);
4320                 goto errout;
4321         }
4322
4323         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
4324         if (!skb) {
4325                 ip6_rt_put(rt);
4326                 err = -ENOBUFS;
4327                 goto errout;
4328         }
4329
4330         skb_dst_set(skb, &rt->dst);
4331         if (fibmatch)
4332                 err = rt6_fill_node(net, skb, rt, NULL, NULL, iif,
4333                                     RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4334                                     nlh->nlmsg_seq, 0);
4335         else
4336                 err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
4337                                     RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4338                                     nlh->nlmsg_seq, 0);
4339         if (err < 0) {
4340                 kfree_skb(skb);
4341                 goto errout;
4342         }
4343
4344         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
4345 errout:
4346         return err;
4347 }
4348
4349 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info,
4350                      unsigned int nlm_flags)
4351 {
4352         struct sk_buff *skb;
4353         struct net *net = info->nl_net;
4354         u32 seq;
4355         int err;
4356
4357         err = -ENOBUFS;
4358         seq = info->nlh ? info->nlh->nlmsg_seq : 0;
4359
4360         skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
4361         if (!skb)
4362                 goto errout;
4363
4364         err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
4365                                 event, info->portid, seq, nlm_flags);
4366         if (err < 0) {
4367                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
4368                 WARN_ON(err == -EMSGSIZE);
4369                 kfree_skb(skb);
4370                 goto errout;
4371         }
4372         rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
4373                     info->nlh, gfp_any());
4374         return;
4375 errout:
4376         if (err < 0)
4377                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
4378 }
4379
4380 static int ip6_route_dev_notify(struct notifier_block *this,
4381                                 unsigned long event, void *ptr)
4382 {
4383         struct net_device *dev = netdev_notifier_info_to_dev(ptr);
4384         struct net *net = dev_net(dev);
4385
4386         if (!(dev->flags & IFF_LOOPBACK))
4387                 return NOTIFY_OK;
4388
4389         if (event == NETDEV_REGISTER) {
4390                 net->ipv6.ip6_null_entry->dst.dev = dev;
4391                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
4392 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4393                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
4394                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
4395                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
4396                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
4397 #endif
4398          } else if (event == NETDEV_UNREGISTER &&
4399                     dev->reg_state != NETREG_UNREGISTERED) {
4400                 /* NETDEV_UNREGISTER could be fired for multiple times by
4401                  * netdev_wait_allrefs(). Make sure we only call this once.
4402                  */
4403                 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
4404 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4405                 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
4406                 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
4407 #endif
4408         }
4409
4410         return NOTIFY_OK;
4411 }
4412
4413 /*
4414  *      /proc
4415  */
4416
4417 #ifdef CONFIG_PROC_FS
4418
4419 static const struct file_operations ipv6_route_proc_fops = {
4420         .owner          = THIS_MODULE,
4421         .open           = ipv6_route_open,
4422         .read           = seq_read,
4423         .llseek         = seq_lseek,
4424         .release        = seq_release_net,
4425 };
4426
4427 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
4428 {
4429         struct net *net = (struct net *)seq->private;
4430         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
4431                    net->ipv6.rt6_stats->fib_nodes,
4432                    net->ipv6.rt6_stats->fib_route_nodes,
4433                    atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
4434                    net->ipv6.rt6_stats->fib_rt_entries,
4435                    net->ipv6.rt6_stats->fib_rt_cache,
4436                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
4437                    net->ipv6.rt6_stats->fib_discarded_routes);
4438
4439         return 0;
4440 }
4441
4442 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
4443 {
4444         return single_open_net(inode, file, rt6_stats_seq_show);
4445 }
4446
4447 static const struct file_operations rt6_stats_seq_fops = {
4448         .owner   = THIS_MODULE,
4449         .open    = rt6_stats_seq_open,
4450         .read    = seq_read,
4451         .llseek  = seq_lseek,
4452         .release = single_release_net,
4453 };
4454 #endif  /* CONFIG_PROC_FS */
4455
4456 #ifdef CONFIG_SYSCTL
4457
4458 static
4459 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
4460                               void __user *buffer, size_t *lenp, loff_t *ppos)
4461 {
4462         struct net *net;
4463         int delay;
4464         if (!write)
4465                 return -EINVAL;
4466
4467         net = (struct net *)ctl->extra1;
4468         delay = net->ipv6.sysctl.flush_delay;
4469         proc_dointvec(ctl, write, buffer, lenp, ppos);
4470         fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
4471         return 0;
4472 }
4473
4474 struct ctl_table ipv6_route_table_template[] = {
4475         {
4476                 .procname       =       "flush",
4477                 .data           =       &init_net.ipv6.sysctl.flush_delay,
4478                 .maxlen         =       sizeof(int),
4479                 .mode           =       0200,
4480                 .proc_handler   =       ipv6_sysctl_rtcache_flush
4481         },
4482         {
4483                 .procname       =       "gc_thresh",
4484                 .data           =       &ip6_dst_ops_template.gc_thresh,
4485                 .maxlen         =       sizeof(int),
4486                 .mode           =       0644,
4487                 .proc_handler   =       proc_dointvec,
4488         },
4489         {
4490                 .procname       =       "max_size",
4491                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
4492                 .maxlen         =       sizeof(int),
4493                 .mode           =       0644,
4494                 .proc_handler   =       proc_dointvec,
4495         },
4496         {
4497                 .procname       =       "gc_min_interval",
4498                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
4499                 .maxlen         =       sizeof(int),
4500                 .mode           =       0644,
4501                 .proc_handler   =       proc_dointvec_jiffies,
4502         },
4503         {
4504                 .procname       =       "gc_timeout",
4505                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
4506                 .maxlen         =       sizeof(int),
4507                 .mode           =       0644,
4508                 .proc_handler   =       proc_dointvec_jiffies,
4509         },
4510         {
4511                 .procname       =       "gc_interval",
4512                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
4513                 .maxlen         =       sizeof(int),
4514                 .mode           =       0644,
4515                 .proc_handler   =       proc_dointvec_jiffies,
4516         },
4517         {
4518                 .procname       =       "gc_elasticity",
4519                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
4520                 .maxlen         =       sizeof(int),
4521                 .mode           =       0644,
4522                 .proc_handler   =       proc_dointvec,
4523         },
4524         {
4525                 .procname       =       "mtu_expires",
4526                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
4527                 .maxlen         =       sizeof(int),
4528                 .mode           =       0644,
4529                 .proc_handler   =       proc_dointvec_jiffies,
4530         },
4531         {
4532                 .procname       =       "min_adv_mss",
4533                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
4534                 .maxlen         =       sizeof(int),
4535                 .mode           =       0644,
4536                 .proc_handler   =       proc_dointvec,
4537         },
4538         {
4539                 .procname       =       "gc_min_interval_ms",
4540                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
4541                 .maxlen         =       sizeof(int),
4542                 .mode           =       0644,
4543                 .proc_handler   =       proc_dointvec_ms_jiffies,
4544         },
4545         { }
4546 };
4547
4548 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
4549 {
4550         struct ctl_table *table;
4551
4552         table = kmemdup(ipv6_route_table_template,
4553                         sizeof(ipv6_route_table_template),
4554                         GFP_KERNEL);
4555
4556         if (table) {
4557                 table[0].data = &net->ipv6.sysctl.flush_delay;
4558                 table[0].extra1 = net;
4559                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
4560                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
4561                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
4562                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
4563                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
4564                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
4565                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
4566                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
4567                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
4568
4569                 /* Don't export sysctls to unprivileged users */
4570                 if (net->user_ns != &init_user_ns)
4571                         table[0].procname = NULL;
4572         }
4573
4574         return table;
4575 }
4576 #endif
4577
4578 static int __net_init ip6_route_net_init(struct net *net)
4579 {
4580         int ret = -ENOMEM;
4581
4582         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
4583                sizeof(net->ipv6.ip6_dst_ops));
4584
4585         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
4586                 goto out_ip6_dst_ops;
4587
4588         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
4589                                            sizeof(*net->ipv6.ip6_null_entry),
4590                                            GFP_KERNEL);
4591         if (!net->ipv6.ip6_null_entry)
4592                 goto out_ip6_dst_entries;
4593         net->ipv6.ip6_null_entry->dst.path =
4594                 (struct dst_entry *)net->ipv6.ip6_null_entry;
4595         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4596         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
4597                          ip6_template_metrics, true);
4598
4599 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4600         net->ipv6.fib6_has_custom_rules = false;
4601         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
4602                                                sizeof(*net->ipv6.ip6_prohibit_entry),
4603                                                GFP_KERNEL);
4604         if (!net->ipv6.ip6_prohibit_entry)
4605                 goto out_ip6_null_entry;
4606         net->ipv6.ip6_prohibit_entry->dst.path =
4607                 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
4608         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4609         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
4610                          ip6_template_metrics, true);
4611
4612         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
4613                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
4614                                                GFP_KERNEL);
4615         if (!net->ipv6.ip6_blk_hole_entry)
4616                 goto out_ip6_prohibit_entry;
4617         net->ipv6.ip6_blk_hole_entry->dst.path =
4618                 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
4619         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4620         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
4621                          ip6_template_metrics, true);
4622 #endif
4623
4624         net->ipv6.sysctl.flush_delay = 0;
4625         net->ipv6.sysctl.ip6_rt_max_size = 4096;
4626         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
4627         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
4628         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
4629         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
4630         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
4631         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
4632
4633         net->ipv6.ip6_rt_gc_expire = 30*HZ;
4634
4635         ret = 0;
4636 out:
4637         return ret;
4638
4639 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4640 out_ip6_prohibit_entry:
4641         kfree(net->ipv6.ip6_prohibit_entry);
4642 out_ip6_null_entry:
4643         kfree(net->ipv6.ip6_null_entry);
4644 #endif
4645 out_ip6_dst_entries:
4646         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
4647 out_ip6_dst_ops:
4648         goto out;
4649 }
4650
4651 static void __net_exit ip6_route_net_exit(struct net *net)
4652 {
4653         kfree(net->ipv6.ip6_null_entry);
4654 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4655         kfree(net->ipv6.ip6_prohibit_entry);
4656         kfree(net->ipv6.ip6_blk_hole_entry);
4657 #endif
4658         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
4659 }
4660
4661 static int __net_init ip6_route_net_init_late(struct net *net)
4662 {
4663 #ifdef CONFIG_PROC_FS
4664         proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
4665         proc_create("rt6_stats", S_IRUGO, net->proc_net, &rt6_stats_seq_fops);
4666 #endif
4667         return 0;
4668 }
4669
4670 static void __net_exit ip6_route_net_exit_late(struct net *net)
4671 {
4672 #ifdef CONFIG_PROC_FS
4673         remove_proc_entry("ipv6_route", net->proc_net);
4674         remove_proc_entry("rt6_stats", net->proc_net);
4675 #endif
4676 }
4677
4678 static struct pernet_operations ip6_route_net_ops = {
4679         .init = ip6_route_net_init,
4680         .exit = ip6_route_net_exit,
4681 };
4682
4683 static int __net_init ipv6_inetpeer_init(struct net *net)
4684 {
4685         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
4686
4687         if (!bp)
4688                 return -ENOMEM;
4689         inet_peer_base_init(bp);
4690         net->ipv6.peers = bp;
4691         return 0;
4692 }
4693
4694 static void __net_exit ipv6_inetpeer_exit(struct net *net)
4695 {
4696         struct inet_peer_base *bp = net->ipv6.peers;
4697
4698         net->ipv6.peers = NULL;
4699         inetpeer_invalidate_tree(bp);
4700         kfree(bp);
4701 }
4702
4703 static struct pernet_operations ipv6_inetpeer_ops = {
4704         .init   =       ipv6_inetpeer_init,
4705         .exit   =       ipv6_inetpeer_exit,
4706 };
4707
4708 static struct pernet_operations ip6_route_net_late_ops = {
4709         .init = ip6_route_net_init_late,
4710         .exit = ip6_route_net_exit_late,
4711 };
4712
4713 static struct notifier_block ip6_route_dev_notifier = {
4714         .notifier_call = ip6_route_dev_notify,
4715         .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
4716 };
4717
4718 void __init ip6_route_init_special_entries(void)
4719 {
4720         /* Registering of the loopback is done before this portion of code,
4721          * the loopback reference in rt6_info will not be taken, do it
4722          * manually for init_net */
4723         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
4724         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4725   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4726         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
4727         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4728         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
4729         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4730   #endif
4731 }
4732
4733 int __init ip6_route_init(void)
4734 {
4735         int ret;
4736         int cpu;
4737
4738         ret = -ENOMEM;
4739         ip6_dst_ops_template.kmem_cachep =
4740                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
4741                                   SLAB_HWCACHE_ALIGN, NULL);
4742         if (!ip6_dst_ops_template.kmem_cachep)
4743                 goto out;
4744
4745         ret = dst_entries_init(&ip6_dst_blackhole_ops);
4746         if (ret)
4747                 goto out_kmem_cache;
4748
4749         ret = register_pernet_subsys(&ipv6_inetpeer_ops);
4750         if (ret)
4751                 goto out_dst_entries;
4752
4753         ret = register_pernet_subsys(&ip6_route_net_ops);
4754         if (ret)
4755                 goto out_register_inetpeer;
4756
4757         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
4758
4759         ret = fib6_init();
4760         if (ret)
4761                 goto out_register_subsys;
4762
4763         ret = xfrm6_init();
4764         if (ret)
4765                 goto out_fib6_init;
4766
4767         ret = fib6_rules_init();
4768         if (ret)
4769                 goto xfrm6_init;
4770
4771         ret = register_pernet_subsys(&ip6_route_net_late_ops);
4772         if (ret)
4773                 goto fib6_rules_init;
4774
4775         ret = -ENOBUFS;
4776         if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, 0) ||
4777             __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, 0) ||
4778             __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL,
4779                             RTNL_FLAG_DOIT_UNLOCKED))
4780                 goto out_register_late_subsys;
4781
4782         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
4783         if (ret)
4784                 goto out_register_late_subsys;
4785
4786         for_each_possible_cpu(cpu) {
4787                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
4788
4789                 INIT_LIST_HEAD(&ul->head);
4790                 spin_lock_init(&ul->lock);
4791         }
4792
4793 out:
4794         return ret;
4795
4796 out_register_late_subsys:
4797         unregister_pernet_subsys(&ip6_route_net_late_ops);
4798 fib6_rules_init:
4799         fib6_rules_cleanup();
4800 xfrm6_init:
4801         xfrm6_fini();
4802 out_fib6_init:
4803         fib6_gc_cleanup();
4804 out_register_subsys:
4805         unregister_pernet_subsys(&ip6_route_net_ops);
4806 out_register_inetpeer:
4807         unregister_pernet_subsys(&ipv6_inetpeer_ops);
4808 out_dst_entries:
4809         dst_entries_destroy(&ip6_dst_blackhole_ops);
4810 out_kmem_cache:
4811         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
4812         goto out;
4813 }
4814
4815 void ip6_route_cleanup(void)
4816 {
4817         unregister_netdevice_notifier(&ip6_route_dev_notifier);
4818         unregister_pernet_subsys(&ip6_route_net_late_ops);
4819         fib6_rules_cleanup();
4820         xfrm6_fini();
4821         fib6_gc_cleanup();
4822         unregister_pernet_subsys(&ipv6_inetpeer_ops);
4823         unregister_pernet_subsys(&ip6_route_net_ops);
4824         dst_entries_destroy(&ip6_dst_blackhole_ops);
4825         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
4826 }