Merge branch 'for-4.15-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj...
[linux-2.6-microblaze.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #define pr_fmt(fmt) "IPv6: " fmt
28
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
49 #include <net/snmp.h>
50 #include <net/ipv6.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
55 #include <net/tcp.h>
56 #include <linux/rtnetlink.h>
57 #include <net/dst.h>
58 #include <net/dst_metadata.h>
59 #include <net/xfrm.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/nexthop.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
66 #include <trace/events/fib6.h>
67
68 #include <linux/uaccess.h>
69
70 #ifdef CONFIG_SYSCTL
71 #include <linux/sysctl.h>
72 #endif
73
74 enum rt6_nud_state {
75         RT6_NUD_FAIL_HARD = -3,
76         RT6_NUD_FAIL_PROBE = -2,
77         RT6_NUD_FAIL_DO_RR = -1,
78         RT6_NUD_SUCCEED = 1
79 };
80
81 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort);
82 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
83 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
84 static unsigned int      ip6_mtu(const struct dst_entry *dst);
85 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
86 static void             ip6_dst_destroy(struct dst_entry *);
87 static void             ip6_dst_ifdown(struct dst_entry *,
88                                        struct net_device *dev, int how);
89 static int               ip6_dst_gc(struct dst_ops *ops);
90
91 static int              ip6_pkt_discard(struct sk_buff *skb);
92 static int              ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
93 static int              ip6_pkt_prohibit(struct sk_buff *skb);
94 static int              ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
95 static void             ip6_link_failure(struct sk_buff *skb);
96 static void             ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
97                                            struct sk_buff *skb, u32 mtu);
98 static void             rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
99                                         struct sk_buff *skb);
100 static void             rt6_dst_from_metrics_check(struct rt6_info *rt);
101 static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
102 static size_t rt6_nlmsg_size(struct rt6_info *rt);
103 static int rt6_fill_node(struct net *net,
104                          struct sk_buff *skb, struct rt6_info *rt,
105                          struct in6_addr *dst, struct in6_addr *src,
106                          int iif, int type, u32 portid, u32 seq,
107                          unsigned int flags);
108 static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
109                                            struct in6_addr *daddr,
110                                            struct in6_addr *saddr);
111
112 #ifdef CONFIG_IPV6_ROUTE_INFO
113 static struct rt6_info *rt6_add_route_info(struct net *net,
114                                            const struct in6_addr *prefix, int prefixlen,
115                                            const struct in6_addr *gwaddr,
116                                            struct net_device *dev,
117                                            unsigned int pref);
118 static struct rt6_info *rt6_get_route_info(struct net *net,
119                                            const struct in6_addr *prefix, int prefixlen,
120                                            const struct in6_addr *gwaddr,
121                                            struct net_device *dev);
122 #endif
123
124 struct uncached_list {
125         spinlock_t              lock;
126         struct list_head        head;
127 };
128
129 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
130
131 static void rt6_uncached_list_add(struct rt6_info *rt)
132 {
133         struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
134
135         rt->rt6i_uncached_list = ul;
136
137         spin_lock_bh(&ul->lock);
138         list_add_tail(&rt->rt6i_uncached, &ul->head);
139         spin_unlock_bh(&ul->lock);
140 }
141
142 static void rt6_uncached_list_del(struct rt6_info *rt)
143 {
144         if (!list_empty(&rt->rt6i_uncached)) {
145                 struct uncached_list *ul = rt->rt6i_uncached_list;
146                 struct net *net = dev_net(rt->dst.dev);
147
148                 spin_lock_bh(&ul->lock);
149                 list_del(&rt->rt6i_uncached);
150                 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
151                 spin_unlock_bh(&ul->lock);
152         }
153 }
154
155 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
156 {
157         struct net_device *loopback_dev = net->loopback_dev;
158         int cpu;
159
160         if (dev == loopback_dev)
161                 return;
162
163         for_each_possible_cpu(cpu) {
164                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
165                 struct rt6_info *rt;
166
167                 spin_lock_bh(&ul->lock);
168                 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
169                         struct inet6_dev *rt_idev = rt->rt6i_idev;
170                         struct net_device *rt_dev = rt->dst.dev;
171
172                         if (rt_idev->dev == dev) {
173                                 rt->rt6i_idev = in6_dev_get(loopback_dev);
174                                 in6_dev_put(rt_idev);
175                         }
176
177                         if (rt_dev == dev) {
178                                 rt->dst.dev = loopback_dev;
179                                 dev_hold(rt->dst.dev);
180                                 dev_put(rt_dev);
181                         }
182                 }
183                 spin_unlock_bh(&ul->lock);
184         }
185 }
186
187 static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt)
188 {
189         return dst_metrics_write_ptr(rt->dst.from);
190 }
191
192 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
193 {
194         struct rt6_info *rt = (struct rt6_info *)dst;
195
196         if (rt->rt6i_flags & RTF_PCPU)
197                 return rt6_pcpu_cow_metrics(rt);
198         else if (rt->rt6i_flags & RTF_CACHE)
199                 return NULL;
200         else
201                 return dst_cow_metrics_generic(dst, old);
202 }
203
204 static inline const void *choose_neigh_daddr(struct rt6_info *rt,
205                                              struct sk_buff *skb,
206                                              const void *daddr)
207 {
208         struct in6_addr *p = &rt->rt6i_gateway;
209
210         if (!ipv6_addr_any(p))
211                 return (const void *) p;
212         else if (skb)
213                 return &ipv6_hdr(skb)->daddr;
214         return daddr;
215 }
216
217 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
218                                           struct sk_buff *skb,
219                                           const void *daddr)
220 {
221         struct rt6_info *rt = (struct rt6_info *) dst;
222         struct neighbour *n;
223
224         daddr = choose_neigh_daddr(rt, skb, daddr);
225         n = __ipv6_neigh_lookup(dst->dev, daddr);
226         if (n)
227                 return n;
228         return neigh_create(&nd_tbl, daddr, dst->dev);
229 }
230
231 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
232 {
233         struct net_device *dev = dst->dev;
234         struct rt6_info *rt = (struct rt6_info *)dst;
235
236         daddr = choose_neigh_daddr(rt, NULL, daddr);
237         if (!daddr)
238                 return;
239         if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
240                 return;
241         if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
242                 return;
243         __ipv6_confirm_neigh(dev, daddr);
244 }
245
246 static struct dst_ops ip6_dst_ops_template = {
247         .family                 =       AF_INET6,
248         .gc                     =       ip6_dst_gc,
249         .gc_thresh              =       1024,
250         .check                  =       ip6_dst_check,
251         .default_advmss         =       ip6_default_advmss,
252         .mtu                    =       ip6_mtu,
253         .cow_metrics            =       ipv6_cow_metrics,
254         .destroy                =       ip6_dst_destroy,
255         .ifdown                 =       ip6_dst_ifdown,
256         .negative_advice        =       ip6_negative_advice,
257         .link_failure           =       ip6_link_failure,
258         .update_pmtu            =       ip6_rt_update_pmtu,
259         .redirect               =       rt6_do_redirect,
260         .local_out              =       __ip6_local_out,
261         .neigh_lookup           =       ip6_neigh_lookup,
262         .confirm_neigh          =       ip6_confirm_neigh,
263 };
264
265 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
266 {
267         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
268
269         return mtu ? : dst->dev->mtu;
270 }
271
272 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
273                                          struct sk_buff *skb, u32 mtu)
274 {
275 }
276
277 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
278                                       struct sk_buff *skb)
279 {
280 }
281
282 static struct dst_ops ip6_dst_blackhole_ops = {
283         .family                 =       AF_INET6,
284         .destroy                =       ip6_dst_destroy,
285         .check                  =       ip6_dst_check,
286         .mtu                    =       ip6_blackhole_mtu,
287         .default_advmss         =       ip6_default_advmss,
288         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
289         .redirect               =       ip6_rt_blackhole_redirect,
290         .cow_metrics            =       dst_cow_metrics_generic,
291         .neigh_lookup           =       ip6_neigh_lookup,
292 };
293
294 static const u32 ip6_template_metrics[RTAX_MAX] = {
295         [RTAX_HOPLIMIT - 1] = 0,
296 };
297
298 static const struct rt6_info ip6_null_entry_template = {
299         .dst = {
300                 .__refcnt       = ATOMIC_INIT(1),
301                 .__use          = 1,
302                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
303                 .error          = -ENETUNREACH,
304                 .input          = ip6_pkt_discard,
305                 .output         = ip6_pkt_discard_out,
306         },
307         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
308         .rt6i_protocol  = RTPROT_KERNEL,
309         .rt6i_metric    = ~(u32) 0,
310         .rt6i_ref       = ATOMIC_INIT(1),
311 };
312
313 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
314
315 static const struct rt6_info ip6_prohibit_entry_template = {
316         .dst = {
317                 .__refcnt       = ATOMIC_INIT(1),
318                 .__use          = 1,
319                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
320                 .error          = -EACCES,
321                 .input          = ip6_pkt_prohibit,
322                 .output         = ip6_pkt_prohibit_out,
323         },
324         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
325         .rt6i_protocol  = RTPROT_KERNEL,
326         .rt6i_metric    = ~(u32) 0,
327         .rt6i_ref       = ATOMIC_INIT(1),
328 };
329
330 static const struct rt6_info ip6_blk_hole_entry_template = {
331         .dst = {
332                 .__refcnt       = ATOMIC_INIT(1),
333                 .__use          = 1,
334                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
335                 .error          = -EINVAL,
336                 .input          = dst_discard,
337                 .output         = dst_discard_out,
338         },
339         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
340         .rt6i_protocol  = RTPROT_KERNEL,
341         .rt6i_metric    = ~(u32) 0,
342         .rt6i_ref       = ATOMIC_INIT(1),
343 };
344
345 #endif
346
347 static void rt6_info_init(struct rt6_info *rt)
348 {
349         struct dst_entry *dst = &rt->dst;
350
351         memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
352         INIT_LIST_HEAD(&rt->rt6i_siblings);
353         INIT_LIST_HEAD(&rt->rt6i_uncached);
354 }
355
356 /* allocate dst with ip6_dst_ops */
357 static struct rt6_info *__ip6_dst_alloc(struct net *net,
358                                         struct net_device *dev,
359                                         int flags)
360 {
361         struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
362                                         1, DST_OBSOLETE_FORCE_CHK, flags);
363
364         if (rt) {
365                 rt6_info_init(rt);
366                 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
367         }
368
369         return rt;
370 }
371
372 struct rt6_info *ip6_dst_alloc(struct net *net,
373                                struct net_device *dev,
374                                int flags)
375 {
376         struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags);
377
378         if (rt) {
379                 rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
380                 if (!rt->rt6i_pcpu) {
381                         dst_release_immediate(&rt->dst);
382                         return NULL;
383                 }
384         }
385
386         return rt;
387 }
388 EXPORT_SYMBOL(ip6_dst_alloc);
389
390 static void ip6_dst_destroy(struct dst_entry *dst)
391 {
392         struct rt6_info *rt = (struct rt6_info *)dst;
393         struct rt6_exception_bucket *bucket;
394         struct dst_entry *from = dst->from;
395         struct inet6_dev *idev;
396
397         dst_destroy_metrics_generic(dst);
398         free_percpu(rt->rt6i_pcpu);
399         rt6_uncached_list_del(rt);
400
401         idev = rt->rt6i_idev;
402         if (idev) {
403                 rt->rt6i_idev = NULL;
404                 in6_dev_put(idev);
405         }
406         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1);
407         if (bucket) {
408                 rt->rt6i_exception_bucket = NULL;
409                 kfree(bucket);
410         }
411
412         dst->from = NULL;
413         dst_release(from);
414 }
415
416 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
417                            int how)
418 {
419         struct rt6_info *rt = (struct rt6_info *)dst;
420         struct inet6_dev *idev = rt->rt6i_idev;
421         struct net_device *loopback_dev =
422                 dev_net(dev)->loopback_dev;
423
424         if (idev && idev->dev != loopback_dev) {
425                 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
426                 if (loopback_idev) {
427                         rt->rt6i_idev = loopback_idev;
428                         in6_dev_put(idev);
429                 }
430         }
431 }
432
433 static bool __rt6_check_expired(const struct rt6_info *rt)
434 {
435         if (rt->rt6i_flags & RTF_EXPIRES)
436                 return time_after(jiffies, rt->dst.expires);
437         else
438                 return false;
439 }
440
441 static bool rt6_check_expired(const struct rt6_info *rt)
442 {
443         if (rt->rt6i_flags & RTF_EXPIRES) {
444                 if (time_after(jiffies, rt->dst.expires))
445                         return true;
446         } else if (rt->dst.from) {
447                 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
448                        rt6_check_expired((struct rt6_info *)rt->dst.from);
449         }
450         return false;
451 }
452
453 static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
454                                              struct flowi6 *fl6, int oif,
455                                              int strict)
456 {
457         struct rt6_info *sibling, *next_sibling;
458         int route_choosen;
459
460         /* We might have already computed the hash for ICMPv6 errors. In such
461          * case it will always be non-zero. Otherwise now is the time to do it.
462          */
463         if (!fl6->mp_hash)
464                 fl6->mp_hash = rt6_multipath_hash(fl6, NULL);
465
466         route_choosen = fl6->mp_hash % (match->rt6i_nsiblings + 1);
467         /* Don't change the route, if route_choosen == 0
468          * (siblings does not include ourself)
469          */
470         if (route_choosen)
471                 list_for_each_entry_safe(sibling, next_sibling,
472                                 &match->rt6i_siblings, rt6i_siblings) {
473                         route_choosen--;
474                         if (route_choosen == 0) {
475                                 struct inet6_dev *idev = sibling->rt6i_idev;
476
477                                 if (!netif_carrier_ok(sibling->dst.dev) &&
478                                     idev->cnf.ignore_routes_with_linkdown)
479                                         break;
480                                 if (rt6_score_route(sibling, oif, strict) < 0)
481                                         break;
482                                 match = sibling;
483                                 break;
484                         }
485                 }
486         return match;
487 }
488
489 /*
490  *      Route lookup. rcu_read_lock() should be held.
491  */
492
493 static inline struct rt6_info *rt6_device_match(struct net *net,
494                                                     struct rt6_info *rt,
495                                                     const struct in6_addr *saddr,
496                                                     int oif,
497                                                     int flags)
498 {
499         struct rt6_info *local = NULL;
500         struct rt6_info *sprt;
501
502         if (!oif && ipv6_addr_any(saddr))
503                 goto out;
504
505         for (sprt = rt; sprt; sprt = rcu_dereference(sprt->dst.rt6_next)) {
506                 struct net_device *dev = sprt->dst.dev;
507
508                 if (oif) {
509                         if (dev->ifindex == oif)
510                                 return sprt;
511                         if (dev->flags & IFF_LOOPBACK) {
512                                 if (!sprt->rt6i_idev ||
513                                     sprt->rt6i_idev->dev->ifindex != oif) {
514                                         if (flags & RT6_LOOKUP_F_IFACE)
515                                                 continue;
516                                         if (local &&
517                                             local->rt6i_idev->dev->ifindex == oif)
518                                                 continue;
519                                 }
520                                 local = sprt;
521                         }
522                 } else {
523                         if (ipv6_chk_addr(net, saddr, dev,
524                                           flags & RT6_LOOKUP_F_IFACE))
525                                 return sprt;
526                 }
527         }
528
529         if (oif) {
530                 if (local)
531                         return local;
532
533                 if (flags & RT6_LOOKUP_F_IFACE)
534                         return net->ipv6.ip6_null_entry;
535         }
536 out:
537         return rt;
538 }
539
540 #ifdef CONFIG_IPV6_ROUTER_PREF
541 struct __rt6_probe_work {
542         struct work_struct work;
543         struct in6_addr target;
544         struct net_device *dev;
545 };
546
547 static void rt6_probe_deferred(struct work_struct *w)
548 {
549         struct in6_addr mcaddr;
550         struct __rt6_probe_work *work =
551                 container_of(w, struct __rt6_probe_work, work);
552
553         addrconf_addr_solict_mult(&work->target, &mcaddr);
554         ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
555         dev_put(work->dev);
556         kfree(work);
557 }
558
559 static void rt6_probe(struct rt6_info *rt)
560 {
561         struct __rt6_probe_work *work;
562         struct neighbour *neigh;
563         /*
564          * Okay, this does not seem to be appropriate
565          * for now, however, we need to check if it
566          * is really so; aka Router Reachability Probing.
567          *
568          * Router Reachability Probe MUST be rate-limited
569          * to no more than one per minute.
570          */
571         if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
572                 return;
573         rcu_read_lock_bh();
574         neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
575         if (neigh) {
576                 if (neigh->nud_state & NUD_VALID)
577                         goto out;
578
579                 work = NULL;
580                 write_lock(&neigh->lock);
581                 if (!(neigh->nud_state & NUD_VALID) &&
582                     time_after(jiffies,
583                                neigh->updated +
584                                rt->rt6i_idev->cnf.rtr_probe_interval)) {
585                         work = kmalloc(sizeof(*work), GFP_ATOMIC);
586                         if (work)
587                                 __neigh_set_probe_once(neigh);
588                 }
589                 write_unlock(&neigh->lock);
590         } else {
591                 work = kmalloc(sizeof(*work), GFP_ATOMIC);
592         }
593
594         if (work) {
595                 INIT_WORK(&work->work, rt6_probe_deferred);
596                 work->target = rt->rt6i_gateway;
597                 dev_hold(rt->dst.dev);
598                 work->dev = rt->dst.dev;
599                 schedule_work(&work->work);
600         }
601
602 out:
603         rcu_read_unlock_bh();
604 }
605 #else
606 static inline void rt6_probe(struct rt6_info *rt)
607 {
608 }
609 #endif
610
611 /*
612  * Default Router Selection (RFC 2461 6.3.6)
613  */
614 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
615 {
616         struct net_device *dev = rt->dst.dev;
617         if (!oif || dev->ifindex == oif)
618                 return 2;
619         if ((dev->flags & IFF_LOOPBACK) &&
620             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
621                 return 1;
622         return 0;
623 }
624
625 static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
626 {
627         struct neighbour *neigh;
628         enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
629
630         if (rt->rt6i_flags & RTF_NONEXTHOP ||
631             !(rt->rt6i_flags & RTF_GATEWAY))
632                 return RT6_NUD_SUCCEED;
633
634         rcu_read_lock_bh();
635         neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
636         if (neigh) {
637                 read_lock(&neigh->lock);
638                 if (neigh->nud_state & NUD_VALID)
639                         ret = RT6_NUD_SUCCEED;
640 #ifdef CONFIG_IPV6_ROUTER_PREF
641                 else if (!(neigh->nud_state & NUD_FAILED))
642                         ret = RT6_NUD_SUCCEED;
643                 else
644                         ret = RT6_NUD_FAIL_PROBE;
645 #endif
646                 read_unlock(&neigh->lock);
647         } else {
648                 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
649                       RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
650         }
651         rcu_read_unlock_bh();
652
653         return ret;
654 }
655
656 static int rt6_score_route(struct rt6_info *rt, int oif,
657                            int strict)
658 {
659         int m;
660
661         m = rt6_check_dev(rt, oif);
662         if (!m && (strict & RT6_LOOKUP_F_IFACE))
663                 return RT6_NUD_FAIL_HARD;
664 #ifdef CONFIG_IPV6_ROUTER_PREF
665         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
666 #endif
667         if (strict & RT6_LOOKUP_F_REACHABLE) {
668                 int n = rt6_check_neigh(rt);
669                 if (n < 0)
670                         return n;
671         }
672         return m;
673 }
674
675 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
676                                    int *mpri, struct rt6_info *match,
677                                    bool *do_rr)
678 {
679         int m;
680         bool match_do_rr = false;
681         struct inet6_dev *idev = rt->rt6i_idev;
682         struct net_device *dev = rt->dst.dev;
683
684         if (dev && !netif_carrier_ok(dev) &&
685             idev->cnf.ignore_routes_with_linkdown &&
686             !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
687                 goto out;
688
689         if (rt6_check_expired(rt))
690                 goto out;
691
692         m = rt6_score_route(rt, oif, strict);
693         if (m == RT6_NUD_FAIL_DO_RR) {
694                 match_do_rr = true;
695                 m = 0; /* lowest valid score */
696         } else if (m == RT6_NUD_FAIL_HARD) {
697                 goto out;
698         }
699
700         if (strict & RT6_LOOKUP_F_REACHABLE)
701                 rt6_probe(rt);
702
703         /* note that m can be RT6_NUD_FAIL_PROBE at this point */
704         if (m > *mpri) {
705                 *do_rr = match_do_rr;
706                 *mpri = m;
707                 match = rt;
708         }
709 out:
710         return match;
711 }
712
713 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
714                                      struct rt6_info *leaf,
715                                      struct rt6_info *rr_head,
716                                      u32 metric, int oif, int strict,
717                                      bool *do_rr)
718 {
719         struct rt6_info *rt, *match, *cont;
720         int mpri = -1;
721
722         match = NULL;
723         cont = NULL;
724         for (rt = rr_head; rt; rt = rcu_dereference(rt->dst.rt6_next)) {
725                 if (rt->rt6i_metric != metric) {
726                         cont = rt;
727                         break;
728                 }
729
730                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
731         }
732
733         for (rt = leaf; rt && rt != rr_head;
734              rt = rcu_dereference(rt->dst.rt6_next)) {
735                 if (rt->rt6i_metric != metric) {
736                         cont = rt;
737                         break;
738                 }
739
740                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
741         }
742
743         if (match || !cont)
744                 return match;
745
746         for (rt = cont; rt; rt = rcu_dereference(rt->dst.rt6_next))
747                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
748
749         return match;
750 }
751
752 static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn,
753                                    int oif, int strict)
754 {
755         struct rt6_info *leaf = rcu_dereference(fn->leaf);
756         struct rt6_info *match, *rt0;
757         bool do_rr = false;
758         int key_plen;
759
760         if (!leaf || leaf == net->ipv6.ip6_null_entry)
761                 return net->ipv6.ip6_null_entry;
762
763         rt0 = rcu_dereference(fn->rr_ptr);
764         if (!rt0)
765                 rt0 = leaf;
766
767         /* Double check to make sure fn is not an intermediate node
768          * and fn->leaf does not points to its child's leaf
769          * (This might happen if all routes under fn are deleted from
770          * the tree and fib6_repair_tree() is called on the node.)
771          */
772         key_plen = rt0->rt6i_dst.plen;
773 #ifdef CONFIG_IPV6_SUBTREES
774         if (rt0->rt6i_src.plen)
775                 key_plen = rt0->rt6i_src.plen;
776 #endif
777         if (fn->fn_bit != key_plen)
778                 return net->ipv6.ip6_null_entry;
779
780         match = find_rr_leaf(fn, leaf, rt0, rt0->rt6i_metric, oif, strict,
781                              &do_rr);
782
783         if (do_rr) {
784                 struct rt6_info *next = rcu_dereference(rt0->dst.rt6_next);
785
786                 /* no entries matched; do round-robin */
787                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
788                         next = leaf;
789
790                 if (next != rt0) {
791                         spin_lock_bh(&leaf->rt6i_table->tb6_lock);
792                         /* make sure next is not being deleted from the tree */
793                         if (next->rt6i_node)
794                                 rcu_assign_pointer(fn->rr_ptr, next);
795                         spin_unlock_bh(&leaf->rt6i_table->tb6_lock);
796                 }
797         }
798
799         return match ? match : net->ipv6.ip6_null_entry;
800 }
801
802 static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt)
803 {
804         return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
805 }
806
807 #ifdef CONFIG_IPV6_ROUTE_INFO
808 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
809                   const struct in6_addr *gwaddr)
810 {
811         struct net *net = dev_net(dev);
812         struct route_info *rinfo = (struct route_info *) opt;
813         struct in6_addr prefix_buf, *prefix;
814         unsigned int pref;
815         unsigned long lifetime;
816         struct rt6_info *rt;
817
818         if (len < sizeof(struct route_info)) {
819                 return -EINVAL;
820         }
821
822         /* Sanity check for prefix_len and length */
823         if (rinfo->length > 3) {
824                 return -EINVAL;
825         } else if (rinfo->prefix_len > 128) {
826                 return -EINVAL;
827         } else if (rinfo->prefix_len > 64) {
828                 if (rinfo->length < 2) {
829                         return -EINVAL;
830                 }
831         } else if (rinfo->prefix_len > 0) {
832                 if (rinfo->length < 1) {
833                         return -EINVAL;
834                 }
835         }
836
837         pref = rinfo->route_pref;
838         if (pref == ICMPV6_ROUTER_PREF_INVALID)
839                 return -EINVAL;
840
841         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
842
843         if (rinfo->length == 3)
844                 prefix = (struct in6_addr *)rinfo->prefix;
845         else {
846                 /* this function is safe */
847                 ipv6_addr_prefix(&prefix_buf,
848                                  (struct in6_addr *)rinfo->prefix,
849                                  rinfo->prefix_len);
850                 prefix = &prefix_buf;
851         }
852
853         if (rinfo->prefix_len == 0)
854                 rt = rt6_get_dflt_router(gwaddr, dev);
855         else
856                 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
857                                         gwaddr, dev);
858
859         if (rt && !lifetime) {
860                 ip6_del_rt(rt);
861                 rt = NULL;
862         }
863
864         if (!rt && lifetime)
865                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
866                                         dev, pref);
867         else if (rt)
868                 rt->rt6i_flags = RTF_ROUTEINFO |
869                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
870
871         if (rt) {
872                 if (!addrconf_finite_timeout(lifetime))
873                         rt6_clean_expires(rt);
874                 else
875                         rt6_set_expires(rt, jiffies + HZ * lifetime);
876
877                 ip6_rt_put(rt);
878         }
879         return 0;
880 }
881 #endif
882
883 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
884                                         struct in6_addr *saddr)
885 {
886         struct fib6_node *pn, *sn;
887         while (1) {
888                 if (fn->fn_flags & RTN_TL_ROOT)
889                         return NULL;
890                 pn = rcu_dereference(fn->parent);
891                 sn = FIB6_SUBTREE(pn);
892                 if (sn && sn != fn)
893                         fn = fib6_lookup(sn, NULL, saddr);
894                 else
895                         fn = pn;
896                 if (fn->fn_flags & RTN_RTINFO)
897                         return fn;
898         }
899 }
900
901 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
902                           bool null_fallback)
903 {
904         struct rt6_info *rt = *prt;
905
906         if (dst_hold_safe(&rt->dst))
907                 return true;
908         if (null_fallback) {
909                 rt = net->ipv6.ip6_null_entry;
910                 dst_hold(&rt->dst);
911         } else {
912                 rt = NULL;
913         }
914         *prt = rt;
915         return false;
916 }
917
918 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
919                                              struct fib6_table *table,
920                                              struct flowi6 *fl6, int flags)
921 {
922         struct rt6_info *rt, *rt_cache;
923         struct fib6_node *fn;
924
925         rcu_read_lock();
926         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
927 restart:
928         rt = rcu_dereference(fn->leaf);
929         if (!rt) {
930                 rt = net->ipv6.ip6_null_entry;
931         } else {
932                 rt = rt6_device_match(net, rt, &fl6->saddr,
933                                       fl6->flowi6_oif, flags);
934                 if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
935                         rt = rt6_multipath_select(rt, fl6,
936                                                   fl6->flowi6_oif, flags);
937         }
938         if (rt == net->ipv6.ip6_null_entry) {
939                 fn = fib6_backtrack(fn, &fl6->saddr);
940                 if (fn)
941                         goto restart;
942         }
943         /* Search through exception table */
944         rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
945         if (rt_cache)
946                 rt = rt_cache;
947
948         if (ip6_hold_safe(net, &rt, true))
949                 dst_use_noref(&rt->dst, jiffies);
950
951         rcu_read_unlock();
952
953         trace_fib6_table_lookup(net, rt, table, fl6);
954
955         return rt;
956
957 }
958
959 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
960                                     int flags)
961 {
962         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
963 }
964 EXPORT_SYMBOL_GPL(ip6_route_lookup);
965
966 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
967                             const struct in6_addr *saddr, int oif, int strict)
968 {
969         struct flowi6 fl6 = {
970                 .flowi6_oif = oif,
971                 .daddr = *daddr,
972         };
973         struct dst_entry *dst;
974         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
975
976         if (saddr) {
977                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
978                 flags |= RT6_LOOKUP_F_HAS_SADDR;
979         }
980
981         dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
982         if (dst->error == 0)
983                 return (struct rt6_info *) dst;
984
985         dst_release(dst);
986
987         return NULL;
988 }
989 EXPORT_SYMBOL(rt6_lookup);
990
991 /* ip6_ins_rt is called with FREE table->tb6_lock.
992  * It takes new route entry, the addition fails by any reason the
993  * route is released.
994  * Caller must hold dst before calling it.
995  */
996
997 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
998                         struct mx6_config *mxc,
999                         struct netlink_ext_ack *extack)
1000 {
1001         int err;
1002         struct fib6_table *table;
1003
1004         table = rt->rt6i_table;
1005         spin_lock_bh(&table->tb6_lock);
1006         err = fib6_add(&table->tb6_root, rt, info, mxc, extack);
1007         spin_unlock_bh(&table->tb6_lock);
1008
1009         return err;
1010 }
1011
1012 int ip6_ins_rt(struct rt6_info *rt)
1013 {
1014         struct nl_info info = { .nl_net = dev_net(rt->dst.dev), };
1015         struct mx6_config mxc = { .mx = NULL, };
1016
1017         /* Hold dst to account for the reference from the fib6 tree */
1018         dst_hold(&rt->dst);
1019         return __ip6_ins_rt(rt, &info, &mxc, NULL);
1020 }
1021
1022 /* called with rcu_lock held */
1023 static struct net_device *ip6_rt_get_dev_rcu(struct rt6_info *rt)
1024 {
1025         struct net_device *dev = rt->dst.dev;
1026
1027         if (rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) {
1028                 /* for copies of local routes, dst->dev needs to be the
1029                  * device if it is a master device, the master device if
1030                  * device is enslaved, and the loopback as the default
1031                  */
1032                 if (netif_is_l3_slave(dev) &&
1033                     !rt6_need_strict(&rt->rt6i_dst.addr))
1034                         dev = l3mdev_master_dev_rcu(dev);
1035                 else if (!netif_is_l3_master(dev))
1036                         dev = dev_net(dev)->loopback_dev;
1037                 /* last case is netif_is_l3_master(dev) is true in which
1038                  * case we want dev returned to be dev
1039                  */
1040         }
1041
1042         return dev;
1043 }
1044
1045 static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
1046                                            const struct in6_addr *daddr,
1047                                            const struct in6_addr *saddr)
1048 {
1049         struct net_device *dev;
1050         struct rt6_info *rt;
1051
1052         /*
1053          *      Clone the route.
1054          */
1055
1056         if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
1057                 ort = (struct rt6_info *)ort->dst.from;
1058
1059         rcu_read_lock();
1060         dev = ip6_rt_get_dev_rcu(ort);
1061         rt = __ip6_dst_alloc(dev_net(dev), dev, 0);
1062         rcu_read_unlock();
1063         if (!rt)
1064                 return NULL;
1065
1066         ip6_rt_copy_init(rt, ort);
1067         rt->rt6i_flags |= RTF_CACHE;
1068         rt->rt6i_metric = 0;
1069         rt->dst.flags |= DST_HOST;
1070         rt->rt6i_dst.addr = *daddr;
1071         rt->rt6i_dst.plen = 128;
1072
1073         if (!rt6_is_gw_or_nonexthop(ort)) {
1074                 if (ort->rt6i_dst.plen != 128 &&
1075                     ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
1076                         rt->rt6i_flags |= RTF_ANYCAST;
1077 #ifdef CONFIG_IPV6_SUBTREES
1078                 if (rt->rt6i_src.plen && saddr) {
1079                         rt->rt6i_src.addr = *saddr;
1080                         rt->rt6i_src.plen = 128;
1081                 }
1082 #endif
1083         }
1084
1085         return rt;
1086 }
1087
1088 static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
1089 {
1090         struct net_device *dev;
1091         struct rt6_info *pcpu_rt;
1092
1093         rcu_read_lock();
1094         dev = ip6_rt_get_dev_rcu(rt);
1095         pcpu_rt = __ip6_dst_alloc(dev_net(dev), dev, rt->dst.flags);
1096         rcu_read_unlock();
1097         if (!pcpu_rt)
1098                 return NULL;
1099         ip6_rt_copy_init(pcpu_rt, rt);
1100         pcpu_rt->rt6i_protocol = rt->rt6i_protocol;
1101         pcpu_rt->rt6i_flags |= RTF_PCPU;
1102         return pcpu_rt;
1103 }
1104
1105 /* It should be called with rcu_read_lock() acquired */
1106 static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
1107 {
1108         struct rt6_info *pcpu_rt, **p;
1109
1110         p = this_cpu_ptr(rt->rt6i_pcpu);
1111         pcpu_rt = *p;
1112
1113         if (pcpu_rt && ip6_hold_safe(NULL, &pcpu_rt, false))
1114                 rt6_dst_from_metrics_check(pcpu_rt);
1115
1116         return pcpu_rt;
1117 }
1118
1119 static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt)
1120 {
1121         struct rt6_info *pcpu_rt, *prev, **p;
1122
1123         pcpu_rt = ip6_rt_pcpu_alloc(rt);
1124         if (!pcpu_rt) {
1125                 struct net *net = dev_net(rt->dst.dev);
1126
1127                 dst_hold(&net->ipv6.ip6_null_entry->dst);
1128                 return net->ipv6.ip6_null_entry;
1129         }
1130
1131         dst_hold(&pcpu_rt->dst);
1132         p = this_cpu_ptr(rt->rt6i_pcpu);
1133         prev = cmpxchg(p, NULL, pcpu_rt);
1134         BUG_ON(prev);
1135
1136         rt6_dst_from_metrics_check(pcpu_rt);
1137         return pcpu_rt;
1138 }
1139
1140 /* exception hash table implementation
1141  */
1142 static DEFINE_SPINLOCK(rt6_exception_lock);
1143
1144 /* Remove rt6_ex from hash table and free the memory
1145  * Caller must hold rt6_exception_lock
1146  */
1147 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1148                                  struct rt6_exception *rt6_ex)
1149 {
1150         struct net *net;
1151
1152         if (!bucket || !rt6_ex)
1153                 return;
1154
1155         net = dev_net(rt6_ex->rt6i->dst.dev);
1156         rt6_ex->rt6i->rt6i_node = NULL;
1157         hlist_del_rcu(&rt6_ex->hlist);
1158         rt6_release(rt6_ex->rt6i);
1159         kfree_rcu(rt6_ex, rcu);
1160         WARN_ON_ONCE(!bucket->depth);
1161         bucket->depth--;
1162         net->ipv6.rt6_stats->fib_rt_cache--;
1163 }
1164
1165 /* Remove oldest rt6_ex in bucket and free the memory
1166  * Caller must hold rt6_exception_lock
1167  */
1168 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1169 {
1170         struct rt6_exception *rt6_ex, *oldest = NULL;
1171
1172         if (!bucket)
1173                 return;
1174
1175         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1176                 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1177                         oldest = rt6_ex;
1178         }
1179         rt6_remove_exception(bucket, oldest);
1180 }
1181
1182 static u32 rt6_exception_hash(const struct in6_addr *dst,
1183                               const struct in6_addr *src)
1184 {
1185         static u32 seed __read_mostly;
1186         u32 val;
1187
1188         net_get_random_once(&seed, sizeof(seed));
1189         val = jhash(dst, sizeof(*dst), seed);
1190
1191 #ifdef CONFIG_IPV6_SUBTREES
1192         if (src)
1193                 val = jhash(src, sizeof(*src), val);
1194 #endif
1195         return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1196 }
1197
1198 /* Helper function to find the cached rt in the hash table
1199  * and update bucket pointer to point to the bucket for this
1200  * (daddr, saddr) pair
1201  * Caller must hold rt6_exception_lock
1202  */
1203 static struct rt6_exception *
1204 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1205                               const struct in6_addr *daddr,
1206                               const struct in6_addr *saddr)
1207 {
1208         struct rt6_exception *rt6_ex;
1209         u32 hval;
1210
1211         if (!(*bucket) || !daddr)
1212                 return NULL;
1213
1214         hval = rt6_exception_hash(daddr, saddr);
1215         *bucket += hval;
1216
1217         hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1218                 struct rt6_info *rt6 = rt6_ex->rt6i;
1219                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1220
1221 #ifdef CONFIG_IPV6_SUBTREES
1222                 if (matched && saddr)
1223                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1224 #endif
1225                 if (matched)
1226                         return rt6_ex;
1227         }
1228         return NULL;
1229 }
1230
1231 /* Helper function to find the cached rt in the hash table
1232  * and update bucket pointer to point to the bucket for this
1233  * (daddr, saddr) pair
1234  * Caller must hold rcu_read_lock()
1235  */
1236 static struct rt6_exception *
1237 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1238                          const struct in6_addr *daddr,
1239                          const struct in6_addr *saddr)
1240 {
1241         struct rt6_exception *rt6_ex;
1242         u32 hval;
1243
1244         WARN_ON_ONCE(!rcu_read_lock_held());
1245
1246         if (!(*bucket) || !daddr)
1247                 return NULL;
1248
1249         hval = rt6_exception_hash(daddr, saddr);
1250         *bucket += hval;
1251
1252         hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1253                 struct rt6_info *rt6 = rt6_ex->rt6i;
1254                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1255
1256 #ifdef CONFIG_IPV6_SUBTREES
1257                 if (matched && saddr)
1258                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1259 #endif
1260                 if (matched)
1261                         return rt6_ex;
1262         }
1263         return NULL;
1264 }
1265
1266 static int rt6_insert_exception(struct rt6_info *nrt,
1267                                 struct rt6_info *ort)
1268 {
1269         struct net *net = dev_net(ort->dst.dev);
1270         struct rt6_exception_bucket *bucket;
1271         struct in6_addr *src_key = NULL;
1272         struct rt6_exception *rt6_ex;
1273         int err = 0;
1274
1275         /* ort can't be a cache or pcpu route */
1276         if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
1277                 ort = (struct rt6_info *)ort->dst.from;
1278         WARN_ON_ONCE(ort->rt6i_flags & (RTF_CACHE | RTF_PCPU));
1279
1280         spin_lock_bh(&rt6_exception_lock);
1281
1282         if (ort->exception_bucket_flushed) {
1283                 err = -EINVAL;
1284                 goto out;
1285         }
1286
1287         bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1288                                         lockdep_is_held(&rt6_exception_lock));
1289         if (!bucket) {
1290                 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1291                                  GFP_ATOMIC);
1292                 if (!bucket) {
1293                         err = -ENOMEM;
1294                         goto out;
1295                 }
1296                 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1297         }
1298
1299 #ifdef CONFIG_IPV6_SUBTREES
1300         /* rt6i_src.plen != 0 indicates ort is in subtree
1301          * and exception table is indexed by a hash of
1302          * both rt6i_dst and rt6i_src.
1303          * Otherwise, the exception table is indexed by
1304          * a hash of only rt6i_dst.
1305          */
1306         if (ort->rt6i_src.plen)
1307                 src_key = &nrt->rt6i_src.addr;
1308 #endif
1309
1310         /* Update rt6i_prefsrc as it could be changed
1311          * in rt6_remove_prefsrc()
1312          */
1313         nrt->rt6i_prefsrc = ort->rt6i_prefsrc;
1314         /* rt6_mtu_change() might lower mtu on ort.
1315          * Only insert this exception route if its mtu
1316          * is less than ort's mtu value.
1317          */
1318         if (nrt->rt6i_pmtu >= dst_mtu(&ort->dst)) {
1319                 err = -EINVAL;
1320                 goto out;
1321         }
1322
1323         rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1324                                                src_key);
1325         if (rt6_ex)
1326                 rt6_remove_exception(bucket, rt6_ex);
1327
1328         rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1329         if (!rt6_ex) {
1330                 err = -ENOMEM;
1331                 goto out;
1332         }
1333         rt6_ex->rt6i = nrt;
1334         rt6_ex->stamp = jiffies;
1335         atomic_inc(&nrt->rt6i_ref);
1336         nrt->rt6i_node = ort->rt6i_node;
1337         hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1338         bucket->depth++;
1339         net->ipv6.rt6_stats->fib_rt_cache++;
1340
1341         if (bucket->depth > FIB6_MAX_DEPTH)
1342                 rt6_exception_remove_oldest(bucket);
1343
1344 out:
1345         spin_unlock_bh(&rt6_exception_lock);
1346
1347         /* Update fn->fn_sernum to invalidate all cached dst */
1348         if (!err) {
1349                 fib6_update_sernum(ort);
1350                 fib6_force_start_gc(net);
1351         }
1352
1353         return err;
1354 }
1355
1356 void rt6_flush_exceptions(struct rt6_info *rt)
1357 {
1358         struct rt6_exception_bucket *bucket;
1359         struct rt6_exception *rt6_ex;
1360         struct hlist_node *tmp;
1361         int i;
1362
1363         spin_lock_bh(&rt6_exception_lock);
1364         /* Prevent rt6_insert_exception() to recreate the bucket list */
1365         rt->exception_bucket_flushed = 1;
1366
1367         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1368                                     lockdep_is_held(&rt6_exception_lock));
1369         if (!bucket)
1370                 goto out;
1371
1372         for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1373                 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1374                         rt6_remove_exception(bucket, rt6_ex);
1375                 WARN_ON_ONCE(bucket->depth);
1376                 bucket++;
1377         }
1378
1379 out:
1380         spin_unlock_bh(&rt6_exception_lock);
1381 }
1382
1383 /* Find cached rt in the hash table inside passed in rt
1384  * Caller has to hold rcu_read_lock()
1385  */
1386 static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
1387                                            struct in6_addr *daddr,
1388                                            struct in6_addr *saddr)
1389 {
1390         struct rt6_exception_bucket *bucket;
1391         struct in6_addr *src_key = NULL;
1392         struct rt6_exception *rt6_ex;
1393         struct rt6_info *res = NULL;
1394
1395         bucket = rcu_dereference(rt->rt6i_exception_bucket);
1396
1397 #ifdef CONFIG_IPV6_SUBTREES
1398         /* rt6i_src.plen != 0 indicates rt is in subtree
1399          * and exception table is indexed by a hash of
1400          * both rt6i_dst and rt6i_src.
1401          * Otherwise, the exception table is indexed by
1402          * a hash of only rt6i_dst.
1403          */
1404         if (rt->rt6i_src.plen)
1405                 src_key = saddr;
1406 #endif
1407         rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1408
1409         if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1410                 res = rt6_ex->rt6i;
1411
1412         return res;
1413 }
1414
1415 /* Remove the passed in cached rt from the hash table that contains it */
1416 int rt6_remove_exception_rt(struct rt6_info *rt)
1417 {
1418         struct rt6_info *from = (struct rt6_info *)rt->dst.from;
1419         struct rt6_exception_bucket *bucket;
1420         struct in6_addr *src_key = NULL;
1421         struct rt6_exception *rt6_ex;
1422         int err;
1423
1424         if (!from ||
1425             !(rt->rt6i_flags & RTF_CACHE))
1426                 return -EINVAL;
1427
1428         if (!rcu_access_pointer(from->rt6i_exception_bucket))
1429                 return -ENOENT;
1430
1431         spin_lock_bh(&rt6_exception_lock);
1432         bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1433                                     lockdep_is_held(&rt6_exception_lock));
1434 #ifdef CONFIG_IPV6_SUBTREES
1435         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1436          * and exception table is indexed by a hash of
1437          * both rt6i_dst and rt6i_src.
1438          * Otherwise, the exception table is indexed by
1439          * a hash of only rt6i_dst.
1440          */
1441         if (from->rt6i_src.plen)
1442                 src_key = &rt->rt6i_src.addr;
1443 #endif
1444         rt6_ex = __rt6_find_exception_spinlock(&bucket,
1445                                                &rt->rt6i_dst.addr,
1446                                                src_key);
1447         if (rt6_ex) {
1448                 rt6_remove_exception(bucket, rt6_ex);
1449                 err = 0;
1450         } else {
1451                 err = -ENOENT;
1452         }
1453
1454         spin_unlock_bh(&rt6_exception_lock);
1455         return err;
1456 }
1457
1458 /* Find rt6_ex which contains the passed in rt cache and
1459  * refresh its stamp
1460  */
1461 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1462 {
1463         struct rt6_info *from = (struct rt6_info *)rt->dst.from;
1464         struct rt6_exception_bucket *bucket;
1465         struct in6_addr *src_key = NULL;
1466         struct rt6_exception *rt6_ex;
1467
1468         if (!from ||
1469             !(rt->rt6i_flags & RTF_CACHE))
1470                 return;
1471
1472         rcu_read_lock();
1473         bucket = rcu_dereference(from->rt6i_exception_bucket);
1474
1475 #ifdef CONFIG_IPV6_SUBTREES
1476         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1477          * and exception table is indexed by a hash of
1478          * both rt6i_dst and rt6i_src.
1479          * Otherwise, the exception table is indexed by
1480          * a hash of only rt6i_dst.
1481          */
1482         if (from->rt6i_src.plen)
1483                 src_key = &rt->rt6i_src.addr;
1484 #endif
1485         rt6_ex = __rt6_find_exception_rcu(&bucket,
1486                                           &rt->rt6i_dst.addr,
1487                                           src_key);
1488         if (rt6_ex)
1489                 rt6_ex->stamp = jiffies;
1490
1491         rcu_read_unlock();
1492 }
1493
1494 static void rt6_exceptions_remove_prefsrc(struct rt6_info *rt)
1495 {
1496         struct rt6_exception_bucket *bucket;
1497         struct rt6_exception *rt6_ex;
1498         int i;
1499
1500         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1501                                         lockdep_is_held(&rt6_exception_lock));
1502
1503         if (bucket) {
1504                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1505                         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1506                                 rt6_ex->rt6i->rt6i_prefsrc.plen = 0;
1507                         }
1508                         bucket++;
1509                 }
1510         }
1511 }
1512
1513 static void rt6_exceptions_update_pmtu(struct rt6_info *rt, int mtu)
1514 {
1515         struct rt6_exception_bucket *bucket;
1516         struct rt6_exception *rt6_ex;
1517         int i;
1518
1519         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1520                                         lockdep_is_held(&rt6_exception_lock));
1521
1522         if (bucket) {
1523                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1524                         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1525                                 struct rt6_info *entry = rt6_ex->rt6i;
1526                                 /* For RTF_CACHE with rt6i_pmtu == 0
1527                                  * (i.e. a redirected route),
1528                                  * the metrics of its rt->dst.from has already
1529                                  * been updated.
1530                                  */
1531                                 if (entry->rt6i_pmtu && entry->rt6i_pmtu > mtu)
1532                                         entry->rt6i_pmtu = mtu;
1533                         }
1534                         bucket++;
1535                 }
1536         }
1537 }
1538
1539 #define RTF_CACHE_GATEWAY       (RTF_GATEWAY | RTF_CACHE)
1540
1541 static void rt6_exceptions_clean_tohost(struct rt6_info *rt,
1542                                         struct in6_addr *gateway)
1543 {
1544         struct rt6_exception_bucket *bucket;
1545         struct rt6_exception *rt6_ex;
1546         struct hlist_node *tmp;
1547         int i;
1548
1549         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1550                 return;
1551
1552         spin_lock_bh(&rt6_exception_lock);
1553         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1554                                      lockdep_is_held(&rt6_exception_lock));
1555
1556         if (bucket) {
1557                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1558                         hlist_for_each_entry_safe(rt6_ex, tmp,
1559                                                   &bucket->chain, hlist) {
1560                                 struct rt6_info *entry = rt6_ex->rt6i;
1561
1562                                 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1563                                     RTF_CACHE_GATEWAY &&
1564                                     ipv6_addr_equal(gateway,
1565                                                     &entry->rt6i_gateway)) {
1566                                         rt6_remove_exception(bucket, rt6_ex);
1567                                 }
1568                         }
1569                         bucket++;
1570                 }
1571         }
1572
1573         spin_unlock_bh(&rt6_exception_lock);
1574 }
1575
1576 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1577                                       struct rt6_exception *rt6_ex,
1578                                       struct fib6_gc_args *gc_args,
1579                                       unsigned long now)
1580 {
1581         struct rt6_info *rt = rt6_ex->rt6i;
1582
1583         /* we are pruning and obsoleting aged-out and non gateway exceptions
1584          * even if others have still references to them, so that on next
1585          * dst_check() such references can be dropped.
1586          * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1587          * expired, independently from their aging, as per RFC 8201 section 4
1588          */
1589         if (!(rt->rt6i_flags & RTF_EXPIRES) &&
1590             time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1591                 RT6_TRACE("aging clone %p\n", rt);
1592                 rt6_remove_exception(bucket, rt6_ex);
1593                 return;
1594         } else if (rt->rt6i_flags & RTF_GATEWAY) {
1595                 struct neighbour *neigh;
1596                 __u8 neigh_flags = 0;
1597
1598                 neigh = dst_neigh_lookup(&rt->dst, &rt->rt6i_gateway);
1599                 if (neigh) {
1600                         neigh_flags = neigh->flags;
1601                         neigh_release(neigh);
1602                 }
1603                 if (!(neigh_flags & NTF_ROUTER)) {
1604                         RT6_TRACE("purging route %p via non-router but gateway\n",
1605                                   rt);
1606                         rt6_remove_exception(bucket, rt6_ex);
1607                         return;
1608                 }
1609         } else if (__rt6_check_expired(rt)) {
1610                 RT6_TRACE("purging expired route %p\n", rt);
1611                 rt6_remove_exception(bucket, rt6_ex);
1612                 return;
1613         }
1614         gc_args->more++;
1615 }
1616
1617 void rt6_age_exceptions(struct rt6_info *rt,
1618                         struct fib6_gc_args *gc_args,
1619                         unsigned long now)
1620 {
1621         struct rt6_exception_bucket *bucket;
1622         struct rt6_exception *rt6_ex;
1623         struct hlist_node *tmp;
1624         int i;
1625
1626         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1627                 return;
1628
1629         spin_lock_bh(&rt6_exception_lock);
1630         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1631                                     lockdep_is_held(&rt6_exception_lock));
1632
1633         if (bucket) {
1634                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1635                         hlist_for_each_entry_safe(rt6_ex, tmp,
1636                                                   &bucket->chain, hlist) {
1637                                 rt6_age_examine_exception(bucket, rt6_ex,
1638                                                           gc_args, now);
1639                         }
1640                         bucket++;
1641                 }
1642         }
1643         spin_unlock_bh(&rt6_exception_lock);
1644 }
1645
1646 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1647                                int oif, struct flowi6 *fl6, int flags)
1648 {
1649         struct fib6_node *fn, *saved_fn;
1650         struct rt6_info *rt, *rt_cache;
1651         int strict = 0;
1652
1653         strict |= flags & RT6_LOOKUP_F_IFACE;
1654         strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1655         if (net->ipv6.devconf_all->forwarding == 0)
1656                 strict |= RT6_LOOKUP_F_REACHABLE;
1657
1658         rcu_read_lock();
1659
1660         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1661         saved_fn = fn;
1662
1663         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1664                 oif = 0;
1665
1666 redo_rt6_select:
1667         rt = rt6_select(net, fn, oif, strict);
1668         if (rt->rt6i_nsiblings)
1669                 rt = rt6_multipath_select(rt, fl6, oif, strict);
1670         if (rt == net->ipv6.ip6_null_entry) {
1671                 fn = fib6_backtrack(fn, &fl6->saddr);
1672                 if (fn)
1673                         goto redo_rt6_select;
1674                 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1675                         /* also consider unreachable route */
1676                         strict &= ~RT6_LOOKUP_F_REACHABLE;
1677                         fn = saved_fn;
1678                         goto redo_rt6_select;
1679                 }
1680         }
1681
1682         /*Search through exception table */
1683         rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
1684         if (rt_cache)
1685                 rt = rt_cache;
1686
1687         if (rt == net->ipv6.ip6_null_entry) {
1688                 rcu_read_unlock();
1689                 dst_hold(&rt->dst);
1690                 trace_fib6_table_lookup(net, rt, table, fl6);
1691                 return rt;
1692         } else if (rt->rt6i_flags & RTF_CACHE) {
1693                 if (ip6_hold_safe(net, &rt, true)) {
1694                         dst_use_noref(&rt->dst, jiffies);
1695                         rt6_dst_from_metrics_check(rt);
1696                 }
1697                 rcu_read_unlock();
1698                 trace_fib6_table_lookup(net, rt, table, fl6);
1699                 return rt;
1700         } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1701                             !(rt->rt6i_flags & RTF_GATEWAY))) {
1702                 /* Create a RTF_CACHE clone which will not be
1703                  * owned by the fib6 tree.  It is for the special case where
1704                  * the daddr in the skb during the neighbor look-up is different
1705                  * from the fl6->daddr used to look-up route here.
1706                  */
1707
1708                 struct rt6_info *uncached_rt;
1709
1710                 if (ip6_hold_safe(net, &rt, true)) {
1711                         dst_use_noref(&rt->dst, jiffies);
1712                 } else {
1713                         rcu_read_unlock();
1714                         uncached_rt = rt;
1715                         goto uncached_rt_out;
1716                 }
1717                 rcu_read_unlock();
1718
1719                 uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
1720                 dst_release(&rt->dst);
1721
1722                 if (uncached_rt) {
1723                         /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1724                          * No need for another dst_hold()
1725                          */
1726                         rt6_uncached_list_add(uncached_rt);
1727                         atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1728                 } else {
1729                         uncached_rt = net->ipv6.ip6_null_entry;
1730                         dst_hold(&uncached_rt->dst);
1731                 }
1732
1733 uncached_rt_out:
1734                 trace_fib6_table_lookup(net, uncached_rt, table, fl6);
1735                 return uncached_rt;
1736
1737         } else {
1738                 /* Get a percpu copy */
1739
1740                 struct rt6_info *pcpu_rt;
1741
1742                 dst_use_noref(&rt->dst, jiffies);
1743                 local_bh_disable();
1744                 pcpu_rt = rt6_get_pcpu_route(rt);
1745
1746                 if (!pcpu_rt) {
1747                         /* atomic_inc_not_zero() is needed when using rcu */
1748                         if (atomic_inc_not_zero(&rt->rt6i_ref)) {
1749                                 /* No dst_hold() on rt is needed because grabbing
1750                                  * rt->rt6i_ref makes sure rt can't be released.
1751                                  */
1752                                 pcpu_rt = rt6_make_pcpu_route(rt);
1753                                 rt6_release(rt);
1754                         } else {
1755                                 /* rt is already removed from tree */
1756                                 pcpu_rt = net->ipv6.ip6_null_entry;
1757                                 dst_hold(&pcpu_rt->dst);
1758                         }
1759                 }
1760                 local_bh_enable();
1761                 rcu_read_unlock();
1762                 trace_fib6_table_lookup(net, pcpu_rt, table, fl6);
1763                 return pcpu_rt;
1764         }
1765 }
1766 EXPORT_SYMBOL_GPL(ip6_pol_route);
1767
1768 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
1769                                             struct flowi6 *fl6, int flags)
1770 {
1771         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
1772 }
1773
1774 struct dst_entry *ip6_route_input_lookup(struct net *net,
1775                                          struct net_device *dev,
1776                                          struct flowi6 *fl6, int flags)
1777 {
1778         if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1779                 flags |= RT6_LOOKUP_F_IFACE;
1780
1781         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
1782 }
1783 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1784
1785 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1786                                   struct flow_keys *keys)
1787 {
1788         const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1789         const struct ipv6hdr *key_iph = outer_iph;
1790         const struct ipv6hdr *inner_iph;
1791         const struct icmp6hdr *icmph;
1792         struct ipv6hdr _inner_iph;
1793
1794         if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1795                 goto out;
1796
1797         icmph = icmp6_hdr(skb);
1798         if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1799             icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1800             icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1801             icmph->icmp6_type != ICMPV6_PARAMPROB)
1802                 goto out;
1803
1804         inner_iph = skb_header_pointer(skb,
1805                                        skb_transport_offset(skb) + sizeof(*icmph),
1806                                        sizeof(_inner_iph), &_inner_iph);
1807         if (!inner_iph)
1808                 goto out;
1809
1810         key_iph = inner_iph;
1811 out:
1812         memset(keys, 0, sizeof(*keys));
1813         keys->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1814         keys->addrs.v6addrs.src = key_iph->saddr;
1815         keys->addrs.v6addrs.dst = key_iph->daddr;
1816         keys->tags.flow_label = ip6_flowinfo(key_iph);
1817         keys->basic.ip_proto = key_iph->nexthdr;
1818 }
1819
1820 /* if skb is set it will be used and fl6 can be NULL */
1821 u32 rt6_multipath_hash(const struct flowi6 *fl6, const struct sk_buff *skb)
1822 {
1823         struct flow_keys hash_keys;
1824
1825         if (skb) {
1826                 ip6_multipath_l3_keys(skb, &hash_keys);
1827                 return flow_hash_from_keys(&hash_keys);
1828         }
1829
1830         return get_hash_from_flowi6(fl6);
1831 }
1832
1833 void ip6_route_input(struct sk_buff *skb)
1834 {
1835         const struct ipv6hdr *iph = ipv6_hdr(skb);
1836         struct net *net = dev_net(skb->dev);
1837         int flags = RT6_LOOKUP_F_HAS_SADDR;
1838         struct ip_tunnel_info *tun_info;
1839         struct flowi6 fl6 = {
1840                 .flowi6_iif = skb->dev->ifindex,
1841                 .daddr = iph->daddr,
1842                 .saddr = iph->saddr,
1843                 .flowlabel = ip6_flowinfo(iph),
1844                 .flowi6_mark = skb->mark,
1845                 .flowi6_proto = iph->nexthdr,
1846         };
1847
1848         tun_info = skb_tunnel_info(skb);
1849         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1850                 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
1851         if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
1852                 fl6.mp_hash = rt6_multipath_hash(&fl6, skb);
1853         skb_dst_drop(skb);
1854         skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
1855 }
1856
1857 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
1858                                              struct flowi6 *fl6, int flags)
1859 {
1860         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
1861 }
1862
1863 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
1864                                          struct flowi6 *fl6, int flags)
1865 {
1866         bool any_src;
1867
1868         if (rt6_need_strict(&fl6->daddr)) {
1869                 struct dst_entry *dst;
1870
1871                 dst = l3mdev_link_scope_lookup(net, fl6);
1872                 if (dst)
1873                         return dst;
1874         }
1875
1876         fl6->flowi6_iif = LOOPBACK_IFINDEX;
1877
1878         any_src = ipv6_addr_any(&fl6->saddr);
1879         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
1880             (fl6->flowi6_oif && any_src))
1881                 flags |= RT6_LOOKUP_F_IFACE;
1882
1883         if (!any_src)
1884                 flags |= RT6_LOOKUP_F_HAS_SADDR;
1885         else if (sk)
1886                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
1887
1888         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
1889 }
1890 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
1891
1892 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
1893 {
1894         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
1895         struct net_device *loopback_dev = net->loopback_dev;
1896         struct dst_entry *new = NULL;
1897
1898         rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
1899                        DST_OBSOLETE_DEAD, 0);
1900         if (rt) {
1901                 rt6_info_init(rt);
1902                 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
1903
1904                 new = &rt->dst;
1905                 new->__use = 1;
1906                 new->input = dst_discard;
1907                 new->output = dst_discard_out;
1908
1909                 dst_copy_metrics(new, &ort->dst);
1910
1911                 rt->rt6i_idev = in6_dev_get(loopback_dev);
1912                 rt->rt6i_gateway = ort->rt6i_gateway;
1913                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
1914                 rt->rt6i_metric = 0;
1915
1916                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1917 #ifdef CONFIG_IPV6_SUBTREES
1918                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1919 #endif
1920         }
1921
1922         dst_release(dst_orig);
1923         return new ? new : ERR_PTR(-ENOMEM);
1924 }
1925
1926 /*
1927  *      Destination cache support functions
1928  */
1929
1930 static void rt6_dst_from_metrics_check(struct rt6_info *rt)
1931 {
1932         if (rt->dst.from &&
1933             dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(rt->dst.from))
1934                 dst_init_metrics(&rt->dst, dst_metrics_ptr(rt->dst.from), true);
1935 }
1936
1937 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
1938 {
1939         u32 rt_cookie = 0;
1940
1941         if (!rt6_get_cookie_safe(rt, &rt_cookie) || rt_cookie != cookie)
1942                 return NULL;
1943
1944         if (rt6_check_expired(rt))
1945                 return NULL;
1946
1947         return &rt->dst;
1948 }
1949
1950 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
1951 {
1952         if (!__rt6_check_expired(rt) &&
1953             rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1954             rt6_check((struct rt6_info *)(rt->dst.from), cookie))
1955                 return &rt->dst;
1956         else
1957                 return NULL;
1958 }
1959
1960 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
1961 {
1962         struct rt6_info *rt;
1963
1964         rt = (struct rt6_info *) dst;
1965
1966         /* All IPV6 dsts are created with ->obsolete set to the value
1967          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1968          * into this function always.
1969          */
1970
1971         rt6_dst_from_metrics_check(rt);
1972
1973         if (rt->rt6i_flags & RTF_PCPU ||
1974             (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->dst.from))
1975                 return rt6_dst_from_check(rt, cookie);
1976         else
1977                 return rt6_check(rt, cookie);
1978 }
1979
1980 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
1981 {
1982         struct rt6_info *rt = (struct rt6_info *) dst;
1983
1984         if (rt) {
1985                 if (rt->rt6i_flags & RTF_CACHE) {
1986                         if (rt6_check_expired(rt)) {
1987                                 ip6_del_rt(rt);
1988                                 dst = NULL;
1989                         }
1990                 } else {
1991                         dst_release(dst);
1992                         dst = NULL;
1993                 }
1994         }
1995         return dst;
1996 }
1997
1998 static void ip6_link_failure(struct sk_buff *skb)
1999 {
2000         struct rt6_info *rt;
2001
2002         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2003
2004         rt = (struct rt6_info *) skb_dst(skb);
2005         if (rt) {
2006                 if (rt->rt6i_flags & RTF_CACHE) {
2007                         if (dst_hold_safe(&rt->dst))
2008                                 ip6_del_rt(rt);
2009                 } else {
2010                         struct fib6_node *fn;
2011
2012                         rcu_read_lock();
2013                         fn = rcu_dereference(rt->rt6i_node);
2014                         if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2015                                 fn->fn_sernum = -1;
2016                         rcu_read_unlock();
2017                 }
2018         }
2019 }
2020
2021 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2022 {
2023         struct net *net = dev_net(rt->dst.dev);
2024
2025         rt->rt6i_flags |= RTF_MODIFIED;
2026         rt->rt6i_pmtu = mtu;
2027         rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2028 }
2029
2030 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2031 {
2032         return !(rt->rt6i_flags & RTF_CACHE) &&
2033                 (rt->rt6i_flags & RTF_PCPU ||
2034                  rcu_access_pointer(rt->rt6i_node));
2035 }
2036
2037 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2038                                  const struct ipv6hdr *iph, u32 mtu)
2039 {
2040         const struct in6_addr *daddr, *saddr;
2041         struct rt6_info *rt6 = (struct rt6_info *)dst;
2042
2043         if (rt6->rt6i_flags & RTF_LOCAL)
2044                 return;
2045
2046         if (dst_metric_locked(dst, RTAX_MTU))
2047                 return;
2048
2049         if (iph) {
2050                 daddr = &iph->daddr;
2051                 saddr = &iph->saddr;
2052         } else if (sk) {
2053                 daddr = &sk->sk_v6_daddr;
2054                 saddr = &inet6_sk(sk)->saddr;
2055         } else {
2056                 daddr = NULL;
2057                 saddr = NULL;
2058         }
2059         dst_confirm_neigh(dst, daddr);
2060         mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2061         if (mtu >= dst_mtu(dst))
2062                 return;
2063
2064         if (!rt6_cache_allowed_for_pmtu(rt6)) {
2065                 rt6_do_update_pmtu(rt6, mtu);
2066                 /* update rt6_ex->stamp for cache */
2067                 if (rt6->rt6i_flags & RTF_CACHE)
2068                         rt6_update_exception_stamp_rt(rt6);
2069         } else if (daddr) {
2070                 struct rt6_info *nrt6;
2071
2072                 nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
2073                 if (nrt6) {
2074                         rt6_do_update_pmtu(nrt6, mtu);
2075                         if (rt6_insert_exception(nrt6, rt6))
2076                                 dst_release_immediate(&nrt6->dst);
2077                 }
2078         }
2079 }
2080
2081 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2082                                struct sk_buff *skb, u32 mtu)
2083 {
2084         __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2085 }
2086
2087 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2088                      int oif, u32 mark, kuid_t uid)
2089 {
2090         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2091         struct dst_entry *dst;
2092         struct flowi6 fl6;
2093
2094         memset(&fl6, 0, sizeof(fl6));
2095         fl6.flowi6_oif = oif;
2096         fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
2097         fl6.daddr = iph->daddr;
2098         fl6.saddr = iph->saddr;
2099         fl6.flowlabel = ip6_flowinfo(iph);
2100         fl6.flowi6_uid = uid;
2101
2102         dst = ip6_route_output(net, NULL, &fl6);
2103         if (!dst->error)
2104                 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2105         dst_release(dst);
2106 }
2107 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2108
2109 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2110 {
2111         struct dst_entry *dst;
2112
2113         ip6_update_pmtu(skb, sock_net(sk), mtu,
2114                         sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid);
2115
2116         dst = __sk_dst_get(sk);
2117         if (!dst || !dst->obsolete ||
2118             dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2119                 return;
2120
2121         bh_lock_sock(sk);
2122         if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2123                 ip6_datagram_dst_update(sk, false);
2124         bh_unlock_sock(sk);
2125 }
2126 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2127
2128 /* Handle redirects */
2129 struct ip6rd_flowi {
2130         struct flowi6 fl6;
2131         struct in6_addr gateway;
2132 };
2133
2134 static struct rt6_info *__ip6_route_redirect(struct net *net,
2135                                              struct fib6_table *table,
2136                                              struct flowi6 *fl6,
2137                                              int flags)
2138 {
2139         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2140         struct rt6_info *rt, *rt_cache;
2141         struct fib6_node *fn;
2142
2143         /* Get the "current" route for this destination and
2144          * check if the redirect has come from appropriate router.
2145          *
2146          * RFC 4861 specifies that redirects should only be
2147          * accepted if they come from the nexthop to the target.
2148          * Due to the way the routes are chosen, this notion
2149          * is a bit fuzzy and one might need to check all possible
2150          * routes.
2151          */
2152
2153         rcu_read_lock();
2154         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2155 restart:
2156         for_each_fib6_node_rt_rcu(fn) {
2157                 if (rt6_check_expired(rt))
2158                         continue;
2159                 if (rt->dst.error)
2160                         break;
2161                 if (!(rt->rt6i_flags & RTF_GATEWAY))
2162                         continue;
2163                 if (fl6->flowi6_oif != rt->dst.dev->ifindex)
2164                         continue;
2165                 /* rt_cache's gateway might be different from its 'parent'
2166                  * in the case of an ip redirect.
2167                  * So we keep searching in the exception table if the gateway
2168                  * is different.
2169                  */
2170                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway)) {
2171                         rt_cache = rt6_find_cached_rt(rt,
2172                                                       &fl6->daddr,
2173                                                       &fl6->saddr);
2174                         if (rt_cache &&
2175                             ipv6_addr_equal(&rdfl->gateway,
2176                                             &rt_cache->rt6i_gateway)) {
2177                                 rt = rt_cache;
2178                                 break;
2179                         }
2180                         continue;
2181                 }
2182                 break;
2183         }
2184
2185         if (!rt)
2186                 rt = net->ipv6.ip6_null_entry;
2187         else if (rt->dst.error) {
2188                 rt = net->ipv6.ip6_null_entry;
2189                 goto out;
2190         }
2191
2192         if (rt == net->ipv6.ip6_null_entry) {
2193                 fn = fib6_backtrack(fn, &fl6->saddr);
2194                 if (fn)
2195                         goto restart;
2196         }
2197
2198 out:
2199         ip6_hold_safe(net, &rt, true);
2200
2201         rcu_read_unlock();
2202
2203         trace_fib6_table_lookup(net, rt, table, fl6);
2204         return rt;
2205 };
2206
2207 static struct dst_entry *ip6_route_redirect(struct net *net,
2208                                         const struct flowi6 *fl6,
2209                                         const struct in6_addr *gateway)
2210 {
2211         int flags = RT6_LOOKUP_F_HAS_SADDR;
2212         struct ip6rd_flowi rdfl;
2213
2214         rdfl.fl6 = *fl6;
2215         rdfl.gateway = *gateway;
2216
2217         return fib6_rule_lookup(net, &rdfl.fl6,
2218                                 flags, __ip6_route_redirect);
2219 }
2220
2221 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2222                   kuid_t uid)
2223 {
2224         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2225         struct dst_entry *dst;
2226         struct flowi6 fl6;
2227
2228         memset(&fl6, 0, sizeof(fl6));
2229         fl6.flowi6_iif = LOOPBACK_IFINDEX;
2230         fl6.flowi6_oif = oif;
2231         fl6.flowi6_mark = mark;
2232         fl6.daddr = iph->daddr;
2233         fl6.saddr = iph->saddr;
2234         fl6.flowlabel = ip6_flowinfo(iph);
2235         fl6.flowi6_uid = uid;
2236
2237         dst = ip6_route_redirect(net, &fl6, &ipv6_hdr(skb)->saddr);
2238         rt6_do_redirect(dst, NULL, skb);
2239         dst_release(dst);
2240 }
2241 EXPORT_SYMBOL_GPL(ip6_redirect);
2242
2243 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
2244                             u32 mark)
2245 {
2246         const struct ipv6hdr *iph = ipv6_hdr(skb);
2247         const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2248         struct dst_entry *dst;
2249         struct flowi6 fl6;
2250
2251         memset(&fl6, 0, sizeof(fl6));
2252         fl6.flowi6_iif = LOOPBACK_IFINDEX;
2253         fl6.flowi6_oif = oif;
2254         fl6.flowi6_mark = mark;
2255         fl6.daddr = msg->dest;
2256         fl6.saddr = iph->daddr;
2257         fl6.flowi6_uid = sock_net_uid(net, NULL);
2258
2259         dst = ip6_route_redirect(net, &fl6, &iph->saddr);
2260         rt6_do_redirect(dst, NULL, skb);
2261         dst_release(dst);
2262 }
2263
2264 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2265 {
2266         ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2267                      sk->sk_uid);
2268 }
2269 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2270
2271 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2272 {
2273         struct net_device *dev = dst->dev;
2274         unsigned int mtu = dst_mtu(dst);
2275         struct net *net = dev_net(dev);
2276
2277         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2278
2279         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2280                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2281
2282         /*
2283          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2284          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2285          * IPV6_MAXPLEN is also valid and means: "any MSS,
2286          * rely only on pmtu discovery"
2287          */
2288         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2289                 mtu = IPV6_MAXPLEN;
2290         return mtu;
2291 }
2292
2293 static unsigned int ip6_mtu(const struct dst_entry *dst)
2294 {
2295         const struct rt6_info *rt = (const struct rt6_info *)dst;
2296         unsigned int mtu = rt->rt6i_pmtu;
2297         struct inet6_dev *idev;
2298
2299         if (mtu)
2300                 goto out;
2301
2302         mtu = dst_metric_raw(dst, RTAX_MTU);
2303         if (mtu)
2304                 goto out;
2305
2306         mtu = IPV6_MIN_MTU;
2307
2308         rcu_read_lock();
2309         idev = __in6_dev_get(dst->dev);
2310         if (idev)
2311                 mtu = idev->cnf.mtu6;
2312         rcu_read_unlock();
2313
2314 out:
2315         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2316
2317         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2318 }
2319
2320 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2321                                   struct flowi6 *fl6)
2322 {
2323         struct dst_entry *dst;
2324         struct rt6_info *rt;
2325         struct inet6_dev *idev = in6_dev_get(dev);
2326         struct net *net = dev_net(dev);
2327
2328         if (unlikely(!idev))
2329                 return ERR_PTR(-ENODEV);
2330
2331         rt = ip6_dst_alloc(net, dev, 0);
2332         if (unlikely(!rt)) {
2333                 in6_dev_put(idev);
2334                 dst = ERR_PTR(-ENOMEM);
2335                 goto out;
2336         }
2337
2338         rt->dst.flags |= DST_HOST;
2339         rt->dst.input = ip6_input;
2340         rt->dst.output  = ip6_output;
2341         rt->rt6i_gateway  = fl6->daddr;
2342         rt->rt6i_dst.addr = fl6->daddr;
2343         rt->rt6i_dst.plen = 128;
2344         rt->rt6i_idev     = idev;
2345         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2346
2347         /* Add this dst into uncached_list so that rt6_ifdown() can
2348          * do proper release of the net_device
2349          */
2350         rt6_uncached_list_add(rt);
2351         atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2352
2353         dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2354
2355 out:
2356         return dst;
2357 }
2358
2359 static int ip6_dst_gc(struct dst_ops *ops)
2360 {
2361         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2362         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2363         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2364         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2365         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2366         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2367         int entries;
2368
2369         entries = dst_entries_get_fast(ops);
2370         if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2371             entries <= rt_max_size)
2372                 goto out;
2373
2374         net->ipv6.ip6_rt_gc_expire++;
2375         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2376         entries = dst_entries_get_slow(ops);
2377         if (entries < ops->gc_thresh)
2378                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2379 out:
2380         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2381         return entries > rt_max_size;
2382 }
2383
2384 static int ip6_convert_metrics(struct mx6_config *mxc,
2385                                const struct fib6_config *cfg)
2386 {
2387         struct net *net = cfg->fc_nlinfo.nl_net;
2388         bool ecn_ca = false;
2389         struct nlattr *nla;
2390         int remaining;
2391         u32 *mp;
2392
2393         if (!cfg->fc_mx)
2394                 return 0;
2395
2396         mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
2397         if (unlikely(!mp))
2398                 return -ENOMEM;
2399
2400         nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
2401                 int type = nla_type(nla);
2402                 u32 val;
2403
2404                 if (!type)
2405                         continue;
2406                 if (unlikely(type > RTAX_MAX))
2407                         goto err;
2408
2409                 if (type == RTAX_CC_ALGO) {
2410                         char tmp[TCP_CA_NAME_MAX];
2411
2412                         nla_strlcpy(tmp, nla, sizeof(tmp));
2413                         val = tcp_ca_get_key_by_name(net, tmp, &ecn_ca);
2414                         if (val == TCP_CA_UNSPEC)
2415                                 goto err;
2416                 } else {
2417                         val = nla_get_u32(nla);
2418                 }
2419                 if (type == RTAX_HOPLIMIT && val > 255)
2420                         val = 255;
2421                 if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
2422                         goto err;
2423
2424                 mp[type - 1] = val;
2425                 __set_bit(type - 1, mxc->mx_valid);
2426         }
2427
2428         if (ecn_ca) {
2429                 __set_bit(RTAX_FEATURES - 1, mxc->mx_valid);
2430                 mp[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
2431         }
2432
2433         mxc->mx = mp;
2434         return 0;
2435  err:
2436         kfree(mp);
2437         return -EINVAL;
2438 }
2439
2440 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2441                                             struct fib6_config *cfg,
2442                                             const struct in6_addr *gw_addr)
2443 {
2444         struct flowi6 fl6 = {
2445                 .flowi6_oif = cfg->fc_ifindex,
2446                 .daddr = *gw_addr,
2447                 .saddr = cfg->fc_prefsrc,
2448         };
2449         struct fib6_table *table;
2450         struct rt6_info *rt;
2451         int flags = RT6_LOOKUP_F_IFACE | RT6_LOOKUP_F_IGNORE_LINKSTATE;
2452
2453         table = fib6_get_table(net, cfg->fc_table);
2454         if (!table)
2455                 return NULL;
2456
2457         if (!ipv6_addr_any(&cfg->fc_prefsrc))
2458                 flags |= RT6_LOOKUP_F_HAS_SADDR;
2459
2460         rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, flags);
2461
2462         /* if table lookup failed, fall back to full lookup */
2463         if (rt == net->ipv6.ip6_null_entry) {
2464                 ip6_rt_put(rt);
2465                 rt = NULL;
2466         }
2467
2468         return rt;
2469 }
2470
2471 static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
2472                                               struct netlink_ext_ack *extack)
2473 {
2474         struct net *net = cfg->fc_nlinfo.nl_net;
2475         struct rt6_info *rt = NULL;
2476         struct net_device *dev = NULL;
2477         struct inet6_dev *idev = NULL;
2478         struct fib6_table *table;
2479         int addr_type;
2480         int err = -EINVAL;
2481
2482         /* RTF_PCPU is an internal flag; can not be set by userspace */
2483         if (cfg->fc_flags & RTF_PCPU) {
2484                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
2485                 goto out;
2486         }
2487
2488         /* RTF_CACHE is an internal flag; can not be set by userspace */
2489         if (cfg->fc_flags & RTF_CACHE) {
2490                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
2491                 goto out;
2492         }
2493
2494         if (cfg->fc_dst_len > 128) {
2495                 NL_SET_ERR_MSG(extack, "Invalid prefix length");
2496                 goto out;
2497         }
2498         if (cfg->fc_src_len > 128) {
2499                 NL_SET_ERR_MSG(extack, "Invalid source address length");
2500                 goto out;
2501         }
2502 #ifndef CONFIG_IPV6_SUBTREES
2503         if (cfg->fc_src_len) {
2504                 NL_SET_ERR_MSG(extack,
2505                                "Specifying source address requires IPV6_SUBTREES to be enabled");
2506                 goto out;
2507         }
2508 #endif
2509         if (cfg->fc_ifindex) {
2510                 err = -ENODEV;
2511                 dev = dev_get_by_index(net, cfg->fc_ifindex);
2512                 if (!dev)
2513                         goto out;
2514                 idev = in6_dev_get(dev);
2515                 if (!idev)
2516                         goto out;
2517         }
2518
2519         if (cfg->fc_metric == 0)
2520                 cfg->fc_metric = IP6_RT_PRIO_USER;
2521
2522         err = -ENOBUFS;
2523         if (cfg->fc_nlinfo.nlh &&
2524             !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
2525                 table = fib6_get_table(net, cfg->fc_table);
2526                 if (!table) {
2527                         pr_warn("NLM_F_CREATE should be specified when creating new route\n");
2528                         table = fib6_new_table(net, cfg->fc_table);
2529                 }
2530         } else {
2531                 table = fib6_new_table(net, cfg->fc_table);
2532         }
2533
2534         if (!table)
2535                 goto out;
2536
2537         rt = ip6_dst_alloc(net, NULL,
2538                            (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT);
2539
2540         if (!rt) {
2541                 err = -ENOMEM;
2542                 goto out;
2543         }
2544
2545         if (cfg->fc_flags & RTF_EXPIRES)
2546                 rt6_set_expires(rt, jiffies +
2547                                 clock_t_to_jiffies(cfg->fc_expires));
2548         else
2549                 rt6_clean_expires(rt);
2550
2551         if (cfg->fc_protocol == RTPROT_UNSPEC)
2552                 cfg->fc_protocol = RTPROT_BOOT;
2553         rt->rt6i_protocol = cfg->fc_protocol;
2554
2555         addr_type = ipv6_addr_type(&cfg->fc_dst);
2556
2557         if (addr_type & IPV6_ADDR_MULTICAST)
2558                 rt->dst.input = ip6_mc_input;
2559         else if (cfg->fc_flags & RTF_LOCAL)
2560                 rt->dst.input = ip6_input;
2561         else
2562                 rt->dst.input = ip6_forward;
2563
2564         rt->dst.output = ip6_output;
2565
2566         if (cfg->fc_encap) {
2567                 struct lwtunnel_state *lwtstate;
2568
2569                 err = lwtunnel_build_state(cfg->fc_encap_type,
2570                                            cfg->fc_encap, AF_INET6, cfg,
2571                                            &lwtstate, extack);
2572                 if (err)
2573                         goto out;
2574                 rt->dst.lwtstate = lwtstate_get(lwtstate);
2575                 if (lwtunnel_output_redirect(rt->dst.lwtstate)) {
2576                         rt->dst.lwtstate->orig_output = rt->dst.output;
2577                         rt->dst.output = lwtunnel_output;
2578                 }
2579                 if (lwtunnel_input_redirect(rt->dst.lwtstate)) {
2580                         rt->dst.lwtstate->orig_input = rt->dst.input;
2581                         rt->dst.input = lwtunnel_input;
2582                 }
2583         }
2584
2585         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
2586         rt->rt6i_dst.plen = cfg->fc_dst_len;
2587         if (rt->rt6i_dst.plen == 128)
2588                 rt->dst.flags |= DST_HOST;
2589
2590 #ifdef CONFIG_IPV6_SUBTREES
2591         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
2592         rt->rt6i_src.plen = cfg->fc_src_len;
2593 #endif
2594
2595         rt->rt6i_metric = cfg->fc_metric;
2596
2597         /* We cannot add true routes via loopback here,
2598            they would result in kernel looping; promote them to reject routes
2599          */
2600         if ((cfg->fc_flags & RTF_REJECT) ||
2601             (dev && (dev->flags & IFF_LOOPBACK) &&
2602              !(addr_type & IPV6_ADDR_LOOPBACK) &&
2603              !(cfg->fc_flags & RTF_LOCAL))) {
2604                 /* hold loopback dev/idev if we haven't done so. */
2605                 if (dev != net->loopback_dev) {
2606                         if (dev) {
2607                                 dev_put(dev);
2608                                 in6_dev_put(idev);
2609                         }
2610                         dev = net->loopback_dev;
2611                         dev_hold(dev);
2612                         idev = in6_dev_get(dev);
2613                         if (!idev) {
2614                                 err = -ENODEV;
2615                                 goto out;
2616                         }
2617                 }
2618                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
2619                 switch (cfg->fc_type) {
2620                 case RTN_BLACKHOLE:
2621                         rt->dst.error = -EINVAL;
2622                         rt->dst.output = dst_discard_out;
2623                         rt->dst.input = dst_discard;
2624                         break;
2625                 case RTN_PROHIBIT:
2626                         rt->dst.error = -EACCES;
2627                         rt->dst.output = ip6_pkt_prohibit_out;
2628                         rt->dst.input = ip6_pkt_prohibit;
2629                         break;
2630                 case RTN_THROW:
2631                 case RTN_UNREACHABLE:
2632                 default:
2633                         rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN
2634                                         : (cfg->fc_type == RTN_UNREACHABLE)
2635                                         ? -EHOSTUNREACH : -ENETUNREACH;
2636                         rt->dst.output = ip6_pkt_discard_out;
2637                         rt->dst.input = ip6_pkt_discard;
2638                         break;
2639                 }
2640                 goto install_route;
2641         }
2642
2643         if (cfg->fc_flags & RTF_GATEWAY) {
2644                 const struct in6_addr *gw_addr;
2645                 int gwa_type;
2646
2647                 gw_addr = &cfg->fc_gateway;
2648                 gwa_type = ipv6_addr_type(gw_addr);
2649
2650                 /* if gw_addr is local we will fail to detect this in case
2651                  * address is still TENTATIVE (DAD in progress). rt6_lookup()
2652                  * will return already-added prefix route via interface that
2653                  * prefix route was assigned to, which might be non-loopback.
2654                  */
2655                 err = -EINVAL;
2656                 if (ipv6_chk_addr_and_flags(net, gw_addr,
2657                                             gwa_type & IPV6_ADDR_LINKLOCAL ?
2658                                             dev : NULL, 0, 0)) {
2659                         NL_SET_ERR_MSG(extack, "Invalid gateway address");
2660                         goto out;
2661                 }
2662                 rt->rt6i_gateway = *gw_addr;
2663
2664                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
2665                         struct rt6_info *grt = NULL;
2666
2667                         /* IPv6 strictly inhibits using not link-local
2668                            addresses as nexthop address.
2669                            Otherwise, router will not able to send redirects.
2670                            It is very good, but in some (rare!) circumstances
2671                            (SIT, PtP, NBMA NOARP links) it is handy to allow
2672                            some exceptions. --ANK
2673                            We allow IPv4-mapped nexthops to support RFC4798-type
2674                            addressing
2675                          */
2676                         if (!(gwa_type & (IPV6_ADDR_UNICAST |
2677                                           IPV6_ADDR_MAPPED))) {
2678                                 NL_SET_ERR_MSG(extack,
2679                                                "Invalid gateway address");
2680                                 goto out;
2681                         }
2682
2683                         if (cfg->fc_table) {
2684                                 grt = ip6_nh_lookup_table(net, cfg, gw_addr);
2685
2686                                 if (grt) {
2687                                         if (grt->rt6i_flags & RTF_GATEWAY ||
2688                                             (dev && dev != grt->dst.dev)) {
2689                                                 ip6_rt_put(grt);
2690                                                 grt = NULL;
2691                                         }
2692                                 }
2693                         }
2694
2695                         if (!grt)
2696                                 grt = rt6_lookup(net, gw_addr, NULL,
2697                                                  cfg->fc_ifindex, 1);
2698
2699                         err = -EHOSTUNREACH;
2700                         if (!grt)
2701                                 goto out;
2702                         if (dev) {
2703                                 if (dev != grt->dst.dev) {
2704                                         ip6_rt_put(grt);
2705                                         goto out;
2706                                 }
2707                         } else {
2708                                 dev = grt->dst.dev;
2709                                 idev = grt->rt6i_idev;
2710                                 dev_hold(dev);
2711                                 in6_dev_hold(grt->rt6i_idev);
2712                         }
2713                         if (!(grt->rt6i_flags & RTF_GATEWAY))
2714                                 err = 0;
2715                         ip6_rt_put(grt);
2716
2717                         if (err)
2718                                 goto out;
2719                 }
2720                 err = -EINVAL;
2721                 if (!dev) {
2722                         NL_SET_ERR_MSG(extack, "Egress device not specified");
2723                         goto out;
2724                 } else if (dev->flags & IFF_LOOPBACK) {
2725                         NL_SET_ERR_MSG(extack,
2726                                        "Egress device can not be loopback device for this route");
2727                         goto out;
2728                 }
2729         }
2730
2731         err = -ENODEV;
2732         if (!dev)
2733                 goto out;
2734
2735         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
2736                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
2737                         NL_SET_ERR_MSG(extack, "Invalid source address");
2738                         err = -EINVAL;
2739                         goto out;
2740                 }
2741                 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
2742                 rt->rt6i_prefsrc.plen = 128;
2743         } else
2744                 rt->rt6i_prefsrc.plen = 0;
2745
2746         rt->rt6i_flags = cfg->fc_flags;
2747
2748 install_route:
2749         rt->dst.dev = dev;
2750         rt->rt6i_idev = idev;
2751         rt->rt6i_table = table;
2752
2753         cfg->fc_nlinfo.nl_net = dev_net(dev);
2754
2755         return rt;
2756 out:
2757         if (dev)
2758                 dev_put(dev);
2759         if (idev)
2760                 in6_dev_put(idev);
2761         if (rt)
2762                 dst_release_immediate(&rt->dst);
2763
2764         return ERR_PTR(err);
2765 }
2766
2767 int ip6_route_add(struct fib6_config *cfg,
2768                   struct netlink_ext_ack *extack)
2769 {
2770         struct mx6_config mxc = { .mx = NULL, };
2771         struct rt6_info *rt;
2772         int err;
2773
2774         rt = ip6_route_info_create(cfg, extack);
2775         if (IS_ERR(rt)) {
2776                 err = PTR_ERR(rt);
2777                 rt = NULL;
2778                 goto out;
2779         }
2780
2781         err = ip6_convert_metrics(&mxc, cfg);
2782         if (err)
2783                 goto out;
2784
2785         err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc, extack);
2786
2787         kfree(mxc.mx);
2788
2789         return err;
2790 out:
2791         if (rt)
2792                 dst_release_immediate(&rt->dst);
2793
2794         return err;
2795 }
2796
2797 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
2798 {
2799         int err;
2800         struct fib6_table *table;
2801         struct net *net = dev_net(rt->dst.dev);
2802
2803         if (rt == net->ipv6.ip6_null_entry) {
2804                 err = -ENOENT;
2805                 goto out;
2806         }
2807
2808         table = rt->rt6i_table;
2809         spin_lock_bh(&table->tb6_lock);
2810         err = fib6_del(rt, info);
2811         spin_unlock_bh(&table->tb6_lock);
2812
2813 out:
2814         ip6_rt_put(rt);
2815         return err;
2816 }
2817
2818 int ip6_del_rt(struct rt6_info *rt)
2819 {
2820         struct nl_info info = {
2821                 .nl_net = dev_net(rt->dst.dev),
2822         };
2823         return __ip6_del_rt(rt, &info);
2824 }
2825
2826 static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg)
2827 {
2828         struct nl_info *info = &cfg->fc_nlinfo;
2829         struct net *net = info->nl_net;
2830         struct sk_buff *skb = NULL;
2831         struct fib6_table *table;
2832         int err = -ENOENT;
2833
2834         if (rt == net->ipv6.ip6_null_entry)
2835                 goto out_put;
2836         table = rt->rt6i_table;
2837         spin_lock_bh(&table->tb6_lock);
2838
2839         if (rt->rt6i_nsiblings && cfg->fc_delete_all_nh) {
2840                 struct rt6_info *sibling, *next_sibling;
2841
2842                 /* prefer to send a single notification with all hops */
2843                 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
2844                 if (skb) {
2845                         u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
2846
2847                         if (rt6_fill_node(net, skb, rt,
2848                                           NULL, NULL, 0, RTM_DELROUTE,
2849                                           info->portid, seq, 0) < 0) {
2850                                 kfree_skb(skb);
2851                                 skb = NULL;
2852                         } else
2853                                 info->skip_notify = 1;
2854                 }
2855
2856                 list_for_each_entry_safe(sibling, next_sibling,
2857                                          &rt->rt6i_siblings,
2858                                          rt6i_siblings) {
2859                         err = fib6_del(sibling, info);
2860                         if (err)
2861                                 goto out_unlock;
2862                 }
2863         }
2864
2865         err = fib6_del(rt, info);
2866 out_unlock:
2867         spin_unlock_bh(&table->tb6_lock);
2868 out_put:
2869         ip6_rt_put(rt);
2870
2871         if (skb) {
2872                 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
2873                             info->nlh, gfp_any());
2874         }
2875         return err;
2876 }
2877
2878 static int ip6_route_del(struct fib6_config *cfg,
2879                          struct netlink_ext_ack *extack)
2880 {
2881         struct rt6_info *rt, *rt_cache;
2882         struct fib6_table *table;
2883         struct fib6_node *fn;
2884         int err = -ESRCH;
2885
2886         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
2887         if (!table) {
2888                 NL_SET_ERR_MSG(extack, "FIB table does not exist");
2889                 return err;
2890         }
2891
2892         rcu_read_lock();
2893
2894         fn = fib6_locate(&table->tb6_root,
2895                          &cfg->fc_dst, cfg->fc_dst_len,
2896                          &cfg->fc_src, cfg->fc_src_len,
2897                          !(cfg->fc_flags & RTF_CACHE));
2898
2899         if (fn) {
2900                 for_each_fib6_node_rt_rcu(fn) {
2901                         if (cfg->fc_flags & RTF_CACHE) {
2902                                 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
2903                                                               &cfg->fc_src);
2904                                 if (!rt_cache)
2905                                         continue;
2906                                 rt = rt_cache;
2907                         }
2908                         if (cfg->fc_ifindex &&
2909                             (!rt->dst.dev ||
2910                              rt->dst.dev->ifindex != cfg->fc_ifindex))
2911                                 continue;
2912                         if (cfg->fc_flags & RTF_GATEWAY &&
2913                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
2914                                 continue;
2915                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
2916                                 continue;
2917                         if (cfg->fc_protocol && cfg->fc_protocol != rt->rt6i_protocol)
2918                                 continue;
2919                         if (!dst_hold_safe(&rt->dst))
2920                                 break;
2921                         rcu_read_unlock();
2922
2923                         /* if gateway was specified only delete the one hop */
2924                         if (cfg->fc_flags & RTF_GATEWAY)
2925                                 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
2926
2927                         return __ip6_del_rt_siblings(rt, cfg);
2928                 }
2929         }
2930         rcu_read_unlock();
2931
2932         return err;
2933 }
2934
2935 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
2936 {
2937         struct netevent_redirect netevent;
2938         struct rt6_info *rt, *nrt = NULL;
2939         struct ndisc_options ndopts;
2940         struct inet6_dev *in6_dev;
2941         struct neighbour *neigh;
2942         struct rd_msg *msg;
2943         int optlen, on_link;
2944         u8 *lladdr;
2945
2946         optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
2947         optlen -= sizeof(*msg);
2948
2949         if (optlen < 0) {
2950                 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
2951                 return;
2952         }
2953
2954         msg = (struct rd_msg *)icmp6_hdr(skb);
2955
2956         if (ipv6_addr_is_multicast(&msg->dest)) {
2957                 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
2958                 return;
2959         }
2960
2961         on_link = 0;
2962         if (ipv6_addr_equal(&msg->dest, &msg->target)) {
2963                 on_link = 1;
2964         } else if (ipv6_addr_type(&msg->target) !=
2965                    (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
2966                 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
2967                 return;
2968         }
2969
2970         in6_dev = __in6_dev_get(skb->dev);
2971         if (!in6_dev)
2972                 return;
2973         if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
2974                 return;
2975
2976         /* RFC2461 8.1:
2977          *      The IP source address of the Redirect MUST be the same as the current
2978          *      first-hop router for the specified ICMP Destination Address.
2979          */
2980
2981         if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
2982                 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
2983                 return;
2984         }
2985
2986         lladdr = NULL;
2987         if (ndopts.nd_opts_tgt_lladdr) {
2988                 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
2989                                              skb->dev);
2990                 if (!lladdr) {
2991                         net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
2992                         return;
2993                 }
2994         }
2995
2996         rt = (struct rt6_info *) dst;
2997         if (rt->rt6i_flags & RTF_REJECT) {
2998                 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
2999                 return;
3000         }
3001
3002         /* Redirect received -> path was valid.
3003          * Look, redirects are sent only in response to data packets,
3004          * so that this nexthop apparently is reachable. --ANK
3005          */
3006         dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3007
3008         neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3009         if (!neigh)
3010                 return;
3011
3012         /*
3013          *      We have finally decided to accept it.
3014          */
3015
3016         ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3017                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
3018                      NEIGH_UPDATE_F_OVERRIDE|
3019                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3020                                      NEIGH_UPDATE_F_ISROUTER)),
3021                      NDISC_REDIRECT, &ndopts);
3022
3023         nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL);
3024         if (!nrt)
3025                 goto out;
3026
3027         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3028         if (on_link)
3029                 nrt->rt6i_flags &= ~RTF_GATEWAY;
3030
3031         nrt->rt6i_protocol = RTPROT_REDIRECT;
3032         nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3033
3034         /* No need to remove rt from the exception table if rt is
3035          * a cached route because rt6_insert_exception() will
3036          * takes care of it
3037          */
3038         if (rt6_insert_exception(nrt, rt)) {
3039                 dst_release_immediate(&nrt->dst);
3040                 goto out;
3041         }
3042
3043         netevent.old = &rt->dst;
3044         netevent.new = &nrt->dst;
3045         netevent.daddr = &msg->dest;
3046         netevent.neigh = neigh;
3047         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3048
3049 out:
3050         neigh_release(neigh);
3051 }
3052
3053 /*
3054  *      Misc support functions
3055  */
3056
3057 static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
3058 {
3059         BUG_ON(from->dst.from);
3060
3061         rt->rt6i_flags &= ~RTF_EXPIRES;
3062         dst_hold(&from->dst);
3063         rt->dst.from = &from->dst;
3064         dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true);
3065 }
3066
3067 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
3068 {
3069         rt->dst.input = ort->dst.input;
3070         rt->dst.output = ort->dst.output;
3071         rt->rt6i_dst = ort->rt6i_dst;
3072         rt->dst.error = ort->dst.error;
3073         rt->rt6i_idev = ort->rt6i_idev;
3074         if (rt->rt6i_idev)
3075                 in6_dev_hold(rt->rt6i_idev);
3076         rt->dst.lastuse = jiffies;
3077         rt->rt6i_gateway = ort->rt6i_gateway;
3078         rt->rt6i_flags = ort->rt6i_flags;
3079         rt6_set_from(rt, ort);
3080         rt->rt6i_metric = ort->rt6i_metric;
3081 #ifdef CONFIG_IPV6_SUBTREES
3082         rt->rt6i_src = ort->rt6i_src;
3083 #endif
3084         rt->rt6i_prefsrc = ort->rt6i_prefsrc;
3085         rt->rt6i_table = ort->rt6i_table;
3086         rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate);
3087 }
3088
3089 #ifdef CONFIG_IPV6_ROUTE_INFO
3090 static struct rt6_info *rt6_get_route_info(struct net *net,
3091                                            const struct in6_addr *prefix, int prefixlen,
3092                                            const struct in6_addr *gwaddr,
3093                                            struct net_device *dev)
3094 {
3095         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3096         int ifindex = dev->ifindex;
3097         struct fib6_node *fn;
3098         struct rt6_info *rt = NULL;
3099         struct fib6_table *table;
3100
3101         table = fib6_get_table(net, tb_id);
3102         if (!table)
3103                 return NULL;
3104
3105         rcu_read_lock();
3106         fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3107         if (!fn)
3108                 goto out;
3109
3110         for_each_fib6_node_rt_rcu(fn) {
3111                 if (rt->dst.dev->ifindex != ifindex)
3112                         continue;
3113                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
3114                         continue;
3115                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
3116                         continue;
3117                 ip6_hold_safe(NULL, &rt, false);
3118                 break;
3119         }
3120 out:
3121         rcu_read_unlock();
3122         return rt;
3123 }
3124
3125 static struct rt6_info *rt6_add_route_info(struct net *net,
3126                                            const struct in6_addr *prefix, int prefixlen,
3127                                            const struct in6_addr *gwaddr,
3128                                            struct net_device *dev,
3129                                            unsigned int pref)
3130 {
3131         struct fib6_config cfg = {
3132                 .fc_metric      = IP6_RT_PRIO_USER,
3133                 .fc_ifindex     = dev->ifindex,
3134                 .fc_dst_len     = prefixlen,
3135                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3136                                   RTF_UP | RTF_PREF(pref),
3137                 .fc_protocol = RTPROT_RA,
3138                 .fc_nlinfo.portid = 0,
3139                 .fc_nlinfo.nlh = NULL,
3140                 .fc_nlinfo.nl_net = net,
3141         };
3142
3143         cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3144         cfg.fc_dst = *prefix;
3145         cfg.fc_gateway = *gwaddr;
3146
3147         /* We should treat it as a default route if prefix length is 0. */
3148         if (!prefixlen)
3149                 cfg.fc_flags |= RTF_DEFAULT;
3150
3151         ip6_route_add(&cfg, NULL);
3152
3153         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3154 }
3155 #endif
3156
3157 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
3158 {
3159         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3160         struct rt6_info *rt;
3161         struct fib6_table *table;
3162
3163         table = fib6_get_table(dev_net(dev), tb_id);
3164         if (!table)
3165                 return NULL;
3166
3167         rcu_read_lock();
3168         for_each_fib6_node_rt_rcu(&table->tb6_root) {
3169                 if (dev == rt->dst.dev &&
3170                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3171                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
3172                         break;
3173         }
3174         if (rt)
3175                 ip6_hold_safe(NULL, &rt, false);
3176         rcu_read_unlock();
3177         return rt;
3178 }
3179
3180 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
3181                                      struct net_device *dev,
3182                                      unsigned int pref)
3183 {
3184         struct fib6_config cfg = {
3185                 .fc_table       = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3186                 .fc_metric      = IP6_RT_PRIO_USER,
3187                 .fc_ifindex     = dev->ifindex,
3188                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3189                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3190                 .fc_protocol = RTPROT_RA,
3191                 .fc_nlinfo.portid = 0,
3192                 .fc_nlinfo.nlh = NULL,
3193                 .fc_nlinfo.nl_net = dev_net(dev),
3194         };
3195
3196         cfg.fc_gateway = *gwaddr;
3197
3198         if (!ip6_route_add(&cfg, NULL)) {
3199                 struct fib6_table *table;
3200
3201                 table = fib6_get_table(dev_net(dev), cfg.fc_table);
3202                 if (table)
3203                         table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3204         }
3205
3206         return rt6_get_dflt_router(gwaddr, dev);
3207 }
3208
3209 static void __rt6_purge_dflt_routers(struct fib6_table *table)
3210 {
3211         struct rt6_info *rt;
3212
3213 restart:
3214         rcu_read_lock();
3215         for_each_fib6_node_rt_rcu(&table->tb6_root) {
3216                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3217                     (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
3218                         if (dst_hold_safe(&rt->dst)) {
3219                                 rcu_read_unlock();
3220                                 ip6_del_rt(rt);
3221                         } else {
3222                                 rcu_read_unlock();
3223                         }
3224                         goto restart;
3225                 }
3226         }
3227         rcu_read_unlock();
3228
3229         table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3230 }
3231
3232 void rt6_purge_dflt_routers(struct net *net)
3233 {
3234         struct fib6_table *table;
3235         struct hlist_head *head;
3236         unsigned int h;
3237
3238         rcu_read_lock();
3239
3240         for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3241                 head = &net->ipv6.fib_table_hash[h];
3242                 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3243                         if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3244                                 __rt6_purge_dflt_routers(table);
3245                 }
3246         }
3247
3248         rcu_read_unlock();
3249 }
3250
3251 static void rtmsg_to_fib6_config(struct net *net,
3252                                  struct in6_rtmsg *rtmsg,
3253                                  struct fib6_config *cfg)
3254 {
3255         memset(cfg, 0, sizeof(*cfg));
3256
3257         cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3258                          : RT6_TABLE_MAIN;
3259         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
3260         cfg->fc_metric = rtmsg->rtmsg_metric;
3261         cfg->fc_expires = rtmsg->rtmsg_info;
3262         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
3263         cfg->fc_src_len = rtmsg->rtmsg_src_len;
3264         cfg->fc_flags = rtmsg->rtmsg_flags;
3265
3266         cfg->fc_nlinfo.nl_net = net;
3267
3268         cfg->fc_dst = rtmsg->rtmsg_dst;
3269         cfg->fc_src = rtmsg->rtmsg_src;
3270         cfg->fc_gateway = rtmsg->rtmsg_gateway;
3271 }
3272
3273 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3274 {
3275         struct fib6_config cfg;
3276         struct in6_rtmsg rtmsg;
3277         int err;
3278
3279         switch (cmd) {
3280         case SIOCADDRT:         /* Add a route */
3281         case SIOCDELRT:         /* Delete a route */
3282                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3283                         return -EPERM;
3284                 err = copy_from_user(&rtmsg, arg,
3285                                      sizeof(struct in6_rtmsg));
3286                 if (err)
3287                         return -EFAULT;
3288
3289                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3290
3291                 rtnl_lock();
3292                 switch (cmd) {
3293                 case SIOCADDRT:
3294                         err = ip6_route_add(&cfg, NULL);
3295                         break;
3296                 case SIOCDELRT:
3297                         err = ip6_route_del(&cfg, NULL);
3298                         break;
3299                 default:
3300                         err = -EINVAL;
3301                 }
3302                 rtnl_unlock();
3303
3304                 return err;
3305         }
3306
3307         return -EINVAL;
3308 }
3309
3310 /*
3311  *      Drop the packet on the floor
3312  */
3313
3314 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3315 {
3316         int type;
3317         struct dst_entry *dst = skb_dst(skb);
3318         switch (ipstats_mib_noroutes) {
3319         case IPSTATS_MIB_INNOROUTES:
3320                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3321                 if (type == IPV6_ADDR_ANY) {
3322                         IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3323                                       IPSTATS_MIB_INADDRERRORS);
3324                         break;
3325                 }
3326                 /* FALLTHROUGH */
3327         case IPSTATS_MIB_OUTNOROUTES:
3328                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3329                               ipstats_mib_noroutes);
3330                 break;
3331         }
3332         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3333         kfree_skb(skb);
3334         return 0;
3335 }
3336
3337 static int ip6_pkt_discard(struct sk_buff *skb)
3338 {
3339         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3340 }
3341
3342 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3343 {
3344         skb->dev = skb_dst(skb)->dev;
3345         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3346 }
3347
3348 static int ip6_pkt_prohibit(struct sk_buff *skb)
3349 {
3350         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3351 }
3352
3353 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3354 {
3355         skb->dev = skb_dst(skb)->dev;
3356         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3357 }
3358
3359 /*
3360  *      Allocate a dst for local (unicast / anycast) address.
3361  */
3362
3363 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
3364                                     const struct in6_addr *addr,
3365                                     bool anycast)
3366 {
3367         u32 tb_id;
3368         struct net *net = dev_net(idev->dev);
3369         struct net_device *dev = idev->dev;
3370         struct rt6_info *rt;
3371
3372         rt = ip6_dst_alloc(net, dev, DST_NOCOUNT);
3373         if (!rt)
3374                 return ERR_PTR(-ENOMEM);
3375
3376         in6_dev_hold(idev);
3377
3378         rt->dst.flags |= DST_HOST;
3379         rt->dst.input = ip6_input;
3380         rt->dst.output = ip6_output;
3381         rt->rt6i_idev = idev;
3382
3383         rt->rt6i_protocol = RTPROT_KERNEL;
3384         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
3385         if (anycast)
3386                 rt->rt6i_flags |= RTF_ANYCAST;
3387         else
3388                 rt->rt6i_flags |= RTF_LOCAL;
3389
3390         rt->rt6i_gateway  = *addr;
3391         rt->rt6i_dst.addr = *addr;
3392         rt->rt6i_dst.plen = 128;
3393         tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
3394         rt->rt6i_table = fib6_get_table(net, tb_id);
3395
3396         return rt;
3397 }
3398
3399 /* remove deleted ip from prefsrc entries */
3400 struct arg_dev_net_ip {
3401         struct net_device *dev;
3402         struct net *net;
3403         struct in6_addr *addr;
3404 };
3405
3406 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
3407 {
3408         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3409         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3410         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3411
3412         if (((void *)rt->dst.dev == dev || !dev) &&
3413             rt != net->ipv6.ip6_null_entry &&
3414             ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
3415                 spin_lock_bh(&rt6_exception_lock);
3416                 /* remove prefsrc entry */
3417                 rt->rt6i_prefsrc.plen = 0;
3418                 /* need to update cache as well */
3419                 rt6_exceptions_remove_prefsrc(rt);
3420                 spin_unlock_bh(&rt6_exception_lock);
3421         }
3422         return 0;
3423 }
3424
3425 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3426 {
3427         struct net *net = dev_net(ifp->idev->dev);
3428         struct arg_dev_net_ip adni = {
3429                 .dev = ifp->idev->dev,
3430                 .net = net,
3431                 .addr = &ifp->addr,
3432         };
3433         fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3434 }
3435
3436 #define RTF_RA_ROUTER           (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
3437
3438 /* Remove routers and update dst entries when gateway turn into host. */
3439 static int fib6_clean_tohost(struct rt6_info *rt, void *arg)
3440 {
3441         struct in6_addr *gateway = (struct in6_addr *)arg;
3442
3443         if (((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3444             ipv6_addr_equal(gateway, &rt->rt6i_gateway)) {
3445                 return -1;
3446         }
3447
3448         /* Further clean up cached routes in exception table.
3449          * This is needed because cached route may have a different
3450          * gateway than its 'parent' in the case of an ip redirect.
3451          */
3452         rt6_exceptions_clean_tohost(rt, gateway);
3453
3454         return 0;
3455 }
3456
3457 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3458 {
3459         fib6_clean_all(net, fib6_clean_tohost, gateway);
3460 }
3461
3462 struct arg_dev_net {
3463         struct net_device *dev;
3464         struct net *net;
3465 };
3466
3467 /* called with write lock held for table with rt */
3468 static int fib6_ifdown(struct rt6_info *rt, void *arg)
3469 {
3470         const struct arg_dev_net *adn = arg;
3471         const struct net_device *dev = adn->dev;
3472
3473         if ((rt->dst.dev == dev || !dev) &&
3474             rt != adn->net->ipv6.ip6_null_entry &&
3475             (rt->rt6i_nsiblings == 0 ||
3476              (dev && netdev_unregistering(dev)) ||
3477              !rt->rt6i_idev->cnf.ignore_routes_with_linkdown))
3478                 return -1;
3479
3480         return 0;
3481 }
3482
3483 void rt6_ifdown(struct net *net, struct net_device *dev)
3484 {
3485         struct arg_dev_net adn = {
3486                 .dev = dev,
3487                 .net = net,
3488         };
3489
3490         fib6_clean_all(net, fib6_ifdown, &adn);
3491         if (dev)
3492                 rt6_uncached_list_flush_dev(net, dev);
3493 }
3494
3495 struct rt6_mtu_change_arg {
3496         struct net_device *dev;
3497         unsigned int mtu;
3498 };
3499
3500 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
3501 {
3502         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
3503         struct inet6_dev *idev;
3504
3505         /* In IPv6 pmtu discovery is not optional,
3506            so that RTAX_MTU lock cannot disable it.
3507            We still use this lock to block changes
3508            caused by addrconf/ndisc.
3509         */
3510
3511         idev = __in6_dev_get(arg->dev);
3512         if (!idev)
3513                 return 0;
3514
3515         /* For administrative MTU increase, there is no way to discover
3516            IPv6 PMTU increase, so PMTU increase should be updated here.
3517            Since RFC 1981 doesn't include administrative MTU increase
3518            update PMTU increase is a MUST. (i.e. jumbo frame)
3519          */
3520         /*
3521            If new MTU is less than route PMTU, this new MTU will be the
3522            lowest MTU in the path, update the route PMTU to reflect PMTU
3523            decreases; if new MTU is greater than route PMTU, and the
3524            old MTU is the lowest MTU in the path, update the route PMTU
3525            to reflect the increase. In this case if the other nodes' MTU
3526            also have the lowest MTU, TOO BIG MESSAGE will be lead to
3527            PMTU discovery.
3528          */
3529         if (rt->dst.dev == arg->dev &&
3530             dst_metric_raw(&rt->dst, RTAX_MTU) &&
3531             !dst_metric_locked(&rt->dst, RTAX_MTU)) {
3532                 spin_lock_bh(&rt6_exception_lock);
3533                 if (dst_mtu(&rt->dst) >= arg->mtu ||
3534                     (dst_mtu(&rt->dst) < arg->mtu &&
3535                      dst_mtu(&rt->dst) == idev->cnf.mtu6)) {
3536                         dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
3537                 }
3538                 rt6_exceptions_update_pmtu(rt, arg->mtu);
3539                 spin_unlock_bh(&rt6_exception_lock);
3540         }
3541         return 0;
3542 }
3543
3544 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
3545 {
3546         struct rt6_mtu_change_arg arg = {
3547                 .dev = dev,
3548                 .mtu = mtu,
3549         };
3550
3551         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
3552 }
3553
3554 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
3555         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
3556         [RTA_OIF]               = { .type = NLA_U32 },
3557         [RTA_IIF]               = { .type = NLA_U32 },
3558         [RTA_PRIORITY]          = { .type = NLA_U32 },
3559         [RTA_METRICS]           = { .type = NLA_NESTED },
3560         [RTA_MULTIPATH]         = { .len = sizeof(struct rtnexthop) },
3561         [RTA_PREF]              = { .type = NLA_U8 },
3562         [RTA_ENCAP_TYPE]        = { .type = NLA_U16 },
3563         [RTA_ENCAP]             = { .type = NLA_NESTED },
3564         [RTA_EXPIRES]           = { .type = NLA_U32 },
3565         [RTA_UID]               = { .type = NLA_U32 },
3566         [RTA_MARK]              = { .type = NLA_U32 },
3567 };
3568
3569 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
3570                               struct fib6_config *cfg,
3571                               struct netlink_ext_ack *extack)
3572 {
3573         struct rtmsg *rtm;
3574         struct nlattr *tb[RTA_MAX+1];
3575         unsigned int pref;
3576         int err;
3577
3578         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
3579                           NULL);
3580         if (err < 0)
3581                 goto errout;
3582
3583         err = -EINVAL;
3584         rtm = nlmsg_data(nlh);
3585         memset(cfg, 0, sizeof(*cfg));
3586
3587         cfg->fc_table = rtm->rtm_table;
3588         cfg->fc_dst_len = rtm->rtm_dst_len;
3589         cfg->fc_src_len = rtm->rtm_src_len;
3590         cfg->fc_flags = RTF_UP;
3591         cfg->fc_protocol = rtm->rtm_protocol;
3592         cfg->fc_type = rtm->rtm_type;
3593
3594         if (rtm->rtm_type == RTN_UNREACHABLE ||
3595             rtm->rtm_type == RTN_BLACKHOLE ||
3596             rtm->rtm_type == RTN_PROHIBIT ||
3597             rtm->rtm_type == RTN_THROW)
3598                 cfg->fc_flags |= RTF_REJECT;
3599
3600         if (rtm->rtm_type == RTN_LOCAL)
3601                 cfg->fc_flags |= RTF_LOCAL;
3602
3603         if (rtm->rtm_flags & RTM_F_CLONED)
3604                 cfg->fc_flags |= RTF_CACHE;
3605
3606         cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
3607         cfg->fc_nlinfo.nlh = nlh;
3608         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
3609
3610         if (tb[RTA_GATEWAY]) {
3611                 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
3612                 cfg->fc_flags |= RTF_GATEWAY;
3613         }
3614
3615         if (tb[RTA_DST]) {
3616                 int plen = (rtm->rtm_dst_len + 7) >> 3;
3617
3618                 if (nla_len(tb[RTA_DST]) < plen)
3619                         goto errout;
3620
3621                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
3622         }
3623
3624         if (tb[RTA_SRC]) {
3625                 int plen = (rtm->rtm_src_len + 7) >> 3;
3626
3627                 if (nla_len(tb[RTA_SRC]) < plen)
3628                         goto errout;
3629
3630                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
3631         }
3632
3633         if (tb[RTA_PREFSRC])
3634                 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
3635
3636         if (tb[RTA_OIF])
3637                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
3638
3639         if (tb[RTA_PRIORITY])
3640                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
3641
3642         if (tb[RTA_METRICS]) {
3643                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
3644                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
3645         }
3646
3647         if (tb[RTA_TABLE])
3648                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
3649
3650         if (tb[RTA_MULTIPATH]) {
3651                 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
3652                 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
3653
3654                 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
3655                                                      cfg->fc_mp_len, extack);
3656                 if (err < 0)
3657                         goto errout;
3658         }
3659
3660         if (tb[RTA_PREF]) {
3661                 pref = nla_get_u8(tb[RTA_PREF]);
3662                 if (pref != ICMPV6_ROUTER_PREF_LOW &&
3663                     pref != ICMPV6_ROUTER_PREF_HIGH)
3664                         pref = ICMPV6_ROUTER_PREF_MEDIUM;
3665                 cfg->fc_flags |= RTF_PREF(pref);
3666         }
3667
3668         if (tb[RTA_ENCAP])
3669                 cfg->fc_encap = tb[RTA_ENCAP];
3670
3671         if (tb[RTA_ENCAP_TYPE]) {
3672                 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
3673
3674                 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
3675                 if (err < 0)
3676                         goto errout;
3677         }
3678
3679         if (tb[RTA_EXPIRES]) {
3680                 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
3681
3682                 if (addrconf_finite_timeout(timeout)) {
3683                         cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
3684                         cfg->fc_flags |= RTF_EXPIRES;
3685                 }
3686         }
3687
3688         err = 0;
3689 errout:
3690         return err;
3691 }
3692
3693 struct rt6_nh {
3694         struct rt6_info *rt6_info;
3695         struct fib6_config r_cfg;
3696         struct mx6_config mxc;
3697         struct list_head next;
3698 };
3699
3700 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
3701 {
3702         struct rt6_nh *nh;
3703
3704         list_for_each_entry(nh, rt6_nh_list, next) {
3705                 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
3706                         &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
3707                         nh->r_cfg.fc_ifindex);
3708         }
3709 }
3710
3711 static int ip6_route_info_append(struct list_head *rt6_nh_list,
3712                                  struct rt6_info *rt, struct fib6_config *r_cfg)
3713 {
3714         struct rt6_nh *nh;
3715         int err = -EEXIST;
3716
3717         list_for_each_entry(nh, rt6_nh_list, next) {
3718                 /* check if rt6_info already exists */
3719                 if (rt6_duplicate_nexthop(nh->rt6_info, rt))
3720                         return err;
3721         }
3722
3723         nh = kzalloc(sizeof(*nh), GFP_KERNEL);
3724         if (!nh)
3725                 return -ENOMEM;
3726         nh->rt6_info = rt;
3727         err = ip6_convert_metrics(&nh->mxc, r_cfg);
3728         if (err) {
3729                 kfree(nh);
3730                 return err;
3731         }
3732         memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
3733         list_add_tail(&nh->next, rt6_nh_list);
3734
3735         return 0;
3736 }
3737
3738 static void ip6_route_mpath_notify(struct rt6_info *rt,
3739                                    struct rt6_info *rt_last,
3740                                    struct nl_info *info,
3741                                    __u16 nlflags)
3742 {
3743         /* if this is an APPEND route, then rt points to the first route
3744          * inserted and rt_last points to last route inserted. Userspace
3745          * wants a consistent dump of the route which starts at the first
3746          * nexthop. Since sibling routes are always added at the end of
3747          * the list, find the first sibling of the last route appended
3748          */
3749         if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->rt6i_nsiblings) {
3750                 rt = list_first_entry(&rt_last->rt6i_siblings,
3751                                       struct rt6_info,
3752                                       rt6i_siblings);
3753         }
3754
3755         if (rt)
3756                 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
3757 }
3758
3759 static int ip6_route_multipath_add(struct fib6_config *cfg,
3760                                    struct netlink_ext_ack *extack)
3761 {
3762         struct rt6_info *rt_notif = NULL, *rt_last = NULL;
3763         struct nl_info *info = &cfg->fc_nlinfo;
3764         struct fib6_config r_cfg;
3765         struct rtnexthop *rtnh;
3766         struct rt6_info *rt;
3767         struct rt6_nh *err_nh;
3768         struct rt6_nh *nh, *nh_safe;
3769         __u16 nlflags;
3770         int remaining;
3771         int attrlen;
3772         int err = 1;
3773         int nhn = 0;
3774         int replace = (cfg->fc_nlinfo.nlh &&
3775                        (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
3776         LIST_HEAD(rt6_nh_list);
3777
3778         nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
3779         if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
3780                 nlflags |= NLM_F_APPEND;
3781
3782         remaining = cfg->fc_mp_len;
3783         rtnh = (struct rtnexthop *)cfg->fc_mp;
3784
3785         /* Parse a Multipath Entry and build a list (rt6_nh_list) of
3786          * rt6_info structs per nexthop
3787          */
3788         while (rtnh_ok(rtnh, remaining)) {
3789                 memcpy(&r_cfg, cfg, sizeof(*cfg));
3790                 if (rtnh->rtnh_ifindex)
3791                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
3792
3793                 attrlen = rtnh_attrlen(rtnh);
3794                 if (attrlen > 0) {
3795                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
3796
3797                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
3798                         if (nla) {
3799                                 r_cfg.fc_gateway = nla_get_in6_addr(nla);
3800                                 r_cfg.fc_flags |= RTF_GATEWAY;
3801                         }
3802                         r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
3803                         nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
3804                         if (nla)
3805                                 r_cfg.fc_encap_type = nla_get_u16(nla);
3806                 }
3807
3808                 rt = ip6_route_info_create(&r_cfg, extack);
3809                 if (IS_ERR(rt)) {
3810                         err = PTR_ERR(rt);
3811                         rt = NULL;
3812                         goto cleanup;
3813                 }
3814
3815                 err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg);
3816                 if (err) {
3817                         dst_release_immediate(&rt->dst);
3818                         goto cleanup;
3819                 }
3820
3821                 rtnh = rtnh_next(rtnh, &remaining);
3822         }
3823
3824         /* for add and replace send one notification with all nexthops.
3825          * Skip the notification in fib6_add_rt2node and send one with
3826          * the full route when done
3827          */
3828         info->skip_notify = 1;
3829
3830         err_nh = NULL;
3831         list_for_each_entry(nh, &rt6_nh_list, next) {
3832                 rt_last = nh->rt6_info;
3833                 err = __ip6_ins_rt(nh->rt6_info, info, &nh->mxc, extack);
3834                 /* save reference to first route for notification */
3835                 if (!rt_notif && !err)
3836                         rt_notif = nh->rt6_info;
3837
3838                 /* nh->rt6_info is used or freed at this point, reset to NULL*/
3839                 nh->rt6_info = NULL;
3840                 if (err) {
3841                         if (replace && nhn)
3842                                 ip6_print_replace_route_err(&rt6_nh_list);
3843                         err_nh = nh;
3844                         goto add_errout;
3845                 }
3846
3847                 /* Because each route is added like a single route we remove
3848                  * these flags after the first nexthop: if there is a collision,
3849                  * we have already failed to add the first nexthop:
3850                  * fib6_add_rt2node() has rejected it; when replacing, old
3851                  * nexthops have been replaced by first new, the rest should
3852                  * be added to it.
3853                  */
3854                 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
3855                                                      NLM_F_REPLACE);
3856                 nhn++;
3857         }
3858
3859         /* success ... tell user about new route */
3860         ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
3861         goto cleanup;
3862
3863 add_errout:
3864         /* send notification for routes that were added so that
3865          * the delete notifications sent by ip6_route_del are
3866          * coherent
3867          */
3868         if (rt_notif)
3869                 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
3870
3871         /* Delete routes that were already added */
3872         list_for_each_entry(nh, &rt6_nh_list, next) {
3873                 if (err_nh == nh)
3874                         break;
3875                 ip6_route_del(&nh->r_cfg, extack);
3876         }
3877
3878 cleanup:
3879         list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
3880                 if (nh->rt6_info)
3881                         dst_release_immediate(&nh->rt6_info->dst);
3882                 kfree(nh->mxc.mx);
3883                 list_del(&nh->next);
3884                 kfree(nh);
3885         }
3886
3887         return err;
3888 }
3889
3890 static int ip6_route_multipath_del(struct fib6_config *cfg,
3891                                    struct netlink_ext_ack *extack)
3892 {
3893         struct fib6_config r_cfg;
3894         struct rtnexthop *rtnh;
3895         int remaining;
3896         int attrlen;
3897         int err = 1, last_err = 0;
3898
3899         remaining = cfg->fc_mp_len;
3900         rtnh = (struct rtnexthop *)cfg->fc_mp;
3901
3902         /* Parse a Multipath Entry */
3903         while (rtnh_ok(rtnh, remaining)) {
3904                 memcpy(&r_cfg, cfg, sizeof(*cfg));
3905                 if (rtnh->rtnh_ifindex)
3906                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
3907
3908                 attrlen = rtnh_attrlen(rtnh);
3909                 if (attrlen > 0) {
3910                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
3911
3912                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
3913                         if (nla) {
3914                                 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
3915                                 r_cfg.fc_flags |= RTF_GATEWAY;
3916                         }
3917                 }
3918                 err = ip6_route_del(&r_cfg, extack);
3919                 if (err)
3920                         last_err = err;
3921
3922                 rtnh = rtnh_next(rtnh, &remaining);
3923         }
3924
3925         return last_err;
3926 }
3927
3928 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
3929                               struct netlink_ext_ack *extack)
3930 {
3931         struct fib6_config cfg;
3932         int err;
3933
3934         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
3935         if (err < 0)
3936                 return err;
3937
3938         if (cfg.fc_mp)
3939                 return ip6_route_multipath_del(&cfg, extack);
3940         else {
3941                 cfg.fc_delete_all_nh = 1;
3942                 return ip6_route_del(&cfg, extack);
3943         }
3944 }
3945
3946 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
3947                               struct netlink_ext_ack *extack)
3948 {
3949         struct fib6_config cfg;
3950         int err;
3951
3952         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
3953         if (err < 0)
3954                 return err;
3955
3956         if (cfg.fc_mp)
3957                 return ip6_route_multipath_add(&cfg, extack);
3958         else
3959                 return ip6_route_add(&cfg, extack);
3960 }
3961
3962 static size_t rt6_nlmsg_size(struct rt6_info *rt)
3963 {
3964         int nexthop_len = 0;
3965
3966         if (rt->rt6i_nsiblings) {
3967                 nexthop_len = nla_total_size(0)  /* RTA_MULTIPATH */
3968                             + NLA_ALIGN(sizeof(struct rtnexthop))
3969                             + nla_total_size(16) /* RTA_GATEWAY */
3970                             + lwtunnel_get_encap_size(rt->dst.lwtstate);
3971
3972                 nexthop_len *= rt->rt6i_nsiblings;
3973         }
3974
3975         return NLMSG_ALIGN(sizeof(struct rtmsg))
3976                + nla_total_size(16) /* RTA_SRC */
3977                + nla_total_size(16) /* RTA_DST */
3978                + nla_total_size(16) /* RTA_GATEWAY */
3979                + nla_total_size(16) /* RTA_PREFSRC */
3980                + nla_total_size(4) /* RTA_TABLE */
3981                + nla_total_size(4) /* RTA_IIF */
3982                + nla_total_size(4) /* RTA_OIF */
3983                + nla_total_size(4) /* RTA_PRIORITY */
3984                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
3985                + nla_total_size(sizeof(struct rta_cacheinfo))
3986                + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
3987                + nla_total_size(1) /* RTA_PREF */
3988                + lwtunnel_get_encap_size(rt->dst.lwtstate)
3989                + nexthop_len;
3990 }
3991
3992 static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt,
3993                             unsigned int *flags, bool skip_oif)
3994 {
3995         if (!netif_running(rt->dst.dev) || !netif_carrier_ok(rt->dst.dev)) {
3996                 *flags |= RTNH_F_LINKDOWN;
3997                 if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
3998                         *flags |= RTNH_F_DEAD;
3999         }
4000
4001         if (rt->rt6i_flags & RTF_GATEWAY) {
4002                 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0)
4003                         goto nla_put_failure;
4004         }
4005
4006         if (rt->rt6i_nh_flags & RTNH_F_OFFLOAD)
4007                 *flags |= RTNH_F_OFFLOAD;
4008
4009         /* not needed for multipath encoding b/c it has a rtnexthop struct */
4010         if (!skip_oif && rt->dst.dev &&
4011             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
4012                 goto nla_put_failure;
4013
4014         if (rt->dst.lwtstate &&
4015             lwtunnel_fill_encap(skb, rt->dst.lwtstate) < 0)
4016                 goto nla_put_failure;
4017
4018         return 0;
4019
4020 nla_put_failure:
4021         return -EMSGSIZE;
4022 }
4023
4024 /* add multipath next hop */
4025 static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt)
4026 {
4027         struct rtnexthop *rtnh;
4028         unsigned int flags = 0;
4029
4030         rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
4031         if (!rtnh)
4032                 goto nla_put_failure;
4033
4034         rtnh->rtnh_hops = 0;
4035         rtnh->rtnh_ifindex = rt->dst.dev ? rt->dst.dev->ifindex : 0;
4036
4037         if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
4038                 goto nla_put_failure;
4039
4040         rtnh->rtnh_flags = flags;
4041
4042         /* length of rtnetlink header + attributes */
4043         rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
4044
4045         return 0;
4046
4047 nla_put_failure:
4048         return -EMSGSIZE;
4049 }
4050
4051 static int rt6_fill_node(struct net *net,
4052                          struct sk_buff *skb, struct rt6_info *rt,
4053                          struct in6_addr *dst, struct in6_addr *src,
4054                          int iif, int type, u32 portid, u32 seq,
4055                          unsigned int flags)
4056 {
4057         u32 metrics[RTAX_MAX];
4058         struct rtmsg *rtm;
4059         struct nlmsghdr *nlh;
4060         long expires;
4061         u32 table;
4062
4063         nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4064         if (!nlh)
4065                 return -EMSGSIZE;
4066
4067         rtm = nlmsg_data(nlh);
4068         rtm->rtm_family = AF_INET6;
4069         rtm->rtm_dst_len = rt->rt6i_dst.plen;
4070         rtm->rtm_src_len = rt->rt6i_src.plen;
4071         rtm->rtm_tos = 0;
4072         if (rt->rt6i_table)
4073                 table = rt->rt6i_table->tb6_id;
4074         else
4075                 table = RT6_TABLE_UNSPEC;
4076         rtm->rtm_table = table;
4077         if (nla_put_u32(skb, RTA_TABLE, table))
4078                 goto nla_put_failure;
4079         if (rt->rt6i_flags & RTF_REJECT) {
4080                 switch (rt->dst.error) {
4081                 case -EINVAL:
4082                         rtm->rtm_type = RTN_BLACKHOLE;
4083                         break;
4084                 case -EACCES:
4085                         rtm->rtm_type = RTN_PROHIBIT;
4086                         break;
4087                 case -EAGAIN:
4088                         rtm->rtm_type = RTN_THROW;
4089                         break;
4090                 default:
4091                         rtm->rtm_type = RTN_UNREACHABLE;
4092                         break;
4093                 }
4094         }
4095         else if (rt->rt6i_flags & RTF_LOCAL)
4096                 rtm->rtm_type = RTN_LOCAL;
4097         else if (rt->rt6i_flags & RTF_ANYCAST)
4098                 rtm->rtm_type = RTN_ANYCAST;
4099         else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
4100                 rtm->rtm_type = RTN_LOCAL;
4101         else
4102                 rtm->rtm_type = RTN_UNICAST;
4103         rtm->rtm_flags = 0;
4104         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4105         rtm->rtm_protocol = rt->rt6i_protocol;
4106
4107         if (rt->rt6i_flags & RTF_CACHE)
4108                 rtm->rtm_flags |= RTM_F_CLONED;
4109
4110         if (dst) {
4111                 if (nla_put_in6_addr(skb, RTA_DST, dst))
4112                         goto nla_put_failure;
4113                 rtm->rtm_dst_len = 128;
4114         } else if (rtm->rtm_dst_len)
4115                 if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr))
4116                         goto nla_put_failure;
4117 #ifdef CONFIG_IPV6_SUBTREES
4118         if (src) {
4119                 if (nla_put_in6_addr(skb, RTA_SRC, src))
4120                         goto nla_put_failure;
4121                 rtm->rtm_src_len = 128;
4122         } else if (rtm->rtm_src_len &&
4123                    nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr))
4124                 goto nla_put_failure;
4125 #endif
4126         if (iif) {
4127 #ifdef CONFIG_IPV6_MROUTE
4128                 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
4129                         int err = ip6mr_get_route(net, skb, rtm, portid);
4130
4131                         if (err == 0)
4132                                 return 0;
4133                         if (err < 0)
4134                                 goto nla_put_failure;
4135                 } else
4136 #endif
4137                         if (nla_put_u32(skb, RTA_IIF, iif))
4138                                 goto nla_put_failure;
4139         } else if (dst) {
4140                 struct in6_addr saddr_buf;
4141                 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
4142                     nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4143                         goto nla_put_failure;
4144         }
4145
4146         if (rt->rt6i_prefsrc.plen) {
4147                 struct in6_addr saddr_buf;
4148                 saddr_buf = rt->rt6i_prefsrc.addr;
4149                 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4150                         goto nla_put_failure;
4151         }
4152
4153         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
4154         if (rt->rt6i_pmtu)
4155                 metrics[RTAX_MTU - 1] = rt->rt6i_pmtu;
4156         if (rtnetlink_put_metrics(skb, metrics) < 0)
4157                 goto nla_put_failure;
4158
4159         if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
4160                 goto nla_put_failure;
4161
4162         /* For multipath routes, walk the siblings list and add
4163          * each as a nexthop within RTA_MULTIPATH.
4164          */
4165         if (rt->rt6i_nsiblings) {
4166                 struct rt6_info *sibling, *next_sibling;
4167                 struct nlattr *mp;
4168
4169                 mp = nla_nest_start(skb, RTA_MULTIPATH);
4170                 if (!mp)
4171                         goto nla_put_failure;
4172
4173                 if (rt6_add_nexthop(skb, rt) < 0)
4174                         goto nla_put_failure;
4175
4176                 list_for_each_entry_safe(sibling, next_sibling,
4177                                          &rt->rt6i_siblings, rt6i_siblings) {
4178                         if (rt6_add_nexthop(skb, sibling) < 0)
4179                                 goto nla_put_failure;
4180                 }
4181
4182                 nla_nest_end(skb, mp);
4183         } else {
4184                 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
4185                         goto nla_put_failure;
4186         }
4187
4188         expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0;
4189
4190         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
4191                 goto nla_put_failure;
4192
4193         if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
4194                 goto nla_put_failure;
4195
4196
4197         nlmsg_end(skb, nlh);
4198         return 0;
4199
4200 nla_put_failure:
4201         nlmsg_cancel(skb, nlh);
4202         return -EMSGSIZE;
4203 }
4204
4205 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
4206 {
4207         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4208         struct net *net = arg->net;
4209
4210         if (rt == net->ipv6.ip6_null_entry)
4211                 return 0;
4212
4213         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
4214                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
4215
4216                 /* user wants prefix routes only */
4217                 if (rtm->rtm_flags & RTM_F_PREFIX &&
4218                     !(rt->rt6i_flags & RTF_PREFIX_RT)) {
4219                         /* success since this is not a prefix route */
4220                         return 1;
4221                 }
4222         }
4223
4224         return rt6_fill_node(net,
4225                      arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
4226                      NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq,
4227                      NLM_F_MULTI);
4228 }
4229
4230 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4231                               struct netlink_ext_ack *extack)
4232 {
4233         struct net *net = sock_net(in_skb->sk);
4234         struct nlattr *tb[RTA_MAX+1];
4235         int err, iif = 0, oif = 0;
4236         struct dst_entry *dst;
4237         struct rt6_info *rt;
4238         struct sk_buff *skb;
4239         struct rtmsg *rtm;
4240         struct flowi6 fl6;
4241         bool fibmatch;
4242
4243         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4244                           extack);
4245         if (err < 0)
4246                 goto errout;
4247
4248         err = -EINVAL;
4249         memset(&fl6, 0, sizeof(fl6));
4250         rtm = nlmsg_data(nlh);
4251         fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4252         fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4253
4254         if (tb[RTA_SRC]) {
4255                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4256                         goto errout;
4257
4258                 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4259         }
4260
4261         if (tb[RTA_DST]) {
4262                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4263                         goto errout;
4264
4265                 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4266         }
4267
4268         if (tb[RTA_IIF])
4269                 iif = nla_get_u32(tb[RTA_IIF]);
4270
4271         if (tb[RTA_OIF])
4272                 oif = nla_get_u32(tb[RTA_OIF]);
4273
4274         if (tb[RTA_MARK])
4275                 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4276
4277         if (tb[RTA_UID])
4278                 fl6.flowi6_uid = make_kuid(current_user_ns(),
4279                                            nla_get_u32(tb[RTA_UID]));
4280         else
4281                 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4282
4283         if (iif) {
4284                 struct net_device *dev;
4285                 int flags = 0;
4286
4287                 rcu_read_lock();
4288
4289                 dev = dev_get_by_index_rcu(net, iif);
4290                 if (!dev) {
4291                         rcu_read_unlock();
4292                         err = -ENODEV;
4293                         goto errout;
4294                 }
4295
4296                 fl6.flowi6_iif = iif;
4297
4298                 if (!ipv6_addr_any(&fl6.saddr))
4299                         flags |= RT6_LOOKUP_F_HAS_SADDR;
4300
4301                 dst = ip6_route_input_lookup(net, dev, &fl6, flags);
4302
4303                 rcu_read_unlock();
4304         } else {
4305                 fl6.flowi6_oif = oif;
4306
4307                 dst = ip6_route_output(net, NULL, &fl6);
4308         }
4309
4310
4311         rt = container_of(dst, struct rt6_info, dst);
4312         if (rt->dst.error) {
4313                 err = rt->dst.error;
4314                 ip6_rt_put(rt);
4315                 goto errout;
4316         }
4317
4318         if (rt == net->ipv6.ip6_null_entry) {
4319                 err = rt->dst.error;
4320                 ip6_rt_put(rt);
4321                 goto errout;
4322         }
4323
4324         if (fibmatch && rt->dst.from) {
4325                 struct rt6_info *ort = container_of(rt->dst.from,
4326                                                     struct rt6_info, dst);
4327
4328                 dst_hold(&ort->dst);
4329                 ip6_rt_put(rt);
4330                 rt = ort;
4331         }
4332
4333         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
4334         if (!skb) {
4335                 ip6_rt_put(rt);
4336                 err = -ENOBUFS;
4337                 goto errout;
4338         }
4339
4340         skb_dst_set(skb, &rt->dst);
4341         if (fibmatch)
4342                 err = rt6_fill_node(net, skb, rt, NULL, NULL, iif,
4343                                     RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4344                                     nlh->nlmsg_seq, 0);
4345         else
4346                 err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
4347                                     RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4348                                     nlh->nlmsg_seq, 0);
4349         if (err < 0) {
4350                 kfree_skb(skb);
4351                 goto errout;
4352         }
4353
4354         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
4355 errout:
4356         return err;
4357 }
4358
4359 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info,
4360                      unsigned int nlm_flags)
4361 {
4362         struct sk_buff *skb;
4363         struct net *net = info->nl_net;
4364         u32 seq;
4365         int err;
4366
4367         err = -ENOBUFS;
4368         seq = info->nlh ? info->nlh->nlmsg_seq : 0;
4369
4370         skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
4371         if (!skb)
4372                 goto errout;
4373
4374         err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
4375                                 event, info->portid, seq, nlm_flags);
4376         if (err < 0) {
4377                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
4378                 WARN_ON(err == -EMSGSIZE);
4379                 kfree_skb(skb);
4380                 goto errout;
4381         }
4382         rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
4383                     info->nlh, gfp_any());
4384         return;
4385 errout:
4386         if (err < 0)
4387                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
4388 }
4389
4390 static int ip6_route_dev_notify(struct notifier_block *this,
4391                                 unsigned long event, void *ptr)
4392 {
4393         struct net_device *dev = netdev_notifier_info_to_dev(ptr);
4394         struct net *net = dev_net(dev);
4395
4396         if (!(dev->flags & IFF_LOOPBACK))
4397                 return NOTIFY_OK;
4398
4399         if (event == NETDEV_REGISTER) {
4400                 net->ipv6.ip6_null_entry->dst.dev = dev;
4401                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
4402 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4403                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
4404                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
4405                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
4406                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
4407 #endif
4408          } else if (event == NETDEV_UNREGISTER &&
4409                     dev->reg_state != NETREG_UNREGISTERED) {
4410                 /* NETDEV_UNREGISTER could be fired for multiple times by
4411                  * netdev_wait_allrefs(). Make sure we only call this once.
4412                  */
4413                 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
4414 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4415                 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
4416                 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
4417 #endif
4418         }
4419
4420         return NOTIFY_OK;
4421 }
4422
4423 /*
4424  *      /proc
4425  */
4426
4427 #ifdef CONFIG_PROC_FS
4428
4429 static const struct file_operations ipv6_route_proc_fops = {
4430         .owner          = THIS_MODULE,
4431         .open           = ipv6_route_open,
4432         .read           = seq_read,
4433         .llseek         = seq_lseek,
4434         .release        = seq_release_net,
4435 };
4436
4437 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
4438 {
4439         struct net *net = (struct net *)seq->private;
4440         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
4441                    net->ipv6.rt6_stats->fib_nodes,
4442                    net->ipv6.rt6_stats->fib_route_nodes,
4443                    atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
4444                    net->ipv6.rt6_stats->fib_rt_entries,
4445                    net->ipv6.rt6_stats->fib_rt_cache,
4446                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
4447                    net->ipv6.rt6_stats->fib_discarded_routes);
4448
4449         return 0;
4450 }
4451
4452 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
4453 {
4454         return single_open_net(inode, file, rt6_stats_seq_show);
4455 }
4456
4457 static const struct file_operations rt6_stats_seq_fops = {
4458         .owner   = THIS_MODULE,
4459         .open    = rt6_stats_seq_open,
4460         .read    = seq_read,
4461         .llseek  = seq_lseek,
4462         .release = single_release_net,
4463 };
4464 #endif  /* CONFIG_PROC_FS */
4465
4466 #ifdef CONFIG_SYSCTL
4467
4468 static
4469 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
4470                               void __user *buffer, size_t *lenp, loff_t *ppos)
4471 {
4472         struct net *net;
4473         int delay;
4474         if (!write)
4475                 return -EINVAL;
4476
4477         net = (struct net *)ctl->extra1;
4478         delay = net->ipv6.sysctl.flush_delay;
4479         proc_dointvec(ctl, write, buffer, lenp, ppos);
4480         fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
4481         return 0;
4482 }
4483
4484 struct ctl_table ipv6_route_table_template[] = {
4485         {
4486                 .procname       =       "flush",
4487                 .data           =       &init_net.ipv6.sysctl.flush_delay,
4488                 .maxlen         =       sizeof(int),
4489                 .mode           =       0200,
4490                 .proc_handler   =       ipv6_sysctl_rtcache_flush
4491         },
4492         {
4493                 .procname       =       "gc_thresh",
4494                 .data           =       &ip6_dst_ops_template.gc_thresh,
4495                 .maxlen         =       sizeof(int),
4496                 .mode           =       0644,
4497                 .proc_handler   =       proc_dointvec,
4498         },
4499         {
4500                 .procname       =       "max_size",
4501                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
4502                 .maxlen         =       sizeof(int),
4503                 .mode           =       0644,
4504                 .proc_handler   =       proc_dointvec,
4505         },
4506         {
4507                 .procname       =       "gc_min_interval",
4508                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
4509                 .maxlen         =       sizeof(int),
4510                 .mode           =       0644,
4511                 .proc_handler   =       proc_dointvec_jiffies,
4512         },
4513         {
4514                 .procname       =       "gc_timeout",
4515                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
4516                 .maxlen         =       sizeof(int),
4517                 .mode           =       0644,
4518                 .proc_handler   =       proc_dointvec_jiffies,
4519         },
4520         {
4521                 .procname       =       "gc_interval",
4522                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
4523                 .maxlen         =       sizeof(int),
4524                 .mode           =       0644,
4525                 .proc_handler   =       proc_dointvec_jiffies,
4526         },
4527         {
4528                 .procname       =       "gc_elasticity",
4529                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
4530                 .maxlen         =       sizeof(int),
4531                 .mode           =       0644,
4532                 .proc_handler   =       proc_dointvec,
4533         },
4534         {
4535                 .procname       =       "mtu_expires",
4536                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
4537                 .maxlen         =       sizeof(int),
4538                 .mode           =       0644,
4539                 .proc_handler   =       proc_dointvec_jiffies,
4540         },
4541         {
4542                 .procname       =       "min_adv_mss",
4543                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
4544                 .maxlen         =       sizeof(int),
4545                 .mode           =       0644,
4546                 .proc_handler   =       proc_dointvec,
4547         },
4548         {
4549                 .procname       =       "gc_min_interval_ms",
4550                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
4551                 .maxlen         =       sizeof(int),
4552                 .mode           =       0644,
4553                 .proc_handler   =       proc_dointvec_ms_jiffies,
4554         },
4555         { }
4556 };
4557
4558 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
4559 {
4560         struct ctl_table *table;
4561
4562         table = kmemdup(ipv6_route_table_template,
4563                         sizeof(ipv6_route_table_template),
4564                         GFP_KERNEL);
4565
4566         if (table) {
4567                 table[0].data = &net->ipv6.sysctl.flush_delay;
4568                 table[0].extra1 = net;
4569                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
4570                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
4571                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
4572                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
4573                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
4574                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
4575                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
4576                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
4577                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
4578
4579                 /* Don't export sysctls to unprivileged users */
4580                 if (net->user_ns != &init_user_ns)
4581                         table[0].procname = NULL;
4582         }
4583
4584         return table;
4585 }
4586 #endif
4587
4588 static int __net_init ip6_route_net_init(struct net *net)
4589 {
4590         int ret = -ENOMEM;
4591
4592         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
4593                sizeof(net->ipv6.ip6_dst_ops));
4594
4595         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
4596                 goto out_ip6_dst_ops;
4597
4598         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
4599                                            sizeof(*net->ipv6.ip6_null_entry),
4600                                            GFP_KERNEL);
4601         if (!net->ipv6.ip6_null_entry)
4602                 goto out_ip6_dst_entries;
4603         net->ipv6.ip6_null_entry->dst.path =
4604                 (struct dst_entry *)net->ipv6.ip6_null_entry;
4605         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4606         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
4607                          ip6_template_metrics, true);
4608
4609 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4610         net->ipv6.fib6_has_custom_rules = false;
4611         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
4612                                                sizeof(*net->ipv6.ip6_prohibit_entry),
4613                                                GFP_KERNEL);
4614         if (!net->ipv6.ip6_prohibit_entry)
4615                 goto out_ip6_null_entry;
4616         net->ipv6.ip6_prohibit_entry->dst.path =
4617                 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
4618         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4619         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
4620                          ip6_template_metrics, true);
4621
4622         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
4623                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
4624                                                GFP_KERNEL);
4625         if (!net->ipv6.ip6_blk_hole_entry)
4626                 goto out_ip6_prohibit_entry;
4627         net->ipv6.ip6_blk_hole_entry->dst.path =
4628                 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
4629         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4630         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
4631                          ip6_template_metrics, true);
4632 #endif
4633
4634         net->ipv6.sysctl.flush_delay = 0;
4635         net->ipv6.sysctl.ip6_rt_max_size = 4096;
4636         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
4637         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
4638         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
4639         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
4640         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
4641         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
4642
4643         net->ipv6.ip6_rt_gc_expire = 30*HZ;
4644
4645         ret = 0;
4646 out:
4647         return ret;
4648
4649 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4650 out_ip6_prohibit_entry:
4651         kfree(net->ipv6.ip6_prohibit_entry);
4652 out_ip6_null_entry:
4653         kfree(net->ipv6.ip6_null_entry);
4654 #endif
4655 out_ip6_dst_entries:
4656         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
4657 out_ip6_dst_ops:
4658         goto out;
4659 }
4660
4661 static void __net_exit ip6_route_net_exit(struct net *net)
4662 {
4663         kfree(net->ipv6.ip6_null_entry);
4664 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4665         kfree(net->ipv6.ip6_prohibit_entry);
4666         kfree(net->ipv6.ip6_blk_hole_entry);
4667 #endif
4668         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
4669 }
4670
4671 static int __net_init ip6_route_net_init_late(struct net *net)
4672 {
4673 #ifdef CONFIG_PROC_FS
4674         proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
4675         proc_create("rt6_stats", S_IRUGO, net->proc_net, &rt6_stats_seq_fops);
4676 #endif
4677         return 0;
4678 }
4679
4680 static void __net_exit ip6_route_net_exit_late(struct net *net)
4681 {
4682 #ifdef CONFIG_PROC_FS
4683         remove_proc_entry("ipv6_route", net->proc_net);
4684         remove_proc_entry("rt6_stats", net->proc_net);
4685 #endif
4686 }
4687
4688 static struct pernet_operations ip6_route_net_ops = {
4689         .init = ip6_route_net_init,
4690         .exit = ip6_route_net_exit,
4691 };
4692
4693 static int __net_init ipv6_inetpeer_init(struct net *net)
4694 {
4695         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
4696
4697         if (!bp)
4698                 return -ENOMEM;
4699         inet_peer_base_init(bp);
4700         net->ipv6.peers = bp;
4701         return 0;
4702 }
4703
4704 static void __net_exit ipv6_inetpeer_exit(struct net *net)
4705 {
4706         struct inet_peer_base *bp = net->ipv6.peers;
4707
4708         net->ipv6.peers = NULL;
4709         inetpeer_invalidate_tree(bp);
4710         kfree(bp);
4711 }
4712
4713 static struct pernet_operations ipv6_inetpeer_ops = {
4714         .init   =       ipv6_inetpeer_init,
4715         .exit   =       ipv6_inetpeer_exit,
4716 };
4717
4718 static struct pernet_operations ip6_route_net_late_ops = {
4719         .init = ip6_route_net_init_late,
4720         .exit = ip6_route_net_exit_late,
4721 };
4722
4723 static struct notifier_block ip6_route_dev_notifier = {
4724         .notifier_call = ip6_route_dev_notify,
4725         .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
4726 };
4727
4728 void __init ip6_route_init_special_entries(void)
4729 {
4730         /* Registering of the loopback is done before this portion of code,
4731          * the loopback reference in rt6_info will not be taken, do it
4732          * manually for init_net */
4733         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
4734         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4735   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4736         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
4737         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4738         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
4739         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4740   #endif
4741 }
4742
4743 int __init ip6_route_init(void)
4744 {
4745         int ret;
4746         int cpu;
4747
4748         ret = -ENOMEM;
4749         ip6_dst_ops_template.kmem_cachep =
4750                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
4751                                   SLAB_HWCACHE_ALIGN, NULL);
4752         if (!ip6_dst_ops_template.kmem_cachep)
4753                 goto out;
4754
4755         ret = dst_entries_init(&ip6_dst_blackhole_ops);
4756         if (ret)
4757                 goto out_kmem_cache;
4758
4759         ret = register_pernet_subsys(&ipv6_inetpeer_ops);
4760         if (ret)
4761                 goto out_dst_entries;
4762
4763         ret = register_pernet_subsys(&ip6_route_net_ops);
4764         if (ret)
4765                 goto out_register_inetpeer;
4766
4767         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
4768
4769         ret = fib6_init();
4770         if (ret)
4771                 goto out_register_subsys;
4772
4773         ret = xfrm6_init();
4774         if (ret)
4775                 goto out_fib6_init;
4776
4777         ret = fib6_rules_init();
4778         if (ret)
4779                 goto xfrm6_init;
4780
4781         ret = register_pernet_subsys(&ip6_route_net_late_ops);
4782         if (ret)
4783                 goto fib6_rules_init;
4784
4785         ret = -ENOBUFS;
4786         if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, 0) ||
4787             __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, 0) ||
4788             __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL,
4789                             RTNL_FLAG_DOIT_UNLOCKED))
4790                 goto out_register_late_subsys;
4791
4792         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
4793         if (ret)
4794                 goto out_register_late_subsys;
4795
4796         for_each_possible_cpu(cpu) {
4797                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
4798
4799                 INIT_LIST_HEAD(&ul->head);
4800                 spin_lock_init(&ul->lock);
4801         }
4802
4803 out:
4804         return ret;
4805
4806 out_register_late_subsys:
4807         unregister_pernet_subsys(&ip6_route_net_late_ops);
4808 fib6_rules_init:
4809         fib6_rules_cleanup();
4810 xfrm6_init:
4811         xfrm6_fini();
4812 out_fib6_init:
4813         fib6_gc_cleanup();
4814 out_register_subsys:
4815         unregister_pernet_subsys(&ip6_route_net_ops);
4816 out_register_inetpeer:
4817         unregister_pernet_subsys(&ipv6_inetpeer_ops);
4818 out_dst_entries:
4819         dst_entries_destroy(&ip6_dst_blackhole_ops);
4820 out_kmem_cache:
4821         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
4822         goto out;
4823 }
4824
4825 void ip6_route_cleanup(void)
4826 {
4827         unregister_netdevice_notifier(&ip6_route_dev_notifier);
4828         unregister_pernet_subsys(&ip6_route_net_late_ops);
4829         fib6_rules_cleanup();
4830         xfrm6_fini();
4831         fib6_gc_cleanup();
4832         unregister_pernet_subsys(&ipv6_inetpeer_ops);
4833         unregister_pernet_subsys(&ip6_route_net_ops);
4834         dst_entries_destroy(&ip6_dst_blackhole_ops);
4835         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
4836 }