Merge git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next
[linux-2.6-microblaze.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #define pr_fmt(fmt) "IPv6: " fmt
28
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
49 #include <net/snmp.h>
50 #include <net/ipv6.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
55 #include <net/tcp.h>
56 #include <linux/rtnetlink.h>
57 #include <net/dst.h>
58 #include <net/dst_metadata.h>
59 #include <net/xfrm.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/nexthop.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
66 #include <net/ip.h>
67 #include <linux/uaccess.h>
68
69 #ifdef CONFIG_SYSCTL
70 #include <linux/sysctl.h>
71 #endif
72
73 static int ip6_rt_type_to_error(u8 fib6_type);
74
75 #define CREATE_TRACE_POINTS
76 #include <trace/events/fib6.h>
77 EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup);
78 #undef CREATE_TRACE_POINTS
79
80 enum rt6_nud_state {
81         RT6_NUD_FAIL_HARD = -3,
82         RT6_NUD_FAIL_PROBE = -2,
83         RT6_NUD_FAIL_DO_RR = -1,
84         RT6_NUD_SUCCEED = 1
85 };
86
87 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
88 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
89 static unsigned int      ip6_mtu(const struct dst_entry *dst);
90 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
91 static void             ip6_dst_destroy(struct dst_entry *);
92 static void             ip6_dst_ifdown(struct dst_entry *,
93                                        struct net_device *dev, int how);
94 static int               ip6_dst_gc(struct dst_ops *ops);
95
96 static int              ip6_pkt_discard(struct sk_buff *skb);
97 static int              ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
98 static int              ip6_pkt_prohibit(struct sk_buff *skb);
99 static int              ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
100 static void             ip6_link_failure(struct sk_buff *skb);
101 static void             ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
102                                            struct sk_buff *skb, u32 mtu);
103 static void             rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
104                                         struct sk_buff *skb);
105 static int rt6_score_route(struct fib6_info *rt, int oif, int strict);
106 static size_t rt6_nlmsg_size(struct fib6_info *rt);
107 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
108                          struct fib6_info *rt, struct dst_entry *dst,
109                          struct in6_addr *dest, struct in6_addr *src,
110                          int iif, int type, u32 portid, u32 seq,
111                          unsigned int flags);
112 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
113                                            struct in6_addr *daddr,
114                                            struct in6_addr *saddr);
115
116 #ifdef CONFIG_IPV6_ROUTE_INFO
117 static struct fib6_info *rt6_add_route_info(struct net *net,
118                                            const struct in6_addr *prefix, int prefixlen,
119                                            const struct in6_addr *gwaddr,
120                                            struct net_device *dev,
121                                            unsigned int pref);
122 static struct fib6_info *rt6_get_route_info(struct net *net,
123                                            const struct in6_addr *prefix, int prefixlen,
124                                            const struct in6_addr *gwaddr,
125                                            struct net_device *dev);
126 #endif
127
128 struct uncached_list {
129         spinlock_t              lock;
130         struct list_head        head;
131 };
132
133 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
134
135 void rt6_uncached_list_add(struct rt6_info *rt)
136 {
137         struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
138
139         rt->rt6i_uncached_list = ul;
140
141         spin_lock_bh(&ul->lock);
142         list_add_tail(&rt->rt6i_uncached, &ul->head);
143         spin_unlock_bh(&ul->lock);
144 }
145
146 void rt6_uncached_list_del(struct rt6_info *rt)
147 {
148         if (!list_empty(&rt->rt6i_uncached)) {
149                 struct uncached_list *ul = rt->rt6i_uncached_list;
150                 struct net *net = dev_net(rt->dst.dev);
151
152                 spin_lock_bh(&ul->lock);
153                 list_del(&rt->rt6i_uncached);
154                 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
155                 spin_unlock_bh(&ul->lock);
156         }
157 }
158
159 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
160 {
161         struct net_device *loopback_dev = net->loopback_dev;
162         int cpu;
163
164         if (dev == loopback_dev)
165                 return;
166
167         for_each_possible_cpu(cpu) {
168                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
169                 struct rt6_info *rt;
170
171                 spin_lock_bh(&ul->lock);
172                 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
173                         struct inet6_dev *rt_idev = rt->rt6i_idev;
174                         struct net_device *rt_dev = rt->dst.dev;
175
176                         if (rt_idev->dev == dev) {
177                                 rt->rt6i_idev = in6_dev_get(loopback_dev);
178                                 in6_dev_put(rt_idev);
179                         }
180
181                         if (rt_dev == dev) {
182                                 rt->dst.dev = loopback_dev;
183                                 dev_hold(rt->dst.dev);
184                                 dev_put(rt_dev);
185                         }
186                 }
187                 spin_unlock_bh(&ul->lock);
188         }
189 }
190
191 static inline const void *choose_neigh_daddr(const struct in6_addr *p,
192                                              struct sk_buff *skb,
193                                              const void *daddr)
194 {
195         if (!ipv6_addr_any(p))
196                 return (const void *) p;
197         else if (skb)
198                 return &ipv6_hdr(skb)->daddr;
199         return daddr;
200 }
201
202 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
203                                    struct net_device *dev,
204                                    struct sk_buff *skb,
205                                    const void *daddr)
206 {
207         struct neighbour *n;
208
209         daddr = choose_neigh_daddr(gw, skb, daddr);
210         n = __ipv6_neigh_lookup(dev, daddr);
211         if (n)
212                 return n;
213
214         n = neigh_create(&nd_tbl, daddr, dev);
215         return IS_ERR(n) ? NULL : n;
216 }
217
218 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst,
219                                               struct sk_buff *skb,
220                                               const void *daddr)
221 {
222         const struct rt6_info *rt = container_of(dst, struct rt6_info, dst);
223
224         return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr);
225 }
226
227 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
228 {
229         struct net_device *dev = dst->dev;
230         struct rt6_info *rt = (struct rt6_info *)dst;
231
232         daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr);
233         if (!daddr)
234                 return;
235         if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
236                 return;
237         if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
238                 return;
239         __ipv6_confirm_neigh(dev, daddr);
240 }
241
242 static struct dst_ops ip6_dst_ops_template = {
243         .family                 =       AF_INET6,
244         .gc                     =       ip6_dst_gc,
245         .gc_thresh              =       1024,
246         .check                  =       ip6_dst_check,
247         .default_advmss         =       ip6_default_advmss,
248         .mtu                    =       ip6_mtu,
249         .cow_metrics            =       dst_cow_metrics_generic,
250         .destroy                =       ip6_dst_destroy,
251         .ifdown                 =       ip6_dst_ifdown,
252         .negative_advice        =       ip6_negative_advice,
253         .link_failure           =       ip6_link_failure,
254         .update_pmtu            =       ip6_rt_update_pmtu,
255         .redirect               =       rt6_do_redirect,
256         .local_out              =       __ip6_local_out,
257         .neigh_lookup           =       ip6_dst_neigh_lookup,
258         .confirm_neigh          =       ip6_confirm_neigh,
259 };
260
261 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
262 {
263         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
264
265         return mtu ? : dst->dev->mtu;
266 }
267
268 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
269                                          struct sk_buff *skb, u32 mtu)
270 {
271 }
272
273 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
274                                       struct sk_buff *skb)
275 {
276 }
277
278 static struct dst_ops ip6_dst_blackhole_ops = {
279         .family                 =       AF_INET6,
280         .destroy                =       ip6_dst_destroy,
281         .check                  =       ip6_dst_check,
282         .mtu                    =       ip6_blackhole_mtu,
283         .default_advmss         =       ip6_default_advmss,
284         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
285         .redirect               =       ip6_rt_blackhole_redirect,
286         .cow_metrics            =       dst_cow_metrics_generic,
287         .neigh_lookup           =       ip6_dst_neigh_lookup,
288 };
289
290 static const u32 ip6_template_metrics[RTAX_MAX] = {
291         [RTAX_HOPLIMIT - 1] = 0,
292 };
293
294 static const struct fib6_info fib6_null_entry_template = {
295         .fib6_flags     = (RTF_REJECT | RTF_NONEXTHOP),
296         .fib6_protocol  = RTPROT_KERNEL,
297         .fib6_metric    = ~(u32)0,
298         .fib6_ref       = ATOMIC_INIT(1),
299         .fib6_type      = RTN_UNREACHABLE,
300         .fib6_metrics   = (struct dst_metrics *)&dst_default_metrics,
301 };
302
303 static const struct rt6_info ip6_null_entry_template = {
304         .dst = {
305                 .__refcnt       = ATOMIC_INIT(1),
306                 .__use          = 1,
307                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
308                 .error          = -ENETUNREACH,
309                 .input          = ip6_pkt_discard,
310                 .output         = ip6_pkt_discard_out,
311         },
312         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
313 };
314
315 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
316
317 static const struct rt6_info ip6_prohibit_entry_template = {
318         .dst = {
319                 .__refcnt       = ATOMIC_INIT(1),
320                 .__use          = 1,
321                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
322                 .error          = -EACCES,
323                 .input          = ip6_pkt_prohibit,
324                 .output         = ip6_pkt_prohibit_out,
325         },
326         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
327 };
328
329 static const struct rt6_info ip6_blk_hole_entry_template = {
330         .dst = {
331                 .__refcnt       = ATOMIC_INIT(1),
332                 .__use          = 1,
333                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
334                 .error          = -EINVAL,
335                 .input          = dst_discard,
336                 .output         = dst_discard_out,
337         },
338         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
339 };
340
341 #endif
342
343 static void rt6_info_init(struct rt6_info *rt)
344 {
345         struct dst_entry *dst = &rt->dst;
346
347         memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
348         INIT_LIST_HEAD(&rt->rt6i_uncached);
349 }
350
351 /* allocate dst with ip6_dst_ops */
352 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
353                                int flags)
354 {
355         struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
356                                         1, DST_OBSOLETE_FORCE_CHK, flags);
357
358         if (rt) {
359                 rt6_info_init(rt);
360                 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
361         }
362
363         return rt;
364 }
365 EXPORT_SYMBOL(ip6_dst_alloc);
366
367 static void ip6_dst_destroy(struct dst_entry *dst)
368 {
369         struct rt6_info *rt = (struct rt6_info *)dst;
370         struct fib6_info *from;
371         struct inet6_dev *idev;
372
373         ip_dst_metrics_put(dst);
374         rt6_uncached_list_del(rt);
375
376         idev = rt->rt6i_idev;
377         if (idev) {
378                 rt->rt6i_idev = NULL;
379                 in6_dev_put(idev);
380         }
381
382         rcu_read_lock();
383         from = rcu_dereference(rt->from);
384         rcu_assign_pointer(rt->from, NULL);
385         fib6_info_release(from);
386         rcu_read_unlock();
387 }
388
389 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
390                            int how)
391 {
392         struct rt6_info *rt = (struct rt6_info *)dst;
393         struct inet6_dev *idev = rt->rt6i_idev;
394         struct net_device *loopback_dev =
395                 dev_net(dev)->loopback_dev;
396
397         if (idev && idev->dev != loopback_dev) {
398                 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
399                 if (loopback_idev) {
400                         rt->rt6i_idev = loopback_idev;
401                         in6_dev_put(idev);
402                 }
403         }
404 }
405
406 static bool __rt6_check_expired(const struct rt6_info *rt)
407 {
408         if (rt->rt6i_flags & RTF_EXPIRES)
409                 return time_after(jiffies, rt->dst.expires);
410         else
411                 return false;
412 }
413
414 static bool rt6_check_expired(const struct rt6_info *rt)
415 {
416         struct fib6_info *from;
417
418         from = rcu_dereference(rt->from);
419
420         if (rt->rt6i_flags & RTF_EXPIRES) {
421                 if (time_after(jiffies, rt->dst.expires))
422                         return true;
423         } else if (from) {
424                 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
425                         fib6_check_expired(from);
426         }
427         return false;
428 }
429
430 struct fib6_info *fib6_multipath_select(const struct net *net,
431                                         struct fib6_info *match,
432                                         struct flowi6 *fl6, int oif,
433                                         const struct sk_buff *skb,
434                                         int strict)
435 {
436         struct fib6_info *sibling, *next_sibling;
437
438         /* We might have already computed the hash for ICMPv6 errors. In such
439          * case it will always be non-zero. Otherwise now is the time to do it.
440          */
441         if (!fl6->mp_hash)
442                 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
443
444         if (fl6->mp_hash <= atomic_read(&match->fib6_nh.nh_upper_bound))
445                 return match;
446
447         list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
448                                  fib6_siblings) {
449                 int nh_upper_bound;
450
451                 nh_upper_bound = atomic_read(&sibling->fib6_nh.nh_upper_bound);
452                 if (fl6->mp_hash > nh_upper_bound)
453                         continue;
454                 if (rt6_score_route(sibling, oif, strict) < 0)
455                         break;
456                 match = sibling;
457                 break;
458         }
459
460         return match;
461 }
462
463 /*
464  *      Route lookup. rcu_read_lock() should be held.
465  */
466
467 static inline struct fib6_info *rt6_device_match(struct net *net,
468                                                  struct fib6_info *rt,
469                                                     const struct in6_addr *saddr,
470                                                     int oif,
471                                                     int flags)
472 {
473         struct fib6_info *sprt;
474
475         if (!oif && ipv6_addr_any(saddr) &&
476             !(rt->fib6_nh.nh_flags & RTNH_F_DEAD))
477                 return rt;
478
479         for (sprt = rt; sprt; sprt = rcu_dereference(sprt->fib6_next)) {
480                 const struct net_device *dev = sprt->fib6_nh.nh_dev;
481
482                 if (sprt->fib6_nh.nh_flags & RTNH_F_DEAD)
483                         continue;
484
485                 if (oif) {
486                         if (dev->ifindex == oif)
487                                 return sprt;
488                 } else {
489                         if (ipv6_chk_addr(net, saddr, dev,
490                                           flags & RT6_LOOKUP_F_IFACE))
491                                 return sprt;
492                 }
493         }
494
495         if (oif && flags & RT6_LOOKUP_F_IFACE)
496                 return net->ipv6.fib6_null_entry;
497
498         return rt->fib6_nh.nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt;
499 }
500
501 #ifdef CONFIG_IPV6_ROUTER_PREF
502 struct __rt6_probe_work {
503         struct work_struct work;
504         struct in6_addr target;
505         struct net_device *dev;
506 };
507
508 static void rt6_probe_deferred(struct work_struct *w)
509 {
510         struct in6_addr mcaddr;
511         struct __rt6_probe_work *work =
512                 container_of(w, struct __rt6_probe_work, work);
513
514         addrconf_addr_solict_mult(&work->target, &mcaddr);
515         ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
516         dev_put(work->dev);
517         kfree(work);
518 }
519
520 static void rt6_probe(struct fib6_info *rt)
521 {
522         struct __rt6_probe_work *work = NULL;
523         const struct in6_addr *nh_gw;
524         struct neighbour *neigh;
525         struct net_device *dev;
526         struct inet6_dev *idev;
527
528         /*
529          * Okay, this does not seem to be appropriate
530          * for now, however, we need to check if it
531          * is really so; aka Router Reachability Probing.
532          *
533          * Router Reachability Probe MUST be rate-limited
534          * to no more than one per minute.
535          */
536         if (!rt || !(rt->fib6_flags & RTF_GATEWAY))
537                 return;
538
539         nh_gw = &rt->fib6_nh.nh_gw;
540         dev = rt->fib6_nh.nh_dev;
541         rcu_read_lock_bh();
542         idev = __in6_dev_get(dev);
543         neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
544         if (neigh) {
545                 if (neigh->nud_state & NUD_VALID)
546                         goto out;
547
548                 write_lock(&neigh->lock);
549                 if (!(neigh->nud_state & NUD_VALID) &&
550                     time_after(jiffies,
551                                neigh->updated + idev->cnf.rtr_probe_interval)) {
552                         work = kmalloc(sizeof(*work), GFP_ATOMIC);
553                         if (work)
554                                 __neigh_set_probe_once(neigh);
555                 }
556                 write_unlock(&neigh->lock);
557         } else if (time_after(jiffies, rt->last_probe +
558                                        idev->cnf.rtr_probe_interval)) {
559                 work = kmalloc(sizeof(*work), GFP_ATOMIC);
560         }
561
562         if (work) {
563                 rt->last_probe = jiffies;
564                 INIT_WORK(&work->work, rt6_probe_deferred);
565                 work->target = *nh_gw;
566                 dev_hold(dev);
567                 work->dev = dev;
568                 schedule_work(&work->work);
569         }
570
571 out:
572         rcu_read_unlock_bh();
573 }
574 #else
575 static inline void rt6_probe(struct fib6_info *rt)
576 {
577 }
578 #endif
579
580 /*
581  * Default Router Selection (RFC 2461 6.3.6)
582  */
583 static inline int rt6_check_dev(struct fib6_info *rt, int oif)
584 {
585         const struct net_device *dev = rt->fib6_nh.nh_dev;
586
587         if (!oif || dev->ifindex == oif)
588                 return 2;
589         return 0;
590 }
591
592 static inline enum rt6_nud_state rt6_check_neigh(struct fib6_info *rt)
593 {
594         enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
595         struct neighbour *neigh;
596
597         if (rt->fib6_flags & RTF_NONEXTHOP ||
598             !(rt->fib6_flags & RTF_GATEWAY))
599                 return RT6_NUD_SUCCEED;
600
601         rcu_read_lock_bh();
602         neigh = __ipv6_neigh_lookup_noref(rt->fib6_nh.nh_dev,
603                                           &rt->fib6_nh.nh_gw);
604         if (neigh) {
605                 read_lock(&neigh->lock);
606                 if (neigh->nud_state & NUD_VALID)
607                         ret = RT6_NUD_SUCCEED;
608 #ifdef CONFIG_IPV6_ROUTER_PREF
609                 else if (!(neigh->nud_state & NUD_FAILED))
610                         ret = RT6_NUD_SUCCEED;
611                 else
612                         ret = RT6_NUD_FAIL_PROBE;
613 #endif
614                 read_unlock(&neigh->lock);
615         } else {
616                 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
617                       RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
618         }
619         rcu_read_unlock_bh();
620
621         return ret;
622 }
623
624 static int rt6_score_route(struct fib6_info *rt, int oif, int strict)
625 {
626         int m;
627
628         m = rt6_check_dev(rt, oif);
629         if (!m && (strict & RT6_LOOKUP_F_IFACE))
630                 return RT6_NUD_FAIL_HARD;
631 #ifdef CONFIG_IPV6_ROUTER_PREF
632         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->fib6_flags)) << 2;
633 #endif
634         if (strict & RT6_LOOKUP_F_REACHABLE) {
635                 int n = rt6_check_neigh(rt);
636                 if (n < 0)
637                         return n;
638         }
639         return m;
640 }
641
642 /* called with rc_read_lock held */
643 static inline bool fib6_ignore_linkdown(const struct fib6_info *f6i)
644 {
645         const struct net_device *dev = fib6_info_nh_dev(f6i);
646         bool rc = false;
647
648         if (dev) {
649                 const struct inet6_dev *idev = __in6_dev_get(dev);
650
651                 rc = !!idev->cnf.ignore_routes_with_linkdown;
652         }
653
654         return rc;
655 }
656
657 static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict,
658                                    int *mpri, struct fib6_info *match,
659                                    bool *do_rr)
660 {
661         int m;
662         bool match_do_rr = false;
663
664         if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
665                 goto out;
666
667         if (fib6_ignore_linkdown(rt) &&
668             rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
669             !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
670                 goto out;
671
672         if (fib6_check_expired(rt))
673                 goto out;
674
675         m = rt6_score_route(rt, oif, strict);
676         if (m == RT6_NUD_FAIL_DO_RR) {
677                 match_do_rr = true;
678                 m = 0; /* lowest valid score */
679         } else if (m == RT6_NUD_FAIL_HARD) {
680                 goto out;
681         }
682
683         if (strict & RT6_LOOKUP_F_REACHABLE)
684                 rt6_probe(rt);
685
686         /* note that m can be RT6_NUD_FAIL_PROBE at this point */
687         if (m > *mpri) {
688                 *do_rr = match_do_rr;
689                 *mpri = m;
690                 match = rt;
691         }
692 out:
693         return match;
694 }
695
696 static struct fib6_info *find_rr_leaf(struct fib6_node *fn,
697                                      struct fib6_info *leaf,
698                                      struct fib6_info *rr_head,
699                                      u32 metric, int oif, int strict,
700                                      bool *do_rr)
701 {
702         struct fib6_info *rt, *match, *cont;
703         int mpri = -1;
704
705         match = NULL;
706         cont = NULL;
707         for (rt = rr_head; rt; rt = rcu_dereference(rt->fib6_next)) {
708                 if (rt->fib6_metric != metric) {
709                         cont = rt;
710                         break;
711                 }
712
713                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
714         }
715
716         for (rt = leaf; rt && rt != rr_head;
717              rt = rcu_dereference(rt->fib6_next)) {
718                 if (rt->fib6_metric != metric) {
719                         cont = rt;
720                         break;
721                 }
722
723                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
724         }
725
726         if (match || !cont)
727                 return match;
728
729         for (rt = cont; rt; rt = rcu_dereference(rt->fib6_next))
730                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
731
732         return match;
733 }
734
735 static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn,
736                                    int oif, int strict)
737 {
738         struct fib6_info *leaf = rcu_dereference(fn->leaf);
739         struct fib6_info *match, *rt0;
740         bool do_rr = false;
741         int key_plen;
742
743         if (!leaf || leaf == net->ipv6.fib6_null_entry)
744                 return net->ipv6.fib6_null_entry;
745
746         rt0 = rcu_dereference(fn->rr_ptr);
747         if (!rt0)
748                 rt0 = leaf;
749
750         /* Double check to make sure fn is not an intermediate node
751          * and fn->leaf does not points to its child's leaf
752          * (This might happen if all routes under fn are deleted from
753          * the tree and fib6_repair_tree() is called on the node.)
754          */
755         key_plen = rt0->fib6_dst.plen;
756 #ifdef CONFIG_IPV6_SUBTREES
757         if (rt0->fib6_src.plen)
758                 key_plen = rt0->fib6_src.plen;
759 #endif
760         if (fn->fn_bit != key_plen)
761                 return net->ipv6.fib6_null_entry;
762
763         match = find_rr_leaf(fn, leaf, rt0, rt0->fib6_metric, oif, strict,
764                              &do_rr);
765
766         if (do_rr) {
767                 struct fib6_info *next = rcu_dereference(rt0->fib6_next);
768
769                 /* no entries matched; do round-robin */
770                 if (!next || next->fib6_metric != rt0->fib6_metric)
771                         next = leaf;
772
773                 if (next != rt0) {
774                         spin_lock_bh(&leaf->fib6_table->tb6_lock);
775                         /* make sure next is not being deleted from the tree */
776                         if (next->fib6_node)
777                                 rcu_assign_pointer(fn->rr_ptr, next);
778                         spin_unlock_bh(&leaf->fib6_table->tb6_lock);
779                 }
780         }
781
782         return match ? match : net->ipv6.fib6_null_entry;
783 }
784
785 static bool rt6_is_gw_or_nonexthop(const struct fib6_info *rt)
786 {
787         return (rt->fib6_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
788 }
789
790 #ifdef CONFIG_IPV6_ROUTE_INFO
791 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
792                   const struct in6_addr *gwaddr)
793 {
794         struct net *net = dev_net(dev);
795         struct route_info *rinfo = (struct route_info *) opt;
796         struct in6_addr prefix_buf, *prefix;
797         unsigned int pref;
798         unsigned long lifetime;
799         struct fib6_info *rt;
800
801         if (len < sizeof(struct route_info)) {
802                 return -EINVAL;
803         }
804
805         /* Sanity check for prefix_len and length */
806         if (rinfo->length > 3) {
807                 return -EINVAL;
808         } else if (rinfo->prefix_len > 128) {
809                 return -EINVAL;
810         } else if (rinfo->prefix_len > 64) {
811                 if (rinfo->length < 2) {
812                         return -EINVAL;
813                 }
814         } else if (rinfo->prefix_len > 0) {
815                 if (rinfo->length < 1) {
816                         return -EINVAL;
817                 }
818         }
819
820         pref = rinfo->route_pref;
821         if (pref == ICMPV6_ROUTER_PREF_INVALID)
822                 return -EINVAL;
823
824         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
825
826         if (rinfo->length == 3)
827                 prefix = (struct in6_addr *)rinfo->prefix;
828         else {
829                 /* this function is safe */
830                 ipv6_addr_prefix(&prefix_buf,
831                                  (struct in6_addr *)rinfo->prefix,
832                                  rinfo->prefix_len);
833                 prefix = &prefix_buf;
834         }
835
836         if (rinfo->prefix_len == 0)
837                 rt = rt6_get_dflt_router(net, gwaddr, dev);
838         else
839                 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
840                                         gwaddr, dev);
841
842         if (rt && !lifetime) {
843                 ip6_del_rt(net, rt);
844                 rt = NULL;
845         }
846
847         if (!rt && lifetime)
848                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
849                                         dev, pref);
850         else if (rt)
851                 rt->fib6_flags = RTF_ROUTEINFO |
852                                  (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
853
854         if (rt) {
855                 if (!addrconf_finite_timeout(lifetime))
856                         fib6_clean_expires(rt);
857                 else
858                         fib6_set_expires(rt, jiffies + HZ * lifetime);
859
860                 fib6_info_release(rt);
861         }
862         return 0;
863 }
864 #endif
865
866 /*
867  *      Misc support functions
868  */
869
870 /* called with rcu_lock held */
871 static struct net_device *ip6_rt_get_dev_rcu(struct fib6_info *rt)
872 {
873         struct net_device *dev = rt->fib6_nh.nh_dev;
874
875         if (rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) {
876                 /* for copies of local routes, dst->dev needs to be the
877                  * device if it is a master device, the master device if
878                  * device is enslaved, and the loopback as the default
879                  */
880                 if (netif_is_l3_slave(dev) &&
881                     !rt6_need_strict(&rt->fib6_dst.addr))
882                         dev = l3mdev_master_dev_rcu(dev);
883                 else if (!netif_is_l3_master(dev))
884                         dev = dev_net(dev)->loopback_dev;
885                 /* last case is netif_is_l3_master(dev) is true in which
886                  * case we want dev returned to be dev
887                  */
888         }
889
890         return dev;
891 }
892
893 static const int fib6_prop[RTN_MAX + 1] = {
894         [RTN_UNSPEC]    = 0,
895         [RTN_UNICAST]   = 0,
896         [RTN_LOCAL]     = 0,
897         [RTN_BROADCAST] = 0,
898         [RTN_ANYCAST]   = 0,
899         [RTN_MULTICAST] = 0,
900         [RTN_BLACKHOLE] = -EINVAL,
901         [RTN_UNREACHABLE] = -EHOSTUNREACH,
902         [RTN_PROHIBIT]  = -EACCES,
903         [RTN_THROW]     = -EAGAIN,
904         [RTN_NAT]       = -EINVAL,
905         [RTN_XRESOLVE]  = -EINVAL,
906 };
907
908 static int ip6_rt_type_to_error(u8 fib6_type)
909 {
910         return fib6_prop[fib6_type];
911 }
912
913 static unsigned short fib6_info_dst_flags(struct fib6_info *rt)
914 {
915         unsigned short flags = 0;
916
917         if (rt->dst_nocount)
918                 flags |= DST_NOCOUNT;
919         if (rt->dst_nopolicy)
920                 flags |= DST_NOPOLICY;
921         if (rt->dst_host)
922                 flags |= DST_HOST;
923
924         return flags;
925 }
926
927 static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort)
928 {
929         rt->dst.error = ip6_rt_type_to_error(ort->fib6_type);
930
931         switch (ort->fib6_type) {
932         case RTN_BLACKHOLE:
933                 rt->dst.output = dst_discard_out;
934                 rt->dst.input = dst_discard;
935                 break;
936         case RTN_PROHIBIT:
937                 rt->dst.output = ip6_pkt_prohibit_out;
938                 rt->dst.input = ip6_pkt_prohibit;
939                 break;
940         case RTN_THROW:
941         case RTN_UNREACHABLE:
942         default:
943                 rt->dst.output = ip6_pkt_discard_out;
944                 rt->dst.input = ip6_pkt_discard;
945                 break;
946         }
947 }
948
949 static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort)
950 {
951         if (ort->fib6_flags & RTF_REJECT) {
952                 ip6_rt_init_dst_reject(rt, ort);
953                 return;
954         }
955
956         rt->dst.error = 0;
957         rt->dst.output = ip6_output;
958
959         if (ort->fib6_type == RTN_LOCAL || ort->fib6_type == RTN_ANYCAST) {
960                 rt->dst.input = ip6_input;
961         } else if (ipv6_addr_type(&ort->fib6_dst.addr) & IPV6_ADDR_MULTICAST) {
962                 rt->dst.input = ip6_mc_input;
963         } else {
964                 rt->dst.input = ip6_forward;
965         }
966
967         if (ort->fib6_nh.nh_lwtstate) {
968                 rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate);
969                 lwtunnel_set_redirect(&rt->dst);
970         }
971
972         rt->dst.lastuse = jiffies;
973 }
974
975 /* Caller must already hold reference to @from */
976 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
977 {
978         rt->rt6i_flags &= ~RTF_EXPIRES;
979         rcu_assign_pointer(rt->from, from);
980         ip_dst_init_metrics(&rt->dst, from->fib6_metrics);
981 }
982
983 /* Caller must already hold reference to @ort */
984 static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort)
985 {
986         struct net_device *dev = fib6_info_nh_dev(ort);
987
988         ip6_rt_init_dst(rt, ort);
989
990         rt->rt6i_dst = ort->fib6_dst;
991         rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL;
992         rt->rt6i_gateway = ort->fib6_nh.nh_gw;
993         rt->rt6i_flags = ort->fib6_flags;
994         rt6_set_from(rt, ort);
995 #ifdef CONFIG_IPV6_SUBTREES
996         rt->rt6i_src = ort->fib6_src;
997 #endif
998 }
999
1000 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
1001                                         struct in6_addr *saddr)
1002 {
1003         struct fib6_node *pn, *sn;
1004         while (1) {
1005                 if (fn->fn_flags & RTN_TL_ROOT)
1006                         return NULL;
1007                 pn = rcu_dereference(fn->parent);
1008                 sn = FIB6_SUBTREE(pn);
1009                 if (sn && sn != fn)
1010                         fn = fib6_node_lookup(sn, NULL, saddr);
1011                 else
1012                         fn = pn;
1013                 if (fn->fn_flags & RTN_RTINFO)
1014                         return fn;
1015         }
1016 }
1017
1018 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt)
1019 {
1020         struct rt6_info *rt = *prt;
1021
1022         if (dst_hold_safe(&rt->dst))
1023                 return true;
1024         if (net) {
1025                 rt = net->ipv6.ip6_null_entry;
1026                 dst_hold(&rt->dst);
1027         } else {
1028                 rt = NULL;
1029         }
1030         *prt = rt;
1031         return false;
1032 }
1033
1034 /* called with rcu_lock held */
1035 static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt)
1036 {
1037         unsigned short flags = fib6_info_dst_flags(rt);
1038         struct net_device *dev = rt->fib6_nh.nh_dev;
1039         struct rt6_info *nrt;
1040
1041         if (!fib6_info_hold_safe(rt))
1042                 return NULL;
1043
1044         nrt = ip6_dst_alloc(dev_net(dev), dev, flags);
1045         if (nrt)
1046                 ip6_rt_copy_init(nrt, rt);
1047         else
1048                 fib6_info_release(rt);
1049
1050         return nrt;
1051 }
1052
1053 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
1054                                              struct fib6_table *table,
1055                                              struct flowi6 *fl6,
1056                                              const struct sk_buff *skb,
1057                                              int flags)
1058 {
1059         struct fib6_info *f6i;
1060         struct fib6_node *fn;
1061         struct rt6_info *rt;
1062
1063         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1064                 flags &= ~RT6_LOOKUP_F_IFACE;
1065
1066         rcu_read_lock();
1067         fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1068 restart:
1069         f6i = rcu_dereference(fn->leaf);
1070         if (!f6i) {
1071                 f6i = net->ipv6.fib6_null_entry;
1072         } else {
1073                 f6i = rt6_device_match(net, f6i, &fl6->saddr,
1074                                       fl6->flowi6_oif, flags);
1075                 if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0)
1076                         f6i = fib6_multipath_select(net, f6i, fl6,
1077                                                     fl6->flowi6_oif, skb,
1078                                                     flags);
1079         }
1080         if (f6i == net->ipv6.fib6_null_entry) {
1081                 fn = fib6_backtrack(fn, &fl6->saddr);
1082                 if (fn)
1083                         goto restart;
1084         }
1085
1086         trace_fib6_table_lookup(net, f6i, table, fl6);
1087
1088         /* Search through exception table */
1089         rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1090         if (rt) {
1091                 if (ip6_hold_safe(net, &rt))
1092                         dst_use_noref(&rt->dst, jiffies);
1093         } else if (f6i == net->ipv6.fib6_null_entry) {
1094                 rt = net->ipv6.ip6_null_entry;
1095                 dst_hold(&rt->dst);
1096         } else {
1097                 rt = ip6_create_rt_rcu(f6i);
1098                 if (!rt) {
1099                         rt = net->ipv6.ip6_null_entry;
1100                         dst_hold(&rt->dst);
1101                 }
1102         }
1103
1104         rcu_read_unlock();
1105
1106         return rt;
1107 }
1108
1109 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
1110                                    const struct sk_buff *skb, int flags)
1111 {
1112         return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
1113 }
1114 EXPORT_SYMBOL_GPL(ip6_route_lookup);
1115
1116 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
1117                             const struct in6_addr *saddr, int oif,
1118                             const struct sk_buff *skb, int strict)
1119 {
1120         struct flowi6 fl6 = {
1121                 .flowi6_oif = oif,
1122                 .daddr = *daddr,
1123         };
1124         struct dst_entry *dst;
1125         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
1126
1127         if (saddr) {
1128                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
1129                 flags |= RT6_LOOKUP_F_HAS_SADDR;
1130         }
1131
1132         dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
1133         if (dst->error == 0)
1134                 return (struct rt6_info *) dst;
1135
1136         dst_release(dst);
1137
1138         return NULL;
1139 }
1140 EXPORT_SYMBOL(rt6_lookup);
1141
1142 /* ip6_ins_rt is called with FREE table->tb6_lock.
1143  * It takes new route entry, the addition fails by any reason the
1144  * route is released.
1145  * Caller must hold dst before calling it.
1146  */
1147
1148 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info,
1149                         struct netlink_ext_ack *extack)
1150 {
1151         int err;
1152         struct fib6_table *table;
1153
1154         table = rt->fib6_table;
1155         spin_lock_bh(&table->tb6_lock);
1156         err = fib6_add(&table->tb6_root, rt, info, extack);
1157         spin_unlock_bh(&table->tb6_lock);
1158
1159         return err;
1160 }
1161
1162 int ip6_ins_rt(struct net *net, struct fib6_info *rt)
1163 {
1164         struct nl_info info = { .nl_net = net, };
1165
1166         return __ip6_ins_rt(rt, &info, NULL);
1167 }
1168
1169 static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort,
1170                                            const struct in6_addr *daddr,
1171                                            const struct in6_addr *saddr)
1172 {
1173         struct net_device *dev;
1174         struct rt6_info *rt;
1175
1176         /*
1177          *      Clone the route.
1178          */
1179
1180         if (!fib6_info_hold_safe(ort))
1181                 return NULL;
1182
1183         dev = ip6_rt_get_dev_rcu(ort);
1184         rt = ip6_dst_alloc(dev_net(dev), dev, 0);
1185         if (!rt) {
1186                 fib6_info_release(ort);
1187                 return NULL;
1188         }
1189
1190         ip6_rt_copy_init(rt, ort);
1191         rt->rt6i_flags |= RTF_CACHE;
1192         rt->dst.flags |= DST_HOST;
1193         rt->rt6i_dst.addr = *daddr;
1194         rt->rt6i_dst.plen = 128;
1195
1196         if (!rt6_is_gw_or_nonexthop(ort)) {
1197                 if (ort->fib6_dst.plen != 128 &&
1198                     ipv6_addr_equal(&ort->fib6_dst.addr, daddr))
1199                         rt->rt6i_flags |= RTF_ANYCAST;
1200 #ifdef CONFIG_IPV6_SUBTREES
1201                 if (rt->rt6i_src.plen && saddr) {
1202                         rt->rt6i_src.addr = *saddr;
1203                         rt->rt6i_src.plen = 128;
1204                 }
1205 #endif
1206         }
1207
1208         return rt;
1209 }
1210
1211 static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt)
1212 {
1213         unsigned short flags = fib6_info_dst_flags(rt);
1214         struct net_device *dev;
1215         struct rt6_info *pcpu_rt;
1216
1217         if (!fib6_info_hold_safe(rt))
1218                 return NULL;
1219
1220         rcu_read_lock();
1221         dev = ip6_rt_get_dev_rcu(rt);
1222         pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags);
1223         rcu_read_unlock();
1224         if (!pcpu_rt) {
1225                 fib6_info_release(rt);
1226                 return NULL;
1227         }
1228         ip6_rt_copy_init(pcpu_rt, rt);
1229         pcpu_rt->rt6i_flags |= RTF_PCPU;
1230         return pcpu_rt;
1231 }
1232
1233 /* It should be called with rcu_read_lock() acquired */
1234 static struct rt6_info *rt6_get_pcpu_route(struct fib6_info *rt)
1235 {
1236         struct rt6_info *pcpu_rt, **p;
1237
1238         p = this_cpu_ptr(rt->rt6i_pcpu);
1239         pcpu_rt = *p;
1240
1241         if (pcpu_rt)
1242                 ip6_hold_safe(NULL, &pcpu_rt);
1243
1244         return pcpu_rt;
1245 }
1246
1247 static struct rt6_info *rt6_make_pcpu_route(struct net *net,
1248                                             struct fib6_info *rt)
1249 {
1250         struct rt6_info *pcpu_rt, *prev, **p;
1251
1252         pcpu_rt = ip6_rt_pcpu_alloc(rt);
1253         if (!pcpu_rt) {
1254                 dst_hold(&net->ipv6.ip6_null_entry->dst);
1255                 return net->ipv6.ip6_null_entry;
1256         }
1257
1258         dst_hold(&pcpu_rt->dst);
1259         p = this_cpu_ptr(rt->rt6i_pcpu);
1260         prev = cmpxchg(p, NULL, pcpu_rt);
1261         BUG_ON(prev);
1262
1263         return pcpu_rt;
1264 }
1265
1266 /* exception hash table implementation
1267  */
1268 static DEFINE_SPINLOCK(rt6_exception_lock);
1269
1270 /* Remove rt6_ex from hash table and free the memory
1271  * Caller must hold rt6_exception_lock
1272  */
1273 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1274                                  struct rt6_exception *rt6_ex)
1275 {
1276         struct fib6_info *from;
1277         struct net *net;
1278
1279         if (!bucket || !rt6_ex)
1280                 return;
1281
1282         net = dev_net(rt6_ex->rt6i->dst.dev);
1283         net->ipv6.rt6_stats->fib_rt_cache--;
1284
1285         /* purge completely the exception to allow releasing the held resources:
1286          * some [sk] cache may keep the dst around for unlimited time
1287          */
1288         from = rcu_dereference_protected(rt6_ex->rt6i->from,
1289                                          lockdep_is_held(&rt6_exception_lock));
1290         rcu_assign_pointer(rt6_ex->rt6i->from, NULL);
1291         fib6_info_release(from);
1292         dst_dev_put(&rt6_ex->rt6i->dst);
1293
1294         hlist_del_rcu(&rt6_ex->hlist);
1295         dst_release(&rt6_ex->rt6i->dst);
1296         kfree_rcu(rt6_ex, rcu);
1297         WARN_ON_ONCE(!bucket->depth);
1298         bucket->depth--;
1299 }
1300
1301 /* Remove oldest rt6_ex in bucket and free the memory
1302  * Caller must hold rt6_exception_lock
1303  */
1304 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1305 {
1306         struct rt6_exception *rt6_ex, *oldest = NULL;
1307
1308         if (!bucket)
1309                 return;
1310
1311         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1312                 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1313                         oldest = rt6_ex;
1314         }
1315         rt6_remove_exception(bucket, oldest);
1316 }
1317
1318 static u32 rt6_exception_hash(const struct in6_addr *dst,
1319                               const struct in6_addr *src)
1320 {
1321         static u32 seed __read_mostly;
1322         u32 val;
1323
1324         net_get_random_once(&seed, sizeof(seed));
1325         val = jhash(dst, sizeof(*dst), seed);
1326
1327 #ifdef CONFIG_IPV6_SUBTREES
1328         if (src)
1329                 val = jhash(src, sizeof(*src), val);
1330 #endif
1331         return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1332 }
1333
1334 /* Helper function to find the cached rt in the hash table
1335  * and update bucket pointer to point to the bucket for this
1336  * (daddr, saddr) pair
1337  * Caller must hold rt6_exception_lock
1338  */
1339 static struct rt6_exception *
1340 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1341                               const struct in6_addr *daddr,
1342                               const struct in6_addr *saddr)
1343 {
1344         struct rt6_exception *rt6_ex;
1345         u32 hval;
1346
1347         if (!(*bucket) || !daddr)
1348                 return NULL;
1349
1350         hval = rt6_exception_hash(daddr, saddr);
1351         *bucket += hval;
1352
1353         hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1354                 struct rt6_info *rt6 = rt6_ex->rt6i;
1355                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1356
1357 #ifdef CONFIG_IPV6_SUBTREES
1358                 if (matched && saddr)
1359                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1360 #endif
1361                 if (matched)
1362                         return rt6_ex;
1363         }
1364         return NULL;
1365 }
1366
1367 /* Helper function to find the cached rt in the hash table
1368  * and update bucket pointer to point to the bucket for this
1369  * (daddr, saddr) pair
1370  * Caller must hold rcu_read_lock()
1371  */
1372 static struct rt6_exception *
1373 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1374                          const struct in6_addr *daddr,
1375                          const struct in6_addr *saddr)
1376 {
1377         struct rt6_exception *rt6_ex;
1378         u32 hval;
1379
1380         WARN_ON_ONCE(!rcu_read_lock_held());
1381
1382         if (!(*bucket) || !daddr)
1383                 return NULL;
1384
1385         hval = rt6_exception_hash(daddr, saddr);
1386         *bucket += hval;
1387
1388         hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1389                 struct rt6_info *rt6 = rt6_ex->rt6i;
1390                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1391
1392 #ifdef CONFIG_IPV6_SUBTREES
1393                 if (matched && saddr)
1394                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1395 #endif
1396                 if (matched)
1397                         return rt6_ex;
1398         }
1399         return NULL;
1400 }
1401
1402 static unsigned int fib6_mtu(const struct fib6_info *rt)
1403 {
1404         unsigned int mtu;
1405
1406         if (rt->fib6_pmtu) {
1407                 mtu = rt->fib6_pmtu;
1408         } else {
1409                 struct net_device *dev = fib6_info_nh_dev(rt);
1410                 struct inet6_dev *idev;
1411
1412                 rcu_read_lock();
1413                 idev = __in6_dev_get(dev);
1414                 mtu = idev->cnf.mtu6;
1415                 rcu_read_unlock();
1416         }
1417
1418         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1419
1420         return mtu - lwtunnel_headroom(rt->fib6_nh.nh_lwtstate, mtu);
1421 }
1422
1423 static int rt6_insert_exception(struct rt6_info *nrt,
1424                                 struct fib6_info *ort)
1425 {
1426         struct net *net = dev_net(nrt->dst.dev);
1427         struct rt6_exception_bucket *bucket;
1428         struct in6_addr *src_key = NULL;
1429         struct rt6_exception *rt6_ex;
1430         int err = 0;
1431
1432         spin_lock_bh(&rt6_exception_lock);
1433
1434         if (ort->exception_bucket_flushed) {
1435                 err = -EINVAL;
1436                 goto out;
1437         }
1438
1439         bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1440                                         lockdep_is_held(&rt6_exception_lock));
1441         if (!bucket) {
1442                 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1443                                  GFP_ATOMIC);
1444                 if (!bucket) {
1445                         err = -ENOMEM;
1446                         goto out;
1447                 }
1448                 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1449         }
1450
1451 #ifdef CONFIG_IPV6_SUBTREES
1452         /* rt6i_src.plen != 0 indicates ort is in subtree
1453          * and exception table is indexed by a hash of
1454          * both rt6i_dst and rt6i_src.
1455          * Otherwise, the exception table is indexed by
1456          * a hash of only rt6i_dst.
1457          */
1458         if (ort->fib6_src.plen)
1459                 src_key = &nrt->rt6i_src.addr;
1460 #endif
1461         /* rt6_mtu_change() might lower mtu on ort.
1462          * Only insert this exception route if its mtu
1463          * is less than ort's mtu value.
1464          */
1465         if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(ort)) {
1466                 err = -EINVAL;
1467                 goto out;
1468         }
1469
1470         rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1471                                                src_key);
1472         if (rt6_ex)
1473                 rt6_remove_exception(bucket, rt6_ex);
1474
1475         rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1476         if (!rt6_ex) {
1477                 err = -ENOMEM;
1478                 goto out;
1479         }
1480         rt6_ex->rt6i = nrt;
1481         rt6_ex->stamp = jiffies;
1482         hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1483         bucket->depth++;
1484         net->ipv6.rt6_stats->fib_rt_cache++;
1485
1486         if (bucket->depth > FIB6_MAX_DEPTH)
1487                 rt6_exception_remove_oldest(bucket);
1488
1489 out:
1490         spin_unlock_bh(&rt6_exception_lock);
1491
1492         /* Update fn->fn_sernum to invalidate all cached dst */
1493         if (!err) {
1494                 spin_lock_bh(&ort->fib6_table->tb6_lock);
1495                 fib6_update_sernum(net, ort);
1496                 spin_unlock_bh(&ort->fib6_table->tb6_lock);
1497                 fib6_force_start_gc(net);
1498         }
1499
1500         return err;
1501 }
1502
1503 void rt6_flush_exceptions(struct fib6_info *rt)
1504 {
1505         struct rt6_exception_bucket *bucket;
1506         struct rt6_exception *rt6_ex;
1507         struct hlist_node *tmp;
1508         int i;
1509
1510         spin_lock_bh(&rt6_exception_lock);
1511         /* Prevent rt6_insert_exception() to recreate the bucket list */
1512         rt->exception_bucket_flushed = 1;
1513
1514         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1515                                     lockdep_is_held(&rt6_exception_lock));
1516         if (!bucket)
1517                 goto out;
1518
1519         for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1520                 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1521                         rt6_remove_exception(bucket, rt6_ex);
1522                 WARN_ON_ONCE(bucket->depth);
1523                 bucket++;
1524         }
1525
1526 out:
1527         spin_unlock_bh(&rt6_exception_lock);
1528 }
1529
1530 /* Find cached rt in the hash table inside passed in rt
1531  * Caller has to hold rcu_read_lock()
1532  */
1533 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
1534                                            struct in6_addr *daddr,
1535                                            struct in6_addr *saddr)
1536 {
1537         struct rt6_exception_bucket *bucket;
1538         struct in6_addr *src_key = NULL;
1539         struct rt6_exception *rt6_ex;
1540         struct rt6_info *res = NULL;
1541
1542         bucket = rcu_dereference(rt->rt6i_exception_bucket);
1543
1544 #ifdef CONFIG_IPV6_SUBTREES
1545         /* rt6i_src.plen != 0 indicates rt is in subtree
1546          * and exception table is indexed by a hash of
1547          * both rt6i_dst and rt6i_src.
1548          * Otherwise, the exception table is indexed by
1549          * a hash of only rt6i_dst.
1550          */
1551         if (rt->fib6_src.plen)
1552                 src_key = saddr;
1553 #endif
1554         rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1555
1556         if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1557                 res = rt6_ex->rt6i;
1558
1559         return res;
1560 }
1561
1562 /* Remove the passed in cached rt from the hash table that contains it */
1563 static int rt6_remove_exception_rt(struct rt6_info *rt)
1564 {
1565         struct rt6_exception_bucket *bucket;
1566         struct in6_addr *src_key = NULL;
1567         struct rt6_exception *rt6_ex;
1568         struct fib6_info *from;
1569         int err;
1570
1571         from = rcu_dereference(rt->from);
1572         if (!from ||
1573             !(rt->rt6i_flags & RTF_CACHE))
1574                 return -EINVAL;
1575
1576         if (!rcu_access_pointer(from->rt6i_exception_bucket))
1577                 return -ENOENT;
1578
1579         spin_lock_bh(&rt6_exception_lock);
1580         bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1581                                     lockdep_is_held(&rt6_exception_lock));
1582 #ifdef CONFIG_IPV6_SUBTREES
1583         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1584          * and exception table is indexed by a hash of
1585          * both rt6i_dst and rt6i_src.
1586          * Otherwise, the exception table is indexed by
1587          * a hash of only rt6i_dst.
1588          */
1589         if (from->fib6_src.plen)
1590                 src_key = &rt->rt6i_src.addr;
1591 #endif
1592         rt6_ex = __rt6_find_exception_spinlock(&bucket,
1593                                                &rt->rt6i_dst.addr,
1594                                                src_key);
1595         if (rt6_ex) {
1596                 rt6_remove_exception(bucket, rt6_ex);
1597                 err = 0;
1598         } else {
1599                 err = -ENOENT;
1600         }
1601
1602         spin_unlock_bh(&rt6_exception_lock);
1603         return err;
1604 }
1605
1606 /* Find rt6_ex which contains the passed in rt cache and
1607  * refresh its stamp
1608  */
1609 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1610 {
1611         struct rt6_exception_bucket *bucket;
1612         struct in6_addr *src_key = NULL;
1613         struct rt6_exception *rt6_ex;
1614         struct fib6_info *from;
1615
1616         rcu_read_lock();
1617         from = rcu_dereference(rt->from);
1618         if (!from || !(rt->rt6i_flags & RTF_CACHE))
1619                 goto unlock;
1620
1621         bucket = rcu_dereference(from->rt6i_exception_bucket);
1622
1623 #ifdef CONFIG_IPV6_SUBTREES
1624         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1625          * and exception table is indexed by a hash of
1626          * both rt6i_dst and rt6i_src.
1627          * Otherwise, the exception table is indexed by
1628          * a hash of only rt6i_dst.
1629          */
1630         if (from->fib6_src.plen)
1631                 src_key = &rt->rt6i_src.addr;
1632 #endif
1633         rt6_ex = __rt6_find_exception_rcu(&bucket,
1634                                           &rt->rt6i_dst.addr,
1635                                           src_key);
1636         if (rt6_ex)
1637                 rt6_ex->stamp = jiffies;
1638
1639 unlock:
1640         rcu_read_unlock();
1641 }
1642
1643 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
1644                                          struct rt6_info *rt, int mtu)
1645 {
1646         /* If the new MTU is lower than the route PMTU, this new MTU will be the
1647          * lowest MTU in the path: always allow updating the route PMTU to
1648          * reflect PMTU decreases.
1649          *
1650          * If the new MTU is higher, and the route PMTU is equal to the local
1651          * MTU, this means the old MTU is the lowest in the path, so allow
1652          * updating it: if other nodes now have lower MTUs, PMTU discovery will
1653          * handle this.
1654          */
1655
1656         if (dst_mtu(&rt->dst) >= mtu)
1657                 return true;
1658
1659         if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
1660                 return true;
1661
1662         return false;
1663 }
1664
1665 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
1666                                        struct fib6_info *rt, int mtu)
1667 {
1668         struct rt6_exception_bucket *bucket;
1669         struct rt6_exception *rt6_ex;
1670         int i;
1671
1672         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1673                                         lockdep_is_held(&rt6_exception_lock));
1674
1675         if (!bucket)
1676                 return;
1677
1678         for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1679                 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1680                         struct rt6_info *entry = rt6_ex->rt6i;
1681
1682                         /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
1683                          * route), the metrics of its rt->from have already
1684                          * been updated.
1685                          */
1686                         if (dst_metric_raw(&entry->dst, RTAX_MTU) &&
1687                             rt6_mtu_change_route_allowed(idev, entry, mtu))
1688                                 dst_metric_set(&entry->dst, RTAX_MTU, mtu);
1689                 }
1690                 bucket++;
1691         }
1692 }
1693
1694 #define RTF_CACHE_GATEWAY       (RTF_GATEWAY | RTF_CACHE)
1695
1696 static void rt6_exceptions_clean_tohost(struct fib6_info *rt,
1697                                         struct in6_addr *gateway)
1698 {
1699         struct rt6_exception_bucket *bucket;
1700         struct rt6_exception *rt6_ex;
1701         struct hlist_node *tmp;
1702         int i;
1703
1704         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1705                 return;
1706
1707         spin_lock_bh(&rt6_exception_lock);
1708         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1709                                      lockdep_is_held(&rt6_exception_lock));
1710
1711         if (bucket) {
1712                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1713                         hlist_for_each_entry_safe(rt6_ex, tmp,
1714                                                   &bucket->chain, hlist) {
1715                                 struct rt6_info *entry = rt6_ex->rt6i;
1716
1717                                 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1718                                     RTF_CACHE_GATEWAY &&
1719                                     ipv6_addr_equal(gateway,
1720                                                     &entry->rt6i_gateway)) {
1721                                         rt6_remove_exception(bucket, rt6_ex);
1722                                 }
1723                         }
1724                         bucket++;
1725                 }
1726         }
1727
1728         spin_unlock_bh(&rt6_exception_lock);
1729 }
1730
1731 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1732                                       struct rt6_exception *rt6_ex,
1733                                       struct fib6_gc_args *gc_args,
1734                                       unsigned long now)
1735 {
1736         struct rt6_info *rt = rt6_ex->rt6i;
1737
1738         /* we are pruning and obsoleting aged-out and non gateway exceptions
1739          * even if others have still references to them, so that on next
1740          * dst_check() such references can be dropped.
1741          * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1742          * expired, independently from their aging, as per RFC 8201 section 4
1743          */
1744         if (!(rt->rt6i_flags & RTF_EXPIRES)) {
1745                 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1746                         RT6_TRACE("aging clone %p\n", rt);
1747                         rt6_remove_exception(bucket, rt6_ex);
1748                         return;
1749                 }
1750         } else if (time_after(jiffies, rt->dst.expires)) {
1751                 RT6_TRACE("purging expired route %p\n", rt);
1752                 rt6_remove_exception(bucket, rt6_ex);
1753                 return;
1754         }
1755
1756         if (rt->rt6i_flags & RTF_GATEWAY) {
1757                 struct neighbour *neigh;
1758                 __u8 neigh_flags = 0;
1759
1760                 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
1761                 if (neigh)
1762                         neigh_flags = neigh->flags;
1763
1764                 if (!(neigh_flags & NTF_ROUTER)) {
1765                         RT6_TRACE("purging route %p via non-router but gateway\n",
1766                                   rt);
1767                         rt6_remove_exception(bucket, rt6_ex);
1768                         return;
1769                 }
1770         }
1771
1772         gc_args->more++;
1773 }
1774
1775 void rt6_age_exceptions(struct fib6_info *rt,
1776                         struct fib6_gc_args *gc_args,
1777                         unsigned long now)
1778 {
1779         struct rt6_exception_bucket *bucket;
1780         struct rt6_exception *rt6_ex;
1781         struct hlist_node *tmp;
1782         int i;
1783
1784         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1785                 return;
1786
1787         rcu_read_lock_bh();
1788         spin_lock(&rt6_exception_lock);
1789         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1790                                     lockdep_is_held(&rt6_exception_lock));
1791
1792         if (bucket) {
1793                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1794                         hlist_for_each_entry_safe(rt6_ex, tmp,
1795                                                   &bucket->chain, hlist) {
1796                                 rt6_age_examine_exception(bucket, rt6_ex,
1797                                                           gc_args, now);
1798                         }
1799                         bucket++;
1800                 }
1801         }
1802         spin_unlock(&rt6_exception_lock);
1803         rcu_read_unlock_bh();
1804 }
1805
1806 /* must be called with rcu lock held */
1807 struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table,
1808                                     int oif, struct flowi6 *fl6, int strict)
1809 {
1810         struct fib6_node *fn, *saved_fn;
1811         struct fib6_info *f6i;
1812
1813         fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1814         saved_fn = fn;
1815
1816         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1817                 oif = 0;
1818
1819 redo_rt6_select:
1820         f6i = rt6_select(net, fn, oif, strict);
1821         if (f6i == net->ipv6.fib6_null_entry) {
1822                 fn = fib6_backtrack(fn, &fl6->saddr);
1823                 if (fn)
1824                         goto redo_rt6_select;
1825                 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1826                         /* also consider unreachable route */
1827                         strict &= ~RT6_LOOKUP_F_REACHABLE;
1828                         fn = saved_fn;
1829                         goto redo_rt6_select;
1830                 }
1831         }
1832
1833         trace_fib6_table_lookup(net, f6i, table, fl6);
1834
1835         return f6i;
1836 }
1837
1838 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1839                                int oif, struct flowi6 *fl6,
1840                                const struct sk_buff *skb, int flags)
1841 {
1842         struct fib6_info *f6i;
1843         struct rt6_info *rt;
1844         int strict = 0;
1845
1846         strict |= flags & RT6_LOOKUP_F_IFACE;
1847         strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1848         if (net->ipv6.devconf_all->forwarding == 0)
1849                 strict |= RT6_LOOKUP_F_REACHABLE;
1850
1851         rcu_read_lock();
1852
1853         f6i = fib6_table_lookup(net, table, oif, fl6, strict);
1854         if (f6i->fib6_nsiblings)
1855                 f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict);
1856
1857         if (f6i == net->ipv6.fib6_null_entry) {
1858                 rt = net->ipv6.ip6_null_entry;
1859                 rcu_read_unlock();
1860                 dst_hold(&rt->dst);
1861                 return rt;
1862         }
1863
1864         /*Search through exception table */
1865         rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1866         if (rt) {
1867                 if (ip6_hold_safe(net, &rt))
1868                         dst_use_noref(&rt->dst, jiffies);
1869
1870                 rcu_read_unlock();
1871                 return rt;
1872         } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1873                             !(f6i->fib6_flags & RTF_GATEWAY))) {
1874                 /* Create a RTF_CACHE clone which will not be
1875                  * owned by the fib6 tree.  It is for the special case where
1876                  * the daddr in the skb during the neighbor look-up is different
1877                  * from the fl6->daddr used to look-up route here.
1878                  */
1879                 struct rt6_info *uncached_rt;
1880
1881                 uncached_rt = ip6_rt_cache_alloc(f6i, &fl6->daddr, NULL);
1882
1883                 rcu_read_unlock();
1884
1885                 if (uncached_rt) {
1886                         /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1887                          * No need for another dst_hold()
1888                          */
1889                         rt6_uncached_list_add(uncached_rt);
1890                         atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1891                 } else {
1892                         uncached_rt = net->ipv6.ip6_null_entry;
1893                         dst_hold(&uncached_rt->dst);
1894                 }
1895
1896                 return uncached_rt;
1897         } else {
1898                 /* Get a percpu copy */
1899
1900                 struct rt6_info *pcpu_rt;
1901
1902                 local_bh_disable();
1903                 pcpu_rt = rt6_get_pcpu_route(f6i);
1904
1905                 if (!pcpu_rt)
1906                         pcpu_rt = rt6_make_pcpu_route(net, f6i);
1907
1908                 local_bh_enable();
1909                 rcu_read_unlock();
1910
1911                 return pcpu_rt;
1912         }
1913 }
1914 EXPORT_SYMBOL_GPL(ip6_pol_route);
1915
1916 static struct rt6_info *ip6_pol_route_input(struct net *net,
1917                                             struct fib6_table *table,
1918                                             struct flowi6 *fl6,
1919                                             const struct sk_buff *skb,
1920                                             int flags)
1921 {
1922         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
1923 }
1924
1925 struct dst_entry *ip6_route_input_lookup(struct net *net,
1926                                          struct net_device *dev,
1927                                          struct flowi6 *fl6,
1928                                          const struct sk_buff *skb,
1929                                          int flags)
1930 {
1931         if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1932                 flags |= RT6_LOOKUP_F_IFACE;
1933
1934         return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
1935 }
1936 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1937
1938 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1939                                   struct flow_keys *keys,
1940                                   struct flow_keys *flkeys)
1941 {
1942         const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1943         const struct ipv6hdr *key_iph = outer_iph;
1944         struct flow_keys *_flkeys = flkeys;
1945         const struct ipv6hdr *inner_iph;
1946         const struct icmp6hdr *icmph;
1947         struct ipv6hdr _inner_iph;
1948         struct icmp6hdr _icmph;
1949
1950         if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1951                 goto out;
1952
1953         icmph = skb_header_pointer(skb, skb_transport_offset(skb),
1954                                    sizeof(_icmph), &_icmph);
1955         if (!icmph)
1956                 goto out;
1957
1958         if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1959             icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1960             icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1961             icmph->icmp6_type != ICMPV6_PARAMPROB)
1962                 goto out;
1963
1964         inner_iph = skb_header_pointer(skb,
1965                                        skb_transport_offset(skb) + sizeof(*icmph),
1966                                        sizeof(_inner_iph), &_inner_iph);
1967         if (!inner_iph)
1968                 goto out;
1969
1970         key_iph = inner_iph;
1971         _flkeys = NULL;
1972 out:
1973         if (_flkeys) {
1974                 keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
1975                 keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
1976                 keys->tags.flow_label = _flkeys->tags.flow_label;
1977                 keys->basic.ip_proto = _flkeys->basic.ip_proto;
1978         } else {
1979                 keys->addrs.v6addrs.src = key_iph->saddr;
1980                 keys->addrs.v6addrs.dst = key_iph->daddr;
1981                 keys->tags.flow_label = ip6_flowlabel(key_iph);
1982                 keys->basic.ip_proto = key_iph->nexthdr;
1983         }
1984 }
1985
1986 /* if skb is set it will be used and fl6 can be NULL */
1987 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
1988                        const struct sk_buff *skb, struct flow_keys *flkeys)
1989 {
1990         struct flow_keys hash_keys;
1991         u32 mhash;
1992
1993         switch (ip6_multipath_hash_policy(net)) {
1994         case 0:
1995                 memset(&hash_keys, 0, sizeof(hash_keys));
1996                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1997                 if (skb) {
1998                         ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
1999                 } else {
2000                         hash_keys.addrs.v6addrs.src = fl6->saddr;
2001                         hash_keys.addrs.v6addrs.dst = fl6->daddr;
2002                         hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
2003                         hash_keys.basic.ip_proto = fl6->flowi6_proto;
2004                 }
2005                 break;
2006         case 1:
2007                 if (skb) {
2008                         unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
2009                         struct flow_keys keys;
2010
2011                         /* short-circuit if we already have L4 hash present */
2012                         if (skb->l4_hash)
2013                                 return skb_get_hash_raw(skb) >> 1;
2014
2015                         memset(&hash_keys, 0, sizeof(hash_keys));
2016
2017                         if (!flkeys) {
2018                                 skb_flow_dissect_flow_keys(skb, &keys, flag);
2019                                 flkeys = &keys;
2020                         }
2021                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2022                         hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
2023                         hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
2024                         hash_keys.ports.src = flkeys->ports.src;
2025                         hash_keys.ports.dst = flkeys->ports.dst;
2026                         hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2027                 } else {
2028                         memset(&hash_keys, 0, sizeof(hash_keys));
2029                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2030                         hash_keys.addrs.v6addrs.src = fl6->saddr;
2031                         hash_keys.addrs.v6addrs.dst = fl6->daddr;
2032                         hash_keys.ports.src = fl6->fl6_sport;
2033                         hash_keys.ports.dst = fl6->fl6_dport;
2034                         hash_keys.basic.ip_proto = fl6->flowi6_proto;
2035                 }
2036                 break;
2037         }
2038         mhash = flow_hash_from_keys(&hash_keys);
2039
2040         return mhash >> 1;
2041 }
2042
2043 void ip6_route_input(struct sk_buff *skb)
2044 {
2045         const struct ipv6hdr *iph = ipv6_hdr(skb);
2046         struct net *net = dev_net(skb->dev);
2047         int flags = RT6_LOOKUP_F_HAS_SADDR;
2048         struct ip_tunnel_info *tun_info;
2049         struct flowi6 fl6 = {
2050                 .flowi6_iif = skb->dev->ifindex,
2051                 .daddr = iph->daddr,
2052                 .saddr = iph->saddr,
2053                 .flowlabel = ip6_flowinfo(iph),
2054                 .flowi6_mark = skb->mark,
2055                 .flowi6_proto = iph->nexthdr,
2056         };
2057         struct flow_keys *flkeys = NULL, _flkeys;
2058
2059         tun_info = skb_tunnel_info(skb);
2060         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2061                 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
2062
2063         if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
2064                 flkeys = &_flkeys;
2065
2066         if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
2067                 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
2068         skb_dst_drop(skb);
2069         skb_dst_set(skb,
2070                     ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
2071 }
2072
2073 static struct rt6_info *ip6_pol_route_output(struct net *net,
2074                                              struct fib6_table *table,
2075                                              struct flowi6 *fl6,
2076                                              const struct sk_buff *skb,
2077                                              int flags)
2078 {
2079         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
2080 }
2081
2082 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
2083                                          struct flowi6 *fl6, int flags)
2084 {
2085         bool any_src;
2086
2087         if (ipv6_addr_type(&fl6->daddr) &
2088             (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)) {
2089                 struct dst_entry *dst;
2090
2091                 dst = l3mdev_link_scope_lookup(net, fl6);
2092                 if (dst)
2093                         return dst;
2094         }
2095
2096         fl6->flowi6_iif = LOOPBACK_IFINDEX;
2097
2098         any_src = ipv6_addr_any(&fl6->saddr);
2099         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
2100             (fl6->flowi6_oif && any_src))
2101                 flags |= RT6_LOOKUP_F_IFACE;
2102
2103         if (!any_src)
2104                 flags |= RT6_LOOKUP_F_HAS_SADDR;
2105         else if (sk)
2106                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
2107
2108         return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
2109 }
2110 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
2111
2112 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2113 {
2114         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
2115         struct net_device *loopback_dev = net->loopback_dev;
2116         struct dst_entry *new = NULL;
2117
2118         rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
2119                        DST_OBSOLETE_DEAD, 0);
2120         if (rt) {
2121                 rt6_info_init(rt);
2122                 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
2123
2124                 new = &rt->dst;
2125                 new->__use = 1;
2126                 new->input = dst_discard;
2127                 new->output = dst_discard_out;
2128
2129                 dst_copy_metrics(new, &ort->dst);
2130
2131                 rt->rt6i_idev = in6_dev_get(loopback_dev);
2132                 rt->rt6i_gateway = ort->rt6i_gateway;
2133                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
2134
2135                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
2136 #ifdef CONFIG_IPV6_SUBTREES
2137                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
2138 #endif
2139         }
2140
2141         dst_release(dst_orig);
2142         return new ? new : ERR_PTR(-ENOMEM);
2143 }
2144
2145 /*
2146  *      Destination cache support functions
2147  */
2148
2149 static bool fib6_check(struct fib6_info *f6i, u32 cookie)
2150 {
2151         u32 rt_cookie = 0;
2152
2153         if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie)
2154                 return false;
2155
2156         if (fib6_check_expired(f6i))
2157                 return false;
2158
2159         return true;
2160 }
2161
2162 static struct dst_entry *rt6_check(struct rt6_info *rt,
2163                                    struct fib6_info *from,
2164                                    u32 cookie)
2165 {
2166         u32 rt_cookie = 0;
2167
2168         if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) ||
2169             rt_cookie != cookie)
2170                 return NULL;
2171
2172         if (rt6_check_expired(rt))
2173                 return NULL;
2174
2175         return &rt->dst;
2176 }
2177
2178 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt,
2179                                             struct fib6_info *from,
2180                                             u32 cookie)
2181 {
2182         if (!__rt6_check_expired(rt) &&
2183             rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
2184             fib6_check(from, cookie))
2185                 return &rt->dst;
2186         else
2187                 return NULL;
2188 }
2189
2190 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
2191 {
2192         struct dst_entry *dst_ret;
2193         struct fib6_info *from;
2194         struct rt6_info *rt;
2195
2196         rt = container_of(dst, struct rt6_info, dst);
2197
2198         rcu_read_lock();
2199
2200         /* All IPV6 dsts are created with ->obsolete set to the value
2201          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
2202          * into this function always.
2203          */
2204
2205         from = rcu_dereference(rt->from);
2206
2207         if (from && (rt->rt6i_flags & RTF_PCPU ||
2208             unlikely(!list_empty(&rt->rt6i_uncached))))
2209                 dst_ret = rt6_dst_from_check(rt, from, cookie);
2210         else
2211                 dst_ret = rt6_check(rt, from, cookie);
2212
2213         rcu_read_unlock();
2214
2215         return dst_ret;
2216 }
2217
2218 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
2219 {
2220         struct rt6_info *rt = (struct rt6_info *) dst;
2221
2222         if (rt) {
2223                 if (rt->rt6i_flags & RTF_CACHE) {
2224                         rcu_read_lock();
2225                         if (rt6_check_expired(rt)) {
2226                                 rt6_remove_exception_rt(rt);
2227                                 dst = NULL;
2228                         }
2229                         rcu_read_unlock();
2230                 } else {
2231                         dst_release(dst);
2232                         dst = NULL;
2233                 }
2234         }
2235         return dst;
2236 }
2237
2238 static void ip6_link_failure(struct sk_buff *skb)
2239 {
2240         struct rt6_info *rt;
2241
2242         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2243
2244         rt = (struct rt6_info *) skb_dst(skb);
2245         if (rt) {
2246                 rcu_read_lock();
2247                 if (rt->rt6i_flags & RTF_CACHE) {
2248                         rt6_remove_exception_rt(rt);
2249                 } else {
2250                         struct fib6_info *from;
2251                         struct fib6_node *fn;
2252
2253                         from = rcu_dereference(rt->from);
2254                         if (from) {
2255                                 fn = rcu_dereference(from->fib6_node);
2256                                 if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2257                                         fn->fn_sernum = -1;
2258                         }
2259                 }
2260                 rcu_read_unlock();
2261         }
2262 }
2263
2264 static void rt6_update_expires(struct rt6_info *rt0, int timeout)
2265 {
2266         if (!(rt0->rt6i_flags & RTF_EXPIRES)) {
2267                 struct fib6_info *from;
2268
2269                 rcu_read_lock();
2270                 from = rcu_dereference(rt0->from);
2271                 if (from)
2272                         rt0->dst.expires = from->expires;
2273                 rcu_read_unlock();
2274         }
2275
2276         dst_set_expires(&rt0->dst, timeout);
2277         rt0->rt6i_flags |= RTF_EXPIRES;
2278 }
2279
2280 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2281 {
2282         struct net *net = dev_net(rt->dst.dev);
2283
2284         dst_metric_set(&rt->dst, RTAX_MTU, mtu);
2285         rt->rt6i_flags |= RTF_MODIFIED;
2286         rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2287 }
2288
2289 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2290 {
2291         return !(rt->rt6i_flags & RTF_CACHE) &&
2292                 (rt->rt6i_flags & RTF_PCPU || rcu_access_pointer(rt->from));
2293 }
2294
2295 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2296                                  const struct ipv6hdr *iph, u32 mtu)
2297 {
2298         const struct in6_addr *daddr, *saddr;
2299         struct rt6_info *rt6 = (struct rt6_info *)dst;
2300
2301         if (dst_metric_locked(dst, RTAX_MTU))
2302                 return;
2303
2304         if (iph) {
2305                 daddr = &iph->daddr;
2306                 saddr = &iph->saddr;
2307         } else if (sk) {
2308                 daddr = &sk->sk_v6_daddr;
2309                 saddr = &inet6_sk(sk)->saddr;
2310         } else {
2311                 daddr = NULL;
2312                 saddr = NULL;
2313         }
2314         dst_confirm_neigh(dst, daddr);
2315         mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2316         if (mtu >= dst_mtu(dst))
2317                 return;
2318
2319         if (!rt6_cache_allowed_for_pmtu(rt6)) {
2320                 rt6_do_update_pmtu(rt6, mtu);
2321                 /* update rt6_ex->stamp for cache */
2322                 if (rt6->rt6i_flags & RTF_CACHE)
2323                         rt6_update_exception_stamp_rt(rt6);
2324         } else if (daddr) {
2325                 struct fib6_info *from;
2326                 struct rt6_info *nrt6;
2327
2328                 rcu_read_lock();
2329                 from = rcu_dereference(rt6->from);
2330                 nrt6 = ip6_rt_cache_alloc(from, daddr, saddr);
2331                 if (nrt6) {
2332                         rt6_do_update_pmtu(nrt6, mtu);
2333                         if (rt6_insert_exception(nrt6, from))
2334                                 dst_release_immediate(&nrt6->dst);
2335                 }
2336                 rcu_read_unlock();
2337         }
2338 }
2339
2340 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2341                                struct sk_buff *skb, u32 mtu)
2342 {
2343         __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2344 }
2345
2346 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2347                      int oif, u32 mark, kuid_t uid)
2348 {
2349         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2350         struct dst_entry *dst;
2351         struct flowi6 fl6 = {
2352                 .flowi6_oif = oif,
2353                 .flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark),
2354                 .daddr = iph->daddr,
2355                 .saddr = iph->saddr,
2356                 .flowlabel = ip6_flowinfo(iph),
2357                 .flowi6_uid = uid,
2358         };
2359
2360         dst = ip6_route_output(net, NULL, &fl6);
2361         if (!dst->error)
2362                 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2363         dst_release(dst);
2364 }
2365 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2366
2367 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2368 {
2369         int oif = sk->sk_bound_dev_if;
2370         struct dst_entry *dst;
2371
2372         if (!oif && skb->dev)
2373                 oif = l3mdev_master_ifindex(skb->dev);
2374
2375         ip6_update_pmtu(skb, sock_net(sk), mtu, oif, sk->sk_mark, sk->sk_uid);
2376
2377         dst = __sk_dst_get(sk);
2378         if (!dst || !dst->obsolete ||
2379             dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2380                 return;
2381
2382         bh_lock_sock(sk);
2383         if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2384                 ip6_datagram_dst_update(sk, false);
2385         bh_unlock_sock(sk);
2386 }
2387 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2388
2389 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
2390                            const struct flowi6 *fl6)
2391 {
2392 #ifdef CONFIG_IPV6_SUBTREES
2393         struct ipv6_pinfo *np = inet6_sk(sk);
2394 #endif
2395
2396         ip6_dst_store(sk, dst,
2397                       ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ?
2398                       &sk->sk_v6_daddr : NULL,
2399 #ifdef CONFIG_IPV6_SUBTREES
2400                       ipv6_addr_equal(&fl6->saddr, &np->saddr) ?
2401                       &np->saddr :
2402 #endif
2403                       NULL);
2404 }
2405
2406 /* Handle redirects */
2407 struct ip6rd_flowi {
2408         struct flowi6 fl6;
2409         struct in6_addr gateway;
2410 };
2411
2412 static struct rt6_info *__ip6_route_redirect(struct net *net,
2413                                              struct fib6_table *table,
2414                                              struct flowi6 *fl6,
2415                                              const struct sk_buff *skb,
2416                                              int flags)
2417 {
2418         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2419         struct rt6_info *ret = NULL, *rt_cache;
2420         struct fib6_info *rt;
2421         struct fib6_node *fn;
2422
2423         /* Get the "current" route for this destination and
2424          * check if the redirect has come from appropriate router.
2425          *
2426          * RFC 4861 specifies that redirects should only be
2427          * accepted if they come from the nexthop to the target.
2428          * Due to the way the routes are chosen, this notion
2429          * is a bit fuzzy and one might need to check all possible
2430          * routes.
2431          */
2432
2433         rcu_read_lock();
2434         fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2435 restart:
2436         for_each_fib6_node_rt_rcu(fn) {
2437                 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
2438                         continue;
2439                 if (fib6_check_expired(rt))
2440                         continue;
2441                 if (rt->fib6_flags & RTF_REJECT)
2442                         break;
2443                 if (!(rt->fib6_flags & RTF_GATEWAY))
2444                         continue;
2445                 if (fl6->flowi6_oif != rt->fib6_nh.nh_dev->ifindex)
2446                         continue;
2447                 /* rt_cache's gateway might be different from its 'parent'
2448                  * in the case of an ip redirect.
2449                  * So we keep searching in the exception table if the gateway
2450                  * is different.
2451                  */
2452                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->fib6_nh.nh_gw)) {
2453                         rt_cache = rt6_find_cached_rt(rt,
2454                                                       &fl6->daddr,
2455                                                       &fl6->saddr);
2456                         if (rt_cache &&
2457                             ipv6_addr_equal(&rdfl->gateway,
2458                                             &rt_cache->rt6i_gateway)) {
2459                                 ret = rt_cache;
2460                                 break;
2461                         }
2462                         continue;
2463                 }
2464                 break;
2465         }
2466
2467         if (!rt)
2468                 rt = net->ipv6.fib6_null_entry;
2469         else if (rt->fib6_flags & RTF_REJECT) {
2470                 ret = net->ipv6.ip6_null_entry;
2471                 goto out;
2472         }
2473
2474         if (rt == net->ipv6.fib6_null_entry) {
2475                 fn = fib6_backtrack(fn, &fl6->saddr);
2476                 if (fn)
2477                         goto restart;
2478         }
2479
2480 out:
2481         if (ret)
2482                 ip6_hold_safe(net, &ret);
2483         else
2484                 ret = ip6_create_rt_rcu(rt);
2485
2486         rcu_read_unlock();
2487
2488         trace_fib6_table_lookup(net, rt, table, fl6);
2489         return ret;
2490 };
2491
2492 static struct dst_entry *ip6_route_redirect(struct net *net,
2493                                             const struct flowi6 *fl6,
2494                                             const struct sk_buff *skb,
2495                                             const struct in6_addr *gateway)
2496 {
2497         int flags = RT6_LOOKUP_F_HAS_SADDR;
2498         struct ip6rd_flowi rdfl;
2499
2500         rdfl.fl6 = *fl6;
2501         rdfl.gateway = *gateway;
2502
2503         return fib6_rule_lookup(net, &rdfl.fl6, skb,
2504                                 flags, __ip6_route_redirect);
2505 }
2506
2507 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2508                   kuid_t uid)
2509 {
2510         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2511         struct dst_entry *dst;
2512         struct flowi6 fl6 = {
2513                 .flowi6_iif = LOOPBACK_IFINDEX,
2514                 .flowi6_oif = oif,
2515                 .flowi6_mark = mark,
2516                 .daddr = iph->daddr,
2517                 .saddr = iph->saddr,
2518                 .flowlabel = ip6_flowinfo(iph),
2519                 .flowi6_uid = uid,
2520         };
2521
2522         dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
2523         rt6_do_redirect(dst, NULL, skb);
2524         dst_release(dst);
2525 }
2526 EXPORT_SYMBOL_GPL(ip6_redirect);
2527
2528 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif)
2529 {
2530         const struct ipv6hdr *iph = ipv6_hdr(skb);
2531         const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2532         struct dst_entry *dst;
2533         struct flowi6 fl6 = {
2534                 .flowi6_iif = LOOPBACK_IFINDEX,
2535                 .flowi6_oif = oif,
2536                 .daddr = msg->dest,
2537                 .saddr = iph->daddr,
2538                 .flowi6_uid = sock_net_uid(net, NULL),
2539         };
2540
2541         dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
2542         rt6_do_redirect(dst, NULL, skb);
2543         dst_release(dst);
2544 }
2545
2546 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2547 {
2548         ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2549                      sk->sk_uid);
2550 }
2551 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2552
2553 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2554 {
2555         struct net_device *dev = dst->dev;
2556         unsigned int mtu = dst_mtu(dst);
2557         struct net *net = dev_net(dev);
2558
2559         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2560
2561         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2562                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2563
2564         /*
2565          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2566          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2567          * IPV6_MAXPLEN is also valid and means: "any MSS,
2568          * rely only on pmtu discovery"
2569          */
2570         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2571                 mtu = IPV6_MAXPLEN;
2572         return mtu;
2573 }
2574
2575 static unsigned int ip6_mtu(const struct dst_entry *dst)
2576 {
2577         struct inet6_dev *idev;
2578         unsigned int mtu;
2579
2580         mtu = dst_metric_raw(dst, RTAX_MTU);
2581         if (mtu)
2582                 goto out;
2583
2584         mtu = IPV6_MIN_MTU;
2585
2586         rcu_read_lock();
2587         idev = __in6_dev_get(dst->dev);
2588         if (idev)
2589                 mtu = idev->cnf.mtu6;
2590         rcu_read_unlock();
2591
2592 out:
2593         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2594
2595         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2596 }
2597
2598 /* MTU selection:
2599  * 1. mtu on route is locked - use it
2600  * 2. mtu from nexthop exception
2601  * 3. mtu from egress device
2602  *
2603  * based on ip6_dst_mtu_forward and exception logic of
2604  * rt6_find_cached_rt; called with rcu_read_lock
2605  */
2606 u32 ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr,
2607                       struct in6_addr *saddr)
2608 {
2609         struct rt6_exception_bucket *bucket;
2610         struct rt6_exception *rt6_ex;
2611         struct in6_addr *src_key;
2612         struct inet6_dev *idev;
2613         u32 mtu = 0;
2614
2615         if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) {
2616                 mtu = f6i->fib6_pmtu;
2617                 if (mtu)
2618                         goto out;
2619         }
2620
2621         src_key = NULL;
2622 #ifdef CONFIG_IPV6_SUBTREES
2623         if (f6i->fib6_src.plen)
2624                 src_key = saddr;
2625 #endif
2626
2627         bucket = rcu_dereference(f6i->rt6i_exception_bucket);
2628         rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
2629         if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
2630                 mtu = dst_metric_raw(&rt6_ex->rt6i->dst, RTAX_MTU);
2631
2632         if (likely(!mtu)) {
2633                 struct net_device *dev = fib6_info_nh_dev(f6i);
2634
2635                 mtu = IPV6_MIN_MTU;
2636                 idev = __in6_dev_get(dev);
2637                 if (idev && idev->cnf.mtu6 > mtu)
2638                         mtu = idev->cnf.mtu6;
2639         }
2640
2641         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2642 out:
2643         return mtu - lwtunnel_headroom(fib6_info_nh_lwt(f6i), mtu);
2644 }
2645
2646 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2647                                   struct flowi6 *fl6)
2648 {
2649         struct dst_entry *dst;
2650         struct rt6_info *rt;
2651         struct inet6_dev *idev = in6_dev_get(dev);
2652         struct net *net = dev_net(dev);
2653
2654         if (unlikely(!idev))
2655                 return ERR_PTR(-ENODEV);
2656
2657         rt = ip6_dst_alloc(net, dev, 0);
2658         if (unlikely(!rt)) {
2659                 in6_dev_put(idev);
2660                 dst = ERR_PTR(-ENOMEM);
2661                 goto out;
2662         }
2663
2664         rt->dst.flags |= DST_HOST;
2665         rt->dst.input = ip6_input;
2666         rt->dst.output  = ip6_output;
2667         rt->rt6i_gateway  = fl6->daddr;
2668         rt->rt6i_dst.addr = fl6->daddr;
2669         rt->rt6i_dst.plen = 128;
2670         rt->rt6i_idev     = idev;
2671         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2672
2673         /* Add this dst into uncached_list so that rt6_disable_ip() can
2674          * do proper release of the net_device
2675          */
2676         rt6_uncached_list_add(rt);
2677         atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2678
2679         dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2680
2681 out:
2682         return dst;
2683 }
2684
2685 static int ip6_dst_gc(struct dst_ops *ops)
2686 {
2687         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2688         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2689         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2690         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2691         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2692         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2693         int entries;
2694
2695         entries = dst_entries_get_fast(ops);
2696         if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2697             entries <= rt_max_size)
2698                 goto out;
2699
2700         net->ipv6.ip6_rt_gc_expire++;
2701         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2702         entries = dst_entries_get_slow(ops);
2703         if (entries < ops->gc_thresh)
2704                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2705 out:
2706         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2707         return entries > rt_max_size;
2708 }
2709
2710 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2711                                             struct fib6_config *cfg,
2712                                             const struct in6_addr *gw_addr,
2713                                             u32 tbid, int flags)
2714 {
2715         struct flowi6 fl6 = {
2716                 .flowi6_oif = cfg->fc_ifindex,
2717                 .daddr = *gw_addr,
2718                 .saddr = cfg->fc_prefsrc,
2719         };
2720         struct fib6_table *table;
2721         struct rt6_info *rt;
2722
2723         table = fib6_get_table(net, tbid);
2724         if (!table)
2725                 return NULL;
2726
2727         if (!ipv6_addr_any(&cfg->fc_prefsrc))
2728                 flags |= RT6_LOOKUP_F_HAS_SADDR;
2729
2730         flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
2731         rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags);
2732
2733         /* if table lookup failed, fall back to full lookup */
2734         if (rt == net->ipv6.ip6_null_entry) {
2735                 ip6_rt_put(rt);
2736                 rt = NULL;
2737         }
2738
2739         return rt;
2740 }
2741
2742 static int ip6_route_check_nh_onlink(struct net *net,
2743                                      struct fib6_config *cfg,
2744                                      const struct net_device *dev,
2745                                      struct netlink_ext_ack *extack)
2746 {
2747         u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
2748         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2749         u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
2750         struct fib6_info *from;
2751         struct rt6_info *grt;
2752         int err;
2753
2754         err = 0;
2755         grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
2756         if (grt) {
2757                 rcu_read_lock();
2758                 from = rcu_dereference(grt->from);
2759                 if (!grt->dst.error &&
2760                     /* ignore match if it is the default route */
2761                     from && !ipv6_addr_any(&from->fib6_dst.addr) &&
2762                     (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
2763                         NL_SET_ERR_MSG(extack,
2764                                        "Nexthop has invalid gateway or device mismatch");
2765                         err = -EINVAL;
2766                 }
2767                 rcu_read_unlock();
2768
2769                 ip6_rt_put(grt);
2770         }
2771
2772         return err;
2773 }
2774
2775 static int ip6_route_check_nh(struct net *net,
2776                               struct fib6_config *cfg,
2777                               struct net_device **_dev,
2778                               struct inet6_dev **idev)
2779 {
2780         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2781         struct net_device *dev = _dev ? *_dev : NULL;
2782         struct rt6_info *grt = NULL;
2783         int err = -EHOSTUNREACH;
2784
2785         if (cfg->fc_table) {
2786                 int flags = RT6_LOOKUP_F_IFACE;
2787
2788                 grt = ip6_nh_lookup_table(net, cfg, gw_addr,
2789                                           cfg->fc_table, flags);
2790                 if (grt) {
2791                         if (grt->rt6i_flags & RTF_GATEWAY ||
2792                             (dev && dev != grt->dst.dev)) {
2793                                 ip6_rt_put(grt);
2794                                 grt = NULL;
2795                         }
2796                 }
2797         }
2798
2799         if (!grt)
2800                 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1);
2801
2802         if (!grt)
2803                 goto out;
2804
2805         if (dev) {
2806                 if (dev != grt->dst.dev) {
2807                         ip6_rt_put(grt);
2808                         goto out;
2809                 }
2810         } else {
2811                 *_dev = dev = grt->dst.dev;
2812                 *idev = grt->rt6i_idev;
2813                 dev_hold(dev);
2814                 in6_dev_hold(grt->rt6i_idev);
2815         }
2816
2817         if (!(grt->rt6i_flags & RTF_GATEWAY))
2818                 err = 0;
2819
2820         ip6_rt_put(grt);
2821
2822 out:
2823         return err;
2824 }
2825
2826 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
2827                            struct net_device **_dev, struct inet6_dev **idev,
2828                            struct netlink_ext_ack *extack)
2829 {
2830         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2831         int gwa_type = ipv6_addr_type(gw_addr);
2832         bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true;
2833         const struct net_device *dev = *_dev;
2834         bool need_addr_check = !dev;
2835         int err = -EINVAL;
2836
2837         /* if gw_addr is local we will fail to detect this in case
2838          * address is still TENTATIVE (DAD in progress). rt6_lookup()
2839          * will return already-added prefix route via interface that
2840          * prefix route was assigned to, which might be non-loopback.
2841          */
2842         if (dev &&
2843             ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2844                 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2845                 goto out;
2846         }
2847
2848         if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
2849                 /* IPv6 strictly inhibits using not link-local
2850                  * addresses as nexthop address.
2851                  * Otherwise, router will not able to send redirects.
2852                  * It is very good, but in some (rare!) circumstances
2853                  * (SIT, PtP, NBMA NOARP links) it is handy to allow
2854                  * some exceptions. --ANK
2855                  * We allow IPv4-mapped nexthops to support RFC4798-type
2856                  * addressing
2857                  */
2858                 if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) {
2859                         NL_SET_ERR_MSG(extack, "Invalid gateway address");
2860                         goto out;
2861                 }
2862
2863                 if (cfg->fc_flags & RTNH_F_ONLINK)
2864                         err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
2865                 else
2866                         err = ip6_route_check_nh(net, cfg, _dev, idev);
2867
2868                 if (err)
2869                         goto out;
2870         }
2871
2872         /* reload in case device was changed */
2873         dev = *_dev;
2874
2875         err = -EINVAL;
2876         if (!dev) {
2877                 NL_SET_ERR_MSG(extack, "Egress device not specified");
2878                 goto out;
2879         } else if (dev->flags & IFF_LOOPBACK) {
2880                 NL_SET_ERR_MSG(extack,
2881                                "Egress device can not be loopback device for this route");
2882                 goto out;
2883         }
2884
2885         /* if we did not check gw_addr above, do so now that the
2886          * egress device has been resolved.
2887          */
2888         if (need_addr_check &&
2889             ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2890                 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2891                 goto out;
2892         }
2893
2894         err = 0;
2895 out:
2896         return err;
2897 }
2898
2899 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
2900                                               gfp_t gfp_flags,
2901                                               struct netlink_ext_ack *extack)
2902 {
2903         struct net *net = cfg->fc_nlinfo.nl_net;
2904         struct fib6_info *rt = NULL;
2905         struct net_device *dev = NULL;
2906         struct inet6_dev *idev = NULL;
2907         struct fib6_table *table;
2908         int addr_type;
2909         int err = -EINVAL;
2910
2911         /* RTF_PCPU is an internal flag; can not be set by userspace */
2912         if (cfg->fc_flags & RTF_PCPU) {
2913                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
2914                 goto out;
2915         }
2916
2917         /* RTF_CACHE is an internal flag; can not be set by userspace */
2918         if (cfg->fc_flags & RTF_CACHE) {
2919                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
2920                 goto out;
2921         }
2922
2923         if (cfg->fc_type > RTN_MAX) {
2924                 NL_SET_ERR_MSG(extack, "Invalid route type");
2925                 goto out;
2926         }
2927
2928         if (cfg->fc_dst_len > 128) {
2929                 NL_SET_ERR_MSG(extack, "Invalid prefix length");
2930                 goto out;
2931         }
2932         if (cfg->fc_src_len > 128) {
2933                 NL_SET_ERR_MSG(extack, "Invalid source address length");
2934                 goto out;
2935         }
2936 #ifndef CONFIG_IPV6_SUBTREES
2937         if (cfg->fc_src_len) {
2938                 NL_SET_ERR_MSG(extack,
2939                                "Specifying source address requires IPV6_SUBTREES to be enabled");
2940                 goto out;
2941         }
2942 #endif
2943         if (cfg->fc_ifindex) {
2944                 err = -ENODEV;
2945                 dev = dev_get_by_index(net, cfg->fc_ifindex);
2946                 if (!dev)
2947                         goto out;
2948                 idev = in6_dev_get(dev);
2949                 if (!idev)
2950                         goto out;
2951         }
2952
2953         if (cfg->fc_flags & RTNH_F_ONLINK) {
2954                 if (!dev) {
2955                         NL_SET_ERR_MSG(extack,
2956                                        "Nexthop device required for onlink");
2957                         err = -ENODEV;
2958                         goto out;
2959                 }
2960
2961                 if (!(dev->flags & IFF_UP)) {
2962                         NL_SET_ERR_MSG(extack, "Nexthop device is not up");
2963                         err = -ENETDOWN;
2964                         goto out;
2965                 }
2966         }
2967
2968         err = -ENOBUFS;
2969         if (cfg->fc_nlinfo.nlh &&
2970             !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
2971                 table = fib6_get_table(net, cfg->fc_table);
2972                 if (!table) {
2973                         pr_warn("NLM_F_CREATE should be specified when creating new route\n");
2974                         table = fib6_new_table(net, cfg->fc_table);
2975                 }
2976         } else {
2977                 table = fib6_new_table(net, cfg->fc_table);
2978         }
2979
2980         if (!table)
2981                 goto out;
2982
2983         err = -ENOMEM;
2984         rt = fib6_info_alloc(gfp_flags);
2985         if (!rt)
2986                 goto out;
2987
2988         rt->fib6_metrics = ip_fib_metrics_init(net, cfg->fc_mx, cfg->fc_mx_len,
2989                                                extack);
2990         if (IS_ERR(rt->fib6_metrics)) {
2991                 err = PTR_ERR(rt->fib6_metrics);
2992                 /* Do not leave garbage there. */
2993                 rt->fib6_metrics = (struct dst_metrics *)&dst_default_metrics;
2994                 goto out;
2995         }
2996
2997         if (cfg->fc_flags & RTF_ADDRCONF)
2998                 rt->dst_nocount = true;
2999
3000         if (cfg->fc_flags & RTF_EXPIRES)
3001                 fib6_set_expires(rt, jiffies +
3002                                 clock_t_to_jiffies(cfg->fc_expires));
3003         else
3004                 fib6_clean_expires(rt);
3005
3006         if (cfg->fc_protocol == RTPROT_UNSPEC)
3007                 cfg->fc_protocol = RTPROT_BOOT;
3008         rt->fib6_protocol = cfg->fc_protocol;
3009
3010         addr_type = ipv6_addr_type(&cfg->fc_dst);
3011
3012         if (cfg->fc_encap) {
3013                 struct lwtunnel_state *lwtstate;
3014
3015                 err = lwtunnel_build_state(cfg->fc_encap_type,
3016                                            cfg->fc_encap, AF_INET6, cfg,
3017                                            &lwtstate, extack);
3018                 if (err)
3019                         goto out;
3020                 rt->fib6_nh.nh_lwtstate = lwtstate_get(lwtstate);
3021         }
3022
3023         ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
3024         rt->fib6_dst.plen = cfg->fc_dst_len;
3025         if (rt->fib6_dst.plen == 128)
3026                 rt->dst_host = true;
3027
3028 #ifdef CONFIG_IPV6_SUBTREES
3029         ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
3030         rt->fib6_src.plen = cfg->fc_src_len;
3031 #endif
3032
3033         rt->fib6_metric = cfg->fc_metric;
3034         rt->fib6_nh.nh_weight = 1;
3035
3036         rt->fib6_type = cfg->fc_type;
3037
3038         /* We cannot add true routes via loopback here,
3039            they would result in kernel looping; promote them to reject routes
3040          */
3041         if ((cfg->fc_flags & RTF_REJECT) ||
3042             (dev && (dev->flags & IFF_LOOPBACK) &&
3043              !(addr_type & IPV6_ADDR_LOOPBACK) &&
3044              !(cfg->fc_flags & RTF_LOCAL))) {
3045                 /* hold loopback dev/idev if we haven't done so. */
3046                 if (dev != net->loopback_dev) {
3047                         if (dev) {
3048                                 dev_put(dev);
3049                                 in6_dev_put(idev);
3050                         }
3051                         dev = net->loopback_dev;
3052                         dev_hold(dev);
3053                         idev = in6_dev_get(dev);
3054                         if (!idev) {
3055                                 err = -ENODEV;
3056                                 goto out;
3057                         }
3058                 }
3059                 rt->fib6_flags = RTF_REJECT|RTF_NONEXTHOP;
3060                 goto install_route;
3061         }
3062
3063         if (cfg->fc_flags & RTF_GATEWAY) {
3064                 err = ip6_validate_gw(net, cfg, &dev, &idev, extack);
3065                 if (err)
3066                         goto out;
3067
3068                 rt->fib6_nh.nh_gw = cfg->fc_gateway;
3069         }
3070
3071         err = -ENODEV;
3072         if (!dev)
3073                 goto out;
3074
3075         if (idev->cnf.disable_ipv6) {
3076                 NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
3077                 err = -EACCES;
3078                 goto out;
3079         }
3080
3081         if (!(dev->flags & IFF_UP) && !cfg->fc_ignore_dev_down) {
3082                 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3083                 err = -ENETDOWN;
3084                 goto out;
3085         }
3086
3087         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
3088                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
3089                         NL_SET_ERR_MSG(extack, "Invalid source address");
3090                         err = -EINVAL;
3091                         goto out;
3092                 }
3093                 rt->fib6_prefsrc.addr = cfg->fc_prefsrc;
3094                 rt->fib6_prefsrc.plen = 128;
3095         } else
3096                 rt->fib6_prefsrc.plen = 0;
3097
3098         rt->fib6_flags = cfg->fc_flags;
3099
3100 install_route:
3101         if (!(rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
3102             !netif_carrier_ok(dev))
3103                 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
3104         rt->fib6_nh.nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK);
3105         rt->fib6_nh.nh_dev = dev;
3106         rt->fib6_table = table;
3107
3108         if (idev)
3109                 in6_dev_put(idev);
3110
3111         return rt;
3112 out:
3113         if (dev)
3114                 dev_put(dev);
3115         if (idev)
3116                 in6_dev_put(idev);
3117
3118         fib6_info_release(rt);
3119         return ERR_PTR(err);
3120 }
3121
3122 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
3123                   struct netlink_ext_ack *extack)
3124 {
3125         struct fib6_info *rt;
3126         int err;
3127
3128         rt = ip6_route_info_create(cfg, gfp_flags, extack);
3129         if (IS_ERR(rt))
3130                 return PTR_ERR(rt);
3131
3132         err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack);
3133         fib6_info_release(rt);
3134
3135         return err;
3136 }
3137
3138 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info)
3139 {
3140         struct net *net = info->nl_net;
3141         struct fib6_table *table;
3142         int err;
3143
3144         if (rt == net->ipv6.fib6_null_entry) {
3145                 err = -ENOENT;
3146                 goto out;
3147         }
3148
3149         table = rt->fib6_table;
3150         spin_lock_bh(&table->tb6_lock);
3151         err = fib6_del(rt, info);
3152         spin_unlock_bh(&table->tb6_lock);
3153
3154 out:
3155         fib6_info_release(rt);
3156         return err;
3157 }
3158
3159 int ip6_del_rt(struct net *net, struct fib6_info *rt)
3160 {
3161         struct nl_info info = { .nl_net = net };
3162
3163         return __ip6_del_rt(rt, &info);
3164 }
3165
3166 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
3167 {
3168         struct nl_info *info = &cfg->fc_nlinfo;
3169         struct net *net = info->nl_net;
3170         struct sk_buff *skb = NULL;
3171         struct fib6_table *table;
3172         int err = -ENOENT;
3173
3174         if (rt == net->ipv6.fib6_null_entry)
3175                 goto out_put;
3176         table = rt->fib6_table;
3177         spin_lock_bh(&table->tb6_lock);
3178
3179         if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) {
3180                 struct fib6_info *sibling, *next_sibling;
3181
3182                 /* prefer to send a single notification with all hops */
3183                 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3184                 if (skb) {
3185                         u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3186
3187                         if (rt6_fill_node(net, skb, rt, NULL,
3188                                           NULL, NULL, 0, RTM_DELROUTE,
3189                                           info->portid, seq, 0) < 0) {
3190                                 kfree_skb(skb);
3191                                 skb = NULL;
3192                         } else
3193                                 info->skip_notify = 1;
3194                 }
3195
3196                 list_for_each_entry_safe(sibling, next_sibling,
3197                                          &rt->fib6_siblings,
3198                                          fib6_siblings) {
3199                         err = fib6_del(sibling, info);
3200                         if (err)
3201                                 goto out_unlock;
3202                 }
3203         }
3204
3205         err = fib6_del(rt, info);
3206 out_unlock:
3207         spin_unlock_bh(&table->tb6_lock);
3208 out_put:
3209         fib6_info_release(rt);
3210
3211         if (skb) {
3212                 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3213                             info->nlh, gfp_any());
3214         }
3215         return err;
3216 }
3217
3218 static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
3219 {
3220         int rc = -ESRCH;
3221
3222         if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex)
3223                 goto out;
3224
3225         if (cfg->fc_flags & RTF_GATEWAY &&
3226             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
3227                 goto out;
3228
3229         rc = rt6_remove_exception_rt(rt);
3230 out:
3231         return rc;
3232 }
3233
3234 static int ip6_route_del(struct fib6_config *cfg,
3235                          struct netlink_ext_ack *extack)
3236 {
3237         struct rt6_info *rt_cache;
3238         struct fib6_table *table;
3239         struct fib6_info *rt;
3240         struct fib6_node *fn;
3241         int err = -ESRCH;
3242
3243         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
3244         if (!table) {
3245                 NL_SET_ERR_MSG(extack, "FIB table does not exist");
3246                 return err;
3247         }
3248
3249         rcu_read_lock();
3250
3251         fn = fib6_locate(&table->tb6_root,
3252                          &cfg->fc_dst, cfg->fc_dst_len,
3253                          &cfg->fc_src, cfg->fc_src_len,
3254                          !(cfg->fc_flags & RTF_CACHE));
3255
3256         if (fn) {
3257                 for_each_fib6_node_rt_rcu(fn) {
3258                         if (cfg->fc_flags & RTF_CACHE) {
3259                                 int rc;
3260
3261                                 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
3262                                                               &cfg->fc_src);
3263                                 if (rt_cache) {
3264                                         rc = ip6_del_cached_rt(rt_cache, cfg);
3265                                         if (rc != -ESRCH) {
3266                                                 rcu_read_unlock();
3267                                                 return rc;
3268                                         }
3269                                 }
3270                                 continue;
3271                         }
3272                         if (cfg->fc_ifindex &&
3273                             (!rt->fib6_nh.nh_dev ||
3274                              rt->fib6_nh.nh_dev->ifindex != cfg->fc_ifindex))
3275                                 continue;
3276                         if (cfg->fc_flags & RTF_GATEWAY &&
3277                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->fib6_nh.nh_gw))
3278                                 continue;
3279                         if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
3280                                 continue;
3281                         if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol)
3282                                 continue;
3283                         if (!fib6_info_hold_safe(rt))
3284                                 continue;
3285                         rcu_read_unlock();
3286
3287                         /* if gateway was specified only delete the one hop */
3288                         if (cfg->fc_flags & RTF_GATEWAY)
3289                                 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
3290
3291                         return __ip6_del_rt_siblings(rt, cfg);
3292                 }
3293         }
3294         rcu_read_unlock();
3295
3296         return err;
3297 }
3298
3299 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
3300 {
3301         struct netevent_redirect netevent;
3302         struct rt6_info *rt, *nrt = NULL;
3303         struct ndisc_options ndopts;
3304         struct inet6_dev *in6_dev;
3305         struct neighbour *neigh;
3306         struct fib6_info *from;
3307         struct rd_msg *msg;
3308         int optlen, on_link;
3309         u8 *lladdr;
3310
3311         optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
3312         optlen -= sizeof(*msg);
3313
3314         if (optlen < 0) {
3315                 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
3316                 return;
3317         }
3318
3319         msg = (struct rd_msg *)icmp6_hdr(skb);
3320
3321         if (ipv6_addr_is_multicast(&msg->dest)) {
3322                 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
3323                 return;
3324         }
3325
3326         on_link = 0;
3327         if (ipv6_addr_equal(&msg->dest, &msg->target)) {
3328                 on_link = 1;
3329         } else if (ipv6_addr_type(&msg->target) !=
3330                    (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
3331                 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
3332                 return;
3333         }
3334
3335         in6_dev = __in6_dev_get(skb->dev);
3336         if (!in6_dev)
3337                 return;
3338         if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
3339                 return;
3340
3341         /* RFC2461 8.1:
3342          *      The IP source address of the Redirect MUST be the same as the current
3343          *      first-hop router for the specified ICMP Destination Address.
3344          */
3345
3346         if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
3347                 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
3348                 return;
3349         }
3350
3351         lladdr = NULL;
3352         if (ndopts.nd_opts_tgt_lladdr) {
3353                 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
3354                                              skb->dev);
3355                 if (!lladdr) {
3356                         net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
3357                         return;
3358                 }
3359         }
3360
3361         rt = (struct rt6_info *) dst;
3362         if (rt->rt6i_flags & RTF_REJECT) {
3363                 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3364                 return;
3365         }
3366
3367         /* Redirect received -> path was valid.
3368          * Look, redirects are sent only in response to data packets,
3369          * so that this nexthop apparently is reachable. --ANK
3370          */
3371         dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3372
3373         neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3374         if (!neigh)
3375                 return;
3376
3377         /*
3378          *      We have finally decided to accept it.
3379          */
3380
3381         ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3382                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
3383                      NEIGH_UPDATE_F_OVERRIDE|
3384                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3385                                      NEIGH_UPDATE_F_ISROUTER)),
3386                      NDISC_REDIRECT, &ndopts);
3387
3388         rcu_read_lock();
3389         from = rcu_dereference(rt->from);
3390         /* This fib6_info_hold() is safe here because we hold reference to rt
3391          * and rt already holds reference to fib6_info.
3392          */
3393         fib6_info_hold(from);
3394         rcu_read_unlock();
3395
3396         nrt = ip6_rt_cache_alloc(from, &msg->dest, NULL);
3397         if (!nrt)
3398                 goto out;
3399
3400         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3401         if (on_link)
3402                 nrt->rt6i_flags &= ~RTF_GATEWAY;
3403
3404         nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3405
3406         /* No need to remove rt from the exception table if rt is
3407          * a cached route because rt6_insert_exception() will
3408          * takes care of it
3409          */
3410         if (rt6_insert_exception(nrt, from)) {
3411                 dst_release_immediate(&nrt->dst);
3412                 goto out;
3413         }
3414
3415         netevent.old = &rt->dst;
3416         netevent.new = &nrt->dst;
3417         netevent.daddr = &msg->dest;
3418         netevent.neigh = neigh;
3419         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3420
3421 out:
3422         fib6_info_release(from);
3423         neigh_release(neigh);
3424 }
3425
3426 #ifdef CONFIG_IPV6_ROUTE_INFO
3427 static struct fib6_info *rt6_get_route_info(struct net *net,
3428                                            const struct in6_addr *prefix, int prefixlen,
3429                                            const struct in6_addr *gwaddr,
3430                                            struct net_device *dev)
3431 {
3432         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3433         int ifindex = dev->ifindex;
3434         struct fib6_node *fn;
3435         struct fib6_info *rt = NULL;
3436         struct fib6_table *table;
3437
3438         table = fib6_get_table(net, tb_id);
3439         if (!table)
3440                 return NULL;
3441
3442         rcu_read_lock();
3443         fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3444         if (!fn)
3445                 goto out;
3446
3447         for_each_fib6_node_rt_rcu(fn) {
3448                 if (rt->fib6_nh.nh_dev->ifindex != ifindex)
3449                         continue;
3450                 if ((rt->fib6_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
3451                         continue;
3452                 if (!ipv6_addr_equal(&rt->fib6_nh.nh_gw, gwaddr))
3453                         continue;
3454                 if (!fib6_info_hold_safe(rt))
3455                         continue;
3456                 break;
3457         }
3458 out:
3459         rcu_read_unlock();
3460         return rt;
3461 }
3462
3463 static struct fib6_info *rt6_add_route_info(struct net *net,
3464                                            const struct in6_addr *prefix, int prefixlen,
3465                                            const struct in6_addr *gwaddr,
3466                                            struct net_device *dev,
3467                                            unsigned int pref)
3468 {
3469         struct fib6_config cfg = {
3470                 .fc_metric      = IP6_RT_PRIO_USER,
3471                 .fc_ifindex     = dev->ifindex,
3472                 .fc_dst_len     = prefixlen,
3473                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3474                                   RTF_UP | RTF_PREF(pref),
3475                 .fc_protocol = RTPROT_RA,
3476                 .fc_type = RTN_UNICAST,
3477                 .fc_nlinfo.portid = 0,
3478                 .fc_nlinfo.nlh = NULL,
3479                 .fc_nlinfo.nl_net = net,
3480         };
3481
3482         cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3483         cfg.fc_dst = *prefix;
3484         cfg.fc_gateway = *gwaddr;
3485
3486         /* We should treat it as a default route if prefix length is 0. */
3487         if (!prefixlen)
3488                 cfg.fc_flags |= RTF_DEFAULT;
3489
3490         ip6_route_add(&cfg, GFP_ATOMIC, NULL);
3491
3492         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3493 }
3494 #endif
3495
3496 struct fib6_info *rt6_get_dflt_router(struct net *net,
3497                                      const struct in6_addr *addr,
3498                                      struct net_device *dev)
3499 {
3500         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3501         struct fib6_info *rt;
3502         struct fib6_table *table;
3503
3504         table = fib6_get_table(net, tb_id);
3505         if (!table)
3506                 return NULL;
3507
3508         rcu_read_lock();
3509         for_each_fib6_node_rt_rcu(&table->tb6_root) {
3510                 if (dev == rt->fib6_nh.nh_dev &&
3511                     ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3512                     ipv6_addr_equal(&rt->fib6_nh.nh_gw, addr))
3513                         break;
3514         }
3515         if (rt && !fib6_info_hold_safe(rt))
3516                 rt = NULL;
3517         rcu_read_unlock();
3518         return rt;
3519 }
3520
3521 struct fib6_info *rt6_add_dflt_router(struct net *net,
3522                                      const struct in6_addr *gwaddr,
3523                                      struct net_device *dev,
3524                                      unsigned int pref)
3525 {
3526         struct fib6_config cfg = {
3527                 .fc_table       = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3528                 .fc_metric      = IP6_RT_PRIO_USER,
3529                 .fc_ifindex     = dev->ifindex,
3530                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3531                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3532                 .fc_protocol = RTPROT_RA,
3533                 .fc_type = RTN_UNICAST,
3534                 .fc_nlinfo.portid = 0,
3535                 .fc_nlinfo.nlh = NULL,
3536                 .fc_nlinfo.nl_net = net,
3537         };
3538
3539         cfg.fc_gateway = *gwaddr;
3540
3541         if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) {
3542                 struct fib6_table *table;
3543
3544                 table = fib6_get_table(dev_net(dev), cfg.fc_table);
3545                 if (table)
3546                         table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3547         }
3548
3549         return rt6_get_dflt_router(net, gwaddr, dev);
3550 }
3551
3552 static void __rt6_purge_dflt_routers(struct net *net,
3553                                      struct fib6_table *table)
3554 {
3555         struct fib6_info *rt;
3556
3557 restart:
3558         rcu_read_lock();
3559         for_each_fib6_node_rt_rcu(&table->tb6_root) {
3560                 struct net_device *dev = fib6_info_nh_dev(rt);
3561                 struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL;
3562
3563                 if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3564                     (!idev || idev->cnf.accept_ra != 2) &&
3565                     fib6_info_hold_safe(rt)) {
3566                         rcu_read_unlock();
3567                         ip6_del_rt(net, rt);
3568                         goto restart;
3569                 }
3570         }
3571         rcu_read_unlock();
3572
3573         table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3574 }
3575
3576 void rt6_purge_dflt_routers(struct net *net)
3577 {
3578         struct fib6_table *table;
3579         struct hlist_head *head;
3580         unsigned int h;
3581
3582         rcu_read_lock();
3583
3584         for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3585                 head = &net->ipv6.fib_table_hash[h];
3586                 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3587                         if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3588                                 __rt6_purge_dflt_routers(net, table);
3589                 }
3590         }
3591
3592         rcu_read_unlock();
3593 }
3594
3595 static void rtmsg_to_fib6_config(struct net *net,
3596                                  struct in6_rtmsg *rtmsg,
3597                                  struct fib6_config *cfg)
3598 {
3599         *cfg = (struct fib6_config){
3600                 .fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3601                          : RT6_TABLE_MAIN,
3602                 .fc_ifindex = rtmsg->rtmsg_ifindex,
3603                 .fc_metric = rtmsg->rtmsg_metric ? : IP6_RT_PRIO_USER,
3604                 .fc_expires = rtmsg->rtmsg_info,
3605                 .fc_dst_len = rtmsg->rtmsg_dst_len,
3606                 .fc_src_len = rtmsg->rtmsg_src_len,
3607                 .fc_flags = rtmsg->rtmsg_flags,
3608                 .fc_type = rtmsg->rtmsg_type,
3609
3610                 .fc_nlinfo.nl_net = net,
3611
3612                 .fc_dst = rtmsg->rtmsg_dst,
3613                 .fc_src = rtmsg->rtmsg_src,
3614                 .fc_gateway = rtmsg->rtmsg_gateway,
3615         };
3616 }
3617
3618 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3619 {
3620         struct fib6_config cfg;
3621         struct in6_rtmsg rtmsg;
3622         int err;
3623
3624         switch (cmd) {
3625         case SIOCADDRT:         /* Add a route */
3626         case SIOCDELRT:         /* Delete a route */
3627                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3628                         return -EPERM;
3629                 err = copy_from_user(&rtmsg, arg,
3630                                      sizeof(struct in6_rtmsg));
3631                 if (err)
3632                         return -EFAULT;
3633
3634                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3635
3636                 rtnl_lock();
3637                 switch (cmd) {
3638                 case SIOCADDRT:
3639                         err = ip6_route_add(&cfg, GFP_KERNEL, NULL);
3640                         break;
3641                 case SIOCDELRT:
3642                         err = ip6_route_del(&cfg, NULL);
3643                         break;
3644                 default:
3645                         err = -EINVAL;
3646                 }
3647                 rtnl_unlock();
3648
3649                 return err;
3650         }
3651
3652         return -EINVAL;
3653 }
3654
3655 /*
3656  *      Drop the packet on the floor
3657  */
3658
3659 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3660 {
3661         int type;
3662         struct dst_entry *dst = skb_dst(skb);
3663         switch (ipstats_mib_noroutes) {
3664         case IPSTATS_MIB_INNOROUTES:
3665                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3666                 if (type == IPV6_ADDR_ANY) {
3667                         IP6_INC_STATS(dev_net(dst->dev),
3668                                       __in6_dev_get_safely(skb->dev),
3669                                       IPSTATS_MIB_INADDRERRORS);
3670                         break;
3671                 }
3672                 /* FALLTHROUGH */
3673         case IPSTATS_MIB_OUTNOROUTES:
3674                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3675                               ipstats_mib_noroutes);
3676                 break;
3677         }
3678         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3679         kfree_skb(skb);
3680         return 0;
3681 }
3682
3683 static int ip6_pkt_discard(struct sk_buff *skb)
3684 {
3685         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3686 }
3687
3688 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3689 {
3690         skb->dev = skb_dst(skb)->dev;
3691         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3692 }
3693
3694 static int ip6_pkt_prohibit(struct sk_buff *skb)
3695 {
3696         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3697 }
3698
3699 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3700 {
3701         skb->dev = skb_dst(skb)->dev;
3702         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3703 }
3704
3705 /*
3706  *      Allocate a dst for local (unicast / anycast) address.
3707  */
3708
3709 struct fib6_info *addrconf_f6i_alloc(struct net *net,
3710                                      struct inet6_dev *idev,
3711                                      const struct in6_addr *addr,
3712                                      bool anycast, gfp_t gfp_flags)
3713 {
3714         struct fib6_config cfg = {
3715                 .fc_table = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL,
3716                 .fc_ifindex = idev->dev->ifindex,
3717                 .fc_flags = RTF_UP | RTF_ADDRCONF | RTF_NONEXTHOP,
3718                 .fc_dst = *addr,
3719                 .fc_dst_len = 128,
3720                 .fc_protocol = RTPROT_KERNEL,
3721                 .fc_nlinfo.nl_net = net,
3722                 .fc_ignore_dev_down = true,
3723         };
3724
3725         if (anycast) {
3726                 cfg.fc_type = RTN_ANYCAST;
3727                 cfg.fc_flags |= RTF_ANYCAST;
3728         } else {
3729                 cfg.fc_type = RTN_LOCAL;
3730                 cfg.fc_flags |= RTF_LOCAL;
3731         }
3732
3733         return ip6_route_info_create(&cfg, gfp_flags, NULL);
3734 }
3735
3736 /* remove deleted ip from prefsrc entries */
3737 struct arg_dev_net_ip {
3738         struct net_device *dev;
3739         struct net *net;
3740         struct in6_addr *addr;
3741 };
3742
3743 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
3744 {
3745         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3746         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3747         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3748
3749         if (((void *)rt->fib6_nh.nh_dev == dev || !dev) &&
3750             rt != net->ipv6.fib6_null_entry &&
3751             ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) {
3752                 spin_lock_bh(&rt6_exception_lock);
3753                 /* remove prefsrc entry */
3754                 rt->fib6_prefsrc.plen = 0;
3755                 spin_unlock_bh(&rt6_exception_lock);
3756         }
3757         return 0;
3758 }
3759
3760 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3761 {
3762         struct net *net = dev_net(ifp->idev->dev);
3763         struct arg_dev_net_ip adni = {
3764                 .dev = ifp->idev->dev,
3765                 .net = net,
3766                 .addr = &ifp->addr,
3767         };
3768         fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3769 }
3770
3771 #define RTF_RA_ROUTER           (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
3772
3773 /* Remove routers and update dst entries when gateway turn into host. */
3774 static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
3775 {
3776         struct in6_addr *gateway = (struct in6_addr *)arg;
3777
3778         if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3779             ipv6_addr_equal(gateway, &rt->fib6_nh.nh_gw)) {
3780                 return -1;
3781         }
3782
3783         /* Further clean up cached routes in exception table.
3784          * This is needed because cached route may have a different
3785          * gateway than its 'parent' in the case of an ip redirect.
3786          */
3787         rt6_exceptions_clean_tohost(rt, gateway);
3788
3789         return 0;
3790 }
3791
3792 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3793 {
3794         fib6_clean_all(net, fib6_clean_tohost, gateway);
3795 }
3796
3797 struct arg_netdev_event {
3798         const struct net_device *dev;
3799         union {
3800                 unsigned int nh_flags;
3801                 unsigned long event;
3802         };
3803 };
3804
3805 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
3806 {
3807         struct fib6_info *iter;
3808         struct fib6_node *fn;
3809
3810         fn = rcu_dereference_protected(rt->fib6_node,
3811                         lockdep_is_held(&rt->fib6_table->tb6_lock));
3812         iter = rcu_dereference_protected(fn->leaf,
3813                         lockdep_is_held(&rt->fib6_table->tb6_lock));
3814         while (iter) {
3815                 if (iter->fib6_metric == rt->fib6_metric &&
3816                     rt6_qualify_for_ecmp(iter))
3817                         return iter;
3818                 iter = rcu_dereference_protected(iter->fib6_next,
3819                                 lockdep_is_held(&rt->fib6_table->tb6_lock));
3820         }
3821
3822         return NULL;
3823 }
3824
3825 static bool rt6_is_dead(const struct fib6_info *rt)
3826 {
3827         if (rt->fib6_nh.nh_flags & RTNH_F_DEAD ||
3828             (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
3829              fib6_ignore_linkdown(rt)))
3830                 return true;
3831
3832         return false;
3833 }
3834
3835 static int rt6_multipath_total_weight(const struct fib6_info *rt)
3836 {
3837         struct fib6_info *iter;
3838         int total = 0;
3839
3840         if (!rt6_is_dead(rt))
3841                 total += rt->fib6_nh.nh_weight;
3842
3843         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) {
3844                 if (!rt6_is_dead(iter))
3845                         total += iter->fib6_nh.nh_weight;
3846         }
3847
3848         return total;
3849 }
3850
3851 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total)
3852 {
3853         int upper_bound = -1;
3854
3855         if (!rt6_is_dead(rt)) {
3856                 *weight += rt->fib6_nh.nh_weight;
3857                 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
3858                                                     total) - 1;
3859         }
3860         atomic_set(&rt->fib6_nh.nh_upper_bound, upper_bound);
3861 }
3862
3863 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total)
3864 {
3865         struct fib6_info *iter;
3866         int weight = 0;
3867
3868         rt6_upper_bound_set(rt, &weight, total);
3869
3870         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3871                 rt6_upper_bound_set(iter, &weight, total);
3872 }
3873
3874 void rt6_multipath_rebalance(struct fib6_info *rt)
3875 {
3876         struct fib6_info *first;
3877         int total;
3878
3879         /* In case the entire multipath route was marked for flushing,
3880          * then there is no need to rebalance upon the removal of every
3881          * sibling route.
3882          */
3883         if (!rt->fib6_nsiblings || rt->should_flush)
3884                 return;
3885
3886         /* During lookup routes are evaluated in order, so we need to
3887          * make sure upper bounds are assigned from the first sibling
3888          * onwards.
3889          */
3890         first = rt6_multipath_first_sibling(rt);
3891         if (WARN_ON_ONCE(!first))
3892                 return;
3893
3894         total = rt6_multipath_total_weight(first);
3895         rt6_multipath_upper_bound_set(first, total);
3896 }
3897
3898 static int fib6_ifup(struct fib6_info *rt, void *p_arg)
3899 {
3900         const struct arg_netdev_event *arg = p_arg;
3901         struct net *net = dev_net(arg->dev);
3902
3903         if (rt != net->ipv6.fib6_null_entry && rt->fib6_nh.nh_dev == arg->dev) {
3904                 rt->fib6_nh.nh_flags &= ~arg->nh_flags;
3905                 fib6_update_sernum_upto_root(net, rt);
3906                 rt6_multipath_rebalance(rt);
3907         }
3908
3909         return 0;
3910 }
3911
3912 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags)
3913 {
3914         struct arg_netdev_event arg = {
3915                 .dev = dev,
3916                 {
3917                         .nh_flags = nh_flags,
3918                 },
3919         };
3920
3921         if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
3922                 arg.nh_flags |= RTNH_F_LINKDOWN;
3923
3924         fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
3925 }
3926
3927 static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
3928                                    const struct net_device *dev)
3929 {
3930         struct fib6_info *iter;
3931
3932         if (rt->fib6_nh.nh_dev == dev)
3933                 return true;
3934         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3935                 if (iter->fib6_nh.nh_dev == dev)
3936                         return true;
3937
3938         return false;
3939 }
3940
3941 static void rt6_multipath_flush(struct fib6_info *rt)
3942 {
3943         struct fib6_info *iter;
3944
3945         rt->should_flush = 1;
3946         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3947                 iter->should_flush = 1;
3948 }
3949
3950 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt,
3951                                              const struct net_device *down_dev)
3952 {
3953         struct fib6_info *iter;
3954         unsigned int dead = 0;
3955
3956         if (rt->fib6_nh.nh_dev == down_dev ||
3957             rt->fib6_nh.nh_flags & RTNH_F_DEAD)
3958                 dead++;
3959         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3960                 if (iter->fib6_nh.nh_dev == down_dev ||
3961                     iter->fib6_nh.nh_flags & RTNH_F_DEAD)
3962                         dead++;
3963
3964         return dead;
3965 }
3966
3967 static void rt6_multipath_nh_flags_set(struct fib6_info *rt,
3968                                        const struct net_device *dev,
3969                                        unsigned int nh_flags)
3970 {
3971         struct fib6_info *iter;
3972
3973         if (rt->fib6_nh.nh_dev == dev)
3974                 rt->fib6_nh.nh_flags |= nh_flags;
3975         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3976                 if (iter->fib6_nh.nh_dev == dev)
3977                         iter->fib6_nh.nh_flags |= nh_flags;
3978 }
3979
3980 /* called with write lock held for table with rt */
3981 static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
3982 {
3983         const struct arg_netdev_event *arg = p_arg;
3984         const struct net_device *dev = arg->dev;
3985         struct net *net = dev_net(dev);
3986
3987         if (rt == net->ipv6.fib6_null_entry)
3988                 return 0;
3989
3990         switch (arg->event) {
3991         case NETDEV_UNREGISTER:
3992                 return rt->fib6_nh.nh_dev == dev ? -1 : 0;
3993         case NETDEV_DOWN:
3994                 if (rt->should_flush)
3995                         return -1;
3996                 if (!rt->fib6_nsiblings)
3997                         return rt->fib6_nh.nh_dev == dev ? -1 : 0;
3998                 if (rt6_multipath_uses_dev(rt, dev)) {
3999                         unsigned int count;
4000
4001                         count = rt6_multipath_dead_count(rt, dev);
4002                         if (rt->fib6_nsiblings + 1 == count) {
4003                                 rt6_multipath_flush(rt);
4004                                 return -1;
4005                         }
4006                         rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
4007                                                    RTNH_F_LINKDOWN);
4008                         fib6_update_sernum(net, rt);
4009                         rt6_multipath_rebalance(rt);
4010                 }
4011                 return -2;
4012         case NETDEV_CHANGE:
4013                 if (rt->fib6_nh.nh_dev != dev ||
4014                     rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
4015                         break;
4016                 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
4017                 rt6_multipath_rebalance(rt);
4018                 break;
4019         }
4020
4021         return 0;
4022 }
4023
4024 void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
4025 {
4026         struct arg_netdev_event arg = {
4027                 .dev = dev,
4028                 {
4029                         .event = event,
4030                 },
4031         };
4032         struct net *net = dev_net(dev);
4033
4034         if (net->ipv6.sysctl.skip_notify_on_dev_down)
4035                 fib6_clean_all_skip_notify(net, fib6_ifdown, &arg);
4036         else
4037                 fib6_clean_all(net, fib6_ifdown, &arg);
4038 }
4039
4040 void rt6_disable_ip(struct net_device *dev, unsigned long event)
4041 {
4042         rt6_sync_down_dev(dev, event);
4043         rt6_uncached_list_flush_dev(dev_net(dev), dev);
4044         neigh_ifdown(&nd_tbl, dev);
4045 }
4046
4047 struct rt6_mtu_change_arg {
4048         struct net_device *dev;
4049         unsigned int mtu;
4050 };
4051
4052 static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg)
4053 {
4054         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
4055         struct inet6_dev *idev;
4056
4057         /* In IPv6 pmtu discovery is not optional,
4058            so that RTAX_MTU lock cannot disable it.
4059            We still use this lock to block changes
4060            caused by addrconf/ndisc.
4061         */
4062
4063         idev = __in6_dev_get(arg->dev);
4064         if (!idev)
4065                 return 0;
4066
4067         /* For administrative MTU increase, there is no way to discover
4068            IPv6 PMTU increase, so PMTU increase should be updated here.
4069            Since RFC 1981 doesn't include administrative MTU increase
4070            update PMTU increase is a MUST. (i.e. jumbo frame)
4071          */
4072         if (rt->fib6_nh.nh_dev == arg->dev &&
4073             !fib6_metric_locked(rt, RTAX_MTU)) {
4074                 u32 mtu = rt->fib6_pmtu;
4075
4076                 if (mtu >= arg->mtu ||
4077                     (mtu < arg->mtu && mtu == idev->cnf.mtu6))
4078                         fib6_metric_set(rt, RTAX_MTU, arg->mtu);
4079
4080                 spin_lock_bh(&rt6_exception_lock);
4081                 rt6_exceptions_update_pmtu(idev, rt, arg->mtu);
4082                 spin_unlock_bh(&rt6_exception_lock);
4083         }
4084         return 0;
4085 }
4086
4087 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
4088 {
4089         struct rt6_mtu_change_arg arg = {
4090                 .dev = dev,
4091                 .mtu = mtu,
4092         };
4093
4094         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
4095 }
4096
4097 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
4098         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
4099         [RTA_PREFSRC]           = { .len = sizeof(struct in6_addr) },
4100         [RTA_OIF]               = { .type = NLA_U32 },
4101         [RTA_IIF]               = { .type = NLA_U32 },
4102         [RTA_PRIORITY]          = { .type = NLA_U32 },
4103         [RTA_METRICS]           = { .type = NLA_NESTED },
4104         [RTA_MULTIPATH]         = { .len = sizeof(struct rtnexthop) },
4105         [RTA_PREF]              = { .type = NLA_U8 },
4106         [RTA_ENCAP_TYPE]        = { .type = NLA_U16 },
4107         [RTA_ENCAP]             = { .type = NLA_NESTED },
4108         [RTA_EXPIRES]           = { .type = NLA_U32 },
4109         [RTA_UID]               = { .type = NLA_U32 },
4110         [RTA_MARK]              = { .type = NLA_U32 },
4111         [RTA_TABLE]             = { .type = NLA_U32 },
4112         [RTA_IP_PROTO]          = { .type = NLA_U8 },
4113         [RTA_SPORT]             = { .type = NLA_U16 },
4114         [RTA_DPORT]             = { .type = NLA_U16 },
4115 };
4116
4117 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
4118                               struct fib6_config *cfg,
4119                               struct netlink_ext_ack *extack)
4120 {
4121         struct rtmsg *rtm;
4122         struct nlattr *tb[RTA_MAX+1];
4123         unsigned int pref;
4124         int err;
4125
4126         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4127                           extack);
4128         if (err < 0)
4129                 goto errout;
4130
4131         err = -EINVAL;
4132         rtm = nlmsg_data(nlh);
4133
4134         *cfg = (struct fib6_config){
4135                 .fc_table = rtm->rtm_table,
4136                 .fc_dst_len = rtm->rtm_dst_len,
4137                 .fc_src_len = rtm->rtm_src_len,
4138                 .fc_flags = RTF_UP,
4139                 .fc_protocol = rtm->rtm_protocol,
4140                 .fc_type = rtm->rtm_type,
4141
4142                 .fc_nlinfo.portid = NETLINK_CB(skb).portid,
4143                 .fc_nlinfo.nlh = nlh,
4144                 .fc_nlinfo.nl_net = sock_net(skb->sk),
4145         };
4146
4147         if (rtm->rtm_type == RTN_UNREACHABLE ||
4148             rtm->rtm_type == RTN_BLACKHOLE ||
4149             rtm->rtm_type == RTN_PROHIBIT ||
4150             rtm->rtm_type == RTN_THROW)
4151                 cfg->fc_flags |= RTF_REJECT;
4152
4153         if (rtm->rtm_type == RTN_LOCAL)
4154                 cfg->fc_flags |= RTF_LOCAL;
4155
4156         if (rtm->rtm_flags & RTM_F_CLONED)
4157                 cfg->fc_flags |= RTF_CACHE;
4158
4159         cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
4160
4161         if (tb[RTA_GATEWAY]) {
4162                 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
4163                 cfg->fc_flags |= RTF_GATEWAY;
4164         }
4165         if (tb[RTA_VIA]) {
4166                 NL_SET_ERR_MSG(extack, "IPv6 does not support RTA_VIA attribute");
4167                 goto errout;
4168         }
4169
4170         if (tb[RTA_DST]) {
4171                 int plen = (rtm->rtm_dst_len + 7) >> 3;
4172
4173                 if (nla_len(tb[RTA_DST]) < plen)
4174                         goto errout;
4175
4176                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
4177         }
4178
4179         if (tb[RTA_SRC]) {
4180                 int plen = (rtm->rtm_src_len + 7) >> 3;
4181
4182                 if (nla_len(tb[RTA_SRC]) < plen)
4183                         goto errout;
4184
4185                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
4186         }
4187
4188         if (tb[RTA_PREFSRC])
4189                 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
4190
4191         if (tb[RTA_OIF])
4192                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
4193
4194         if (tb[RTA_PRIORITY])
4195                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
4196
4197         if (tb[RTA_METRICS]) {
4198                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
4199                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
4200         }
4201
4202         if (tb[RTA_TABLE])
4203                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
4204
4205         if (tb[RTA_MULTIPATH]) {
4206                 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
4207                 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
4208
4209                 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
4210                                                      cfg->fc_mp_len, extack);
4211                 if (err < 0)
4212                         goto errout;
4213         }
4214
4215         if (tb[RTA_PREF]) {
4216                 pref = nla_get_u8(tb[RTA_PREF]);
4217                 if (pref != ICMPV6_ROUTER_PREF_LOW &&
4218                     pref != ICMPV6_ROUTER_PREF_HIGH)
4219                         pref = ICMPV6_ROUTER_PREF_MEDIUM;
4220                 cfg->fc_flags |= RTF_PREF(pref);
4221         }
4222
4223         if (tb[RTA_ENCAP])
4224                 cfg->fc_encap = tb[RTA_ENCAP];
4225
4226         if (tb[RTA_ENCAP_TYPE]) {
4227                 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
4228
4229                 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
4230                 if (err < 0)
4231                         goto errout;
4232         }
4233
4234         if (tb[RTA_EXPIRES]) {
4235                 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
4236
4237                 if (addrconf_finite_timeout(timeout)) {
4238                         cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
4239                         cfg->fc_flags |= RTF_EXPIRES;
4240                 }
4241         }
4242
4243         err = 0;
4244 errout:
4245         return err;
4246 }
4247
4248 struct rt6_nh {
4249         struct fib6_info *fib6_info;
4250         struct fib6_config r_cfg;
4251         struct list_head next;
4252 };
4253
4254 static int ip6_route_info_append(struct net *net,
4255                                  struct list_head *rt6_nh_list,
4256                                  struct fib6_info *rt,
4257                                  struct fib6_config *r_cfg)
4258 {
4259         struct rt6_nh *nh;
4260         int err = -EEXIST;
4261
4262         list_for_each_entry(nh, rt6_nh_list, next) {
4263                 /* check if fib6_info already exists */
4264                 if (rt6_duplicate_nexthop(nh->fib6_info, rt))
4265                         return err;
4266         }
4267
4268         nh = kzalloc(sizeof(*nh), GFP_KERNEL);
4269         if (!nh)
4270                 return -ENOMEM;
4271         nh->fib6_info = rt;
4272         memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
4273         list_add_tail(&nh->next, rt6_nh_list);
4274
4275         return 0;
4276 }
4277
4278 static void ip6_route_mpath_notify(struct fib6_info *rt,
4279                                    struct fib6_info *rt_last,
4280                                    struct nl_info *info,
4281                                    __u16 nlflags)
4282 {
4283         /* if this is an APPEND route, then rt points to the first route
4284          * inserted and rt_last points to last route inserted. Userspace
4285          * wants a consistent dump of the route which starts at the first
4286          * nexthop. Since sibling routes are always added at the end of
4287          * the list, find the first sibling of the last route appended
4288          */
4289         if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) {
4290                 rt = list_first_entry(&rt_last->fib6_siblings,
4291                                       struct fib6_info,
4292                                       fib6_siblings);
4293         }
4294
4295         if (rt)
4296                 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
4297 }
4298
4299 static int ip6_route_multipath_add(struct fib6_config *cfg,
4300                                    struct netlink_ext_ack *extack)
4301 {
4302         struct fib6_info *rt_notif = NULL, *rt_last = NULL;
4303         struct nl_info *info = &cfg->fc_nlinfo;
4304         struct fib6_config r_cfg;
4305         struct rtnexthop *rtnh;
4306         struct fib6_info *rt;
4307         struct rt6_nh *err_nh;
4308         struct rt6_nh *nh, *nh_safe;
4309         __u16 nlflags;
4310         int remaining;
4311         int attrlen;
4312         int err = 1;
4313         int nhn = 0;
4314         int replace = (cfg->fc_nlinfo.nlh &&
4315                        (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
4316         LIST_HEAD(rt6_nh_list);
4317
4318         nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
4319         if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
4320                 nlflags |= NLM_F_APPEND;
4321
4322         remaining = cfg->fc_mp_len;
4323         rtnh = (struct rtnexthop *)cfg->fc_mp;
4324
4325         /* Parse a Multipath Entry and build a list (rt6_nh_list) of
4326          * fib6_info structs per nexthop
4327          */
4328         while (rtnh_ok(rtnh, remaining)) {
4329                 memcpy(&r_cfg, cfg, sizeof(*cfg));
4330                 if (rtnh->rtnh_ifindex)
4331                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4332
4333                 attrlen = rtnh_attrlen(rtnh);
4334                 if (attrlen > 0) {
4335                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4336
4337                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4338                         if (nla) {
4339                                 r_cfg.fc_gateway = nla_get_in6_addr(nla);
4340                                 r_cfg.fc_flags |= RTF_GATEWAY;
4341                         }
4342                         r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
4343                         nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
4344                         if (nla)
4345                                 r_cfg.fc_encap_type = nla_get_u16(nla);
4346                 }
4347
4348                 r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
4349                 rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack);
4350                 if (IS_ERR(rt)) {
4351                         err = PTR_ERR(rt);
4352                         rt = NULL;
4353                         goto cleanup;
4354                 }
4355                 if (!rt6_qualify_for_ecmp(rt)) {
4356                         err = -EINVAL;
4357                         NL_SET_ERR_MSG(extack,
4358                                        "Device only routes can not be added for IPv6 using the multipath API.");
4359                         fib6_info_release(rt);
4360                         goto cleanup;
4361                 }
4362
4363                 rt->fib6_nh.nh_weight = rtnh->rtnh_hops + 1;
4364
4365                 err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
4366                                             rt, &r_cfg);
4367                 if (err) {
4368                         fib6_info_release(rt);
4369                         goto cleanup;
4370                 }
4371
4372                 rtnh = rtnh_next(rtnh, &remaining);
4373         }
4374
4375         /* for add and replace send one notification with all nexthops.
4376          * Skip the notification in fib6_add_rt2node and send one with
4377          * the full route when done
4378          */
4379         info->skip_notify = 1;
4380
4381         err_nh = NULL;
4382         list_for_each_entry(nh, &rt6_nh_list, next) {
4383                 err = __ip6_ins_rt(nh->fib6_info, info, extack);
4384                 fib6_info_release(nh->fib6_info);
4385
4386                 if (!err) {
4387                         /* save reference to last route successfully inserted */
4388                         rt_last = nh->fib6_info;
4389
4390                         /* save reference to first route for notification */
4391                         if (!rt_notif)
4392                                 rt_notif = nh->fib6_info;
4393                 }
4394
4395                 /* nh->fib6_info is used or freed at this point, reset to NULL*/
4396                 nh->fib6_info = NULL;
4397                 if (err) {
4398                         if (replace && nhn)
4399                                 NL_SET_ERR_MSG_MOD(extack,
4400                                                    "multipath route replace failed (check consistency of installed routes)");
4401                         err_nh = nh;
4402                         goto add_errout;
4403                 }
4404
4405                 /* Because each route is added like a single route we remove
4406                  * these flags after the first nexthop: if there is a collision,
4407                  * we have already failed to add the first nexthop:
4408                  * fib6_add_rt2node() has rejected it; when replacing, old
4409                  * nexthops have been replaced by first new, the rest should
4410                  * be added to it.
4411                  */
4412                 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
4413                                                      NLM_F_REPLACE);
4414                 nhn++;
4415         }
4416
4417         /* success ... tell user about new route */
4418         ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4419         goto cleanup;
4420
4421 add_errout:
4422         /* send notification for routes that were added so that
4423          * the delete notifications sent by ip6_route_del are
4424          * coherent
4425          */
4426         if (rt_notif)
4427                 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4428
4429         /* Delete routes that were already added */
4430         list_for_each_entry(nh, &rt6_nh_list, next) {
4431                 if (err_nh == nh)
4432                         break;
4433                 ip6_route_del(&nh->r_cfg, extack);
4434         }
4435
4436 cleanup:
4437         list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
4438                 if (nh->fib6_info)
4439                         fib6_info_release(nh->fib6_info);
4440                 list_del(&nh->next);
4441                 kfree(nh);
4442         }
4443
4444         return err;
4445 }
4446
4447 static int ip6_route_multipath_del(struct fib6_config *cfg,
4448                                    struct netlink_ext_ack *extack)
4449 {
4450         struct fib6_config r_cfg;
4451         struct rtnexthop *rtnh;
4452         int remaining;
4453         int attrlen;
4454         int err = 1, last_err = 0;
4455
4456         remaining = cfg->fc_mp_len;
4457         rtnh = (struct rtnexthop *)cfg->fc_mp;
4458
4459         /* Parse a Multipath Entry */
4460         while (rtnh_ok(rtnh, remaining)) {
4461                 memcpy(&r_cfg, cfg, sizeof(*cfg));
4462                 if (rtnh->rtnh_ifindex)
4463                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4464
4465                 attrlen = rtnh_attrlen(rtnh);
4466                 if (attrlen > 0) {
4467                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4468
4469                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4470                         if (nla) {
4471                                 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
4472                                 r_cfg.fc_flags |= RTF_GATEWAY;
4473                         }
4474                 }
4475                 err = ip6_route_del(&r_cfg, extack);
4476                 if (err)
4477                         last_err = err;
4478
4479                 rtnh = rtnh_next(rtnh, &remaining);
4480         }
4481
4482         return last_err;
4483 }
4484
4485 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4486                               struct netlink_ext_ack *extack)
4487 {
4488         struct fib6_config cfg;
4489         int err;
4490
4491         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4492         if (err < 0)
4493                 return err;
4494
4495         if (cfg.fc_mp)
4496                 return ip6_route_multipath_del(&cfg, extack);
4497         else {
4498                 cfg.fc_delete_all_nh = 1;
4499                 return ip6_route_del(&cfg, extack);
4500         }
4501 }
4502
4503 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4504                               struct netlink_ext_ack *extack)
4505 {
4506         struct fib6_config cfg;
4507         int err;
4508
4509         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4510         if (err < 0)
4511                 return err;
4512
4513         if (cfg.fc_metric == 0)
4514                 cfg.fc_metric = IP6_RT_PRIO_USER;
4515
4516         if (cfg.fc_mp)
4517                 return ip6_route_multipath_add(&cfg, extack);
4518         else
4519                 return ip6_route_add(&cfg, GFP_KERNEL, extack);
4520 }
4521
4522 static size_t rt6_nlmsg_size(struct fib6_info *rt)
4523 {
4524         int nexthop_len = 0;
4525
4526         if (rt->fib6_nsiblings) {
4527                 nexthop_len = nla_total_size(0)  /* RTA_MULTIPATH */
4528                             + NLA_ALIGN(sizeof(struct rtnexthop))
4529                             + nla_total_size(16) /* RTA_GATEWAY */
4530                             + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate);
4531
4532                 nexthop_len *= rt->fib6_nsiblings;
4533         }
4534
4535         return NLMSG_ALIGN(sizeof(struct rtmsg))
4536                + nla_total_size(16) /* RTA_SRC */
4537                + nla_total_size(16) /* RTA_DST */
4538                + nla_total_size(16) /* RTA_GATEWAY */
4539                + nla_total_size(16) /* RTA_PREFSRC */
4540                + nla_total_size(4) /* RTA_TABLE */
4541                + nla_total_size(4) /* RTA_IIF */
4542                + nla_total_size(4) /* RTA_OIF */
4543                + nla_total_size(4) /* RTA_PRIORITY */
4544                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
4545                + nla_total_size(sizeof(struct rta_cacheinfo))
4546                + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
4547                + nla_total_size(1) /* RTA_PREF */
4548                + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate)
4549                + nexthop_len;
4550 }
4551
4552 static int rt6_nexthop_info(struct sk_buff *skb, struct fib6_info *rt,
4553                             unsigned int *flags, bool skip_oif)
4554 {
4555         if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
4556                 *flags |= RTNH_F_DEAD;
4557
4558         if (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN) {
4559                 *flags |= RTNH_F_LINKDOWN;
4560
4561                 rcu_read_lock();
4562                 if (fib6_ignore_linkdown(rt))
4563                         *flags |= RTNH_F_DEAD;
4564                 rcu_read_unlock();
4565         }
4566
4567         if (rt->fib6_flags & RTF_GATEWAY) {
4568                 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->fib6_nh.nh_gw) < 0)
4569                         goto nla_put_failure;
4570         }
4571
4572         *flags |= (rt->fib6_nh.nh_flags & RTNH_F_ONLINK);
4573         if (rt->fib6_nh.nh_flags & RTNH_F_OFFLOAD)
4574                 *flags |= RTNH_F_OFFLOAD;
4575
4576         /* not needed for multipath encoding b/c it has a rtnexthop struct */
4577         if (!skip_oif && rt->fib6_nh.nh_dev &&
4578             nla_put_u32(skb, RTA_OIF, rt->fib6_nh.nh_dev->ifindex))
4579                 goto nla_put_failure;
4580
4581         if (rt->fib6_nh.nh_lwtstate &&
4582             lwtunnel_fill_encap(skb, rt->fib6_nh.nh_lwtstate) < 0)
4583                 goto nla_put_failure;
4584
4585         return 0;
4586
4587 nla_put_failure:
4588         return -EMSGSIZE;
4589 }
4590
4591 /* add multipath next hop */
4592 static int rt6_add_nexthop(struct sk_buff *skb, struct fib6_info *rt)
4593 {
4594         const struct net_device *dev = rt->fib6_nh.nh_dev;
4595         struct rtnexthop *rtnh;
4596         unsigned int flags = 0;
4597
4598         rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
4599         if (!rtnh)
4600                 goto nla_put_failure;
4601
4602         rtnh->rtnh_hops = rt->fib6_nh.nh_weight - 1;
4603         rtnh->rtnh_ifindex = dev ? dev->ifindex : 0;
4604
4605         if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
4606                 goto nla_put_failure;
4607
4608         rtnh->rtnh_flags = flags;
4609
4610         /* length of rtnetlink header + attributes */
4611         rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
4612
4613         return 0;
4614
4615 nla_put_failure:
4616         return -EMSGSIZE;
4617 }
4618
4619 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
4620                          struct fib6_info *rt, struct dst_entry *dst,
4621                          struct in6_addr *dest, struct in6_addr *src,
4622                          int iif, int type, u32 portid, u32 seq,
4623                          unsigned int flags)
4624 {
4625         struct rt6_info *rt6 = (struct rt6_info *)dst;
4626         struct rt6key *rt6_dst, *rt6_src;
4627         u32 *pmetrics, table, rt6_flags;
4628         struct nlmsghdr *nlh;
4629         struct rtmsg *rtm;
4630         long expires = 0;
4631
4632         nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4633         if (!nlh)
4634                 return -EMSGSIZE;
4635
4636         if (rt6) {
4637                 rt6_dst = &rt6->rt6i_dst;
4638                 rt6_src = &rt6->rt6i_src;
4639                 rt6_flags = rt6->rt6i_flags;
4640         } else {
4641                 rt6_dst = &rt->fib6_dst;
4642                 rt6_src = &rt->fib6_src;
4643                 rt6_flags = rt->fib6_flags;
4644         }
4645
4646         rtm = nlmsg_data(nlh);
4647         rtm->rtm_family = AF_INET6;
4648         rtm->rtm_dst_len = rt6_dst->plen;
4649         rtm->rtm_src_len = rt6_src->plen;
4650         rtm->rtm_tos = 0;
4651         if (rt->fib6_table)
4652                 table = rt->fib6_table->tb6_id;
4653         else
4654                 table = RT6_TABLE_UNSPEC;
4655         rtm->rtm_table = table < 256 ? table : RT_TABLE_COMPAT;
4656         if (nla_put_u32(skb, RTA_TABLE, table))
4657                 goto nla_put_failure;
4658
4659         rtm->rtm_type = rt->fib6_type;
4660         rtm->rtm_flags = 0;
4661         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4662         rtm->rtm_protocol = rt->fib6_protocol;
4663
4664         if (rt6_flags & RTF_CACHE)
4665                 rtm->rtm_flags |= RTM_F_CLONED;
4666
4667         if (dest) {
4668                 if (nla_put_in6_addr(skb, RTA_DST, dest))
4669                         goto nla_put_failure;
4670                 rtm->rtm_dst_len = 128;
4671         } else if (rtm->rtm_dst_len)
4672                 if (nla_put_in6_addr(skb, RTA_DST, &rt6_dst->addr))
4673                         goto nla_put_failure;
4674 #ifdef CONFIG_IPV6_SUBTREES
4675         if (src) {
4676                 if (nla_put_in6_addr(skb, RTA_SRC, src))
4677                         goto nla_put_failure;
4678                 rtm->rtm_src_len = 128;
4679         } else if (rtm->rtm_src_len &&
4680                    nla_put_in6_addr(skb, RTA_SRC, &rt6_src->addr))
4681                 goto nla_put_failure;
4682 #endif
4683         if (iif) {
4684 #ifdef CONFIG_IPV6_MROUTE
4685                 if (ipv6_addr_is_multicast(&rt6_dst->addr)) {
4686                         int err = ip6mr_get_route(net, skb, rtm, portid);
4687
4688                         if (err == 0)
4689                                 return 0;
4690                         if (err < 0)
4691                                 goto nla_put_failure;
4692                 } else
4693 #endif
4694                         if (nla_put_u32(skb, RTA_IIF, iif))
4695                                 goto nla_put_failure;
4696         } else if (dest) {
4697                 struct in6_addr saddr_buf;
4698                 if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 &&
4699                     nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4700                         goto nla_put_failure;
4701         }
4702
4703         if (rt->fib6_prefsrc.plen) {
4704                 struct in6_addr saddr_buf;
4705                 saddr_buf = rt->fib6_prefsrc.addr;
4706                 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4707                         goto nla_put_failure;
4708         }
4709
4710         pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics;
4711         if (rtnetlink_put_metrics(skb, pmetrics) < 0)
4712                 goto nla_put_failure;
4713
4714         if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric))
4715                 goto nla_put_failure;
4716
4717         /* For multipath routes, walk the siblings list and add
4718          * each as a nexthop within RTA_MULTIPATH.
4719          */
4720         if (rt6) {
4721                 if (rt6_flags & RTF_GATEWAY &&
4722                     nla_put_in6_addr(skb, RTA_GATEWAY, &rt6->rt6i_gateway))
4723                         goto nla_put_failure;
4724
4725                 if (dst->dev && nla_put_u32(skb, RTA_OIF, dst->dev->ifindex))
4726                         goto nla_put_failure;
4727         } else if (rt->fib6_nsiblings) {
4728                 struct fib6_info *sibling, *next_sibling;
4729                 struct nlattr *mp;
4730
4731                 mp = nla_nest_start(skb, RTA_MULTIPATH);
4732                 if (!mp)
4733                         goto nla_put_failure;
4734
4735                 if (rt6_add_nexthop(skb, rt) < 0)
4736                         goto nla_put_failure;
4737
4738                 list_for_each_entry_safe(sibling, next_sibling,
4739                                          &rt->fib6_siblings, fib6_siblings) {
4740                         if (rt6_add_nexthop(skb, sibling) < 0)
4741                                 goto nla_put_failure;
4742                 }
4743
4744                 nla_nest_end(skb, mp);
4745         } else {
4746                 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
4747                         goto nla_put_failure;
4748         }
4749
4750         if (rt6_flags & RTF_EXPIRES) {
4751                 expires = dst ? dst->expires : rt->expires;
4752                 expires -= jiffies;
4753         }
4754
4755         if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
4756                 goto nla_put_failure;
4757
4758         if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt6_flags)))
4759                 goto nla_put_failure;
4760
4761
4762         nlmsg_end(skb, nlh);
4763         return 0;
4764
4765 nla_put_failure:
4766         nlmsg_cancel(skb, nlh);
4767         return -EMSGSIZE;
4768 }
4769
4770 static bool fib6_info_uses_dev(const struct fib6_info *f6i,
4771                                const struct net_device *dev)
4772 {
4773         if (f6i->fib6_nh.nh_dev == dev)
4774                 return true;
4775
4776         if (f6i->fib6_nsiblings) {
4777                 struct fib6_info *sibling, *next_sibling;
4778
4779                 list_for_each_entry_safe(sibling, next_sibling,
4780                                          &f6i->fib6_siblings, fib6_siblings) {
4781                         if (sibling->fib6_nh.nh_dev == dev)
4782                                 return true;
4783                 }
4784         }
4785
4786         return false;
4787 }
4788
4789 int rt6_dump_route(struct fib6_info *rt, void *p_arg)
4790 {
4791         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4792         struct fib_dump_filter *filter = &arg->filter;
4793         unsigned int flags = NLM_F_MULTI;
4794         struct net *net = arg->net;
4795
4796         if (rt == net->ipv6.fib6_null_entry)
4797                 return 0;
4798
4799         if ((filter->flags & RTM_F_PREFIX) &&
4800             !(rt->fib6_flags & RTF_PREFIX_RT)) {
4801                 /* success since this is not a prefix route */
4802                 return 1;
4803         }
4804         if (filter->filter_set) {
4805                 if ((filter->rt_type && rt->fib6_type != filter->rt_type) ||
4806                     (filter->dev && !fib6_info_uses_dev(rt, filter->dev)) ||
4807                     (filter->protocol && rt->fib6_protocol != filter->protocol)) {
4808                         return 1;
4809                 }
4810                 flags |= NLM_F_DUMP_FILTERED;
4811         }
4812
4813         return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0,
4814                              RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid,
4815                              arg->cb->nlh->nlmsg_seq, flags);
4816 }
4817
4818 static int inet6_rtm_valid_getroute_req(struct sk_buff *skb,
4819                                         const struct nlmsghdr *nlh,
4820                                         struct nlattr **tb,
4821                                         struct netlink_ext_ack *extack)
4822 {
4823         struct rtmsg *rtm;
4824         int i, err;
4825
4826         if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
4827                 NL_SET_ERR_MSG_MOD(extack,
4828                                    "Invalid header for get route request");
4829                 return -EINVAL;
4830         }
4831
4832         if (!netlink_strict_get_check(skb))
4833                 return nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX,
4834                                    rtm_ipv6_policy, extack);
4835
4836         rtm = nlmsg_data(nlh);
4837         if ((rtm->rtm_src_len && rtm->rtm_src_len != 128) ||
4838             (rtm->rtm_dst_len && rtm->rtm_dst_len != 128) ||
4839             rtm->rtm_table || rtm->rtm_protocol || rtm->rtm_scope ||
4840             rtm->rtm_type) {
4841                 NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for get route request");
4842                 return -EINVAL;
4843         }
4844         if (rtm->rtm_flags & ~RTM_F_FIB_MATCH) {
4845                 NL_SET_ERR_MSG_MOD(extack,
4846                                    "Invalid flags for get route request");
4847                 return -EINVAL;
4848         }
4849
4850         err = nlmsg_parse_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
4851                                  rtm_ipv6_policy, extack);
4852         if (err)
4853                 return err;
4854
4855         if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
4856             (tb[RTA_DST] && !rtm->rtm_dst_len)) {
4857                 NL_SET_ERR_MSG_MOD(extack, "rtm_src_len and rtm_dst_len must be 128 for IPv6");
4858                 return -EINVAL;
4859         }
4860
4861         for (i = 0; i <= RTA_MAX; i++) {
4862                 if (!tb[i])
4863                         continue;
4864
4865                 switch (i) {
4866                 case RTA_SRC:
4867                 case RTA_DST:
4868                 case RTA_IIF:
4869                 case RTA_OIF:
4870                 case RTA_MARK:
4871                 case RTA_UID:
4872                 case RTA_SPORT:
4873                 case RTA_DPORT:
4874                 case RTA_IP_PROTO:
4875                         break;
4876                 default:
4877                         NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in get route request");
4878                         return -EINVAL;
4879                 }
4880         }
4881
4882         return 0;
4883 }
4884
4885 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4886                               struct netlink_ext_ack *extack)
4887 {
4888         struct net *net = sock_net(in_skb->sk);
4889         struct nlattr *tb[RTA_MAX+1];
4890         int err, iif = 0, oif = 0;
4891         struct fib6_info *from;
4892         struct dst_entry *dst;
4893         struct rt6_info *rt;
4894         struct sk_buff *skb;
4895         struct rtmsg *rtm;
4896         struct flowi6 fl6 = {};
4897         bool fibmatch;
4898
4899         err = inet6_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
4900         if (err < 0)
4901                 goto errout;
4902
4903         err = -EINVAL;
4904         rtm = nlmsg_data(nlh);
4905         fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4906         fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4907
4908         if (tb[RTA_SRC]) {
4909                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4910                         goto errout;
4911
4912                 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4913         }
4914
4915         if (tb[RTA_DST]) {
4916                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4917                         goto errout;
4918
4919                 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4920         }
4921
4922         if (tb[RTA_IIF])
4923                 iif = nla_get_u32(tb[RTA_IIF]);
4924
4925         if (tb[RTA_OIF])
4926                 oif = nla_get_u32(tb[RTA_OIF]);
4927
4928         if (tb[RTA_MARK])
4929                 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4930
4931         if (tb[RTA_UID])
4932                 fl6.flowi6_uid = make_kuid(current_user_ns(),
4933                                            nla_get_u32(tb[RTA_UID]));
4934         else
4935                 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4936
4937         if (tb[RTA_SPORT])
4938                 fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]);
4939
4940         if (tb[RTA_DPORT])
4941                 fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]);
4942
4943         if (tb[RTA_IP_PROTO]) {
4944                 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
4945                                                   &fl6.flowi6_proto, AF_INET6,
4946                                                   extack);
4947                 if (err)
4948                         goto errout;
4949         }
4950
4951         if (iif) {
4952                 struct net_device *dev;
4953                 int flags = 0;
4954
4955                 rcu_read_lock();
4956
4957                 dev = dev_get_by_index_rcu(net, iif);
4958                 if (!dev) {
4959                         rcu_read_unlock();
4960                         err = -ENODEV;
4961                         goto errout;
4962                 }
4963
4964                 fl6.flowi6_iif = iif;
4965
4966                 if (!ipv6_addr_any(&fl6.saddr))
4967                         flags |= RT6_LOOKUP_F_HAS_SADDR;
4968
4969                 dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags);
4970
4971                 rcu_read_unlock();
4972         } else {
4973                 fl6.flowi6_oif = oif;
4974
4975                 dst = ip6_route_output(net, NULL, &fl6);
4976         }
4977
4978
4979         rt = container_of(dst, struct rt6_info, dst);
4980         if (rt->dst.error) {
4981                 err = rt->dst.error;
4982                 ip6_rt_put(rt);
4983                 goto errout;
4984         }
4985
4986         if (rt == net->ipv6.ip6_null_entry) {
4987                 err = rt->dst.error;
4988                 ip6_rt_put(rt);
4989                 goto errout;
4990         }
4991
4992         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
4993         if (!skb) {
4994                 ip6_rt_put(rt);
4995                 err = -ENOBUFS;
4996                 goto errout;
4997         }
4998
4999         skb_dst_set(skb, &rt->dst);
5000
5001         rcu_read_lock();
5002         from = rcu_dereference(rt->from);
5003
5004         if (fibmatch)
5005                 err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, iif,
5006                                     RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
5007                                     nlh->nlmsg_seq, 0);
5008         else
5009                 err = rt6_fill_node(net, skb, from, dst, &fl6.daddr,
5010                                     &fl6.saddr, iif, RTM_NEWROUTE,
5011                                     NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
5012                                     0);
5013         rcu_read_unlock();
5014
5015         if (err < 0) {
5016                 kfree_skb(skb);
5017                 goto errout;
5018         }
5019
5020         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
5021 errout:
5022         return err;
5023 }
5024
5025 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
5026                      unsigned int nlm_flags)
5027 {
5028         struct sk_buff *skb;
5029         struct net *net = info->nl_net;
5030         u32 seq;
5031         int err;
5032
5033         err = -ENOBUFS;
5034         seq = info->nlh ? info->nlh->nlmsg_seq : 0;
5035
5036         skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
5037         if (!skb)
5038                 goto errout;
5039
5040         err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
5041                             event, info->portid, seq, nlm_flags);
5042         if (err < 0) {
5043                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
5044                 WARN_ON(err == -EMSGSIZE);
5045                 kfree_skb(skb);
5046                 goto errout;
5047         }
5048         rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
5049                     info->nlh, gfp_any());
5050         return;
5051 errout:
5052         if (err < 0)
5053                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
5054 }
5055
5056 static int ip6_route_dev_notify(struct notifier_block *this,
5057                                 unsigned long event, void *ptr)
5058 {
5059         struct net_device *dev = netdev_notifier_info_to_dev(ptr);
5060         struct net *net = dev_net(dev);
5061
5062         if (!(dev->flags & IFF_LOOPBACK))
5063                 return NOTIFY_OK;
5064
5065         if (event == NETDEV_REGISTER) {
5066                 net->ipv6.fib6_null_entry->fib6_nh.nh_dev = dev;
5067                 net->ipv6.ip6_null_entry->dst.dev = dev;
5068                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
5069 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5070                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
5071                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
5072                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
5073                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
5074 #endif
5075          } else if (event == NETDEV_UNREGISTER &&
5076                     dev->reg_state != NETREG_UNREGISTERED) {
5077                 /* NETDEV_UNREGISTER could be fired for multiple times by
5078                  * netdev_wait_allrefs(). Make sure we only call this once.
5079                  */
5080                 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
5081 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5082                 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
5083                 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
5084 #endif
5085         }
5086
5087         return NOTIFY_OK;
5088 }
5089
5090 /*
5091  *      /proc
5092  */
5093
5094 #ifdef CONFIG_PROC_FS
5095 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
5096 {
5097         struct net *net = (struct net *)seq->private;
5098         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
5099                    net->ipv6.rt6_stats->fib_nodes,
5100                    net->ipv6.rt6_stats->fib_route_nodes,
5101                    atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
5102                    net->ipv6.rt6_stats->fib_rt_entries,
5103                    net->ipv6.rt6_stats->fib_rt_cache,
5104                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
5105                    net->ipv6.rt6_stats->fib_discarded_routes);
5106
5107         return 0;
5108 }
5109 #endif  /* CONFIG_PROC_FS */
5110
5111 #ifdef CONFIG_SYSCTL
5112
5113 static
5114 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
5115                               void __user *buffer, size_t *lenp, loff_t *ppos)
5116 {
5117         struct net *net;
5118         int delay;
5119         int ret;
5120         if (!write)
5121                 return -EINVAL;
5122
5123         net = (struct net *)ctl->extra1;
5124         delay = net->ipv6.sysctl.flush_delay;
5125         ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
5126         if (ret)
5127                 return ret;
5128
5129         fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
5130         return 0;
5131 }
5132
5133 static int zero;
5134 static int one = 1;
5135
5136 static struct ctl_table ipv6_route_table_template[] = {
5137         {
5138                 .procname       =       "flush",
5139                 .data           =       &init_net.ipv6.sysctl.flush_delay,
5140                 .maxlen         =       sizeof(int),
5141                 .mode           =       0200,
5142                 .proc_handler   =       ipv6_sysctl_rtcache_flush
5143         },
5144         {
5145                 .procname       =       "gc_thresh",
5146                 .data           =       &ip6_dst_ops_template.gc_thresh,
5147                 .maxlen         =       sizeof(int),
5148                 .mode           =       0644,
5149                 .proc_handler   =       proc_dointvec,
5150         },
5151         {
5152                 .procname       =       "max_size",
5153                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
5154                 .maxlen         =       sizeof(int),
5155                 .mode           =       0644,
5156                 .proc_handler   =       proc_dointvec,
5157         },
5158         {
5159                 .procname       =       "gc_min_interval",
5160                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5161                 .maxlen         =       sizeof(int),
5162                 .mode           =       0644,
5163                 .proc_handler   =       proc_dointvec_jiffies,
5164         },
5165         {
5166                 .procname       =       "gc_timeout",
5167                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
5168                 .maxlen         =       sizeof(int),
5169                 .mode           =       0644,
5170                 .proc_handler   =       proc_dointvec_jiffies,
5171         },
5172         {
5173                 .procname       =       "gc_interval",
5174                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
5175                 .maxlen         =       sizeof(int),
5176                 .mode           =       0644,
5177                 .proc_handler   =       proc_dointvec_jiffies,
5178         },
5179         {
5180                 .procname       =       "gc_elasticity",
5181                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
5182                 .maxlen         =       sizeof(int),
5183                 .mode           =       0644,
5184                 .proc_handler   =       proc_dointvec,
5185         },
5186         {
5187                 .procname       =       "mtu_expires",
5188                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
5189                 .maxlen         =       sizeof(int),
5190                 .mode           =       0644,
5191                 .proc_handler   =       proc_dointvec_jiffies,
5192         },
5193         {
5194                 .procname       =       "min_adv_mss",
5195                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
5196                 .maxlen         =       sizeof(int),
5197                 .mode           =       0644,
5198                 .proc_handler   =       proc_dointvec,
5199         },
5200         {
5201                 .procname       =       "gc_min_interval_ms",
5202                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5203                 .maxlen         =       sizeof(int),
5204                 .mode           =       0644,
5205                 .proc_handler   =       proc_dointvec_ms_jiffies,
5206         },
5207         {
5208                 .procname       =       "skip_notify_on_dev_down",
5209                 .data           =       &init_net.ipv6.sysctl.skip_notify_on_dev_down,
5210                 .maxlen         =       sizeof(int),
5211                 .mode           =       0644,
5212                 .proc_handler   =       proc_dointvec,
5213                 .extra1         =       &zero,
5214                 .extra2         =       &one,
5215         },
5216         { }
5217 };
5218
5219 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
5220 {
5221         struct ctl_table *table;
5222
5223         table = kmemdup(ipv6_route_table_template,
5224                         sizeof(ipv6_route_table_template),
5225                         GFP_KERNEL);
5226
5227         if (table) {
5228                 table[0].data = &net->ipv6.sysctl.flush_delay;
5229                 table[0].extra1 = net;
5230                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
5231                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
5232                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5233                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
5234                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
5235                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
5236                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
5237                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
5238                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5239                 table[10].data = &net->ipv6.sysctl.skip_notify_on_dev_down;
5240
5241                 /* Don't export sysctls to unprivileged users */
5242                 if (net->user_ns != &init_user_ns)
5243                         table[0].procname = NULL;
5244         }
5245
5246         return table;
5247 }
5248 #endif
5249
5250 static int __net_init ip6_route_net_init(struct net *net)
5251 {
5252         int ret = -ENOMEM;
5253
5254         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
5255                sizeof(net->ipv6.ip6_dst_ops));
5256
5257         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
5258                 goto out_ip6_dst_ops;
5259
5260         net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template,
5261                                             sizeof(*net->ipv6.fib6_null_entry),
5262                                             GFP_KERNEL);
5263         if (!net->ipv6.fib6_null_entry)
5264                 goto out_ip6_dst_entries;
5265
5266         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
5267                                            sizeof(*net->ipv6.ip6_null_entry),
5268                                            GFP_KERNEL);
5269         if (!net->ipv6.ip6_null_entry)
5270                 goto out_fib6_null_entry;
5271         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5272         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
5273                          ip6_template_metrics, true);
5274
5275 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5276         net->ipv6.fib6_has_custom_rules = false;
5277         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
5278                                                sizeof(*net->ipv6.ip6_prohibit_entry),
5279                                                GFP_KERNEL);
5280         if (!net->ipv6.ip6_prohibit_entry)
5281                 goto out_ip6_null_entry;
5282         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5283         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
5284                          ip6_template_metrics, true);
5285
5286         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
5287                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
5288                                                GFP_KERNEL);
5289         if (!net->ipv6.ip6_blk_hole_entry)
5290                 goto out_ip6_prohibit_entry;
5291         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5292         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
5293                          ip6_template_metrics, true);
5294 #endif
5295
5296         net->ipv6.sysctl.flush_delay = 0;
5297         net->ipv6.sysctl.ip6_rt_max_size = 4096;
5298         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
5299         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
5300         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
5301         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
5302         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
5303         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
5304         net->ipv6.sysctl.skip_notify_on_dev_down = 0;
5305
5306         net->ipv6.ip6_rt_gc_expire = 30*HZ;
5307
5308         ret = 0;
5309 out:
5310         return ret;
5311
5312 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5313 out_ip6_prohibit_entry:
5314         kfree(net->ipv6.ip6_prohibit_entry);
5315 out_ip6_null_entry:
5316         kfree(net->ipv6.ip6_null_entry);
5317 #endif
5318 out_fib6_null_entry:
5319         kfree(net->ipv6.fib6_null_entry);
5320 out_ip6_dst_entries:
5321         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5322 out_ip6_dst_ops:
5323         goto out;
5324 }
5325
5326 static void __net_exit ip6_route_net_exit(struct net *net)
5327 {
5328         kfree(net->ipv6.fib6_null_entry);
5329         kfree(net->ipv6.ip6_null_entry);
5330 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5331         kfree(net->ipv6.ip6_prohibit_entry);
5332         kfree(net->ipv6.ip6_blk_hole_entry);
5333 #endif
5334         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5335 }
5336
5337 static int __net_init ip6_route_net_init_late(struct net *net)
5338 {
5339 #ifdef CONFIG_PROC_FS
5340         proc_create_net("ipv6_route", 0, net->proc_net, &ipv6_route_seq_ops,
5341                         sizeof(struct ipv6_route_iter));
5342         proc_create_net_single("rt6_stats", 0444, net->proc_net,
5343                         rt6_stats_seq_show, NULL);
5344 #endif
5345         return 0;
5346 }
5347
5348 static void __net_exit ip6_route_net_exit_late(struct net *net)
5349 {
5350 #ifdef CONFIG_PROC_FS
5351         remove_proc_entry("ipv6_route", net->proc_net);
5352         remove_proc_entry("rt6_stats", net->proc_net);
5353 #endif
5354 }
5355
5356 static struct pernet_operations ip6_route_net_ops = {
5357         .init = ip6_route_net_init,
5358         .exit = ip6_route_net_exit,
5359 };
5360
5361 static int __net_init ipv6_inetpeer_init(struct net *net)
5362 {
5363         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
5364
5365         if (!bp)
5366                 return -ENOMEM;
5367         inet_peer_base_init(bp);
5368         net->ipv6.peers = bp;
5369         return 0;
5370 }
5371
5372 static void __net_exit ipv6_inetpeer_exit(struct net *net)
5373 {
5374         struct inet_peer_base *bp = net->ipv6.peers;
5375
5376         net->ipv6.peers = NULL;
5377         inetpeer_invalidate_tree(bp);
5378         kfree(bp);
5379 }
5380
5381 static struct pernet_operations ipv6_inetpeer_ops = {
5382         .init   =       ipv6_inetpeer_init,
5383         .exit   =       ipv6_inetpeer_exit,
5384 };
5385
5386 static struct pernet_operations ip6_route_net_late_ops = {
5387         .init = ip6_route_net_init_late,
5388         .exit = ip6_route_net_exit_late,
5389 };
5390
5391 static struct notifier_block ip6_route_dev_notifier = {
5392         .notifier_call = ip6_route_dev_notify,
5393         .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
5394 };
5395
5396 void __init ip6_route_init_special_entries(void)
5397 {
5398         /* Registering of the loopback is done before this portion of code,
5399          * the loopback reference in rt6_info will not be taken, do it
5400          * manually for init_net */
5401         init_net.ipv6.fib6_null_entry->fib6_nh.nh_dev = init_net.loopback_dev;
5402         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
5403         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5404   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5405         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
5406         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5407         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
5408         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5409   #endif
5410 }
5411
5412 int __init ip6_route_init(void)
5413 {
5414         int ret;
5415         int cpu;
5416
5417         ret = -ENOMEM;
5418         ip6_dst_ops_template.kmem_cachep =
5419                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
5420                                   SLAB_HWCACHE_ALIGN, NULL);
5421         if (!ip6_dst_ops_template.kmem_cachep)
5422                 goto out;
5423
5424         ret = dst_entries_init(&ip6_dst_blackhole_ops);
5425         if (ret)
5426                 goto out_kmem_cache;
5427
5428         ret = register_pernet_subsys(&ipv6_inetpeer_ops);
5429         if (ret)
5430                 goto out_dst_entries;
5431
5432         ret = register_pernet_subsys(&ip6_route_net_ops);
5433         if (ret)
5434                 goto out_register_inetpeer;
5435
5436         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
5437
5438         ret = fib6_init();
5439         if (ret)
5440                 goto out_register_subsys;
5441
5442         ret = xfrm6_init();
5443         if (ret)
5444                 goto out_fib6_init;
5445
5446         ret = fib6_rules_init();
5447         if (ret)
5448                 goto xfrm6_init;
5449
5450         ret = register_pernet_subsys(&ip6_route_net_late_ops);
5451         if (ret)
5452                 goto fib6_rules_init;
5453
5454         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
5455                                    inet6_rtm_newroute, NULL, 0);
5456         if (ret < 0)
5457                 goto out_register_late_subsys;
5458
5459         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
5460                                    inet6_rtm_delroute, NULL, 0);
5461         if (ret < 0)
5462                 goto out_register_late_subsys;
5463
5464         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
5465                                    inet6_rtm_getroute, NULL,
5466                                    RTNL_FLAG_DOIT_UNLOCKED);
5467         if (ret < 0)
5468                 goto out_register_late_subsys;
5469
5470         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
5471         if (ret)
5472                 goto out_register_late_subsys;
5473
5474         for_each_possible_cpu(cpu) {
5475                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
5476
5477                 INIT_LIST_HEAD(&ul->head);
5478                 spin_lock_init(&ul->lock);
5479         }
5480
5481 out:
5482         return ret;
5483
5484 out_register_late_subsys:
5485         rtnl_unregister_all(PF_INET6);
5486         unregister_pernet_subsys(&ip6_route_net_late_ops);
5487 fib6_rules_init:
5488         fib6_rules_cleanup();
5489 xfrm6_init:
5490         xfrm6_fini();
5491 out_fib6_init:
5492         fib6_gc_cleanup();
5493 out_register_subsys:
5494         unregister_pernet_subsys(&ip6_route_net_ops);
5495 out_register_inetpeer:
5496         unregister_pernet_subsys(&ipv6_inetpeer_ops);
5497 out_dst_entries:
5498         dst_entries_destroy(&ip6_dst_blackhole_ops);
5499 out_kmem_cache:
5500         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5501         goto out;
5502 }
5503
5504 void ip6_route_cleanup(void)
5505 {
5506         unregister_netdevice_notifier(&ip6_route_dev_notifier);
5507         unregister_pernet_subsys(&ip6_route_net_late_ops);
5508         fib6_rules_cleanup();
5509         xfrm6_fini();
5510         fib6_gc_cleanup();
5511         unregister_pernet_subsys(&ipv6_inetpeer_ops);
5512         unregister_pernet_subsys(&ip6_route_net_ops);
5513         dst_entries_destroy(&ip6_dst_blackhole_ops);
5514         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5515 }